aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/sched/core.c420
-rw-r--r--kernel/sched/debug.c12
-rw-r--r--kernel/sched/fair.c462
-rw-r--r--kernel/sched/idle_task.c2
-rw-r--r--kernel/sched/rt.c56
-rw-r--r--kernel/sched/sched.h8
6 files changed, 352 insertions, 608 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d2e2e173d8f7..d833cc94eedc 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -693,8 +693,6 @@ int tg_nop(struct task_group *tg, void *data)
693} 693}
694#endif 694#endif
695 695
696void update_cpu_load(struct rq *this_rq);
697
698static void set_load_weight(struct task_struct *p) 696static void set_load_weight(struct task_struct *p)
699{ 697{
700 int prio = p->static_prio - MAX_RT_PRIO; 698 int prio = p->static_prio - MAX_RT_PRIO;
@@ -2481,22 +2479,13 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
2481 * scheduler tick (TICK_NSEC). With tickless idle this will not be called 2479 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
2482 * every tick. We fix it up based on jiffies. 2480 * every tick. We fix it up based on jiffies.
2483 */ 2481 */
2484void update_cpu_load(struct rq *this_rq) 2482static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
2483 unsigned long pending_updates)
2485{ 2484{
2486 unsigned long this_load = this_rq->load.weight;
2487 unsigned long curr_jiffies = jiffies;
2488 unsigned long pending_updates;
2489 int i, scale; 2485 int i, scale;
2490 2486
2491 this_rq->nr_load_updates++; 2487 this_rq->nr_load_updates++;
2492 2488
2493 /* Avoid repeated calls on same jiffy, when moving in and out of idle */
2494 if (curr_jiffies == this_rq->last_load_update_tick)
2495 return;
2496
2497 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2498 this_rq->last_load_update_tick = curr_jiffies;
2499
2500 /* Update our load: */ 2489 /* Update our load: */
2501 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ 2490 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
2502 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { 2491 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
@@ -2521,9 +2510,45 @@ void update_cpu_load(struct rq *this_rq)
2521 sched_avg_update(this_rq); 2510 sched_avg_update(this_rq);
2522} 2511}
2523 2512
2513/*
2514 * Called from nohz_idle_balance() to update the load ratings before doing the
2515 * idle balance.
2516 */
2517void update_idle_cpu_load(struct rq *this_rq)
2518{
2519 unsigned long curr_jiffies = jiffies;
2520 unsigned long load = this_rq->load.weight;
2521 unsigned long pending_updates;
2522
2523 /*
2524 * Bloody broken means of dealing with nohz, but better than nothing..
2525 * jiffies is updated by one cpu, another cpu can drift wrt the jiffy
2526 * update and see 0 difference the one time and 2 the next, even though
2527 * we ticked at roughtly the same rate.
2528 *
2529 * Hence we only use this from nohz_idle_balance() and skip this
2530 * nonsense when called from the scheduler_tick() since that's
2531 * guaranteed a stable rate.
2532 */
2533 if (load || curr_jiffies == this_rq->last_load_update_tick)
2534 return;
2535
2536 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2537 this_rq->last_load_update_tick = curr_jiffies;
2538
2539 __update_cpu_load(this_rq, load, pending_updates);
2540}
2541
2542/*
2543 * Called from scheduler_tick()
2544 */
2524static void update_cpu_load_active(struct rq *this_rq) 2545static void update_cpu_load_active(struct rq *this_rq)
2525{ 2546{
2526 update_cpu_load(this_rq); 2547 /*
2548 * See the mess in update_idle_cpu_load().
2549 */
2550 this_rq->last_load_update_tick = jiffies;
2551 __update_cpu_load(this_rq, this_rq->load.weight, 1);
2527 2552
2528 calc_load_account_active(this_rq); 2553 calc_load_account_active(this_rq);
2529} 2554}
@@ -3108,6 +3133,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
3108 if (irqs_disabled()) 3133 if (irqs_disabled())
3109 print_irqtrace_events(prev); 3134 print_irqtrace_events(prev);
3110 dump_stack(); 3135 dump_stack();
3136 add_taint(TAINT_WARN);
3111} 3137}
3112 3138
3113/* 3139/*
@@ -5555,7 +5581,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5555 break; 5581 break;
5556 } 5582 }
5557 5583
5558 if (cpumask_intersects(groupmask, sched_group_cpus(group))) { 5584 if (!(sd->flags & SD_OVERLAP) &&
5585 cpumask_intersects(groupmask, sched_group_cpus(group))) {
5559 printk(KERN_CONT "\n"); 5586 printk(KERN_CONT "\n");
5560 printk(KERN_ERR "ERROR: repeated CPUs\n"); 5587 printk(KERN_ERR "ERROR: repeated CPUs\n");
5561 break; 5588 break;
@@ -5893,99 +5920,11 @@ static int __init isolated_cpu_setup(char *str)
5893 5920
5894__setup("isolcpus=", isolated_cpu_setup); 5921__setup("isolcpus=", isolated_cpu_setup);
5895 5922
5896#ifdef CONFIG_NUMA
5897
5898/**
5899 * find_next_best_node - find the next node to include in a sched_domain
5900 * @node: node whose sched_domain we're building
5901 * @used_nodes: nodes already in the sched_domain
5902 *
5903 * Find the next node to include in a given scheduling domain. Simply
5904 * finds the closest node not already in the @used_nodes map.
5905 *
5906 * Should use nodemask_t.
5907 */
5908static int find_next_best_node(int node, nodemask_t *used_nodes)
5909{
5910 int i, n, val, min_val, best_node = -1;
5911
5912 min_val = INT_MAX;
5913
5914 for (i = 0; i < nr_node_ids; i++) {
5915 /* Start at @node */
5916 n = (node + i) % nr_node_ids;
5917
5918 if (!nr_cpus_node(n))
5919 continue;
5920
5921 /* Skip already used nodes */
5922 if (node_isset(n, *used_nodes))
5923 continue;
5924
5925 /* Simple min distance search */
5926 val = node_distance(node, n);
5927
5928 if (val < min_val) {
5929 min_val = val;
5930 best_node = n;
5931 }
5932 }
5933
5934 if (best_node != -1)
5935 node_set(best_node, *used_nodes);
5936 return best_node;
5937}
5938
5939/**
5940 * sched_domain_node_span - get a cpumask for a node's sched_domain
5941 * @node: node whose cpumask we're constructing
5942 * @span: resulting cpumask
5943 *
5944 * Given a node, construct a good cpumask for its sched_domain to span. It
5945 * should be one that prevents unnecessary balancing, but also spreads tasks
5946 * out optimally.
5947 */
5948static void sched_domain_node_span(int node, struct cpumask *span)
5949{
5950 nodemask_t used_nodes;
5951 int i;
5952
5953 cpumask_clear(span);
5954 nodes_clear(used_nodes);
5955
5956 cpumask_or(span, span, cpumask_of_node(node));
5957 node_set(node, used_nodes);
5958
5959 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
5960 int next_node = find_next_best_node(node, &used_nodes);
5961 if (next_node < 0)
5962 break;
5963 cpumask_or(span, span, cpumask_of_node(next_node));
5964 }
5965}
5966
5967static const struct cpumask *cpu_node_mask(int cpu)
5968{
5969 lockdep_assert_held(&sched_domains_mutex);
5970
5971 sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
5972
5973 return sched_domains_tmpmask;
5974}
5975
5976static const struct cpumask *cpu_allnodes_mask(int cpu)
5977{
5978 return cpu_possible_mask;
5979}
5980#endif /* CONFIG_NUMA */
5981
5982static const struct cpumask *cpu_cpu_mask(int cpu) 5923static const struct cpumask *cpu_cpu_mask(int cpu)
5983{ 5924{
5984 return cpumask_of_node(cpu_to_node(cpu)); 5925 return cpumask_of_node(cpu_to_node(cpu));
5985} 5926}
5986 5927
5987int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
5988
5989struct sd_data { 5928struct sd_data {
5990 struct sched_domain **__percpu sd; 5929 struct sched_domain **__percpu sd;
5991 struct sched_group **__percpu sg; 5930 struct sched_group **__percpu sg;
@@ -6015,6 +5954,7 @@ struct sched_domain_topology_level {
6015 sched_domain_init_f init; 5954 sched_domain_init_f init;
6016 sched_domain_mask_f mask; 5955 sched_domain_mask_f mask;
6017 int flags; 5956 int flags;
5957 int numa_level;
6018 struct sd_data data; 5958 struct sd_data data;
6019}; 5959};
6020 5960
@@ -6206,10 +6146,6 @@ sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
6206} 6146}
6207 6147
6208SD_INIT_FUNC(CPU) 6148SD_INIT_FUNC(CPU)
6209#ifdef CONFIG_NUMA
6210 SD_INIT_FUNC(ALLNODES)
6211 SD_INIT_FUNC(NODE)
6212#endif
6213#ifdef CONFIG_SCHED_SMT 6149#ifdef CONFIG_SCHED_SMT
6214 SD_INIT_FUNC(SIBLING) 6150 SD_INIT_FUNC(SIBLING)
6215#endif 6151#endif
@@ -6331,15 +6267,184 @@ static struct sched_domain_topology_level default_topology[] = {
6331 { sd_init_BOOK, cpu_book_mask, }, 6267 { sd_init_BOOK, cpu_book_mask, },
6332#endif 6268#endif
6333 { sd_init_CPU, cpu_cpu_mask, }, 6269 { sd_init_CPU, cpu_cpu_mask, },
6334#ifdef CONFIG_NUMA
6335 { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },
6336 { sd_init_ALLNODES, cpu_allnodes_mask, },
6337#endif
6338 { NULL, }, 6270 { NULL, },
6339}; 6271};
6340 6272
6341static struct sched_domain_topology_level *sched_domain_topology = default_topology; 6273static struct sched_domain_topology_level *sched_domain_topology = default_topology;
6342 6274
6275#ifdef CONFIG_NUMA
6276
6277static int sched_domains_numa_levels;
6278static int sched_domains_numa_scale;
6279static int *sched_domains_numa_distance;
6280static struct cpumask ***sched_domains_numa_masks;
6281static int sched_domains_curr_level;
6282
6283static inline int sd_local_flags(int level)
6284{
6285 if (sched_domains_numa_distance[level] > REMOTE_DISTANCE)
6286 return 0;
6287
6288 return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
6289}
6290
6291static struct sched_domain *
6292sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
6293{
6294 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
6295 int level = tl->numa_level;
6296 int sd_weight = cpumask_weight(
6297 sched_domains_numa_masks[level][cpu_to_node(cpu)]);
6298
6299 *sd = (struct sched_domain){
6300 .min_interval = sd_weight,
6301 .max_interval = 2*sd_weight,
6302 .busy_factor = 32,
6303 .imbalance_pct = 125,
6304 .cache_nice_tries = 2,
6305 .busy_idx = 3,
6306 .idle_idx = 2,
6307 .newidle_idx = 0,
6308 .wake_idx = 0,
6309 .forkexec_idx = 0,
6310
6311 .flags = 1*SD_LOAD_BALANCE
6312 | 1*SD_BALANCE_NEWIDLE
6313 | 0*SD_BALANCE_EXEC
6314 | 0*SD_BALANCE_FORK
6315 | 0*SD_BALANCE_WAKE
6316 | 0*SD_WAKE_AFFINE
6317 | 0*SD_PREFER_LOCAL
6318 | 0*SD_SHARE_CPUPOWER
6319 | 0*SD_SHARE_PKG_RESOURCES
6320 | 1*SD_SERIALIZE
6321 | 0*SD_PREFER_SIBLING
6322 | sd_local_flags(level)
6323 ,
6324 .last_balance = jiffies,
6325 .balance_interval = sd_weight,
6326 };
6327 SD_INIT_NAME(sd, NUMA);
6328 sd->private = &tl->data;
6329
6330 /*
6331 * Ugly hack to pass state to sd_numa_mask()...
6332 */
6333 sched_domains_curr_level = tl->numa_level;
6334
6335 return sd;
6336}
6337
6338static const struct cpumask *sd_numa_mask(int cpu)
6339{
6340 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
6341}
6342
6343static void sched_init_numa(void)
6344{
6345 int next_distance, curr_distance = node_distance(0, 0);
6346 struct sched_domain_topology_level *tl;
6347 int level = 0;
6348 int i, j, k;
6349
6350 sched_domains_numa_scale = curr_distance;
6351 sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
6352 if (!sched_domains_numa_distance)
6353 return;
6354
6355 /*
6356 * O(nr_nodes^2) deduplicating selection sort -- in order to find the
6357 * unique distances in the node_distance() table.
6358 *
6359 * Assumes node_distance(0,j) includes all distances in
6360 * node_distance(i,j) in order to avoid cubic time.
6361 *
6362 * XXX: could be optimized to O(n log n) by using sort()
6363 */
6364 next_distance = curr_distance;
6365 for (i = 0; i < nr_node_ids; i++) {
6366 for (j = 0; j < nr_node_ids; j++) {
6367 int distance = node_distance(0, j);
6368 if (distance > curr_distance &&
6369 (distance < next_distance ||
6370 next_distance == curr_distance))
6371 next_distance = distance;
6372 }
6373 if (next_distance != curr_distance) {
6374 sched_domains_numa_distance[level++] = next_distance;
6375 sched_domains_numa_levels = level;
6376 curr_distance = next_distance;
6377 } else break;
6378 }
6379 /*
6380 * 'level' contains the number of unique distances, excluding the
6381 * identity distance node_distance(i,i).
6382 *
6383 * The sched_domains_nume_distance[] array includes the actual distance
6384 * numbers.
6385 */
6386
6387 sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
6388 if (!sched_domains_numa_masks)
6389 return;
6390
6391 /*
6392 * Now for each level, construct a mask per node which contains all
6393 * cpus of nodes that are that many hops away from us.
6394 */
6395 for (i = 0; i < level; i++) {
6396 sched_domains_numa_masks[i] =
6397 kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
6398 if (!sched_domains_numa_masks[i])
6399 return;
6400
6401 for (j = 0; j < nr_node_ids; j++) {
6402 struct cpumask *mask = kzalloc_node(cpumask_size(), GFP_KERNEL, j);
6403 if (!mask)
6404 return;
6405
6406 sched_domains_numa_masks[i][j] = mask;
6407
6408 for (k = 0; k < nr_node_ids; k++) {
6409 if (node_distance(j, k) > sched_domains_numa_distance[i])
6410 continue;
6411
6412 cpumask_or(mask, mask, cpumask_of_node(k));
6413 }
6414 }
6415 }
6416
6417 tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
6418 sizeof(struct sched_domain_topology_level), GFP_KERNEL);
6419 if (!tl)
6420 return;
6421
6422 /*
6423 * Copy the default topology bits..
6424 */
6425 for (i = 0; default_topology[i].init; i++)
6426 tl[i] = default_topology[i];
6427
6428 /*
6429 * .. and append 'j' levels of NUMA goodness.
6430 */
6431 for (j = 0; j < level; i++, j++) {
6432 tl[i] = (struct sched_domain_topology_level){
6433 .init = sd_numa_init,
6434 .mask = sd_numa_mask,
6435 .flags = SDTL_OVERLAP,
6436 .numa_level = j,
6437 };
6438 }
6439
6440 sched_domain_topology = tl;
6441}
6442#else
6443static inline void sched_init_numa(void)
6444{
6445}
6446#endif /* CONFIG_NUMA */
6447
6343static int __sdt_alloc(const struct cpumask *cpu_map) 6448static int __sdt_alloc(const struct cpumask *cpu_map)
6344{ 6449{
6345 struct sched_domain_topology_level *tl; 6450 struct sched_domain_topology_level *tl;
@@ -6707,97 +6812,6 @@ match2:
6707 mutex_unlock(&sched_domains_mutex); 6812 mutex_unlock(&sched_domains_mutex);
6708} 6813}
6709 6814
6710#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
6711static void reinit_sched_domains(void)
6712{
6713 get_online_cpus();
6714
6715 /* Destroy domains first to force the rebuild */
6716 partition_sched_domains(0, NULL, NULL);
6717
6718 rebuild_sched_domains();
6719 put_online_cpus();
6720}
6721
6722static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
6723{
6724 unsigned int level = 0;
6725
6726 if (sscanf(buf, "%u", &level) != 1)
6727 return -EINVAL;
6728
6729 /*
6730 * level is always be positive so don't check for
6731 * level < POWERSAVINGS_BALANCE_NONE which is 0
6732 * What happens on 0 or 1 byte write,
6733 * need to check for count as well?
6734 */
6735
6736 if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
6737 return -EINVAL;
6738
6739 if (smt)
6740 sched_smt_power_savings = level;
6741 else
6742 sched_mc_power_savings = level;
6743
6744 reinit_sched_domains();
6745
6746 return count;
6747}
6748
6749#ifdef CONFIG_SCHED_MC
6750static ssize_t sched_mc_power_savings_show(struct device *dev,
6751 struct device_attribute *attr,
6752 char *buf)
6753{
6754 return sprintf(buf, "%u\n", sched_mc_power_savings);
6755}
6756static ssize_t sched_mc_power_savings_store(struct device *dev,
6757 struct device_attribute *attr,
6758 const char *buf, size_t count)
6759{
6760 return sched_power_savings_store(buf, count, 0);
6761}
6762static DEVICE_ATTR(sched_mc_power_savings, 0644,
6763 sched_mc_power_savings_show,
6764 sched_mc_power_savings_store);
6765#endif
6766
6767#ifdef CONFIG_SCHED_SMT
6768static ssize_t sched_smt_power_savings_show(struct device *dev,
6769 struct device_attribute *attr,
6770 char *buf)
6771{
6772 return sprintf(buf, "%u\n", sched_smt_power_savings);
6773}
6774static ssize_t sched_smt_power_savings_store(struct device *dev,
6775 struct device_attribute *attr,
6776 const char *buf, size_t count)
6777{
6778 return sched_power_savings_store(buf, count, 1);
6779}
6780static DEVICE_ATTR(sched_smt_power_savings, 0644,
6781 sched_smt_power_savings_show,
6782 sched_smt_power_savings_store);
6783#endif
6784
6785int __init sched_create_sysfs_power_savings_entries(struct device *dev)
6786{
6787 int err = 0;
6788
6789#ifdef CONFIG_SCHED_SMT
6790 if (smt_capable())
6791 err = device_create_file(dev, &dev_attr_sched_smt_power_savings);
6792#endif
6793#ifdef CONFIG_SCHED_MC
6794 if (!err && mc_capable())
6795 err = device_create_file(dev, &dev_attr_sched_mc_power_savings);
6796#endif
6797 return err;
6798}
6799#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
6800
6801/* 6815/*
6802 * Update cpusets according to cpu_active mask. If cpusets are 6816 * Update cpusets according to cpu_active mask. If cpusets are
6803 * disabled, cpuset_update_active_cpus() becomes a simple wrapper 6817 * disabled, cpuset_update_active_cpus() becomes a simple wrapper
@@ -6835,6 +6849,8 @@ void __init sched_init_smp(void)
6835 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); 6849 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
6836 alloc_cpumask_var(&fallback_doms, GFP_KERNEL); 6850 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
6837 6851
6852 sched_init_numa();
6853
6838 get_online_cpus(); 6854 get_online_cpus();
6839 mutex_lock(&sched_domains_mutex); 6855 mutex_lock(&sched_domains_mutex);
6840 init_sched_domains(cpu_active_mask); 6856 init_sched_domains(cpu_active_mask);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 09acaa15161d..6f79596e0ea9 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -202,7 +202,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
202 SPLIT_NS(spread0)); 202 SPLIT_NS(spread0));
203 SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", 203 SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
204 cfs_rq->nr_spread_over); 204 cfs_rq->nr_spread_over);
205 SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); 205 SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
206 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); 206 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
207#ifdef CONFIG_FAIR_GROUP_SCHED 207#ifdef CONFIG_FAIR_GROUP_SCHED
208#ifdef CONFIG_SMP 208#ifdef CONFIG_SMP
@@ -260,8 +260,14 @@ static void print_cpu(struct seq_file *m, int cpu)
260 SEQ_printf(m, "\ncpu#%d\n", cpu); 260 SEQ_printf(m, "\ncpu#%d\n", cpu);
261#endif 261#endif
262 262
263#define P(x) \ 263#define P(x) \
264 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x)) 264do { \
265 if (sizeof(rq->x) == 4) \
266 SEQ_printf(m, " .%-30s: %ld\n", #x, (long)(rq->x)); \
267 else \
268 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x));\
269} while (0)
270
265#define PN(x) \ 271#define PN(x) \
266 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x)) 272 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x))
267 273
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e9553640c1c3..940e6d17cf96 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2721,7 +2721,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
2721 * If power savings logic is enabled for a domain, see if we 2721 * If power savings logic is enabled for a domain, see if we
2722 * are not overloaded, if so, don't balance wider. 2722 * are not overloaded, if so, don't balance wider.
2723 */ 2723 */
2724 if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) { 2724 if (tmp->flags & (SD_PREFER_LOCAL)) {
2725 unsigned long power = 0; 2725 unsigned long power = 0;
2726 unsigned long nr_running = 0; 2726 unsigned long nr_running = 0;
2727 unsigned long capacity; 2727 unsigned long capacity;
@@ -2734,9 +2734,6 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
2734 2734
2735 capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); 2735 capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
2736 2736
2737 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
2738 nr_running /= 2;
2739
2740 if (nr_running < capacity) 2737 if (nr_running < capacity)
2741 want_sd = 0; 2738 want_sd = 0;
2742 } 2739 }
@@ -3082,7 +3079,7 @@ struct lb_env {
3082 struct rq *dst_rq; 3079 struct rq *dst_rq;
3083 3080
3084 enum cpu_idle_type idle; 3081 enum cpu_idle_type idle;
3085 long load_move; 3082 long imbalance;
3086 unsigned int flags; 3083 unsigned int flags;
3087 3084
3088 unsigned int loop; 3085 unsigned int loop;
@@ -3218,7 +3215,7 @@ static unsigned long task_h_load(struct task_struct *p);
3218static const unsigned int sched_nr_migrate_break = 32; 3215static const unsigned int sched_nr_migrate_break = 32;
3219 3216
3220/* 3217/*
3221 * move_tasks tries to move up to load_move weighted load from busiest to 3218 * move_tasks tries to move up to imbalance weighted load from busiest to
3222 * this_rq, as part of a balancing operation within domain "sd". 3219 * this_rq, as part of a balancing operation within domain "sd".
3223 * Returns 1 if successful and 0 otherwise. 3220 * Returns 1 if successful and 0 otherwise.
3224 * 3221 *
@@ -3231,7 +3228,7 @@ static int move_tasks(struct lb_env *env)
3231 unsigned long load; 3228 unsigned long load;
3232 int pulled = 0; 3229 int pulled = 0;
3233 3230
3234 if (env->load_move <= 0) 3231 if (env->imbalance <= 0)
3235 return 0; 3232 return 0;
3236 3233
3237 while (!list_empty(tasks)) { 3234 while (!list_empty(tasks)) {
@@ -3257,7 +3254,7 @@ static int move_tasks(struct lb_env *env)
3257 if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed) 3254 if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
3258 goto next; 3255 goto next;
3259 3256
3260 if ((load / 2) > env->load_move) 3257 if ((load / 2) > env->imbalance)
3261 goto next; 3258 goto next;
3262 3259
3263 if (!can_migrate_task(p, env)) 3260 if (!can_migrate_task(p, env))
@@ -3265,7 +3262,7 @@ static int move_tasks(struct lb_env *env)
3265 3262
3266 move_task(p, env); 3263 move_task(p, env);
3267 pulled++; 3264 pulled++;
3268 env->load_move -= load; 3265 env->imbalance -= load;
3269 3266
3270#ifdef CONFIG_PREEMPT 3267#ifdef CONFIG_PREEMPT
3271 /* 3268 /*
@@ -3281,7 +3278,7 @@ static int move_tasks(struct lb_env *env)
3281 * We only want to steal up to the prescribed amount of 3278 * We only want to steal up to the prescribed amount of
3282 * weighted load. 3279 * weighted load.
3283 */ 3280 */
3284 if (env->load_move <= 0) 3281 if (env->imbalance <= 0)
3285 break; 3282 break;
3286 3283
3287 continue; 3284 continue;
@@ -3435,14 +3432,6 @@ struct sd_lb_stats {
3435 unsigned int busiest_group_weight; 3432 unsigned int busiest_group_weight;
3436 3433
3437 int group_imb; /* Is there imbalance in this sd */ 3434 int group_imb; /* Is there imbalance in this sd */
3438#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3439 int power_savings_balance; /* Is powersave balance needed for this sd */
3440 struct sched_group *group_min; /* Least loaded group in sd */
3441 struct sched_group *group_leader; /* Group which relieves group_min */
3442 unsigned long min_load_per_task; /* load_per_task in group_min */
3443 unsigned long leader_nr_running; /* Nr running of group_leader */
3444 unsigned long min_nr_running; /* Nr running of group_min */
3445#endif
3446}; 3435};
3447 3436
3448/* 3437/*
@@ -3486,148 +3475,6 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
3486 return load_idx; 3475 return load_idx;
3487} 3476}
3488 3477
3489
3490#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3491/**
3492 * init_sd_power_savings_stats - Initialize power savings statistics for
3493 * the given sched_domain, during load balancing.
3494 *
3495 * @sd: Sched domain whose power-savings statistics are to be initialized.
3496 * @sds: Variable containing the statistics for sd.
3497 * @idle: Idle status of the CPU at which we're performing load-balancing.
3498 */
3499static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3500 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3501{
3502 /*
3503 * Busy processors will not participate in power savings
3504 * balance.
3505 */
3506 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
3507 sds->power_savings_balance = 0;
3508 else {
3509 sds->power_savings_balance = 1;
3510 sds->min_nr_running = ULONG_MAX;
3511 sds->leader_nr_running = 0;
3512 }
3513}
3514
3515/**
3516 * update_sd_power_savings_stats - Update the power saving stats for a
3517 * sched_domain while performing load balancing.
3518 *
3519 * @group: sched_group belonging to the sched_domain under consideration.
3520 * @sds: Variable containing the statistics of the sched_domain
3521 * @local_group: Does group contain the CPU for which we're performing
3522 * load balancing ?
3523 * @sgs: Variable containing the statistics of the group.
3524 */
3525static inline void update_sd_power_savings_stats(struct sched_group *group,
3526 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3527{
3528
3529 if (!sds->power_savings_balance)
3530 return;
3531
3532 /*
3533 * If the local group is idle or completely loaded
3534 * no need to do power savings balance at this domain
3535 */
3536 if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
3537 !sds->this_nr_running))
3538 sds->power_savings_balance = 0;
3539
3540 /*
3541 * If a group is already running at full capacity or idle,
3542 * don't include that group in power savings calculations
3543 */
3544 if (!sds->power_savings_balance ||
3545 sgs->sum_nr_running >= sgs->group_capacity ||
3546 !sgs->sum_nr_running)
3547 return;
3548
3549 /*
3550 * Calculate the group which has the least non-idle load.
3551 * This is the group from where we need to pick up the load
3552 * for saving power
3553 */
3554 if ((sgs->sum_nr_running < sds->min_nr_running) ||
3555 (sgs->sum_nr_running == sds->min_nr_running &&
3556 group_first_cpu(group) > group_first_cpu(sds->group_min))) {
3557 sds->group_min = group;
3558 sds->min_nr_running = sgs->sum_nr_running;
3559 sds->min_load_per_task = sgs->sum_weighted_load /
3560 sgs->sum_nr_running;
3561 }
3562
3563 /*
3564 * Calculate the group which is almost near its
3565 * capacity but still has some space to pick up some load
3566 * from other group and save more power
3567 */
3568 if (sgs->sum_nr_running + 1 > sgs->group_capacity)
3569 return;
3570
3571 if (sgs->sum_nr_running > sds->leader_nr_running ||
3572 (sgs->sum_nr_running == sds->leader_nr_running &&
3573 group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
3574 sds->group_leader = group;
3575 sds->leader_nr_running = sgs->sum_nr_running;
3576 }
3577}
3578
3579/**
3580 * check_power_save_busiest_group - see if there is potential for some power-savings balance
3581 * @sds: Variable containing the statistics of the sched_domain
3582 * under consideration.
3583 * @this_cpu: Cpu at which we're currently performing load-balancing.
3584 * @imbalance: Variable to store the imbalance.
3585 *
3586 * Description:
3587 * Check if we have potential to perform some power-savings balance.
3588 * If yes, set the busiest group to be the least loaded group in the
3589 * sched_domain, so that it's CPUs can be put to idle.
3590 *
3591 * Returns 1 if there is potential to perform power-savings balance.
3592 * Else returns 0.
3593 */
3594static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3595 int this_cpu, unsigned long *imbalance)
3596{
3597 if (!sds->power_savings_balance)
3598 return 0;
3599
3600 if (sds->this != sds->group_leader ||
3601 sds->group_leader == sds->group_min)
3602 return 0;
3603
3604 *imbalance = sds->min_load_per_task;
3605 sds->busiest = sds->group_min;
3606
3607 return 1;
3608
3609}
3610#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3611static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3612 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3613{
3614 return;
3615}
3616
3617static inline void update_sd_power_savings_stats(struct sched_group *group,
3618 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3619{
3620 return;
3621}
3622
3623static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3624 int this_cpu, unsigned long *imbalance)
3625{
3626 return 0;
3627}
3628#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3629
3630
3631unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) 3478unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
3632{ 3479{
3633 return SCHED_POWER_SCALE; 3480 return SCHED_POWER_SCALE;
@@ -3765,24 +3612,22 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
3765 * update_sg_lb_stats - Update sched_group's statistics for load balancing. 3612 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
3766 * @sd: The sched_domain whose statistics are to be updated. 3613 * @sd: The sched_domain whose statistics are to be updated.
3767 * @group: sched_group whose statistics are to be updated. 3614 * @group: sched_group whose statistics are to be updated.
3768 * @this_cpu: Cpu for which load balance is currently performed.
3769 * @idle: Idle status of this_cpu
3770 * @load_idx: Load index of sched_domain of this_cpu for load calc. 3615 * @load_idx: Load index of sched_domain of this_cpu for load calc.
3771 * @local_group: Does group contain this_cpu. 3616 * @local_group: Does group contain this_cpu.
3772 * @cpus: Set of cpus considered for load balancing. 3617 * @cpus: Set of cpus considered for load balancing.
3773 * @balance: Should we balance. 3618 * @balance: Should we balance.
3774 * @sgs: variable to hold the statistics for this group. 3619 * @sgs: variable to hold the statistics for this group.
3775 */ 3620 */
3776static inline void update_sg_lb_stats(struct sched_domain *sd, 3621static inline void update_sg_lb_stats(struct lb_env *env,
3777 struct sched_group *group, int this_cpu, 3622 struct sched_group *group, int load_idx,
3778 enum cpu_idle_type idle, int load_idx,
3779 int local_group, const struct cpumask *cpus, 3623 int local_group, const struct cpumask *cpus,
3780 int *balance, struct sg_lb_stats *sgs) 3624 int *balance, struct sg_lb_stats *sgs)
3781{ 3625{
3782 unsigned long load, max_cpu_load, min_cpu_load, max_nr_running; 3626 unsigned long nr_running, max_nr_running, min_nr_running;
3783 int i; 3627 unsigned long load, max_cpu_load, min_cpu_load;
3784 unsigned int balance_cpu = -1, first_idle_cpu = 0; 3628 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3785 unsigned long avg_load_per_task = 0; 3629 unsigned long avg_load_per_task = 0;
3630 int i;
3786 3631
3787 if (local_group) 3632 if (local_group)
3788 balance_cpu = group_first_cpu(group); 3633 balance_cpu = group_first_cpu(group);
@@ -3791,10 +3636,13 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
3791 max_cpu_load = 0; 3636 max_cpu_load = 0;
3792 min_cpu_load = ~0UL; 3637 min_cpu_load = ~0UL;
3793 max_nr_running = 0; 3638 max_nr_running = 0;
3639 min_nr_running = ~0UL;
3794 3640
3795 for_each_cpu_and(i, sched_group_cpus(group), cpus) { 3641 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
3796 struct rq *rq = cpu_rq(i); 3642 struct rq *rq = cpu_rq(i);
3797 3643
3644 nr_running = rq->nr_running;
3645
3798 /* Bias balancing toward cpus of our domain */ 3646 /* Bias balancing toward cpus of our domain */
3799 if (local_group) { 3647 if (local_group) {
3800 if (idle_cpu(i) && !first_idle_cpu) { 3648 if (idle_cpu(i) && !first_idle_cpu) {
@@ -3805,16 +3653,19 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
3805 load = target_load(i, load_idx); 3653 load = target_load(i, load_idx);
3806 } else { 3654 } else {
3807 load = source_load(i, load_idx); 3655 load = source_load(i, load_idx);
3808 if (load > max_cpu_load) { 3656 if (load > max_cpu_load)
3809 max_cpu_load = load; 3657 max_cpu_load = load;
3810 max_nr_running = rq->nr_running;
3811 }
3812 if (min_cpu_load > load) 3658 if (min_cpu_load > load)
3813 min_cpu_load = load; 3659 min_cpu_load = load;
3660
3661 if (nr_running > max_nr_running)
3662 max_nr_running = nr_running;
3663 if (min_nr_running > nr_running)
3664 min_nr_running = nr_running;
3814 } 3665 }
3815 3666
3816 sgs->group_load += load; 3667 sgs->group_load += load;
3817 sgs->sum_nr_running += rq->nr_running; 3668 sgs->sum_nr_running += nr_running;
3818 sgs->sum_weighted_load += weighted_cpuload(i); 3669 sgs->sum_weighted_load += weighted_cpuload(i);
3819 if (idle_cpu(i)) 3670 if (idle_cpu(i))
3820 sgs->idle_cpus++; 3671 sgs->idle_cpus++;
@@ -3827,14 +3678,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
3827 * to do the newly idle load balance. 3678 * to do the newly idle load balance.
3828 */ 3679 */
3829 if (local_group) { 3680 if (local_group) {
3830 if (idle != CPU_NEWLY_IDLE) { 3681 if (env->idle != CPU_NEWLY_IDLE) {
3831 if (balance_cpu != this_cpu) { 3682 if (balance_cpu != env->dst_cpu) {
3832 *balance = 0; 3683 *balance = 0;
3833 return; 3684 return;
3834 } 3685 }
3835 update_group_power(sd, this_cpu); 3686 update_group_power(env->sd, env->dst_cpu);
3836 } else if (time_after_eq(jiffies, group->sgp->next_update)) 3687 } else if (time_after_eq(jiffies, group->sgp->next_update))
3837 update_group_power(sd, this_cpu); 3688 update_group_power(env->sd, env->dst_cpu);
3838 } 3689 }
3839 3690
3840 /* Adjust by relative CPU power of the group */ 3691 /* Adjust by relative CPU power of the group */
@@ -3852,13 +3703,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
3852 if (sgs->sum_nr_running) 3703 if (sgs->sum_nr_running)
3853 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; 3704 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
3854 3705
3855 if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1) 3706 if ((max_cpu_load - min_cpu_load) >= avg_load_per_task &&
3707 (max_nr_running - min_nr_running) > 1)
3856 sgs->group_imb = 1; 3708 sgs->group_imb = 1;
3857 3709
3858 sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power, 3710 sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
3859 SCHED_POWER_SCALE); 3711 SCHED_POWER_SCALE);
3860 if (!sgs->group_capacity) 3712 if (!sgs->group_capacity)
3861 sgs->group_capacity = fix_small_capacity(sd, group); 3713 sgs->group_capacity = fix_small_capacity(env->sd, group);
3862 sgs->group_weight = group->group_weight; 3714 sgs->group_weight = group->group_weight;
3863 3715
3864 if (sgs->group_capacity > sgs->sum_nr_running) 3716 if (sgs->group_capacity > sgs->sum_nr_running)
@@ -3876,11 +3728,10 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
3876 * Determine if @sg is a busier group than the previously selected 3728 * Determine if @sg is a busier group than the previously selected
3877 * busiest group. 3729 * busiest group.
3878 */ 3730 */
3879static bool update_sd_pick_busiest(struct sched_domain *sd, 3731static bool update_sd_pick_busiest(struct lb_env *env,
3880 struct sd_lb_stats *sds, 3732 struct sd_lb_stats *sds,
3881 struct sched_group *sg, 3733 struct sched_group *sg,
3882 struct sg_lb_stats *sgs, 3734 struct sg_lb_stats *sgs)
3883 int this_cpu)
3884{ 3735{
3885 if (sgs->avg_load <= sds->max_load) 3736 if (sgs->avg_load <= sds->max_load)
3886 return false; 3737 return false;
@@ -3896,8 +3747,8 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
3896 * numbered CPUs in the group, therefore mark all groups 3747 * numbered CPUs in the group, therefore mark all groups
3897 * higher than ourself as busy. 3748 * higher than ourself as busy.
3898 */ 3749 */
3899 if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running && 3750 if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
3900 this_cpu < group_first_cpu(sg)) { 3751 env->dst_cpu < group_first_cpu(sg)) {
3901 if (!sds->busiest) 3752 if (!sds->busiest)
3902 return true; 3753 return true;
3903 3754
@@ -3917,28 +3768,27 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
3917 * @balance: Should we balance. 3768 * @balance: Should we balance.
3918 * @sds: variable to hold the statistics for this sched_domain. 3769 * @sds: variable to hold the statistics for this sched_domain.
3919 */ 3770 */
3920static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, 3771static inline void update_sd_lb_stats(struct lb_env *env,
3921 enum cpu_idle_type idle, const struct cpumask *cpus, 3772 const struct cpumask *cpus,
3922 int *balance, struct sd_lb_stats *sds) 3773 int *balance, struct sd_lb_stats *sds)
3923{ 3774{
3924 struct sched_domain *child = sd->child; 3775 struct sched_domain *child = env->sd->child;
3925 struct sched_group *sg = sd->groups; 3776 struct sched_group *sg = env->sd->groups;
3926 struct sg_lb_stats sgs; 3777 struct sg_lb_stats sgs;
3927 int load_idx, prefer_sibling = 0; 3778 int load_idx, prefer_sibling = 0;
3928 3779
3929 if (child && child->flags & SD_PREFER_SIBLING) 3780 if (child && child->flags & SD_PREFER_SIBLING)
3930 prefer_sibling = 1; 3781 prefer_sibling = 1;
3931 3782
3932 init_sd_power_savings_stats(sd, sds, idle); 3783 load_idx = get_sd_load_idx(env->sd, env->idle);
3933 load_idx = get_sd_load_idx(sd, idle);
3934 3784
3935 do { 3785 do {
3936 int local_group; 3786 int local_group;
3937 3787
3938 local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg)); 3788 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
3939 memset(&sgs, 0, sizeof(sgs)); 3789 memset(&sgs, 0, sizeof(sgs));
3940 update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, 3790 update_sg_lb_stats(env, sg, load_idx, local_group,
3941 local_group, cpus, balance, &sgs); 3791 cpus, balance, &sgs);
3942 3792
3943 if (local_group && !(*balance)) 3793 if (local_group && !(*balance))
3944 return; 3794 return;
@@ -3966,7 +3816,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3966 sds->this_load_per_task = sgs.sum_weighted_load; 3816 sds->this_load_per_task = sgs.sum_weighted_load;
3967 sds->this_has_capacity = sgs.group_has_capacity; 3817 sds->this_has_capacity = sgs.group_has_capacity;
3968 sds->this_idle_cpus = sgs.idle_cpus; 3818 sds->this_idle_cpus = sgs.idle_cpus;
3969 } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { 3819 } else if (update_sd_pick_busiest(env, sds, sg, &sgs)) {
3970 sds->max_load = sgs.avg_load; 3820 sds->max_load = sgs.avg_load;
3971 sds->busiest = sg; 3821 sds->busiest = sg;
3972 sds->busiest_nr_running = sgs.sum_nr_running; 3822 sds->busiest_nr_running = sgs.sum_nr_running;
@@ -3978,9 +3828,8 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3978 sds->group_imb = sgs.group_imb; 3828 sds->group_imb = sgs.group_imb;
3979 } 3829 }
3980 3830
3981 update_sd_power_savings_stats(sg, sds, local_group, &sgs);
3982 sg = sg->next; 3831 sg = sg->next;
3983 } while (sg != sd->groups); 3832 } while (sg != env->sd->groups);
3984} 3833}
3985 3834
3986/** 3835/**
@@ -4008,24 +3857,23 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
4008 * @this_cpu: The cpu at whose sched_domain we're performing load-balance. 3857 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
4009 * @imbalance: returns amount of imbalanced due to packing. 3858 * @imbalance: returns amount of imbalanced due to packing.
4010 */ 3859 */
4011static int check_asym_packing(struct sched_domain *sd, 3860static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
4012 struct sd_lb_stats *sds,
4013 int this_cpu, unsigned long *imbalance)
4014{ 3861{
4015 int busiest_cpu; 3862 int busiest_cpu;
4016 3863
4017 if (!(sd->flags & SD_ASYM_PACKING)) 3864 if (!(env->sd->flags & SD_ASYM_PACKING))
4018 return 0; 3865 return 0;
4019 3866
4020 if (!sds->busiest) 3867 if (!sds->busiest)
4021 return 0; 3868 return 0;
4022 3869
4023 busiest_cpu = group_first_cpu(sds->busiest); 3870 busiest_cpu = group_first_cpu(sds->busiest);
4024 if (this_cpu > busiest_cpu) 3871 if (env->dst_cpu > busiest_cpu)
4025 return 0; 3872 return 0;
4026 3873
4027 *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->sgp->power, 3874 env->imbalance = DIV_ROUND_CLOSEST(
4028 SCHED_POWER_SCALE); 3875 sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE);
3876
4029 return 1; 3877 return 1;
4030} 3878}
4031 3879
@@ -4037,8 +3885,8 @@ static int check_asym_packing(struct sched_domain *sd,
4037 * @this_cpu: The cpu at whose sched_domain we're performing load-balance. 3885 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
4038 * @imbalance: Variable to store the imbalance. 3886 * @imbalance: Variable to store the imbalance.
4039 */ 3887 */
4040static inline void fix_small_imbalance(struct sd_lb_stats *sds, 3888static inline
4041 int this_cpu, unsigned long *imbalance) 3889void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
4042{ 3890{
4043 unsigned long tmp, pwr_now = 0, pwr_move = 0; 3891 unsigned long tmp, pwr_now = 0, pwr_move = 0;
4044 unsigned int imbn = 2; 3892 unsigned int imbn = 2;
@@ -4049,9 +3897,10 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
4049 if (sds->busiest_load_per_task > 3897 if (sds->busiest_load_per_task >
4050 sds->this_load_per_task) 3898 sds->this_load_per_task)
4051 imbn = 1; 3899 imbn = 1;
4052 } else 3900 } else {
4053 sds->this_load_per_task = 3901 sds->this_load_per_task =
4054 cpu_avg_load_per_task(this_cpu); 3902 cpu_avg_load_per_task(env->dst_cpu);
3903 }
4055 3904
4056 scaled_busy_load_per_task = sds->busiest_load_per_task 3905 scaled_busy_load_per_task = sds->busiest_load_per_task
4057 * SCHED_POWER_SCALE; 3906 * SCHED_POWER_SCALE;
@@ -4059,7 +3908,7 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
4059 3908
4060 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= 3909 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
4061 (scaled_busy_load_per_task * imbn)) { 3910 (scaled_busy_load_per_task * imbn)) {
4062 *imbalance = sds->busiest_load_per_task; 3911 env->imbalance = sds->busiest_load_per_task;
4063 return; 3912 return;
4064 } 3913 }
4065 3914
@@ -4096,18 +3945,16 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
4096 3945
4097 /* Move if we gain throughput */ 3946 /* Move if we gain throughput */
4098 if (pwr_move > pwr_now) 3947 if (pwr_move > pwr_now)
4099 *imbalance = sds->busiest_load_per_task; 3948 env->imbalance = sds->busiest_load_per_task;
4100} 3949}
4101 3950
4102/** 3951/**
4103 * calculate_imbalance - Calculate the amount of imbalance present within the 3952 * calculate_imbalance - Calculate the amount of imbalance present within the
4104 * groups of a given sched_domain during load balance. 3953 * groups of a given sched_domain during load balance.
3954 * @env: load balance environment
4105 * @sds: statistics of the sched_domain whose imbalance is to be calculated. 3955 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
4106 * @this_cpu: Cpu for which currently load balance is being performed.
4107 * @imbalance: The variable to store the imbalance.
4108 */ 3956 */
4109static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, 3957static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
4110 unsigned long *imbalance)
4111{ 3958{
4112 unsigned long max_pull, load_above_capacity = ~0UL; 3959 unsigned long max_pull, load_above_capacity = ~0UL;
4113 3960
@@ -4123,8 +3970,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
4123 * its cpu_power, while calculating max_load..) 3970 * its cpu_power, while calculating max_load..)
4124 */ 3971 */
4125 if (sds->max_load < sds->avg_load) { 3972 if (sds->max_load < sds->avg_load) {
4126 *imbalance = 0; 3973 env->imbalance = 0;
4127 return fix_small_imbalance(sds, this_cpu, imbalance); 3974 return fix_small_imbalance(env, sds);
4128 } 3975 }
4129 3976
4130 if (!sds->group_imb) { 3977 if (!sds->group_imb) {
@@ -4152,7 +3999,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
4152 max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); 3999 max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
4153 4000
4154 /* How much load to actually move to equalise the imbalance */ 4001 /* How much load to actually move to equalise the imbalance */
4155 *imbalance = min(max_pull * sds->busiest->sgp->power, 4002 env->imbalance = min(max_pull * sds->busiest->sgp->power,
4156 (sds->avg_load - sds->this_load) * sds->this->sgp->power) 4003 (sds->avg_load - sds->this_load) * sds->this->sgp->power)
4157 / SCHED_POWER_SCALE; 4004 / SCHED_POWER_SCALE;
4158 4005
@@ -4162,8 +4009,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
4162 * a think about bumping its value to force at least one task to be 4009 * a think about bumping its value to force at least one task to be
4163 * moved 4010 * moved
4164 */ 4011 */
4165 if (*imbalance < sds->busiest_load_per_task) 4012 if (env->imbalance < sds->busiest_load_per_task)
4166 return fix_small_imbalance(sds, this_cpu, imbalance); 4013 return fix_small_imbalance(env, sds);
4167 4014
4168} 4015}
4169 4016
@@ -4194,9 +4041,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
4194 * put to idle by rebalancing its tasks onto our group. 4041 * put to idle by rebalancing its tasks onto our group.
4195 */ 4042 */
4196static struct sched_group * 4043static struct sched_group *
4197find_busiest_group(struct sched_domain *sd, int this_cpu, 4044find_busiest_group(struct lb_env *env, const struct cpumask *cpus, int *balance)
4198 unsigned long *imbalance, enum cpu_idle_type idle,
4199 const struct cpumask *cpus, int *balance)
4200{ 4045{
4201 struct sd_lb_stats sds; 4046 struct sd_lb_stats sds;
4202 4047
@@ -4206,7 +4051,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
4206 * Compute the various statistics relavent for load balancing at 4051 * Compute the various statistics relavent for load balancing at
4207 * this level. 4052 * this level.
4208 */ 4053 */
4209 update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds); 4054 update_sd_lb_stats(env, cpus, balance, &sds);
4210 4055
4211 /* 4056 /*
4212 * this_cpu is not the appropriate cpu to perform load balancing at 4057 * this_cpu is not the appropriate cpu to perform load balancing at
@@ -4215,8 +4060,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
4215 if (!(*balance)) 4060 if (!(*balance))
4216 goto ret; 4061 goto ret;
4217 4062
4218 if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) && 4063 if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
4219 check_asym_packing(sd, &sds, this_cpu, imbalance)) 4064 check_asym_packing(env, &sds))
4220 return sds.busiest; 4065 return sds.busiest;
4221 4066
4222 /* There is no busy sibling group to pull tasks from */ 4067 /* There is no busy sibling group to pull tasks from */
@@ -4234,7 +4079,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
4234 goto force_balance; 4079 goto force_balance;
4235 4080
4236 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ 4081 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
4237 if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity && 4082 if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
4238 !sds.busiest_has_capacity) 4083 !sds.busiest_has_capacity)
4239 goto force_balance; 4084 goto force_balance;
4240 4085
@@ -4252,7 +4097,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
4252 if (sds.this_load >= sds.avg_load) 4097 if (sds.this_load >= sds.avg_load)
4253 goto out_balanced; 4098 goto out_balanced;
4254 4099
4255 if (idle == CPU_IDLE) { 4100 if (env->idle == CPU_IDLE) {
4256 /* 4101 /*
4257 * This cpu is idle. If the busiest group load doesn't 4102 * This cpu is idle. If the busiest group load doesn't
4258 * have more tasks than the number of available cpu's and 4103 * have more tasks than the number of available cpu's and
@@ -4267,34 +4112,27 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
4267 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use 4112 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
4268 * imbalance_pct to be conservative. 4113 * imbalance_pct to be conservative.
4269 */ 4114 */
4270 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) 4115 if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load)
4271 goto out_balanced; 4116 goto out_balanced;
4272 } 4117 }
4273 4118
4274force_balance: 4119force_balance:
4275 /* Looks like there is an imbalance. Compute it */ 4120 /* Looks like there is an imbalance. Compute it */
4276 calculate_imbalance(&sds, this_cpu, imbalance); 4121 calculate_imbalance(env, &sds);
4277 return sds.busiest; 4122 return sds.busiest;
4278 4123
4279out_balanced: 4124out_balanced:
4280 /*
4281 * There is no obvious imbalance. But check if we can do some balancing
4282 * to save power.
4283 */
4284 if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
4285 return sds.busiest;
4286ret: 4125ret:
4287 *imbalance = 0; 4126 env->imbalance = 0;
4288 return NULL; 4127 return NULL;
4289} 4128}
4290 4129
4291/* 4130/*
4292 * find_busiest_queue - find the busiest runqueue among the cpus in group. 4131 * find_busiest_queue - find the busiest runqueue among the cpus in group.
4293 */ 4132 */
4294static struct rq * 4133static struct rq *find_busiest_queue(struct lb_env *env,
4295find_busiest_queue(struct sched_domain *sd, struct sched_group *group, 4134 struct sched_group *group,
4296 enum cpu_idle_type idle, unsigned long imbalance, 4135 const struct cpumask *cpus)
4297 const struct cpumask *cpus)
4298{ 4136{
4299 struct rq *busiest = NULL, *rq; 4137 struct rq *busiest = NULL, *rq;
4300 unsigned long max_load = 0; 4138 unsigned long max_load = 0;
@@ -4307,7 +4145,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
4307 unsigned long wl; 4145 unsigned long wl;
4308 4146
4309 if (!capacity) 4147 if (!capacity)
4310 capacity = fix_small_capacity(sd, group); 4148 capacity = fix_small_capacity(env->sd, group);
4311 4149
4312 if (!cpumask_test_cpu(i, cpus)) 4150 if (!cpumask_test_cpu(i, cpus))
4313 continue; 4151 continue;
@@ -4319,7 +4157,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
4319 * When comparing with imbalance, use weighted_cpuload() 4157 * When comparing with imbalance, use weighted_cpuload()
4320 * which is not scaled with the cpu power. 4158 * which is not scaled with the cpu power.
4321 */ 4159 */
4322 if (capacity && rq->nr_running == 1 && wl > imbalance) 4160 if (capacity && rq->nr_running == 1 && wl > env->imbalance)
4323 continue; 4161 continue;
4324 4162
4325 /* 4163 /*
@@ -4348,40 +4186,19 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
4348/* Working cpumask for load_balance and load_balance_newidle. */ 4186/* Working cpumask for load_balance and load_balance_newidle. */
4349DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 4187DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
4350 4188
4351static int need_active_balance(struct sched_domain *sd, int idle, 4189static int need_active_balance(struct lb_env *env)
4352 int busiest_cpu, int this_cpu)
4353{ 4190{
4354 if (idle == CPU_NEWLY_IDLE) { 4191 struct sched_domain *sd = env->sd;
4192
4193 if (env->idle == CPU_NEWLY_IDLE) {
4355 4194
4356 /* 4195 /*
4357 * ASYM_PACKING needs to force migrate tasks from busy but 4196 * ASYM_PACKING needs to force migrate tasks from busy but
4358 * higher numbered CPUs in order to pack all tasks in the 4197 * higher numbered CPUs in order to pack all tasks in the
4359 * lowest numbered CPUs. 4198 * lowest numbered CPUs.
4360 */ 4199 */
4361 if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu) 4200 if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
4362 return 1; 4201 return 1;
4363
4364 /*
4365 * The only task running in a non-idle cpu can be moved to this
4366 * cpu in an attempt to completely freeup the other CPU
4367 * package.
4368 *
4369 * The package power saving logic comes from
4370 * find_busiest_group(). If there are no imbalance, then
4371 * f_b_g() will return NULL. However when sched_mc={1,2} then
4372 * f_b_g() will select a group from which a running task may be
4373 * pulled to this cpu in order to make the other package idle.
4374 * If there is no opportunity to make a package idle and if
4375 * there are no imbalance, then f_b_g() will return NULL and no
4376 * action will be taken in load_balance_newidle().
4377 *
4378 * Under normal task pull operation due to imbalance, there
4379 * will be more than one task in the source run queue and
4380 * move_tasks() will succeed. ld_moved will be true and this
4381 * active balance code will not be triggered.
4382 */
4383 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
4384 return 0;
4385 } 4202 }
4386 4203
4387 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); 4204 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
@@ -4399,7 +4216,6 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4399{ 4216{
4400 int ld_moved, active_balance = 0; 4217 int ld_moved, active_balance = 0;
4401 struct sched_group *group; 4218 struct sched_group *group;
4402 unsigned long imbalance;
4403 struct rq *busiest; 4219 struct rq *busiest;
4404 unsigned long flags; 4220 unsigned long flags;
4405 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); 4221 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
@@ -4417,8 +4233,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4417 schedstat_inc(sd, lb_count[idle]); 4233 schedstat_inc(sd, lb_count[idle]);
4418 4234
4419redo: 4235redo:
4420 group = find_busiest_group(sd, this_cpu, &imbalance, idle, 4236 group = find_busiest_group(&env, cpus, balance);
4421 cpus, balance);
4422 4237
4423 if (*balance == 0) 4238 if (*balance == 0)
4424 goto out_balanced; 4239 goto out_balanced;
@@ -4428,7 +4243,7 @@ redo:
4428 goto out_balanced; 4243 goto out_balanced;
4429 } 4244 }
4430 4245
4431 busiest = find_busiest_queue(sd, group, idle, imbalance, cpus); 4246 busiest = find_busiest_queue(&env, group, cpus);
4432 if (!busiest) { 4247 if (!busiest) {
4433 schedstat_inc(sd, lb_nobusyq[idle]); 4248 schedstat_inc(sd, lb_nobusyq[idle]);
4434 goto out_balanced; 4249 goto out_balanced;
@@ -4436,7 +4251,7 @@ redo:
4436 4251
4437 BUG_ON(busiest == this_rq); 4252 BUG_ON(busiest == this_rq);
4438 4253
4439 schedstat_add(sd, lb_imbalance[idle], imbalance); 4254 schedstat_add(sd, lb_imbalance[idle], env.imbalance);
4440 4255
4441 ld_moved = 0; 4256 ld_moved = 0;
4442 if (busiest->nr_running > 1) { 4257 if (busiest->nr_running > 1) {
@@ -4447,10 +4262,9 @@ redo:
4447 * correctly treated as an imbalance. 4262 * correctly treated as an imbalance.
4448 */ 4263 */
4449 env.flags |= LBF_ALL_PINNED; 4264 env.flags |= LBF_ALL_PINNED;
4450 env.load_move = imbalance; 4265 env.src_cpu = busiest->cpu;
4451 env.src_cpu = busiest->cpu; 4266 env.src_rq = busiest;
4452 env.src_rq = busiest; 4267 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
4453 env.loop_max = min_t(unsigned long, sysctl_sched_nr_migrate, busiest->nr_running);
4454 4268
4455more_balance: 4269more_balance:
4456 local_irq_save(flags); 4270 local_irq_save(flags);
@@ -4492,7 +4306,7 @@ more_balance:
4492 if (idle != CPU_NEWLY_IDLE) 4306 if (idle != CPU_NEWLY_IDLE)
4493 sd->nr_balance_failed++; 4307 sd->nr_balance_failed++;
4494 4308
4495 if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) { 4309 if (need_active_balance(&env)) {
4496 raw_spin_lock_irqsave(&busiest->lock, flags); 4310 raw_spin_lock_irqsave(&busiest->lock, flags);
4497 4311
4498 /* don't kick the active_load_balance_cpu_stop, 4312 /* don't kick the active_load_balance_cpu_stop,
@@ -4519,10 +4333,11 @@ more_balance:
4519 } 4333 }
4520 raw_spin_unlock_irqrestore(&busiest->lock, flags); 4334 raw_spin_unlock_irqrestore(&busiest->lock, flags);
4521 4335
4522 if (active_balance) 4336 if (active_balance) {
4523 stop_one_cpu_nowait(cpu_of(busiest), 4337 stop_one_cpu_nowait(cpu_of(busiest),
4524 active_load_balance_cpu_stop, busiest, 4338 active_load_balance_cpu_stop, busiest,
4525 &busiest->active_balance_work); 4339 &busiest->active_balance_work);
4340 }
4526 4341
4527 /* 4342 /*
4528 * We've kicked active balancing, reset the failure 4343 * We've kicked active balancing, reset the failure
@@ -4703,104 +4518,15 @@ static struct {
4703 unsigned long next_balance; /* in jiffy units */ 4518 unsigned long next_balance; /* in jiffy units */
4704} nohz ____cacheline_aligned; 4519} nohz ____cacheline_aligned;
4705 4520
4706#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 4521static inline int find_new_ilb(int call_cpu)
4707/**
4708 * lowest_flag_domain - Return lowest sched_domain containing flag.
4709 * @cpu: The cpu whose lowest level of sched domain is to
4710 * be returned.
4711 * @flag: The flag to check for the lowest sched_domain
4712 * for the given cpu.
4713 *
4714 * Returns the lowest sched_domain of a cpu which contains the given flag.
4715 */
4716static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
4717{
4718 struct sched_domain *sd;
4719
4720 for_each_domain(cpu, sd)
4721 if (sd->flags & flag)
4722 break;
4723
4724 return sd;
4725}
4726
4727/**
4728 * for_each_flag_domain - Iterates over sched_domains containing the flag.
4729 * @cpu: The cpu whose domains we're iterating over.
4730 * @sd: variable holding the value of the power_savings_sd
4731 * for cpu.
4732 * @flag: The flag to filter the sched_domains to be iterated.
4733 *
4734 * Iterates over all the scheduler domains for a given cpu that has the 'flag'
4735 * set, starting from the lowest sched_domain to the highest.
4736 */
4737#define for_each_flag_domain(cpu, sd, flag) \
4738 for (sd = lowest_flag_domain(cpu, flag); \
4739 (sd && (sd->flags & flag)); sd = sd->parent)
4740
4741/**
4742 * find_new_ilb - Finds the optimum idle load balancer for nomination.
4743 * @cpu: The cpu which is nominating a new idle_load_balancer.
4744 *
4745 * Returns: Returns the id of the idle load balancer if it exists,
4746 * Else, returns >= nr_cpu_ids.
4747 *
4748 * This algorithm picks the idle load balancer such that it belongs to a
4749 * semi-idle powersavings sched_domain. The idea is to try and avoid
4750 * completely idle packages/cores just for the purpose of idle load balancing
4751 * when there are other idle cpu's which are better suited for that job.
4752 */
4753static int find_new_ilb(int cpu)
4754{ 4522{
4755 int ilb = cpumask_first(nohz.idle_cpus_mask); 4523 int ilb = cpumask_first(nohz.idle_cpus_mask);
4756 struct sched_group *ilbg;
4757 struct sched_domain *sd;
4758
4759 /*
4760 * Have idle load balancer selection from semi-idle packages only
4761 * when power-aware load balancing is enabled
4762 */
4763 if (!(sched_smt_power_savings || sched_mc_power_savings))
4764 goto out_done;
4765
4766 /*
4767 * Optimize for the case when we have no idle CPUs or only one
4768 * idle CPU. Don't walk the sched_domain hierarchy in such cases
4769 */
4770 if (cpumask_weight(nohz.idle_cpus_mask) < 2)
4771 goto out_done;
4772
4773 rcu_read_lock();
4774 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
4775 ilbg = sd->groups;
4776
4777 do {
4778 if (ilbg->group_weight !=
4779 atomic_read(&ilbg->sgp->nr_busy_cpus)) {
4780 ilb = cpumask_first_and(nohz.idle_cpus_mask,
4781 sched_group_cpus(ilbg));
4782 goto unlock;
4783 }
4784
4785 ilbg = ilbg->next;
4786
4787 } while (ilbg != sd->groups);
4788 }
4789unlock:
4790 rcu_read_unlock();
4791 4524
4792out_done:
4793 if (ilb < nr_cpu_ids && idle_cpu(ilb)) 4525 if (ilb < nr_cpu_ids && idle_cpu(ilb))
4794 return ilb; 4526 return ilb;
4795 4527
4796 return nr_cpu_ids; 4528 return nr_cpu_ids;
4797} 4529}
4798#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
4799static inline int find_new_ilb(int call_cpu)
4800{
4801 return nr_cpu_ids;
4802}
4803#endif
4804 4530
4805/* 4531/*
4806 * Kick a CPU to do the nohz balancing, if it is time for it. We pick the 4532 * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
@@ -5023,7 +4749,7 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
5023 4749
5024 raw_spin_lock_irq(&this_rq->lock); 4750 raw_spin_lock_irq(&this_rq->lock);
5025 update_rq_clock(this_rq); 4751 update_rq_clock(this_rq);
5026 update_cpu_load(this_rq); 4752 update_idle_cpu_load(this_rq);
5027 raw_spin_unlock_irq(&this_rq->lock); 4753 raw_spin_unlock_irq(&this_rq->lock);
5028 4754
5029 rebalance_domains(balance_cpu, CPU_IDLE); 4755 rebalance_domains(balance_cpu, CPU_IDLE);
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 91b4c957f289..b44d604b35d1 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -4,7 +4,7 @@
4 * idle-task scheduling class. 4 * idle-task scheduling class.
5 * 5 *
6 * (NOTE: these are not related to SCHED_IDLE tasks which are 6 * (NOTE: these are not related to SCHED_IDLE tasks which are
7 * handled in sched_fair.c) 7 * handled in sched/fair.c)
8 */ 8 */
9 9
10#ifdef CONFIG_SMP 10#ifdef CONFIG_SMP
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 44af55e6d5d0..c5565c3c515f 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1803,44 +1803,40 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
1803static void set_cpus_allowed_rt(struct task_struct *p, 1803static void set_cpus_allowed_rt(struct task_struct *p,
1804 const struct cpumask *new_mask) 1804 const struct cpumask *new_mask)
1805{ 1805{
1806 int weight = cpumask_weight(new_mask); 1806 struct rq *rq;
1807 int weight;
1807 1808
1808 BUG_ON(!rt_task(p)); 1809 BUG_ON(!rt_task(p));
1809 1810
1810 /* 1811 if (!p->on_rq)
1811 * Update the migration status of the RQ if we have an RT task 1812 return;
1812 * which is running AND changing its weight value.
1813 */
1814 if (p->on_rq && (weight != p->rt.nr_cpus_allowed)) {
1815 struct rq *rq = task_rq(p);
1816
1817 if (!task_current(rq, p)) {
1818 /*
1819 * Make sure we dequeue this task from the pushable list
1820 * before going further. It will either remain off of
1821 * the list because we are no longer pushable, or it
1822 * will be requeued.
1823 */
1824 if (p->rt.nr_cpus_allowed > 1)
1825 dequeue_pushable_task(rq, p);
1826 1813
1827 /* 1814 weight = cpumask_weight(new_mask);
1828 * Requeue if our weight is changing and still > 1
1829 */
1830 if (weight > 1)
1831 enqueue_pushable_task(rq, p);
1832 1815
1833 } 1816 /*
1817 * Only update if the process changes its state from whether it
1818 * can migrate or not.
1819 */
1820 if ((p->rt.nr_cpus_allowed > 1) == (weight > 1))
1821 return;
1834 1822
1835 if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) { 1823 rq = task_rq(p);
1836 rq->rt.rt_nr_migratory++;
1837 } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) {
1838 BUG_ON(!rq->rt.rt_nr_migratory);
1839 rq->rt.rt_nr_migratory--;
1840 }
1841 1824
1842 update_rt_migration(&rq->rt); 1825 /*
1826 * The process used to be able to migrate OR it can now migrate
1827 */
1828 if (weight <= 1) {
1829 if (!task_current(rq, p))
1830 dequeue_pushable_task(rq, p);
1831 BUG_ON(!rq->rt.rt_nr_migratory);
1832 rq->rt.rt_nr_migratory--;
1833 } else {
1834 if (!task_current(rq, p))
1835 enqueue_pushable_task(rq, p);
1836 rq->rt.rt_nr_migratory++;
1843 } 1837 }
1838
1839 update_rt_migration(&rq->rt);
1844} 1840}
1845 1841
1846/* Assumes rq->lock is held */ 1842/* Assumes rq->lock is held */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index fb3acba4d52e..ba9dccfd24ce 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -201,7 +201,7 @@ struct cfs_bandwidth { };
201/* CFS-related fields in a runqueue */ 201/* CFS-related fields in a runqueue */
202struct cfs_rq { 202struct cfs_rq {
203 struct load_weight load; 203 struct load_weight load;
204 unsigned long nr_running, h_nr_running; 204 unsigned int nr_running, h_nr_running;
205 205
206 u64 exec_clock; 206 u64 exec_clock;
207 u64 min_vruntime; 207 u64 min_vruntime;
@@ -279,7 +279,7 @@ static inline int rt_bandwidth_enabled(void)
279/* Real-Time classes' related field in a runqueue: */ 279/* Real-Time classes' related field in a runqueue: */
280struct rt_rq { 280struct rt_rq {
281 struct rt_prio_array active; 281 struct rt_prio_array active;
282 unsigned long rt_nr_running; 282 unsigned int rt_nr_running;
283#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 283#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
284 struct { 284 struct {
285 int curr; /* highest queued rt task prio */ 285 int curr; /* highest queued rt task prio */
@@ -353,7 +353,7 @@ struct rq {
353 * nr_running and cpu_load should be in the same cacheline because 353 * nr_running and cpu_load should be in the same cacheline because
354 * remote CPUs use both these fields when doing load calculation. 354 * remote CPUs use both these fields when doing load calculation.
355 */ 355 */
356 unsigned long nr_running; 356 unsigned int nr_running;
357 #define CPU_LOAD_IDX_MAX 5 357 #define CPU_LOAD_IDX_MAX 5
358 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 358 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
359 unsigned long last_load_update_tick; 359 unsigned long last_load_update_tick;
@@ -876,7 +876,7 @@ extern void resched_cpu(int cpu);
876extern struct rt_bandwidth def_rt_bandwidth; 876extern struct rt_bandwidth def_rt_bandwidth;
877extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); 877extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
878 878
879extern void update_cpu_load(struct rq *this_rq); 879extern void update_idle_cpu_load(struct rq *this_rq);
880 880
881#ifdef CONFIG_CGROUP_CPUACCT 881#ifdef CONFIG_CGROUP_CPUACCT
882#include <linux/cgroup.h> 882#include <linux/cgroup.h>