aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/sched/core.c327
-rw-r--r--kernel/sched/debug.c12
-rw-r--r--kernel/sched/fair.c203
-rw-r--r--kernel/sched/idle_task.c2
-rw-r--r--kernel/sched/rt.c56
-rw-r--r--kernel/sched/sched.h8
6 files changed, 358 insertions, 250 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e5212ae294f6..bd314d7cd9f8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -692,8 +692,6 @@ int tg_nop(struct task_group *tg, void *data)
692} 692}
693#endif 693#endif
694 694
695void update_cpu_load(struct rq *this_rq);
696
697static void set_load_weight(struct task_struct *p) 695static void set_load_weight(struct task_struct *p)
698{ 696{
699 int prio = p->static_prio - MAX_RT_PRIO; 697 int prio = p->static_prio - MAX_RT_PRIO;
@@ -2486,22 +2484,13 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
2486 * scheduler tick (TICK_NSEC). With tickless idle this will not be called 2484 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
2487 * every tick. We fix it up based on jiffies. 2485 * every tick. We fix it up based on jiffies.
2488 */ 2486 */
2489void update_cpu_load(struct rq *this_rq) 2487static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
2488 unsigned long pending_updates)
2490{ 2489{
2491 unsigned long this_load = this_rq->load.weight;
2492 unsigned long curr_jiffies = jiffies;
2493 unsigned long pending_updates;
2494 int i, scale; 2490 int i, scale;
2495 2491
2496 this_rq->nr_load_updates++; 2492 this_rq->nr_load_updates++;
2497 2493
2498 /* Avoid repeated calls on same jiffy, when moving in and out of idle */
2499 if (curr_jiffies == this_rq->last_load_update_tick)
2500 return;
2501
2502 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2503 this_rq->last_load_update_tick = curr_jiffies;
2504
2505 /* Update our load: */ 2494 /* Update our load: */
2506 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ 2495 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
2507 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { 2496 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
@@ -2526,9 +2515,45 @@ void update_cpu_load(struct rq *this_rq)
2526 sched_avg_update(this_rq); 2515 sched_avg_update(this_rq);
2527} 2516}
2528 2517
2518/*
2519 * Called from nohz_idle_balance() to update the load ratings before doing the
2520 * idle balance.
2521 */
2522void update_idle_cpu_load(struct rq *this_rq)
2523{
2524 unsigned long curr_jiffies = jiffies;
2525 unsigned long load = this_rq->load.weight;
2526 unsigned long pending_updates;
2527
2528 /*
2529 * Bloody broken means of dealing with nohz, but better than nothing..
2530 * jiffies is updated by one cpu, another cpu can drift wrt the jiffy
2531 * update and see 0 difference the one time and 2 the next, even though
2532 * we ticked at roughtly the same rate.
2533 *
2534 * Hence we only use this from nohz_idle_balance() and skip this
2535 * nonsense when called from the scheduler_tick() since that's
2536 * guaranteed a stable rate.
2537 */
2538 if (load || curr_jiffies == this_rq->last_load_update_tick)
2539 return;
2540
2541 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2542 this_rq->last_load_update_tick = curr_jiffies;
2543
2544 __update_cpu_load(this_rq, load, pending_updates);
2545}
2546
2547/*
2548 * Called from scheduler_tick()
2549 */
2529static void update_cpu_load_active(struct rq *this_rq) 2550static void update_cpu_load_active(struct rq *this_rq)
2530{ 2551{
2531 update_cpu_load(this_rq); 2552 /*
2553 * See the mess in update_idle_cpu_load().
2554 */
2555 this_rq->last_load_update_tick = jiffies;
2556 __update_cpu_load(this_rq, this_rq->load.weight, 1);
2532 2557
2533 calc_load_account_active(this_rq); 2558 calc_load_account_active(this_rq);
2534} 2559}
@@ -5560,7 +5585,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5560 break; 5585 break;
5561 } 5586 }
5562 5587
5563 if (cpumask_intersects(groupmask, sched_group_cpus(group))) { 5588 if (!(sd->flags & SD_OVERLAP) &&
5589 cpumask_intersects(groupmask, sched_group_cpus(group))) {
5564 printk(KERN_CONT "\n"); 5590 printk(KERN_CONT "\n");
5565 printk(KERN_ERR "ERROR: repeated CPUs\n"); 5591 printk(KERN_ERR "ERROR: repeated CPUs\n");
5566 break; 5592 break;
@@ -5898,92 +5924,6 @@ static int __init isolated_cpu_setup(char *str)
5898 5924
5899__setup("isolcpus=", isolated_cpu_setup); 5925__setup("isolcpus=", isolated_cpu_setup);
5900 5926
5901#ifdef CONFIG_NUMA
5902
5903/**
5904 * find_next_best_node - find the next node to include in a sched_domain
5905 * @node: node whose sched_domain we're building
5906 * @used_nodes: nodes already in the sched_domain
5907 *
5908 * Find the next node to include in a given scheduling domain. Simply
5909 * finds the closest node not already in the @used_nodes map.
5910 *
5911 * Should use nodemask_t.
5912 */
5913static int find_next_best_node(int node, nodemask_t *used_nodes)
5914{
5915 int i, n, val, min_val, best_node = -1;
5916
5917 min_val = INT_MAX;
5918
5919 for (i = 0; i < nr_node_ids; i++) {
5920 /* Start at @node */
5921 n = (node + i) % nr_node_ids;
5922
5923 if (!nr_cpus_node(n))
5924 continue;
5925
5926 /* Skip already used nodes */
5927 if (node_isset(n, *used_nodes))
5928 continue;
5929
5930 /* Simple min distance search */
5931 val = node_distance(node, n);
5932
5933 if (val < min_val) {
5934 min_val = val;
5935 best_node = n;
5936 }
5937 }
5938
5939 if (best_node != -1)
5940 node_set(best_node, *used_nodes);
5941 return best_node;
5942}
5943
5944/**
5945 * sched_domain_node_span - get a cpumask for a node's sched_domain
5946 * @node: node whose cpumask we're constructing
5947 * @span: resulting cpumask
5948 *
5949 * Given a node, construct a good cpumask for its sched_domain to span. It
5950 * should be one that prevents unnecessary balancing, but also spreads tasks
5951 * out optimally.
5952 */
5953static void sched_domain_node_span(int node, struct cpumask *span)
5954{
5955 nodemask_t used_nodes;
5956 int i;
5957
5958 cpumask_clear(span);
5959 nodes_clear(used_nodes);
5960
5961 cpumask_or(span, span, cpumask_of_node(node));
5962 node_set(node, used_nodes);
5963
5964 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
5965 int next_node = find_next_best_node(node, &used_nodes);
5966 if (next_node < 0)
5967 break;
5968 cpumask_or(span, span, cpumask_of_node(next_node));
5969 }
5970}
5971
5972static const struct cpumask *cpu_node_mask(int cpu)
5973{
5974 lockdep_assert_held(&sched_domains_mutex);
5975
5976 sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
5977
5978 return sched_domains_tmpmask;
5979}
5980
5981static const struct cpumask *cpu_allnodes_mask(int cpu)
5982{
5983 return cpu_possible_mask;
5984}
5985#endif /* CONFIG_NUMA */
5986
5987static const struct cpumask *cpu_cpu_mask(int cpu) 5927static const struct cpumask *cpu_cpu_mask(int cpu)
5988{ 5928{
5989 return cpumask_of_node(cpu_to_node(cpu)); 5929 return cpumask_of_node(cpu_to_node(cpu));
@@ -6020,6 +5960,7 @@ struct sched_domain_topology_level {
6020 sched_domain_init_f init; 5960 sched_domain_init_f init;
6021 sched_domain_mask_f mask; 5961 sched_domain_mask_f mask;
6022 int flags; 5962 int flags;
5963 int numa_level;
6023 struct sd_data data; 5964 struct sd_data data;
6024}; 5965};
6025 5966
@@ -6211,10 +6152,6 @@ sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
6211} 6152}
6212 6153
6213SD_INIT_FUNC(CPU) 6154SD_INIT_FUNC(CPU)
6214#ifdef CONFIG_NUMA
6215 SD_INIT_FUNC(ALLNODES)
6216 SD_INIT_FUNC(NODE)
6217#endif
6218#ifdef CONFIG_SCHED_SMT 6155#ifdef CONFIG_SCHED_SMT
6219 SD_INIT_FUNC(SIBLING) 6156 SD_INIT_FUNC(SIBLING)
6220#endif 6157#endif
@@ -6336,15 +6273,185 @@ static struct sched_domain_topology_level default_topology[] = {
6336 { sd_init_BOOK, cpu_book_mask, }, 6273 { sd_init_BOOK, cpu_book_mask, },
6337#endif 6274#endif
6338 { sd_init_CPU, cpu_cpu_mask, }, 6275 { sd_init_CPU, cpu_cpu_mask, },
6339#ifdef CONFIG_NUMA
6340 { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },
6341 { sd_init_ALLNODES, cpu_allnodes_mask, },
6342#endif
6343 { NULL, }, 6276 { NULL, },
6344}; 6277};
6345 6278
6346static struct sched_domain_topology_level *sched_domain_topology = default_topology; 6279static struct sched_domain_topology_level *sched_domain_topology = default_topology;
6347 6280
6281#ifdef CONFIG_NUMA
6282
6283static int sched_domains_numa_levels;
6284static int sched_domains_numa_scale;
6285static int *sched_domains_numa_distance;
6286static struct cpumask ***sched_domains_numa_masks;
6287static int sched_domains_curr_level;
6288
6289static inline int sd_local_flags(int level)
6290{
6291 if (sched_domains_numa_distance[level] > REMOTE_DISTANCE)
6292 return 0;
6293
6294 return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
6295}
6296
6297static struct sched_domain *
6298sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
6299{
6300 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
6301 int level = tl->numa_level;
6302 int sd_weight = cpumask_weight(
6303 sched_domains_numa_masks[level][cpu_to_node(cpu)]);
6304
6305 *sd = (struct sched_domain){
6306 .min_interval = sd_weight,
6307 .max_interval = 2*sd_weight,
6308 .busy_factor = 32,
6309 .imbalance_pct = 125,
6310 .cache_nice_tries = 2,
6311 .busy_idx = 3,
6312 .idle_idx = 2,
6313 .newidle_idx = 0,
6314 .wake_idx = 0,
6315 .forkexec_idx = 0,
6316
6317 .flags = 1*SD_LOAD_BALANCE
6318 | 1*SD_BALANCE_NEWIDLE
6319 | 0*SD_BALANCE_EXEC
6320 | 0*SD_BALANCE_FORK
6321 | 0*SD_BALANCE_WAKE
6322 | 0*SD_WAKE_AFFINE
6323 | 0*SD_PREFER_LOCAL
6324 | 0*SD_SHARE_CPUPOWER
6325 | 0*SD_POWERSAVINGS_BALANCE
6326 | 0*SD_SHARE_PKG_RESOURCES
6327 | 1*SD_SERIALIZE
6328 | 0*SD_PREFER_SIBLING
6329 | sd_local_flags(level)
6330 ,
6331 .last_balance = jiffies,
6332 .balance_interval = sd_weight,
6333 };
6334 SD_INIT_NAME(sd, NUMA);
6335 sd->private = &tl->data;
6336
6337 /*
6338 * Ugly hack to pass state to sd_numa_mask()...
6339 */
6340 sched_domains_curr_level = tl->numa_level;
6341
6342 return sd;
6343}
6344
6345static const struct cpumask *sd_numa_mask(int cpu)
6346{
6347 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
6348}
6349
6350static void sched_init_numa(void)
6351{
6352 int next_distance, curr_distance = node_distance(0, 0);
6353 struct sched_domain_topology_level *tl;
6354 int level = 0;
6355 int i, j, k;
6356
6357 sched_domains_numa_scale = curr_distance;
6358 sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
6359 if (!sched_domains_numa_distance)
6360 return;
6361
6362 /*
6363 * O(nr_nodes^2) deduplicating selection sort -- in order to find the
6364 * unique distances in the node_distance() table.
6365 *
6366 * Assumes node_distance(0,j) includes all distances in
6367 * node_distance(i,j) in order to avoid cubic time.
6368 *
6369 * XXX: could be optimized to O(n log n) by using sort()
6370 */
6371 next_distance = curr_distance;
6372 for (i = 0; i < nr_node_ids; i++) {
6373 for (j = 0; j < nr_node_ids; j++) {
6374 int distance = node_distance(0, j);
6375 if (distance > curr_distance &&
6376 (distance < next_distance ||
6377 next_distance == curr_distance))
6378 next_distance = distance;
6379 }
6380 if (next_distance != curr_distance) {
6381 sched_domains_numa_distance[level++] = next_distance;
6382 sched_domains_numa_levels = level;
6383 curr_distance = next_distance;
6384 } else break;
6385 }
6386 /*
6387 * 'level' contains the number of unique distances, excluding the
6388 * identity distance node_distance(i,i).
6389 *
6390 * The sched_domains_nume_distance[] array includes the actual distance
6391 * numbers.
6392 */
6393
6394 sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
6395 if (!sched_domains_numa_masks)
6396 return;
6397
6398 /*
6399 * Now for each level, construct a mask per node which contains all
6400 * cpus of nodes that are that many hops away from us.
6401 */
6402 for (i = 0; i < level; i++) {
6403 sched_domains_numa_masks[i] =
6404 kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
6405 if (!sched_domains_numa_masks[i])
6406 return;
6407
6408 for (j = 0; j < nr_node_ids; j++) {
6409 struct cpumask *mask = kzalloc_node(cpumask_size(), GFP_KERNEL, j);
6410 if (!mask)
6411 return;
6412
6413 sched_domains_numa_masks[i][j] = mask;
6414
6415 for (k = 0; k < nr_node_ids; k++) {
6416 if (node_distance(j, k) > sched_domains_numa_distance[i])
6417 continue;
6418
6419 cpumask_or(mask, mask, cpumask_of_node(k));
6420 }
6421 }
6422 }
6423
6424 tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
6425 sizeof(struct sched_domain_topology_level), GFP_KERNEL);
6426 if (!tl)
6427 return;
6428
6429 /*
6430 * Copy the default topology bits..
6431 */
6432 for (i = 0; default_topology[i].init; i++)
6433 tl[i] = default_topology[i];
6434
6435 /*
6436 * .. and append 'j' levels of NUMA goodness.
6437 */
6438 for (j = 0; j < level; i++, j++) {
6439 tl[i] = (struct sched_domain_topology_level){
6440 .init = sd_numa_init,
6441 .mask = sd_numa_mask,
6442 .flags = SDTL_OVERLAP,
6443 .numa_level = j,
6444 };
6445 }
6446
6447 sched_domain_topology = tl;
6448}
6449#else
6450static inline void sched_init_numa(void)
6451{
6452}
6453#endif /* CONFIG_NUMA */
6454
6348static int __sdt_alloc(const struct cpumask *cpu_map) 6455static int __sdt_alloc(const struct cpumask *cpu_map)
6349{ 6456{
6350 struct sched_domain_topology_level *tl; 6457 struct sched_domain_topology_level *tl;
@@ -6840,6 +6947,8 @@ void __init sched_init_smp(void)
6840 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); 6947 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
6841 alloc_cpumask_var(&fallback_doms, GFP_KERNEL); 6948 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
6842 6949
6950 sched_init_numa();
6951
6843 get_online_cpus(); 6952 get_online_cpus();
6844 mutex_lock(&sched_domains_mutex); 6953 mutex_lock(&sched_domains_mutex);
6845 init_sched_domains(cpu_active_mask); 6954 init_sched_domains(cpu_active_mask);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 09acaa15161d..6f79596e0ea9 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -202,7 +202,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
202 SPLIT_NS(spread0)); 202 SPLIT_NS(spread0));
203 SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", 203 SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
204 cfs_rq->nr_spread_over); 204 cfs_rq->nr_spread_over);
205 SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); 205 SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
206 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); 206 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
207#ifdef CONFIG_FAIR_GROUP_SCHED 207#ifdef CONFIG_FAIR_GROUP_SCHED
208#ifdef CONFIG_SMP 208#ifdef CONFIG_SMP
@@ -260,8 +260,14 @@ static void print_cpu(struct seq_file *m, int cpu)
260 SEQ_printf(m, "\ncpu#%d\n", cpu); 260 SEQ_printf(m, "\ncpu#%d\n", cpu);
261#endif 261#endif
262 262
263#define P(x) \ 263#define P(x) \
264 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x)) 264do { \
265 if (sizeof(rq->x) == 4) \
266 SEQ_printf(m, " .%-30s: %ld\n", #x, (long)(rq->x)); \
267 else \
268 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x));\
269} while (0)
270
265#define PN(x) \ 271#define PN(x) \
266 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x)) 272 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x))
267 273
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e9553640c1c3..0b42f4487329 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3082,7 +3082,7 @@ struct lb_env {
3082 struct rq *dst_rq; 3082 struct rq *dst_rq;
3083 3083
3084 enum cpu_idle_type idle; 3084 enum cpu_idle_type idle;
3085 long load_move; 3085 long imbalance;
3086 unsigned int flags; 3086 unsigned int flags;
3087 3087
3088 unsigned int loop; 3088 unsigned int loop;
@@ -3218,7 +3218,7 @@ static unsigned long task_h_load(struct task_struct *p);
3218static const unsigned int sched_nr_migrate_break = 32; 3218static const unsigned int sched_nr_migrate_break = 32;
3219 3219
3220/* 3220/*
3221 * move_tasks tries to move up to load_move weighted load from busiest to 3221 * move_tasks tries to move up to imbalance weighted load from busiest to
3222 * this_rq, as part of a balancing operation within domain "sd". 3222 * this_rq, as part of a balancing operation within domain "sd".
3223 * Returns 1 if successful and 0 otherwise. 3223 * Returns 1 if successful and 0 otherwise.
3224 * 3224 *
@@ -3231,7 +3231,7 @@ static int move_tasks(struct lb_env *env)
3231 unsigned long load; 3231 unsigned long load;
3232 int pulled = 0; 3232 int pulled = 0;
3233 3233
3234 if (env->load_move <= 0) 3234 if (env->imbalance <= 0)
3235 return 0; 3235 return 0;
3236 3236
3237 while (!list_empty(tasks)) { 3237 while (!list_empty(tasks)) {
@@ -3257,7 +3257,7 @@ static int move_tasks(struct lb_env *env)
3257 if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed) 3257 if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
3258 goto next; 3258 goto next;
3259 3259
3260 if ((load / 2) > env->load_move) 3260 if ((load / 2) > env->imbalance)
3261 goto next; 3261 goto next;
3262 3262
3263 if (!can_migrate_task(p, env)) 3263 if (!can_migrate_task(p, env))
@@ -3265,7 +3265,7 @@ static int move_tasks(struct lb_env *env)
3265 3265
3266 move_task(p, env); 3266 move_task(p, env);
3267 pulled++; 3267 pulled++;
3268 env->load_move -= load; 3268 env->imbalance -= load;
3269 3269
3270#ifdef CONFIG_PREEMPT 3270#ifdef CONFIG_PREEMPT
3271 /* 3271 /*
@@ -3281,7 +3281,7 @@ static int move_tasks(struct lb_env *env)
3281 * We only want to steal up to the prescribed amount of 3281 * We only want to steal up to the prescribed amount of
3282 * weighted load. 3282 * weighted load.
3283 */ 3283 */
3284 if (env->load_move <= 0) 3284 if (env->imbalance <= 0)
3285 break; 3285 break;
3286 3286
3287 continue; 3287 continue;
@@ -3578,10 +3578,9 @@ static inline void update_sd_power_savings_stats(struct sched_group *group,
3578 3578
3579/** 3579/**
3580 * check_power_save_busiest_group - see if there is potential for some power-savings balance 3580 * check_power_save_busiest_group - see if there is potential for some power-savings balance
3581 * @env: load balance environment
3581 * @sds: Variable containing the statistics of the sched_domain 3582 * @sds: Variable containing the statistics of the sched_domain
3582 * under consideration. 3583 * under consideration.
3583 * @this_cpu: Cpu at which we're currently performing load-balancing.
3584 * @imbalance: Variable to store the imbalance.
3585 * 3584 *
3586 * Description: 3585 * Description:
3587 * Check if we have potential to perform some power-savings balance. 3586 * Check if we have potential to perform some power-savings balance.
@@ -3591,8 +3590,8 @@ static inline void update_sd_power_savings_stats(struct sched_group *group,
3591 * Returns 1 if there is potential to perform power-savings balance. 3590 * Returns 1 if there is potential to perform power-savings balance.
3592 * Else returns 0. 3591 * Else returns 0.
3593 */ 3592 */
3594static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, 3593static inline
3595 int this_cpu, unsigned long *imbalance) 3594int check_power_save_busiest_group(struct lb_env *env, struct sd_lb_stats *sds)
3596{ 3595{
3597 if (!sds->power_savings_balance) 3596 if (!sds->power_savings_balance)
3598 return 0; 3597 return 0;
@@ -3601,7 +3600,7 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3601 sds->group_leader == sds->group_min) 3600 sds->group_leader == sds->group_min)
3602 return 0; 3601 return 0;
3603 3602
3604 *imbalance = sds->min_load_per_task; 3603 env->imbalance = sds->min_load_per_task;
3605 sds->busiest = sds->group_min; 3604 sds->busiest = sds->group_min;
3606 3605
3607 return 1; 3606 return 1;
@@ -3620,8 +3619,8 @@ static inline void update_sd_power_savings_stats(struct sched_group *group,
3620 return; 3619 return;
3621} 3620}
3622 3621
3623static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, 3622static inline
3624 int this_cpu, unsigned long *imbalance) 3623int check_power_save_busiest_group(struct lb_env *env, struct sd_lb_stats *sds)
3625{ 3624{
3626 return 0; 3625 return 0;
3627} 3626}
@@ -3765,24 +3764,22 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
3765 * update_sg_lb_stats - Update sched_group's statistics for load balancing. 3764 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
3766 * @sd: The sched_domain whose statistics are to be updated. 3765 * @sd: The sched_domain whose statistics are to be updated.
3767 * @group: sched_group whose statistics are to be updated. 3766 * @group: sched_group whose statistics are to be updated.
3768 * @this_cpu: Cpu for which load balance is currently performed.
3769 * @idle: Idle status of this_cpu
3770 * @load_idx: Load index of sched_domain of this_cpu for load calc. 3767 * @load_idx: Load index of sched_domain of this_cpu for load calc.
3771 * @local_group: Does group contain this_cpu. 3768 * @local_group: Does group contain this_cpu.
3772 * @cpus: Set of cpus considered for load balancing. 3769 * @cpus: Set of cpus considered for load balancing.
3773 * @balance: Should we balance. 3770 * @balance: Should we balance.
3774 * @sgs: variable to hold the statistics for this group. 3771 * @sgs: variable to hold the statistics for this group.
3775 */ 3772 */
3776static inline void update_sg_lb_stats(struct sched_domain *sd, 3773static inline void update_sg_lb_stats(struct lb_env *env,
3777 struct sched_group *group, int this_cpu, 3774 struct sched_group *group, int load_idx,
3778 enum cpu_idle_type idle, int load_idx,
3779 int local_group, const struct cpumask *cpus, 3775 int local_group, const struct cpumask *cpus,
3780 int *balance, struct sg_lb_stats *sgs) 3776 int *balance, struct sg_lb_stats *sgs)
3781{ 3777{
3782 unsigned long load, max_cpu_load, min_cpu_load, max_nr_running; 3778 unsigned long nr_running, max_nr_running, min_nr_running;
3783 int i; 3779 unsigned long load, max_cpu_load, min_cpu_load;
3784 unsigned int balance_cpu = -1, first_idle_cpu = 0; 3780 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3785 unsigned long avg_load_per_task = 0; 3781 unsigned long avg_load_per_task = 0;
3782 int i;
3786 3783
3787 if (local_group) 3784 if (local_group)
3788 balance_cpu = group_first_cpu(group); 3785 balance_cpu = group_first_cpu(group);
@@ -3791,10 +3788,13 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
3791 max_cpu_load = 0; 3788 max_cpu_load = 0;
3792 min_cpu_load = ~0UL; 3789 min_cpu_load = ~0UL;
3793 max_nr_running = 0; 3790 max_nr_running = 0;
3791 min_nr_running = ~0UL;
3794 3792
3795 for_each_cpu_and(i, sched_group_cpus(group), cpus) { 3793 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
3796 struct rq *rq = cpu_rq(i); 3794 struct rq *rq = cpu_rq(i);
3797 3795
3796 nr_running = rq->nr_running;
3797
3798 /* Bias balancing toward cpus of our domain */ 3798 /* Bias balancing toward cpus of our domain */
3799 if (local_group) { 3799 if (local_group) {
3800 if (idle_cpu(i) && !first_idle_cpu) { 3800 if (idle_cpu(i) && !first_idle_cpu) {
@@ -3805,16 +3805,19 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
3805 load = target_load(i, load_idx); 3805 load = target_load(i, load_idx);
3806 } else { 3806 } else {
3807 load = source_load(i, load_idx); 3807 load = source_load(i, load_idx);
3808 if (load > max_cpu_load) { 3808 if (load > max_cpu_load)
3809 max_cpu_load = load; 3809 max_cpu_load = load;
3810 max_nr_running = rq->nr_running;
3811 }
3812 if (min_cpu_load > load) 3810 if (min_cpu_load > load)
3813 min_cpu_load = load; 3811 min_cpu_load = load;
3812
3813 if (nr_running > max_nr_running)
3814 max_nr_running = nr_running;
3815 if (min_nr_running > nr_running)
3816 min_nr_running = nr_running;
3814 } 3817 }
3815 3818
3816 sgs->group_load += load; 3819 sgs->group_load += load;
3817 sgs->sum_nr_running += rq->nr_running; 3820 sgs->sum_nr_running += nr_running;
3818 sgs->sum_weighted_load += weighted_cpuload(i); 3821 sgs->sum_weighted_load += weighted_cpuload(i);
3819 if (idle_cpu(i)) 3822 if (idle_cpu(i))
3820 sgs->idle_cpus++; 3823 sgs->idle_cpus++;
@@ -3827,14 +3830,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
3827 * to do the newly idle load balance. 3830 * to do the newly idle load balance.
3828 */ 3831 */
3829 if (local_group) { 3832 if (local_group) {
3830 if (idle != CPU_NEWLY_IDLE) { 3833 if (env->idle != CPU_NEWLY_IDLE) {
3831 if (balance_cpu != this_cpu) { 3834 if (balance_cpu != env->dst_cpu) {
3832 *balance = 0; 3835 *balance = 0;
3833 return; 3836 return;
3834 } 3837 }
3835 update_group_power(sd, this_cpu); 3838 update_group_power(env->sd, env->dst_cpu);
3836 } else if (time_after_eq(jiffies, group->sgp->next_update)) 3839 } else if (time_after_eq(jiffies, group->sgp->next_update))
3837 update_group_power(sd, this_cpu); 3840 update_group_power(env->sd, env->dst_cpu);
3838 } 3841 }
3839 3842
3840 /* Adjust by relative CPU power of the group */ 3843 /* Adjust by relative CPU power of the group */
@@ -3852,13 +3855,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
3852 if (sgs->sum_nr_running) 3855 if (sgs->sum_nr_running)
3853 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; 3856 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
3854 3857
3855 if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1) 3858 if ((max_cpu_load - min_cpu_load) >= avg_load_per_task &&
3859 (max_nr_running - min_nr_running) > 1)
3856 sgs->group_imb = 1; 3860 sgs->group_imb = 1;
3857 3861
3858 sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power, 3862 sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
3859 SCHED_POWER_SCALE); 3863 SCHED_POWER_SCALE);
3860 if (!sgs->group_capacity) 3864 if (!sgs->group_capacity)
3861 sgs->group_capacity = fix_small_capacity(sd, group); 3865 sgs->group_capacity = fix_small_capacity(env->sd, group);
3862 sgs->group_weight = group->group_weight; 3866 sgs->group_weight = group->group_weight;
3863 3867
3864 if (sgs->group_capacity > sgs->sum_nr_running) 3868 if (sgs->group_capacity > sgs->sum_nr_running)
@@ -3876,11 +3880,10 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
3876 * Determine if @sg is a busier group than the previously selected 3880 * Determine if @sg is a busier group than the previously selected
3877 * busiest group. 3881 * busiest group.
3878 */ 3882 */
3879static bool update_sd_pick_busiest(struct sched_domain *sd, 3883static bool update_sd_pick_busiest(struct lb_env *env,
3880 struct sd_lb_stats *sds, 3884 struct sd_lb_stats *sds,
3881 struct sched_group *sg, 3885 struct sched_group *sg,
3882 struct sg_lb_stats *sgs, 3886 struct sg_lb_stats *sgs)
3883 int this_cpu)
3884{ 3887{
3885 if (sgs->avg_load <= sds->max_load) 3888 if (sgs->avg_load <= sds->max_load)
3886 return false; 3889 return false;
@@ -3896,8 +3899,8 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
3896 * numbered CPUs in the group, therefore mark all groups 3899 * numbered CPUs in the group, therefore mark all groups
3897 * higher than ourself as busy. 3900 * higher than ourself as busy.
3898 */ 3901 */
3899 if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running && 3902 if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
3900 this_cpu < group_first_cpu(sg)) { 3903 env->dst_cpu < group_first_cpu(sg)) {
3901 if (!sds->busiest) 3904 if (!sds->busiest)
3902 return true; 3905 return true;
3903 3906
@@ -3917,28 +3920,28 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
3917 * @balance: Should we balance. 3920 * @balance: Should we balance.
3918 * @sds: variable to hold the statistics for this sched_domain. 3921 * @sds: variable to hold the statistics for this sched_domain.
3919 */ 3922 */
3920static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, 3923static inline void update_sd_lb_stats(struct lb_env *env,
3921 enum cpu_idle_type idle, const struct cpumask *cpus, 3924 const struct cpumask *cpus,
3922 int *balance, struct sd_lb_stats *sds) 3925 int *balance, struct sd_lb_stats *sds)
3923{ 3926{
3924 struct sched_domain *child = sd->child; 3927 struct sched_domain *child = env->sd->child;
3925 struct sched_group *sg = sd->groups; 3928 struct sched_group *sg = env->sd->groups;
3926 struct sg_lb_stats sgs; 3929 struct sg_lb_stats sgs;
3927 int load_idx, prefer_sibling = 0; 3930 int load_idx, prefer_sibling = 0;
3928 3931
3929 if (child && child->flags & SD_PREFER_SIBLING) 3932 if (child && child->flags & SD_PREFER_SIBLING)
3930 prefer_sibling = 1; 3933 prefer_sibling = 1;
3931 3934
3932 init_sd_power_savings_stats(sd, sds, idle); 3935 init_sd_power_savings_stats(env->sd, sds, env->idle);
3933 load_idx = get_sd_load_idx(sd, idle); 3936 load_idx = get_sd_load_idx(env->sd, env->idle);
3934 3937
3935 do { 3938 do {
3936 int local_group; 3939 int local_group;
3937 3940
3938 local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg)); 3941 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
3939 memset(&sgs, 0, sizeof(sgs)); 3942 memset(&sgs, 0, sizeof(sgs));
3940 update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, 3943 update_sg_lb_stats(env, sg, load_idx, local_group,
3941 local_group, cpus, balance, &sgs); 3944 cpus, balance, &sgs);
3942 3945
3943 if (local_group && !(*balance)) 3946 if (local_group && !(*balance))
3944 return; 3947 return;
@@ -3966,7 +3969,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3966 sds->this_load_per_task = sgs.sum_weighted_load; 3969 sds->this_load_per_task = sgs.sum_weighted_load;
3967 sds->this_has_capacity = sgs.group_has_capacity; 3970 sds->this_has_capacity = sgs.group_has_capacity;
3968 sds->this_idle_cpus = sgs.idle_cpus; 3971 sds->this_idle_cpus = sgs.idle_cpus;
3969 } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { 3972 } else if (update_sd_pick_busiest(env, sds, sg, &sgs)) {
3970 sds->max_load = sgs.avg_load; 3973 sds->max_load = sgs.avg_load;
3971 sds->busiest = sg; 3974 sds->busiest = sg;
3972 sds->busiest_nr_running = sgs.sum_nr_running; 3975 sds->busiest_nr_running = sgs.sum_nr_running;
@@ -3980,7 +3983,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3980 3983
3981 update_sd_power_savings_stats(sg, sds, local_group, &sgs); 3984 update_sd_power_savings_stats(sg, sds, local_group, &sgs);
3982 sg = sg->next; 3985 sg = sg->next;
3983 } while (sg != sd->groups); 3986 } while (sg != env->sd->groups);
3984} 3987}
3985 3988
3986/** 3989/**
@@ -4008,24 +4011,23 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
4008 * @this_cpu: The cpu at whose sched_domain we're performing load-balance. 4011 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
4009 * @imbalance: returns amount of imbalanced due to packing. 4012 * @imbalance: returns amount of imbalanced due to packing.
4010 */ 4013 */
4011static int check_asym_packing(struct sched_domain *sd, 4014static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
4012 struct sd_lb_stats *sds,
4013 int this_cpu, unsigned long *imbalance)
4014{ 4015{
4015 int busiest_cpu; 4016 int busiest_cpu;
4016 4017
4017 if (!(sd->flags & SD_ASYM_PACKING)) 4018 if (!(env->sd->flags & SD_ASYM_PACKING))
4018 return 0; 4019 return 0;
4019 4020
4020 if (!sds->busiest) 4021 if (!sds->busiest)
4021 return 0; 4022 return 0;
4022 4023
4023 busiest_cpu = group_first_cpu(sds->busiest); 4024 busiest_cpu = group_first_cpu(sds->busiest);
4024 if (this_cpu > busiest_cpu) 4025 if (env->dst_cpu > busiest_cpu)
4025 return 0; 4026 return 0;
4026 4027
4027 *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->sgp->power, 4028 env->imbalance = DIV_ROUND_CLOSEST(
4028 SCHED_POWER_SCALE); 4029 sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE);
4030
4029 return 1; 4031 return 1;
4030} 4032}
4031 4033
@@ -4037,8 +4039,8 @@ static int check_asym_packing(struct sched_domain *sd,
4037 * @this_cpu: The cpu at whose sched_domain we're performing load-balance. 4039 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
4038 * @imbalance: Variable to store the imbalance. 4040 * @imbalance: Variable to store the imbalance.
4039 */ 4041 */
4040static inline void fix_small_imbalance(struct sd_lb_stats *sds, 4042static inline
4041 int this_cpu, unsigned long *imbalance) 4043void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
4042{ 4044{
4043 unsigned long tmp, pwr_now = 0, pwr_move = 0; 4045 unsigned long tmp, pwr_now = 0, pwr_move = 0;
4044 unsigned int imbn = 2; 4046 unsigned int imbn = 2;
@@ -4049,9 +4051,10 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
4049 if (sds->busiest_load_per_task > 4051 if (sds->busiest_load_per_task >
4050 sds->this_load_per_task) 4052 sds->this_load_per_task)
4051 imbn = 1; 4053 imbn = 1;
4052 } else 4054 } else {
4053 sds->this_load_per_task = 4055 sds->this_load_per_task =
4054 cpu_avg_load_per_task(this_cpu); 4056 cpu_avg_load_per_task(env->dst_cpu);
4057 }
4055 4058
4056 scaled_busy_load_per_task = sds->busiest_load_per_task 4059 scaled_busy_load_per_task = sds->busiest_load_per_task
4057 * SCHED_POWER_SCALE; 4060 * SCHED_POWER_SCALE;
@@ -4059,7 +4062,7 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
4059 4062
4060 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= 4063 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
4061 (scaled_busy_load_per_task * imbn)) { 4064 (scaled_busy_load_per_task * imbn)) {
4062 *imbalance = sds->busiest_load_per_task; 4065 env->imbalance = sds->busiest_load_per_task;
4063 return; 4066 return;
4064 } 4067 }
4065 4068
@@ -4096,18 +4099,16 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
4096 4099
4097 /* Move if we gain throughput */ 4100 /* Move if we gain throughput */
4098 if (pwr_move > pwr_now) 4101 if (pwr_move > pwr_now)
4099 *imbalance = sds->busiest_load_per_task; 4102 env->imbalance = sds->busiest_load_per_task;
4100} 4103}
4101 4104
4102/** 4105/**
4103 * calculate_imbalance - Calculate the amount of imbalance present within the 4106 * calculate_imbalance - Calculate the amount of imbalance present within the
4104 * groups of a given sched_domain during load balance. 4107 * groups of a given sched_domain during load balance.
4108 * @env: load balance environment
4105 * @sds: statistics of the sched_domain whose imbalance is to be calculated. 4109 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
4106 * @this_cpu: Cpu for which currently load balance is being performed.
4107 * @imbalance: The variable to store the imbalance.
4108 */ 4110 */
4109static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, 4111static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
4110 unsigned long *imbalance)
4111{ 4112{
4112 unsigned long max_pull, load_above_capacity = ~0UL; 4113 unsigned long max_pull, load_above_capacity = ~0UL;
4113 4114
@@ -4123,8 +4124,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
4123 * its cpu_power, while calculating max_load..) 4124 * its cpu_power, while calculating max_load..)
4124 */ 4125 */
4125 if (sds->max_load < sds->avg_load) { 4126 if (sds->max_load < sds->avg_load) {
4126 *imbalance = 0; 4127 env->imbalance = 0;
4127 return fix_small_imbalance(sds, this_cpu, imbalance); 4128 return fix_small_imbalance(env, sds);
4128 } 4129 }
4129 4130
4130 if (!sds->group_imb) { 4131 if (!sds->group_imb) {
@@ -4152,7 +4153,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
4152 max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); 4153 max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
4153 4154
4154 /* How much load to actually move to equalise the imbalance */ 4155 /* How much load to actually move to equalise the imbalance */
4155 *imbalance = min(max_pull * sds->busiest->sgp->power, 4156 env->imbalance = min(max_pull * sds->busiest->sgp->power,
4156 (sds->avg_load - sds->this_load) * sds->this->sgp->power) 4157 (sds->avg_load - sds->this_load) * sds->this->sgp->power)
4157 / SCHED_POWER_SCALE; 4158 / SCHED_POWER_SCALE;
4158 4159
@@ -4162,8 +4163,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
4162 * a think about bumping its value to force at least one task to be 4163 * a think about bumping its value to force at least one task to be
4163 * moved 4164 * moved
4164 */ 4165 */
4165 if (*imbalance < sds->busiest_load_per_task) 4166 if (env->imbalance < sds->busiest_load_per_task)
4166 return fix_small_imbalance(sds, this_cpu, imbalance); 4167 return fix_small_imbalance(env, sds);
4167 4168
4168} 4169}
4169 4170
@@ -4194,9 +4195,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
4194 * put to idle by rebalancing its tasks onto our group. 4195 * put to idle by rebalancing its tasks onto our group.
4195 */ 4196 */
4196static struct sched_group * 4197static struct sched_group *
4197find_busiest_group(struct sched_domain *sd, int this_cpu, 4198find_busiest_group(struct lb_env *env, const struct cpumask *cpus, int *balance)
4198 unsigned long *imbalance, enum cpu_idle_type idle,
4199 const struct cpumask *cpus, int *balance)
4200{ 4199{
4201 struct sd_lb_stats sds; 4200 struct sd_lb_stats sds;
4202 4201
@@ -4206,7 +4205,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
4206 * Compute the various statistics relavent for load balancing at 4205 * Compute the various statistics relavent for load balancing at
4207 * this level. 4206 * this level.
4208 */ 4207 */
4209 update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds); 4208 update_sd_lb_stats(env, cpus, balance, &sds);
4210 4209
4211 /* 4210 /*
4212 * this_cpu is not the appropriate cpu to perform load balancing at 4211 * this_cpu is not the appropriate cpu to perform load balancing at
@@ -4215,8 +4214,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
4215 if (!(*balance)) 4214 if (!(*balance))
4216 goto ret; 4215 goto ret;
4217 4216
4218 if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) && 4217 if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
4219 check_asym_packing(sd, &sds, this_cpu, imbalance)) 4218 check_asym_packing(env, &sds))
4220 return sds.busiest; 4219 return sds.busiest;
4221 4220
4222 /* There is no busy sibling group to pull tasks from */ 4221 /* There is no busy sibling group to pull tasks from */
@@ -4234,7 +4233,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
4234 goto force_balance; 4233 goto force_balance;
4235 4234
4236 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ 4235 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
4237 if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity && 4236 if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
4238 !sds.busiest_has_capacity) 4237 !sds.busiest_has_capacity)
4239 goto force_balance; 4238 goto force_balance;
4240 4239
@@ -4252,7 +4251,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
4252 if (sds.this_load >= sds.avg_load) 4251 if (sds.this_load >= sds.avg_load)
4253 goto out_balanced; 4252 goto out_balanced;
4254 4253
4255 if (idle == CPU_IDLE) { 4254 if (env->idle == CPU_IDLE) {
4256 /* 4255 /*
4257 * This cpu is idle. If the busiest group load doesn't 4256 * This cpu is idle. If the busiest group load doesn't
4258 * have more tasks than the number of available cpu's and 4257 * have more tasks than the number of available cpu's and
@@ -4267,13 +4266,13 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
4267 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use 4266 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
4268 * imbalance_pct to be conservative. 4267 * imbalance_pct to be conservative.
4269 */ 4268 */
4270 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) 4269 if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load)
4271 goto out_balanced; 4270 goto out_balanced;
4272 } 4271 }
4273 4272
4274force_balance: 4273force_balance:
4275 /* Looks like there is an imbalance. Compute it */ 4274 /* Looks like there is an imbalance. Compute it */
4276 calculate_imbalance(&sds, this_cpu, imbalance); 4275 calculate_imbalance(env, &sds);
4277 return sds.busiest; 4276 return sds.busiest;
4278 4277
4279out_balanced: 4278out_balanced:
@@ -4281,20 +4280,19 @@ out_balanced:
4281 * There is no obvious imbalance. But check if we can do some balancing 4280 * There is no obvious imbalance. But check if we can do some balancing
4282 * to save power. 4281 * to save power.
4283 */ 4282 */
4284 if (check_power_save_busiest_group(&sds, this_cpu, imbalance)) 4283 if (check_power_save_busiest_group(env, &sds))
4285 return sds.busiest; 4284 return sds.busiest;
4286ret: 4285ret:
4287 *imbalance = 0; 4286 env->imbalance = 0;
4288 return NULL; 4287 return NULL;
4289} 4288}
4290 4289
4291/* 4290/*
4292 * find_busiest_queue - find the busiest runqueue among the cpus in group. 4291 * find_busiest_queue - find the busiest runqueue among the cpus in group.
4293 */ 4292 */
4294static struct rq * 4293static struct rq *find_busiest_queue(struct lb_env *env,
4295find_busiest_queue(struct sched_domain *sd, struct sched_group *group, 4294 struct sched_group *group,
4296 enum cpu_idle_type idle, unsigned long imbalance, 4295 const struct cpumask *cpus)
4297 const struct cpumask *cpus)
4298{ 4296{
4299 struct rq *busiest = NULL, *rq; 4297 struct rq *busiest = NULL, *rq;
4300 unsigned long max_load = 0; 4298 unsigned long max_load = 0;
@@ -4307,7 +4305,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
4307 unsigned long wl; 4305 unsigned long wl;
4308 4306
4309 if (!capacity) 4307 if (!capacity)
4310 capacity = fix_small_capacity(sd, group); 4308 capacity = fix_small_capacity(env->sd, group);
4311 4309
4312 if (!cpumask_test_cpu(i, cpus)) 4310 if (!cpumask_test_cpu(i, cpus))
4313 continue; 4311 continue;
@@ -4319,7 +4317,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
4319 * When comparing with imbalance, use weighted_cpuload() 4317 * When comparing with imbalance, use weighted_cpuload()
4320 * which is not scaled with the cpu power. 4318 * which is not scaled with the cpu power.
4321 */ 4319 */
4322 if (capacity && rq->nr_running == 1 && wl > imbalance) 4320 if (capacity && rq->nr_running == 1 && wl > env->imbalance)
4323 continue; 4321 continue;
4324 4322
4325 /* 4323 /*
@@ -4348,17 +4346,18 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
4348/* Working cpumask for load_balance and load_balance_newidle. */ 4346/* Working cpumask for load_balance and load_balance_newidle. */
4349DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 4347DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
4350 4348
4351static int need_active_balance(struct sched_domain *sd, int idle, 4349static int need_active_balance(struct lb_env *env)
4352 int busiest_cpu, int this_cpu)
4353{ 4350{
4354 if (idle == CPU_NEWLY_IDLE) { 4351 struct sched_domain *sd = env->sd;
4352
4353 if (env->idle == CPU_NEWLY_IDLE) {
4355 4354
4356 /* 4355 /*
4357 * ASYM_PACKING needs to force migrate tasks from busy but 4356 * ASYM_PACKING needs to force migrate tasks from busy but
4358 * higher numbered CPUs in order to pack all tasks in the 4357 * higher numbered CPUs in order to pack all tasks in the
4359 * lowest numbered CPUs. 4358 * lowest numbered CPUs.
4360 */ 4359 */
4361 if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu) 4360 if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
4362 return 1; 4361 return 1;
4363 4362
4364 /* 4363 /*
@@ -4399,7 +4398,6 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4399{ 4398{
4400 int ld_moved, active_balance = 0; 4399 int ld_moved, active_balance = 0;
4401 struct sched_group *group; 4400 struct sched_group *group;
4402 unsigned long imbalance;
4403 struct rq *busiest; 4401 struct rq *busiest;
4404 unsigned long flags; 4402 unsigned long flags;
4405 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); 4403 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
@@ -4417,8 +4415,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4417 schedstat_inc(sd, lb_count[idle]); 4415 schedstat_inc(sd, lb_count[idle]);
4418 4416
4419redo: 4417redo:
4420 group = find_busiest_group(sd, this_cpu, &imbalance, idle, 4418 group = find_busiest_group(&env, cpus, balance);
4421 cpus, balance);
4422 4419
4423 if (*balance == 0) 4420 if (*balance == 0)
4424 goto out_balanced; 4421 goto out_balanced;
@@ -4428,7 +4425,7 @@ redo:
4428 goto out_balanced; 4425 goto out_balanced;
4429 } 4426 }
4430 4427
4431 busiest = find_busiest_queue(sd, group, idle, imbalance, cpus); 4428 busiest = find_busiest_queue(&env, group, cpus);
4432 if (!busiest) { 4429 if (!busiest) {
4433 schedstat_inc(sd, lb_nobusyq[idle]); 4430 schedstat_inc(sd, lb_nobusyq[idle]);
4434 goto out_balanced; 4431 goto out_balanced;
@@ -4436,7 +4433,7 @@ redo:
4436 4433
4437 BUG_ON(busiest == this_rq); 4434 BUG_ON(busiest == this_rq);
4438 4435
4439 schedstat_add(sd, lb_imbalance[idle], imbalance); 4436 schedstat_add(sd, lb_imbalance[idle], env.imbalance);
4440 4437
4441 ld_moved = 0; 4438 ld_moved = 0;
4442 if (busiest->nr_running > 1) { 4439 if (busiest->nr_running > 1) {
@@ -4447,10 +4444,9 @@ redo:
4447 * correctly treated as an imbalance. 4444 * correctly treated as an imbalance.
4448 */ 4445 */
4449 env.flags |= LBF_ALL_PINNED; 4446 env.flags |= LBF_ALL_PINNED;
4450 env.load_move = imbalance; 4447 env.src_cpu = busiest->cpu;
4451 env.src_cpu = busiest->cpu; 4448 env.src_rq = busiest;
4452 env.src_rq = busiest; 4449 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
4453 env.loop_max = min_t(unsigned long, sysctl_sched_nr_migrate, busiest->nr_running);
4454 4450
4455more_balance: 4451more_balance:
4456 local_irq_save(flags); 4452 local_irq_save(flags);
@@ -4492,7 +4488,7 @@ more_balance:
4492 if (idle != CPU_NEWLY_IDLE) 4488 if (idle != CPU_NEWLY_IDLE)
4493 sd->nr_balance_failed++; 4489 sd->nr_balance_failed++;
4494 4490
4495 if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) { 4491 if (need_active_balance(&env)) {
4496 raw_spin_lock_irqsave(&busiest->lock, flags); 4492 raw_spin_lock_irqsave(&busiest->lock, flags);
4497 4493
4498 /* don't kick the active_load_balance_cpu_stop, 4494 /* don't kick the active_load_balance_cpu_stop,
@@ -4519,10 +4515,11 @@ more_balance:
4519 } 4515 }
4520 raw_spin_unlock_irqrestore(&busiest->lock, flags); 4516 raw_spin_unlock_irqrestore(&busiest->lock, flags);
4521 4517
4522 if (active_balance) 4518 if (active_balance) {
4523 stop_one_cpu_nowait(cpu_of(busiest), 4519 stop_one_cpu_nowait(cpu_of(busiest),
4524 active_load_balance_cpu_stop, busiest, 4520 active_load_balance_cpu_stop, busiest,
4525 &busiest->active_balance_work); 4521 &busiest->active_balance_work);
4522 }
4526 4523
4527 /* 4524 /*
4528 * We've kicked active balancing, reset the failure 4525 * We've kicked active balancing, reset the failure
@@ -5023,7 +5020,7 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
5023 5020
5024 raw_spin_lock_irq(&this_rq->lock); 5021 raw_spin_lock_irq(&this_rq->lock);
5025 update_rq_clock(this_rq); 5022 update_rq_clock(this_rq);
5026 update_cpu_load(this_rq); 5023 update_idle_cpu_load(this_rq);
5027 raw_spin_unlock_irq(&this_rq->lock); 5024 raw_spin_unlock_irq(&this_rq->lock);
5028 5025
5029 rebalance_domains(balance_cpu, CPU_IDLE); 5026 rebalance_domains(balance_cpu, CPU_IDLE);
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 91b4c957f289..b44d604b35d1 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -4,7 +4,7 @@
4 * idle-task scheduling class. 4 * idle-task scheduling class.
5 * 5 *
6 * (NOTE: these are not related to SCHED_IDLE tasks which are 6 * (NOTE: these are not related to SCHED_IDLE tasks which are
7 * handled in sched_fair.c) 7 * handled in sched/fair.c)
8 */ 8 */
9 9
10#ifdef CONFIG_SMP 10#ifdef CONFIG_SMP
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 44af55e6d5d0..c5565c3c515f 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1803,44 +1803,40 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
1803static void set_cpus_allowed_rt(struct task_struct *p, 1803static void set_cpus_allowed_rt(struct task_struct *p,
1804 const struct cpumask *new_mask) 1804 const struct cpumask *new_mask)
1805{ 1805{
1806 int weight = cpumask_weight(new_mask); 1806 struct rq *rq;
1807 int weight;
1807 1808
1808 BUG_ON(!rt_task(p)); 1809 BUG_ON(!rt_task(p));
1809 1810
1810 /* 1811 if (!p->on_rq)
1811 * Update the migration status of the RQ if we have an RT task 1812 return;
1812 * which is running AND changing its weight value.
1813 */
1814 if (p->on_rq && (weight != p->rt.nr_cpus_allowed)) {
1815 struct rq *rq = task_rq(p);
1816
1817 if (!task_current(rq, p)) {
1818 /*
1819 * Make sure we dequeue this task from the pushable list
1820 * before going further. It will either remain off of
1821 * the list because we are no longer pushable, or it
1822 * will be requeued.
1823 */
1824 if (p->rt.nr_cpus_allowed > 1)
1825 dequeue_pushable_task(rq, p);
1826 1813
1827 /* 1814 weight = cpumask_weight(new_mask);
1828 * Requeue if our weight is changing and still > 1
1829 */
1830 if (weight > 1)
1831 enqueue_pushable_task(rq, p);
1832 1815
1833 } 1816 /*
1817 * Only update if the process changes its state from whether it
1818 * can migrate or not.
1819 */
1820 if ((p->rt.nr_cpus_allowed > 1) == (weight > 1))
1821 return;
1834 1822
1835 if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) { 1823 rq = task_rq(p);
1836 rq->rt.rt_nr_migratory++;
1837 } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) {
1838 BUG_ON(!rq->rt.rt_nr_migratory);
1839 rq->rt.rt_nr_migratory--;
1840 }
1841 1824
1842 update_rt_migration(&rq->rt); 1825 /*
1826 * The process used to be able to migrate OR it can now migrate
1827 */
1828 if (weight <= 1) {
1829 if (!task_current(rq, p))
1830 dequeue_pushable_task(rq, p);
1831 BUG_ON(!rq->rt.rt_nr_migratory);
1832 rq->rt.rt_nr_migratory--;
1833 } else {
1834 if (!task_current(rq, p))
1835 enqueue_pushable_task(rq, p);
1836 rq->rt.rt_nr_migratory++;
1843 } 1837 }
1838
1839 update_rt_migration(&rq->rt);
1844} 1840}
1845 1841
1846/* Assumes rq->lock is held */ 1842/* Assumes rq->lock is held */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index fb3acba4d52e..ba9dccfd24ce 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -201,7 +201,7 @@ struct cfs_bandwidth { };
201/* CFS-related fields in a runqueue */ 201/* CFS-related fields in a runqueue */
202struct cfs_rq { 202struct cfs_rq {
203 struct load_weight load; 203 struct load_weight load;
204 unsigned long nr_running, h_nr_running; 204 unsigned int nr_running, h_nr_running;
205 205
206 u64 exec_clock; 206 u64 exec_clock;
207 u64 min_vruntime; 207 u64 min_vruntime;
@@ -279,7 +279,7 @@ static inline int rt_bandwidth_enabled(void)
279/* Real-Time classes' related field in a runqueue: */ 279/* Real-Time classes' related field in a runqueue: */
280struct rt_rq { 280struct rt_rq {
281 struct rt_prio_array active; 281 struct rt_prio_array active;
282 unsigned long rt_nr_running; 282 unsigned int rt_nr_running;
283#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 283#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
284 struct { 284 struct {
285 int curr; /* highest queued rt task prio */ 285 int curr; /* highest queued rt task prio */
@@ -353,7 +353,7 @@ struct rq {
353 * nr_running and cpu_load should be in the same cacheline because 353 * nr_running and cpu_load should be in the same cacheline because
354 * remote CPUs use both these fields when doing load calculation. 354 * remote CPUs use both these fields when doing load calculation.
355 */ 355 */
356 unsigned long nr_running; 356 unsigned int nr_running;
357 #define CPU_LOAD_IDX_MAX 5 357 #define CPU_LOAD_IDX_MAX 5
358 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 358 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
359 unsigned long last_load_update_tick; 359 unsigned long last_load_update_tick;
@@ -876,7 +876,7 @@ extern void resched_cpu(int cpu);
876extern struct rt_bandwidth def_rt_bandwidth; 876extern struct rt_bandwidth def_rt_bandwidth;
877extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); 877extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
878 878
879extern void update_cpu_load(struct rq *this_rq); 879extern void update_idle_cpu_load(struct rq *this_rq);
880 880
881#ifdef CONFIG_CGROUP_CPUACCT 881#ifdef CONFIG_CGROUP_CPUACCT
882#include <linux/cgroup.h> 882#include <linux/cgroup.h>