aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2009-01-10 20:42:53 -0500
committerIngo Molnar <mingo@elte.hu>2009-01-10 20:42:53 -0500
commit506c10f26c481b7f8ef27c1c79290f68989b2e9e (patch)
tree03de82e812f00957aa6276dac2fe51c3358e88d7 /kernel/sched.c
parente1df957670aef74ffd9a4ad93e6d2c90bf6b4845 (diff)
parentc59765042f53a79a7a65585042ff463b69cb248c (diff)
Merge commit 'v2.6.29-rc1' into perfcounters/core
Conflicts: include/linux/kernel_stat.h
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c1110
1 files changed, 632 insertions, 478 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 3dfbff5fb1ac..43fd21233b93 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -209,7 +209,6 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
209 hrtimer_init(&rt_b->rt_period_timer, 209 hrtimer_init(&rt_b->rt_period_timer,
210 CLOCK_MONOTONIC, HRTIMER_MODE_REL); 210 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
211 rt_b->rt_period_timer.function = sched_rt_period_timer; 211 rt_b->rt_period_timer.function = sched_rt_period_timer;
212 rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
213} 212}
214 213
215static inline int rt_bandwidth_enabled(void) 214static inline int rt_bandwidth_enabled(void)
@@ -499,18 +498,26 @@ struct rt_rq {
499 */ 498 */
500struct root_domain { 499struct root_domain {
501 atomic_t refcount; 500 atomic_t refcount;
502 cpumask_t span; 501 cpumask_var_t span;
503 cpumask_t online; 502 cpumask_var_t online;
504 503
505 /* 504 /*
506 * The "RT overload" flag: it gets set if a CPU has more than 505 * The "RT overload" flag: it gets set if a CPU has more than
507 * one runnable RT task. 506 * one runnable RT task.
508 */ 507 */
509 cpumask_t rto_mask; 508 cpumask_var_t rto_mask;
510 atomic_t rto_count; 509 atomic_t rto_count;
511#ifdef CONFIG_SMP 510#ifdef CONFIG_SMP
512 struct cpupri cpupri; 511 struct cpupri cpupri;
513#endif 512#endif
513#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
514 /*
515 * Preferred wake up cpu nominated by sched_mc balance that will be
516 * used when most cpus are idle in the system indicating overall very
517 * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2)
518 */
519 unsigned int sched_mc_preferred_wakeup_cpu;
520#endif
514}; 521};
515 522
516/* 523/*
@@ -1159,7 +1166,6 @@ static void init_rq_hrtick(struct rq *rq)
1159 1166
1160 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 1167 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1161 rq->hrtick_timer.function = hrtick; 1168 rq->hrtick_timer.function = hrtick;
1162 rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
1163} 1169}
1164#else /* CONFIG_SCHED_HRTICK */ 1170#else /* CONFIG_SCHED_HRTICK */
1165static inline void hrtick_clear(struct rq *rq) 1171static inline void hrtick_clear(struct rq *rq)
@@ -1536,7 +1542,7 @@ static int tg_shares_up(struct task_group *tg, void *data)
1536 struct sched_domain *sd = data; 1542 struct sched_domain *sd = data;
1537 int i; 1543 int i;
1538 1544
1539 for_each_cpu_mask(i, sd->span) { 1545 for_each_cpu(i, sched_domain_span(sd)) {
1540 /* 1546 /*
1541 * If there are currently no tasks on the cpu pretend there 1547 * If there are currently no tasks on the cpu pretend there
1542 * is one of average load so that when a new task gets to 1548 * is one of average load so that when a new task gets to
@@ -1557,7 +1563,7 @@ static int tg_shares_up(struct task_group *tg, void *data)
1557 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) 1563 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1558 shares = tg->shares; 1564 shares = tg->shares;
1559 1565
1560 for_each_cpu_mask(i, sd->span) 1566 for_each_cpu(i, sched_domain_span(sd))
1561 update_group_shares_cpu(tg, i, shares, rq_weight); 1567 update_group_shares_cpu(tg, i, shares, rq_weight);
1562 1568
1563 return 0; 1569 return 0;
@@ -2125,15 +2131,17 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
2125 int i; 2131 int i;
2126 2132
2127 /* Skip over this group if it has no CPUs allowed */ 2133 /* Skip over this group if it has no CPUs allowed */
2128 if (!cpus_intersects(group->cpumask, p->cpus_allowed)) 2134 if (!cpumask_intersects(sched_group_cpus(group),
2135 &p->cpus_allowed))
2129 continue; 2136 continue;
2130 2137
2131 local_group = cpu_isset(this_cpu, group->cpumask); 2138 local_group = cpumask_test_cpu(this_cpu,
2139 sched_group_cpus(group));
2132 2140
2133 /* Tally up the load of all CPUs in the group */ 2141 /* Tally up the load of all CPUs in the group */
2134 avg_load = 0; 2142 avg_load = 0;
2135 2143
2136 for_each_cpu_mask_nr(i, group->cpumask) { 2144 for_each_cpu(i, sched_group_cpus(group)) {
2137 /* Bias balancing toward cpus of our domain */ 2145 /* Bias balancing toward cpus of our domain */
2138 if (local_group) 2146 if (local_group)
2139 load = source_load(i, load_idx); 2147 load = source_load(i, load_idx);
@@ -2165,17 +2173,14 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
2165 * find_idlest_cpu - find the idlest cpu among the cpus in group. 2173 * find_idlest_cpu - find the idlest cpu among the cpus in group.
2166 */ 2174 */
2167static int 2175static int
2168find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu, 2176find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
2169 cpumask_t *tmp)
2170{ 2177{
2171 unsigned long load, min_load = ULONG_MAX; 2178 unsigned long load, min_load = ULONG_MAX;
2172 int idlest = -1; 2179 int idlest = -1;
2173 int i; 2180 int i;
2174 2181
2175 /* Traverse only the allowed CPUs */ 2182 /* Traverse only the allowed CPUs */
2176 cpus_and(*tmp, group->cpumask, p->cpus_allowed); 2183 for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
2177
2178 for_each_cpu_mask_nr(i, *tmp) {
2179 load = weighted_cpuload(i); 2184 load = weighted_cpuload(i);
2180 2185
2181 if (load < min_load || (load == min_load && i == this_cpu)) { 2186 if (load < min_load || (load == min_load && i == this_cpu)) {
@@ -2217,7 +2222,6 @@ static int sched_balance_self(int cpu, int flag)
2217 update_shares(sd); 2222 update_shares(sd);
2218 2223
2219 while (sd) { 2224 while (sd) {
2220 cpumask_t span, tmpmask;
2221 struct sched_group *group; 2225 struct sched_group *group;
2222 int new_cpu, weight; 2226 int new_cpu, weight;
2223 2227
@@ -2226,14 +2230,13 @@ static int sched_balance_self(int cpu, int flag)
2226 continue; 2230 continue;
2227 } 2231 }
2228 2232
2229 span = sd->span;
2230 group = find_idlest_group(sd, t, cpu); 2233 group = find_idlest_group(sd, t, cpu);
2231 if (!group) { 2234 if (!group) {
2232 sd = sd->child; 2235 sd = sd->child;
2233 continue; 2236 continue;
2234 } 2237 }
2235 2238
2236 new_cpu = find_idlest_cpu(group, t, cpu, &tmpmask); 2239 new_cpu = find_idlest_cpu(group, t, cpu);
2237 if (new_cpu == -1 || new_cpu == cpu) { 2240 if (new_cpu == -1 || new_cpu == cpu) {
2238 /* Now try balancing at a lower domain level of cpu */ 2241 /* Now try balancing at a lower domain level of cpu */
2239 sd = sd->child; 2242 sd = sd->child;
@@ -2242,10 +2245,10 @@ static int sched_balance_self(int cpu, int flag)
2242 2245
2243 /* Now try balancing at a lower domain level of new_cpu */ 2246 /* Now try balancing at a lower domain level of new_cpu */
2244 cpu = new_cpu; 2247 cpu = new_cpu;
2248 weight = cpumask_weight(sched_domain_span(sd));
2245 sd = NULL; 2249 sd = NULL;
2246 weight = cpus_weight(span);
2247 for_each_domain(cpu, tmp) { 2250 for_each_domain(cpu, tmp) {
2248 if (weight <= cpus_weight(tmp->span)) 2251 if (weight <= cpumask_weight(sched_domain_span(tmp)))
2249 break; 2252 break;
2250 if (tmp->flags & flag) 2253 if (tmp->flags & flag)
2251 sd = tmp; 2254 sd = tmp;
@@ -2311,7 +2314,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2311 cpu = task_cpu(p); 2314 cpu = task_cpu(p);
2312 2315
2313 for_each_domain(this_cpu, sd) { 2316 for_each_domain(this_cpu, sd) {
2314 if (cpu_isset(cpu, sd->span)) { 2317 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2315 update_shares(sd); 2318 update_shares(sd);
2316 break; 2319 break;
2317 } 2320 }
@@ -2360,7 +2363,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2360 else { 2363 else {
2361 struct sched_domain *sd; 2364 struct sched_domain *sd;
2362 for_each_domain(this_cpu, sd) { 2365 for_each_domain(this_cpu, sd) {
2363 if (cpu_isset(cpu, sd->span)) { 2366 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2364 schedstat_inc(sd, ttwu_wake_remote); 2367 schedstat_inc(sd, ttwu_wake_remote);
2365 break; 2368 break;
2366 } 2369 }
@@ -2893,7 +2896,7 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
2893 struct rq *rq; 2896 struct rq *rq;
2894 2897
2895 rq = task_rq_lock(p, &flags); 2898 rq = task_rq_lock(p, &flags);
2896 if (!cpu_isset(dest_cpu, p->cpus_allowed) 2899 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)
2897 || unlikely(!cpu_active(dest_cpu))) 2900 || unlikely(!cpu_active(dest_cpu)))
2898 goto out; 2901 goto out;
2899 2902
@@ -2958,7 +2961,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2958 * 2) cannot be migrated to this CPU due to cpus_allowed, or 2961 * 2) cannot be migrated to this CPU due to cpus_allowed, or
2959 * 3) are cache-hot on their current CPU. 2962 * 3) are cache-hot on their current CPU.
2960 */ 2963 */
2961 if (!cpu_isset(this_cpu, p->cpus_allowed)) { 2964 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
2962 schedstat_inc(p, se.nr_failed_migrations_affine); 2965 schedstat_inc(p, se.nr_failed_migrations_affine);
2963 return 0; 2966 return 0;
2964 } 2967 }
@@ -3133,7 +3136,7 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3133static struct sched_group * 3136static struct sched_group *
3134find_busiest_group(struct sched_domain *sd, int this_cpu, 3137find_busiest_group(struct sched_domain *sd, int this_cpu,
3135 unsigned long *imbalance, enum cpu_idle_type idle, 3138 unsigned long *imbalance, enum cpu_idle_type idle,
3136 int *sd_idle, const cpumask_t *cpus, int *balance) 3139 int *sd_idle, const struct cpumask *cpus, int *balance)
3137{ 3140{
3138 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; 3141 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
3139 unsigned long max_load, avg_load, total_load, this_load, total_pwr; 3142 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
@@ -3169,10 +3172,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3169 unsigned long sum_avg_load_per_task; 3172 unsigned long sum_avg_load_per_task;
3170 unsigned long avg_load_per_task; 3173 unsigned long avg_load_per_task;
3171 3174
3172 local_group = cpu_isset(this_cpu, group->cpumask); 3175 local_group = cpumask_test_cpu(this_cpu,
3176 sched_group_cpus(group));
3173 3177
3174 if (local_group) 3178 if (local_group)
3175 balance_cpu = first_cpu(group->cpumask); 3179 balance_cpu = cpumask_first(sched_group_cpus(group));
3176 3180
3177 /* Tally up the load of all CPUs in the group */ 3181 /* Tally up the load of all CPUs in the group */
3178 sum_weighted_load = sum_nr_running = avg_load = 0; 3182 sum_weighted_load = sum_nr_running = avg_load = 0;
@@ -3181,13 +3185,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3181 max_cpu_load = 0; 3185 max_cpu_load = 0;
3182 min_cpu_load = ~0UL; 3186 min_cpu_load = ~0UL;
3183 3187
3184 for_each_cpu_mask_nr(i, group->cpumask) { 3188 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
3185 struct rq *rq; 3189 struct rq *rq = cpu_rq(i);
3186
3187 if (!cpu_isset(i, *cpus))
3188 continue;
3189
3190 rq = cpu_rq(i);
3191 3190
3192 if (*sd_idle && rq->nr_running) 3191 if (*sd_idle && rq->nr_running)
3193 *sd_idle = 0; 3192 *sd_idle = 0;
@@ -3298,8 +3297,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3298 */ 3297 */
3299 if ((sum_nr_running < min_nr_running) || 3298 if ((sum_nr_running < min_nr_running) ||
3300 (sum_nr_running == min_nr_running && 3299 (sum_nr_running == min_nr_running &&
3301 first_cpu(group->cpumask) < 3300 cpumask_first(sched_group_cpus(group)) >
3302 first_cpu(group_min->cpumask))) { 3301 cpumask_first(sched_group_cpus(group_min)))) {
3303 group_min = group; 3302 group_min = group;
3304 min_nr_running = sum_nr_running; 3303 min_nr_running = sum_nr_running;
3305 min_load_per_task = sum_weighted_load / 3304 min_load_per_task = sum_weighted_load /
@@ -3314,8 +3313,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3314 if (sum_nr_running <= group_capacity - 1) { 3313 if (sum_nr_running <= group_capacity - 1) {
3315 if (sum_nr_running > leader_nr_running || 3314 if (sum_nr_running > leader_nr_running ||
3316 (sum_nr_running == leader_nr_running && 3315 (sum_nr_running == leader_nr_running &&
3317 first_cpu(group->cpumask) > 3316 cpumask_first(sched_group_cpus(group)) <
3318 first_cpu(group_leader->cpumask))) { 3317 cpumask_first(sched_group_cpus(group_leader)))) {
3319 group_leader = group; 3318 group_leader = group;
3320 leader_nr_running = sum_nr_running; 3319 leader_nr_running = sum_nr_running;
3321 } 3320 }
@@ -3441,6 +3440,10 @@ out_balanced:
3441 3440
3442 if (this == group_leader && group_leader != group_min) { 3441 if (this == group_leader && group_leader != group_min) {
3443 *imbalance = min_load_per_task; 3442 *imbalance = min_load_per_task;
3443 if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
3444 cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
3445 cpumask_first(sched_group_cpus(group_leader));
3446 }
3444 return group_min; 3447 return group_min;
3445 } 3448 }
3446#endif 3449#endif
@@ -3454,16 +3457,16 @@ ret:
3454 */ 3457 */
3455static struct rq * 3458static struct rq *
3456find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, 3459find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
3457 unsigned long imbalance, const cpumask_t *cpus) 3460 unsigned long imbalance, const struct cpumask *cpus)
3458{ 3461{
3459 struct rq *busiest = NULL, *rq; 3462 struct rq *busiest = NULL, *rq;
3460 unsigned long max_load = 0; 3463 unsigned long max_load = 0;
3461 int i; 3464 int i;
3462 3465
3463 for_each_cpu_mask_nr(i, group->cpumask) { 3466 for_each_cpu(i, sched_group_cpus(group)) {
3464 unsigned long wl; 3467 unsigned long wl;
3465 3468
3466 if (!cpu_isset(i, *cpus)) 3469 if (!cpumask_test_cpu(i, cpus))
3467 continue; 3470 continue;
3468 3471
3469 rq = cpu_rq(i); 3472 rq = cpu_rq(i);
@@ -3493,7 +3496,7 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
3493 */ 3496 */
3494static int load_balance(int this_cpu, struct rq *this_rq, 3497static int load_balance(int this_cpu, struct rq *this_rq,
3495 struct sched_domain *sd, enum cpu_idle_type idle, 3498 struct sched_domain *sd, enum cpu_idle_type idle,
3496 int *balance, cpumask_t *cpus) 3499 int *balance, struct cpumask *cpus)
3497{ 3500{
3498 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; 3501 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
3499 struct sched_group *group; 3502 struct sched_group *group;
@@ -3501,7 +3504,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3501 struct rq *busiest; 3504 struct rq *busiest;
3502 unsigned long flags; 3505 unsigned long flags;
3503 3506
3504 cpus_setall(*cpus); 3507 cpumask_setall(cpus);
3505 3508
3506 /* 3509 /*
3507 * When power savings policy is enabled for the parent domain, idle 3510 * When power savings policy is enabled for the parent domain, idle
@@ -3561,8 +3564,8 @@ redo:
3561 3564
3562 /* All tasks on this runqueue were pinned by CPU affinity */ 3565 /* All tasks on this runqueue were pinned by CPU affinity */
3563 if (unlikely(all_pinned)) { 3566 if (unlikely(all_pinned)) {
3564 cpu_clear(cpu_of(busiest), *cpus); 3567 cpumask_clear_cpu(cpu_of(busiest), cpus);
3565 if (!cpus_empty(*cpus)) 3568 if (!cpumask_empty(cpus))
3566 goto redo; 3569 goto redo;
3567 goto out_balanced; 3570 goto out_balanced;
3568 } 3571 }
@@ -3579,7 +3582,8 @@ redo:
3579 /* don't kick the migration_thread, if the curr 3582 /* don't kick the migration_thread, if the curr
3580 * task on busiest cpu can't be moved to this_cpu 3583 * task on busiest cpu can't be moved to this_cpu
3581 */ 3584 */
3582 if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { 3585 if (!cpumask_test_cpu(this_cpu,
3586 &busiest->curr->cpus_allowed)) {
3583 spin_unlock_irqrestore(&busiest->lock, flags); 3587 spin_unlock_irqrestore(&busiest->lock, flags);
3584 all_pinned = 1; 3588 all_pinned = 1;
3585 goto out_one_pinned; 3589 goto out_one_pinned;
@@ -3654,7 +3658,7 @@ out:
3654 */ 3658 */
3655static int 3659static int
3656load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd, 3660load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
3657 cpumask_t *cpus) 3661 struct cpumask *cpus)
3658{ 3662{
3659 struct sched_group *group; 3663 struct sched_group *group;
3660 struct rq *busiest = NULL; 3664 struct rq *busiest = NULL;
@@ -3663,7 +3667,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
3663 int sd_idle = 0; 3667 int sd_idle = 0;
3664 int all_pinned = 0; 3668 int all_pinned = 0;
3665 3669
3666 cpus_setall(*cpus); 3670 cpumask_setall(cpus);
3667 3671
3668 /* 3672 /*
3669 * When power savings policy is enabled for the parent domain, idle 3673 * When power savings policy is enabled for the parent domain, idle
@@ -3707,17 +3711,76 @@ redo:
3707 double_unlock_balance(this_rq, busiest); 3711 double_unlock_balance(this_rq, busiest);
3708 3712
3709 if (unlikely(all_pinned)) { 3713 if (unlikely(all_pinned)) {
3710 cpu_clear(cpu_of(busiest), *cpus); 3714 cpumask_clear_cpu(cpu_of(busiest), cpus);
3711 if (!cpus_empty(*cpus)) 3715 if (!cpumask_empty(cpus))
3712 goto redo; 3716 goto redo;
3713 } 3717 }
3714 } 3718 }
3715 3719
3716 if (!ld_moved) { 3720 if (!ld_moved) {
3721 int active_balance = 0;
3722
3717 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]); 3723 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
3718 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && 3724 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3719 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 3725 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3720 return -1; 3726 return -1;
3727
3728 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
3729 return -1;
3730
3731 if (sd->nr_balance_failed++ < 2)
3732 return -1;
3733
3734 /*
3735 * The only task running in a non-idle cpu can be moved to this
3736 * cpu in an attempt to completely freeup the other CPU
3737 * package. The same method used to move task in load_balance()
3738 * have been extended for load_balance_newidle() to speedup
3739 * consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2)
3740 *
3741 * The package power saving logic comes from
3742 * find_busiest_group(). If there are no imbalance, then
3743 * f_b_g() will return NULL. However when sched_mc={1,2} then
3744 * f_b_g() will select a group from which a running task may be
3745 * pulled to this cpu in order to make the other package idle.
3746 * If there is no opportunity to make a package idle and if
3747 * there are no imbalance, then f_b_g() will return NULL and no
3748 * action will be taken in load_balance_newidle().
3749 *
3750 * Under normal task pull operation due to imbalance, there
3751 * will be more than one task in the source run queue and
3752 * move_tasks() will succeed. ld_moved will be true and this
3753 * active balance code will not be triggered.
3754 */
3755
3756 /* Lock busiest in correct order while this_rq is held */
3757 double_lock_balance(this_rq, busiest);
3758
3759 /*
3760 * don't kick the migration_thread, if the curr
3761 * task on busiest cpu can't be moved to this_cpu
3762 */
3763 if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
3764 double_unlock_balance(this_rq, busiest);
3765 all_pinned = 1;
3766 return ld_moved;
3767 }
3768
3769 if (!busiest->active_balance) {
3770 busiest->active_balance = 1;
3771 busiest->push_cpu = this_cpu;
3772 active_balance = 1;
3773 }
3774
3775 double_unlock_balance(this_rq, busiest);
3776 /*
3777 * Should not call ttwu while holding a rq->lock
3778 */
3779 spin_unlock(&this_rq->lock);
3780 if (active_balance)
3781 wake_up_process(busiest->migration_thread);
3782 spin_lock(&this_rq->lock);
3783
3721 } else 3784 } else
3722 sd->nr_balance_failed = 0; 3785 sd->nr_balance_failed = 0;
3723 3786
@@ -3743,7 +3806,10 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3743 struct sched_domain *sd; 3806 struct sched_domain *sd;
3744 int pulled_task = 0; 3807 int pulled_task = 0;
3745 unsigned long next_balance = jiffies + HZ; 3808 unsigned long next_balance = jiffies + HZ;
3746 cpumask_t tmpmask; 3809 cpumask_var_t tmpmask;
3810
3811 if (!alloc_cpumask_var(&tmpmask, GFP_ATOMIC))
3812 return;
3747 3813
3748 for_each_domain(this_cpu, sd) { 3814 for_each_domain(this_cpu, sd) {
3749 unsigned long interval; 3815 unsigned long interval;
@@ -3754,7 +3820,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3754 if (sd->flags & SD_BALANCE_NEWIDLE) 3820 if (sd->flags & SD_BALANCE_NEWIDLE)
3755 /* If we've pulled tasks over stop searching: */ 3821 /* If we've pulled tasks over stop searching: */
3756 pulled_task = load_balance_newidle(this_cpu, this_rq, 3822 pulled_task = load_balance_newidle(this_cpu, this_rq,
3757 sd, &tmpmask); 3823 sd, tmpmask);
3758 3824
3759 interval = msecs_to_jiffies(sd->balance_interval); 3825 interval = msecs_to_jiffies(sd->balance_interval);
3760 if (time_after(next_balance, sd->last_balance + interval)) 3826 if (time_after(next_balance, sd->last_balance + interval))
@@ -3769,6 +3835,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3769 */ 3835 */
3770 this_rq->next_balance = next_balance; 3836 this_rq->next_balance = next_balance;
3771 } 3837 }
3838 free_cpumask_var(tmpmask);
3772} 3839}
3773 3840
3774/* 3841/*
@@ -3806,7 +3873,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3806 /* Search for an sd spanning us and the target CPU. */ 3873 /* Search for an sd spanning us and the target CPU. */
3807 for_each_domain(target_cpu, sd) { 3874 for_each_domain(target_cpu, sd) {
3808 if ((sd->flags & SD_LOAD_BALANCE) && 3875 if ((sd->flags & SD_LOAD_BALANCE) &&
3809 cpu_isset(busiest_cpu, sd->span)) 3876 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
3810 break; 3877 break;
3811 } 3878 }
3812 3879
@@ -3825,10 +3892,9 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3825#ifdef CONFIG_NO_HZ 3892#ifdef CONFIG_NO_HZ
3826static struct { 3893static struct {
3827 atomic_t load_balancer; 3894 atomic_t load_balancer;
3828 cpumask_t cpu_mask; 3895 cpumask_var_t cpu_mask;
3829} nohz ____cacheline_aligned = { 3896} nohz ____cacheline_aligned = {
3830 .load_balancer = ATOMIC_INIT(-1), 3897 .load_balancer = ATOMIC_INIT(-1),
3831 .cpu_mask = CPU_MASK_NONE,
3832}; 3898};
3833 3899
3834/* 3900/*
@@ -3856,7 +3922,7 @@ int select_nohz_load_balancer(int stop_tick)
3856 int cpu = smp_processor_id(); 3922 int cpu = smp_processor_id();
3857 3923
3858 if (stop_tick) { 3924 if (stop_tick) {
3859 cpu_set(cpu, nohz.cpu_mask); 3925 cpumask_set_cpu(cpu, nohz.cpu_mask);
3860 cpu_rq(cpu)->in_nohz_recently = 1; 3926 cpu_rq(cpu)->in_nohz_recently = 1;
3861 3927
3862 /* 3928 /*
@@ -3870,7 +3936,7 @@ int select_nohz_load_balancer(int stop_tick)
3870 } 3936 }
3871 3937
3872 /* time for ilb owner also to sleep */ 3938 /* time for ilb owner also to sleep */
3873 if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) { 3939 if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
3874 if (atomic_read(&nohz.load_balancer) == cpu) 3940 if (atomic_read(&nohz.load_balancer) == cpu)
3875 atomic_set(&nohz.load_balancer, -1); 3941 atomic_set(&nohz.load_balancer, -1);
3876 return 0; 3942 return 0;
@@ -3883,10 +3949,10 @@ int select_nohz_load_balancer(int stop_tick)
3883 } else if (atomic_read(&nohz.load_balancer) == cpu) 3949 } else if (atomic_read(&nohz.load_balancer) == cpu)
3884 return 1; 3950 return 1;
3885 } else { 3951 } else {
3886 if (!cpu_isset(cpu, nohz.cpu_mask)) 3952 if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
3887 return 0; 3953 return 0;
3888 3954
3889 cpu_clear(cpu, nohz.cpu_mask); 3955 cpumask_clear_cpu(cpu, nohz.cpu_mask);
3890 3956
3891 if (atomic_read(&nohz.load_balancer) == cpu) 3957 if (atomic_read(&nohz.load_balancer) == cpu)
3892 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) 3958 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
@@ -3914,7 +3980,11 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3914 unsigned long next_balance = jiffies + 60*HZ; 3980 unsigned long next_balance = jiffies + 60*HZ;
3915 int update_next_balance = 0; 3981 int update_next_balance = 0;
3916 int need_serialize; 3982 int need_serialize;
3917 cpumask_t tmp; 3983 cpumask_var_t tmp;
3984
3985 /* Fails alloc? Rebalancing probably not a priority right now. */
3986 if (!alloc_cpumask_var(&tmp, GFP_ATOMIC))
3987 return;
3918 3988
3919 for_each_domain(cpu, sd) { 3989 for_each_domain(cpu, sd) {
3920 if (!(sd->flags & SD_LOAD_BALANCE)) 3990 if (!(sd->flags & SD_LOAD_BALANCE))
@@ -3939,7 +4009,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3939 } 4009 }
3940 4010
3941 if (time_after_eq(jiffies, sd->last_balance + interval)) { 4011 if (time_after_eq(jiffies, sd->last_balance + interval)) {
3942 if (load_balance(cpu, rq, sd, idle, &balance, &tmp)) { 4012 if (load_balance(cpu, rq, sd, idle, &balance, tmp)) {
3943 /* 4013 /*
3944 * We've pulled tasks over so either we're no 4014 * We've pulled tasks over so either we're no
3945 * longer idle, or one of our SMT siblings is 4015 * longer idle, or one of our SMT siblings is
@@ -3973,6 +4043,8 @@ out:
3973 */ 4043 */
3974 if (likely(update_next_balance)) 4044 if (likely(update_next_balance))
3975 rq->next_balance = next_balance; 4045 rq->next_balance = next_balance;
4046
4047 free_cpumask_var(tmp);
3976} 4048}
3977 4049
3978/* 4050/*
@@ -3997,12 +4069,13 @@ static void run_rebalance_domains(struct softirq_action *h)
3997 */ 4069 */
3998 if (this_rq->idle_at_tick && 4070 if (this_rq->idle_at_tick &&
3999 atomic_read(&nohz.load_balancer) == this_cpu) { 4071 atomic_read(&nohz.load_balancer) == this_cpu) {
4000 cpumask_t cpus = nohz.cpu_mask;
4001 struct rq *rq; 4072 struct rq *rq;
4002 int balance_cpu; 4073 int balance_cpu;
4003 4074
4004 cpu_clear(this_cpu, cpus); 4075 for_each_cpu(balance_cpu, nohz.cpu_mask) {
4005 for_each_cpu_mask_nr(balance_cpu, cpus) { 4076 if (balance_cpu == this_cpu)
4077 continue;
4078
4006 /* 4079 /*
4007 * If this cpu gets work to do, stop the load balancing 4080 * If this cpu gets work to do, stop the load balancing
4008 * work being done for other cpus. Next load 4081 * work being done for other cpus. Next load
@@ -4040,7 +4113,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
4040 rq->in_nohz_recently = 0; 4113 rq->in_nohz_recently = 0;
4041 4114
4042 if (atomic_read(&nohz.load_balancer) == cpu) { 4115 if (atomic_read(&nohz.load_balancer) == cpu) {
4043 cpu_clear(cpu, nohz.cpu_mask); 4116 cpumask_clear_cpu(cpu, nohz.cpu_mask);
4044 atomic_set(&nohz.load_balancer, -1); 4117 atomic_set(&nohz.load_balancer, -1);
4045 } 4118 }
4046 4119
@@ -4053,7 +4126,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
4053 * TBD: Traverse the sched domains and nominate 4126 * TBD: Traverse the sched domains and nominate
4054 * the nearest cpu in the nohz.cpu_mask. 4127 * the nearest cpu in the nohz.cpu_mask.
4055 */ 4128 */
4056 int ilb = first_cpu(nohz.cpu_mask); 4129 int ilb = cpumask_first(nohz.cpu_mask);
4057 4130
4058 if (ilb < nr_cpu_ids) 4131 if (ilb < nr_cpu_ids)
4059 resched_cpu(ilb); 4132 resched_cpu(ilb);
@@ -4065,7 +4138,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
4065 * cpus with ticks stopped, is it time for that to stop? 4138 * cpus with ticks stopped, is it time for that to stop?
4066 */ 4139 */
4067 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu && 4140 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
4068 cpus_weight(nohz.cpu_mask) == num_online_cpus()) { 4141 cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
4069 resched_cpu(cpu); 4142 resched_cpu(cpu);
4070 return; 4143 return;
4071 } 4144 }
@@ -4075,7 +4148,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
4075 * someone else, then no need raise the SCHED_SOFTIRQ 4148 * someone else, then no need raise the SCHED_SOFTIRQ
4076 */ 4149 */
4077 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu && 4150 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
4078 cpu_isset(cpu, nohz.cpu_mask)) 4151 cpumask_test_cpu(cpu, nohz.cpu_mask))
4079 return; 4152 return;
4080#endif 4153#endif
4081 if (time_after_eq(jiffies, rq->next_balance)) 4154 if (time_after_eq(jiffies, rq->next_balance))
@@ -4150,13 +4223,17 @@ unsigned long long task_delta_exec(struct task_struct *p)
4150 * Account user cpu time to a process. 4223 * Account user cpu time to a process.
4151 * @p: the process that the cpu time gets accounted to 4224 * @p: the process that the cpu time gets accounted to
4152 * @cputime: the cpu time spent in user space since the last update 4225 * @cputime: the cpu time spent in user space since the last update
4226 * @cputime_scaled: cputime scaled by cpu frequency
4153 */ 4227 */
4154void account_user_time(struct task_struct *p, cputime_t cputime) 4228void account_user_time(struct task_struct *p, cputime_t cputime,
4229 cputime_t cputime_scaled)
4155{ 4230{
4156 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 4231 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4157 cputime64_t tmp; 4232 cputime64_t tmp;
4158 4233
4234 /* Add user time to process. */
4159 p->utime = cputime_add(p->utime, cputime); 4235 p->utime = cputime_add(p->utime, cputime);
4236 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
4160 account_group_user_time(p, cputime); 4237 account_group_user_time(p, cputime);
4161 4238
4162 /* Add user time to cpustat. */ 4239 /* Add user time to cpustat. */
@@ -4173,51 +4250,48 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
4173 * Account guest cpu time to a process. 4250 * Account guest cpu time to a process.
4174 * @p: the process that the cpu time gets accounted to 4251 * @p: the process that the cpu time gets accounted to
4175 * @cputime: the cpu time spent in virtual machine since the last update 4252 * @cputime: the cpu time spent in virtual machine since the last update
4253 * @cputime_scaled: cputime scaled by cpu frequency
4176 */ 4254 */
4177static void account_guest_time(struct task_struct *p, cputime_t cputime) 4255static void account_guest_time(struct task_struct *p, cputime_t cputime,
4256 cputime_t cputime_scaled)
4178{ 4257{
4179 cputime64_t tmp; 4258 cputime64_t tmp;
4180 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 4259 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4181 4260
4182 tmp = cputime_to_cputime64(cputime); 4261 tmp = cputime_to_cputime64(cputime);
4183 4262
4263 /* Add guest time to process. */
4184 p->utime = cputime_add(p->utime, cputime); 4264 p->utime = cputime_add(p->utime, cputime);
4265 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
4185 account_group_user_time(p, cputime); 4266 account_group_user_time(p, cputime);
4186 p->gtime = cputime_add(p->gtime, cputime); 4267 p->gtime = cputime_add(p->gtime, cputime);
4187 4268
4269 /* Add guest time to cpustat. */
4188 cpustat->user = cputime64_add(cpustat->user, tmp); 4270 cpustat->user = cputime64_add(cpustat->user, tmp);
4189 cpustat->guest = cputime64_add(cpustat->guest, tmp); 4271 cpustat->guest = cputime64_add(cpustat->guest, tmp);
4190} 4272}
4191 4273
4192/* 4274/*
4193 * Account scaled user cpu time to a process.
4194 * @p: the process that the cpu time gets accounted to
4195 * @cputime: the cpu time spent in user space since the last update
4196 */
4197void account_user_time_scaled(struct task_struct *p, cputime_t cputime)
4198{
4199 p->utimescaled = cputime_add(p->utimescaled, cputime);
4200}
4201
4202/*
4203 * Account system cpu time to a process. 4275 * Account system cpu time to a process.
4204 * @p: the process that the cpu time gets accounted to 4276 * @p: the process that the cpu time gets accounted to
4205 * @hardirq_offset: the offset to subtract from hardirq_count() 4277 * @hardirq_offset: the offset to subtract from hardirq_count()
4206 * @cputime: the cpu time spent in kernel space since the last update 4278 * @cputime: the cpu time spent in kernel space since the last update
4279 * @cputime_scaled: cputime scaled by cpu frequency
4207 */ 4280 */
4208void account_system_time(struct task_struct *p, int hardirq_offset, 4281void account_system_time(struct task_struct *p, int hardirq_offset,
4209 cputime_t cputime) 4282 cputime_t cputime, cputime_t cputime_scaled)
4210{ 4283{
4211 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 4284 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4212 struct rq *rq = this_rq();
4213 cputime64_t tmp; 4285 cputime64_t tmp;
4214 4286
4215 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { 4287 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
4216 account_guest_time(p, cputime); 4288 account_guest_time(p, cputime, cputime_scaled);
4217 return; 4289 return;
4218 } 4290 }
4219 4291
4292 /* Add system time to process. */
4220 p->stime = cputime_add(p->stime, cputime); 4293 p->stime = cputime_add(p->stime, cputime);
4294 p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
4221 account_group_system_time(p, cputime); 4295 account_group_system_time(p, cputime);
4222 4296
4223 /* Add system time to cpustat. */ 4297 /* Add system time to cpustat. */
@@ -4226,49 +4300,84 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
4226 cpustat->irq = cputime64_add(cpustat->irq, tmp); 4300 cpustat->irq = cputime64_add(cpustat->irq, tmp);
4227 else if (softirq_count()) 4301 else if (softirq_count())
4228 cpustat->softirq = cputime64_add(cpustat->softirq, tmp); 4302 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
4229 else if (p != rq->idle)
4230 cpustat->system = cputime64_add(cpustat->system, tmp);
4231 else if (atomic_read(&rq->nr_iowait) > 0)
4232 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
4233 else 4303 else
4234 cpustat->idle = cputime64_add(cpustat->idle, tmp); 4304 cpustat->system = cputime64_add(cpustat->system, tmp);
4305
4235 /* Account for system time used */ 4306 /* Account for system time used */
4236 acct_update_integrals(p); 4307 acct_update_integrals(p);
4237} 4308}
4238 4309
4239/* 4310/*
4240 * Account scaled system cpu time to a process. 4311 * Account for involuntary wait time.
4241 * @p: the process that the cpu time gets accounted to 4312 * @steal: the cpu time spent in involuntary wait
4242 * @hardirq_offset: the offset to subtract from hardirq_count()
4243 * @cputime: the cpu time spent in kernel space since the last update
4244 */ 4313 */
4245void account_system_time_scaled(struct task_struct *p, cputime_t cputime) 4314void account_steal_time(cputime_t cputime)
4246{ 4315{
4247 p->stimescaled = cputime_add(p->stimescaled, cputime); 4316 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4317 cputime64_t cputime64 = cputime_to_cputime64(cputime);
4318
4319 cpustat->steal = cputime64_add(cpustat->steal, cputime64);
4248} 4320}
4249 4321
4250/* 4322/*
4251 * Account for involuntary wait time. 4323 * Account for idle time.
4252 * @p: the process from which the cpu time has been stolen 4324 * @cputime: the cpu time spent in idle wait
4253 * @steal: the cpu time spent in involuntary wait
4254 */ 4325 */
4255void account_steal_time(struct task_struct *p, cputime_t steal) 4326void account_idle_time(cputime_t cputime)
4256{ 4327{
4257 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 4328 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4258 cputime64_t tmp = cputime_to_cputime64(steal); 4329 cputime64_t cputime64 = cputime_to_cputime64(cputime);
4259 struct rq *rq = this_rq(); 4330 struct rq *rq = this_rq();
4260 4331
4261 if (p == rq->idle) { 4332 if (atomic_read(&rq->nr_iowait) > 0)
4262 p->stime = cputime_add(p->stime, steal); 4333 cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);
4263 account_group_system_time(p, steal); 4334 else
4264 if (atomic_read(&rq->nr_iowait) > 0) 4335 cpustat->idle = cputime64_add(cpustat->idle, cputime64);
4265 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
4266 else
4267 cpustat->idle = cputime64_add(cpustat->idle, tmp);
4268 } else
4269 cpustat->steal = cputime64_add(cpustat->steal, tmp);
4270} 4336}
4271 4337
4338#ifndef CONFIG_VIRT_CPU_ACCOUNTING
4339
4340/*
4341 * Account a single tick of cpu time.
4342 * @p: the process that the cpu time gets accounted to
4343 * @user_tick: indicates if the tick is a user or a system tick
4344 */
4345void account_process_tick(struct task_struct *p, int user_tick)
4346{
4347 cputime_t one_jiffy = jiffies_to_cputime(1);
4348 cputime_t one_jiffy_scaled = cputime_to_scaled(one_jiffy);
4349 struct rq *rq = this_rq();
4350
4351 if (user_tick)
4352 account_user_time(p, one_jiffy, one_jiffy_scaled);
4353 else if (p != rq->idle)
4354 account_system_time(p, HARDIRQ_OFFSET, one_jiffy,
4355 one_jiffy_scaled);
4356 else
4357 account_idle_time(one_jiffy);
4358}
4359
4360/*
4361 * Account multiple ticks of steal time.
4362 * @p: the process from which the cpu time has been stolen
4363 * @ticks: number of stolen ticks
4364 */
4365void account_steal_ticks(unsigned long ticks)
4366{
4367 account_steal_time(jiffies_to_cputime(ticks));
4368}
4369
4370/*
4371 * Account multiple ticks of idle time.
4372 * @ticks: number of stolen ticks
4373 */
4374void account_idle_ticks(unsigned long ticks)
4375{
4376 account_idle_time(jiffies_to_cputime(ticks));
4377}
4378
4379#endif
4380
4272/* 4381/*
4273 * Use precise platform statistics if available: 4382 * Use precise platform statistics if available:
4274 */ 4383 */
@@ -4397,7 +4506,7 @@ void __kprobes sub_preempt_count(int val)
4397 /* 4506 /*
4398 * Underflow? 4507 * Underflow?
4399 */ 4508 */
4400 if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) 4509 if (DEBUG_LOCKS_WARN_ON(val > preempt_count() - (!!kernel_locked())))
4401 return; 4510 return;
4402 /* 4511 /*
4403 * Is the spinlock portion underflowing? 4512 * Is the spinlock portion underflowing?
@@ -5474,10 +5583,9 @@ out_unlock:
5474 return retval; 5583 return retval;
5475} 5584}
5476 5585
5477long sched_setaffinity(pid_t pid, const cpumask_t *in_mask) 5586long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
5478{ 5587{
5479 cpumask_t cpus_allowed; 5588 cpumask_var_t cpus_allowed, new_mask;
5480 cpumask_t new_mask = *in_mask;
5481 struct task_struct *p; 5589 struct task_struct *p;
5482 int retval; 5590 int retval;
5483 5591
@@ -5499,6 +5607,14 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
5499 get_task_struct(p); 5607 get_task_struct(p);
5500 read_unlock(&tasklist_lock); 5608 read_unlock(&tasklist_lock);
5501 5609
5610 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
5611 retval = -ENOMEM;
5612 goto out_put_task;
5613 }
5614 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
5615 retval = -ENOMEM;
5616 goto out_free_cpus_allowed;
5617 }
5502 retval = -EPERM; 5618 retval = -EPERM;
5503 if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) 5619 if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
5504 goto out_unlock; 5620 goto out_unlock;
@@ -5507,37 +5623,41 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
5507 if (retval) 5623 if (retval)
5508 goto out_unlock; 5624 goto out_unlock;
5509 5625
5510 cpuset_cpus_allowed(p, &cpus_allowed); 5626 cpuset_cpus_allowed(p, cpus_allowed);
5511 cpus_and(new_mask, new_mask, cpus_allowed); 5627 cpumask_and(new_mask, in_mask, cpus_allowed);
5512 again: 5628 again:
5513 retval = set_cpus_allowed_ptr(p, &new_mask); 5629 retval = set_cpus_allowed_ptr(p, new_mask);
5514 5630
5515 if (!retval) { 5631 if (!retval) {
5516 cpuset_cpus_allowed(p, &cpus_allowed); 5632 cpuset_cpus_allowed(p, cpus_allowed);
5517 if (!cpus_subset(new_mask, cpus_allowed)) { 5633 if (!cpumask_subset(new_mask, cpus_allowed)) {
5518 /* 5634 /*
5519 * We must have raced with a concurrent cpuset 5635 * We must have raced with a concurrent cpuset
5520 * update. Just reset the cpus_allowed to the 5636 * update. Just reset the cpus_allowed to the
5521 * cpuset's cpus_allowed 5637 * cpuset's cpus_allowed
5522 */ 5638 */
5523 new_mask = cpus_allowed; 5639 cpumask_copy(new_mask, cpus_allowed);
5524 goto again; 5640 goto again;
5525 } 5641 }
5526 } 5642 }
5527out_unlock: 5643out_unlock:
5644 free_cpumask_var(new_mask);
5645out_free_cpus_allowed:
5646 free_cpumask_var(cpus_allowed);
5647out_put_task:
5528 put_task_struct(p); 5648 put_task_struct(p);
5529 put_online_cpus(); 5649 put_online_cpus();
5530 return retval; 5650 return retval;
5531} 5651}
5532 5652
5533static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, 5653static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
5534 cpumask_t *new_mask) 5654 struct cpumask *new_mask)
5535{ 5655{
5536 if (len < sizeof(cpumask_t)) { 5656 if (len < cpumask_size())
5537 memset(new_mask, 0, sizeof(cpumask_t)); 5657 cpumask_clear(new_mask);
5538 } else if (len > sizeof(cpumask_t)) { 5658 else if (len > cpumask_size())
5539 len = sizeof(cpumask_t); 5659 len = cpumask_size();
5540 } 5660
5541 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; 5661 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
5542} 5662}
5543 5663
@@ -5550,17 +5670,20 @@ static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
5550asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, 5670asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
5551 unsigned long __user *user_mask_ptr) 5671 unsigned long __user *user_mask_ptr)
5552{ 5672{
5553 cpumask_t new_mask; 5673 cpumask_var_t new_mask;
5554 int retval; 5674 int retval;
5555 5675
5556 retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask); 5676 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
5557 if (retval) 5677 return -ENOMEM;
5558 return retval;
5559 5678
5560 return sched_setaffinity(pid, &new_mask); 5679 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
5680 if (retval == 0)
5681 retval = sched_setaffinity(pid, new_mask);
5682 free_cpumask_var(new_mask);
5683 return retval;
5561} 5684}
5562 5685
5563long sched_getaffinity(pid_t pid, cpumask_t *mask) 5686long sched_getaffinity(pid_t pid, struct cpumask *mask)
5564{ 5687{
5565 struct task_struct *p; 5688 struct task_struct *p;
5566 int retval; 5689 int retval;
@@ -5577,7 +5700,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
5577 if (retval) 5700 if (retval)
5578 goto out_unlock; 5701 goto out_unlock;
5579 5702
5580 cpus_and(*mask, p->cpus_allowed, cpu_online_map); 5703 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
5581 5704
5582out_unlock: 5705out_unlock:
5583 read_unlock(&tasklist_lock); 5706 read_unlock(&tasklist_lock);
@@ -5596,19 +5719,24 @@ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
5596 unsigned long __user *user_mask_ptr) 5719 unsigned long __user *user_mask_ptr)
5597{ 5720{
5598 int ret; 5721 int ret;
5599 cpumask_t mask; 5722 cpumask_var_t mask;
5600 5723
5601 if (len < sizeof(cpumask_t)) 5724 if (len < cpumask_size())
5602 return -EINVAL; 5725 return -EINVAL;
5603 5726
5604 ret = sched_getaffinity(pid, &mask); 5727 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
5605 if (ret < 0) 5728 return -ENOMEM;
5606 return ret;
5607 5729
5608 if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t))) 5730 ret = sched_getaffinity(pid, mask);
5609 return -EFAULT; 5731 if (ret == 0) {
5732 if (copy_to_user(user_mask_ptr, mask, cpumask_size()))
5733 ret = -EFAULT;
5734 else
5735 ret = cpumask_size();
5736 }
5737 free_cpumask_var(mask);
5610 5738
5611 return sizeof(cpumask_t); 5739 return ret;
5612} 5740}
5613 5741
5614/** 5742/**
@@ -5950,7 +6078,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5950 idle->se.exec_start = sched_clock(); 6078 idle->se.exec_start = sched_clock();
5951 6079
5952 idle->prio = idle->normal_prio = MAX_PRIO; 6080 idle->prio = idle->normal_prio = MAX_PRIO;
5953 idle->cpus_allowed = cpumask_of_cpu(cpu); 6081 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
5954 __set_task_cpu(idle, cpu); 6082 __set_task_cpu(idle, cpu);
5955 6083
5956 rq->curr = rq->idle = idle; 6084 rq->curr = rq->idle = idle;
@@ -5977,9 +6105,9 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5977 * indicates which cpus entered this state. This is used 6105 * indicates which cpus entered this state. This is used
5978 * in the rcu update to wait only for active cpus. For system 6106 * in the rcu update to wait only for active cpus. For system
5979 * which do not switch off the HZ timer nohz_cpu_mask should 6107 * which do not switch off the HZ timer nohz_cpu_mask should
5980 * always be CPU_MASK_NONE. 6108 * always be CPU_BITS_NONE.
5981 */ 6109 */
5982cpumask_t nohz_cpu_mask = CPU_MASK_NONE; 6110cpumask_var_t nohz_cpu_mask;
5983 6111
5984/* 6112/*
5985 * Increase the granularity value when there are more CPUs, 6113 * Increase the granularity value when there are more CPUs,
@@ -6034,7 +6162,7 @@ static inline void sched_init_granularity(void)
6034 * task must not exit() & deallocate itself prematurely. The 6162 * task must not exit() & deallocate itself prematurely. The
6035 * call is not atomic; no spinlocks may be held. 6163 * call is not atomic; no spinlocks may be held.
6036 */ 6164 */
6037int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask) 6165int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
6038{ 6166{
6039 struct migration_req req; 6167 struct migration_req req;
6040 unsigned long flags; 6168 unsigned long flags;
@@ -6042,13 +6170,13 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
6042 int ret = 0; 6170 int ret = 0;
6043 6171
6044 rq = task_rq_lock(p, &flags); 6172 rq = task_rq_lock(p, &flags);
6045 if (!cpus_intersects(*new_mask, cpu_online_map)) { 6173 if (!cpumask_intersects(new_mask, cpu_online_mask)) {
6046 ret = -EINVAL; 6174 ret = -EINVAL;
6047 goto out; 6175 goto out;
6048 } 6176 }
6049 6177
6050 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && 6178 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
6051 !cpus_equal(p->cpus_allowed, *new_mask))) { 6179 !cpumask_equal(&p->cpus_allowed, new_mask))) {
6052 ret = -EINVAL; 6180 ret = -EINVAL;
6053 goto out; 6181 goto out;
6054 } 6182 }
@@ -6056,15 +6184,15 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
6056 if (p->sched_class->set_cpus_allowed) 6184 if (p->sched_class->set_cpus_allowed)
6057 p->sched_class->set_cpus_allowed(p, new_mask); 6185 p->sched_class->set_cpus_allowed(p, new_mask);
6058 else { 6186 else {
6059 p->cpus_allowed = *new_mask; 6187 cpumask_copy(&p->cpus_allowed, new_mask);
6060 p->rt.nr_cpus_allowed = cpus_weight(*new_mask); 6188 p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
6061 } 6189 }
6062 6190
6063 /* Can the task run on the task's current CPU? If so, we're done */ 6191 /* Can the task run on the task's current CPU? If so, we're done */
6064 if (cpu_isset(task_cpu(p), *new_mask)) 6192 if (cpumask_test_cpu(task_cpu(p), new_mask))
6065 goto out; 6193 goto out;
6066 6194
6067 if (migrate_task(p, any_online_cpu(*new_mask), &req)) { 6195 if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) {
6068 /* Need help from migration thread: drop lock and wait. */ 6196 /* Need help from migration thread: drop lock and wait. */
6069 task_rq_unlock(rq, &flags); 6197 task_rq_unlock(rq, &flags);
6070 wake_up_process(rq->migration_thread); 6198 wake_up_process(rq->migration_thread);
@@ -6106,7 +6234,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
6106 if (task_cpu(p) != src_cpu) 6234 if (task_cpu(p) != src_cpu)
6107 goto done; 6235 goto done;
6108 /* Affinity changed (again). */ 6236 /* Affinity changed (again). */
6109 if (!cpu_isset(dest_cpu, p->cpus_allowed)) 6237 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
6110 goto fail; 6238 goto fail;
6111 6239
6112 on_rq = p->se.on_rq; 6240 on_rq = p->se.on_rq;
@@ -6203,50 +6331,41 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
6203 */ 6331 */
6204static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 6332static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
6205{ 6333{
6206 unsigned long flags;
6207 cpumask_t mask;
6208 struct rq *rq;
6209 int dest_cpu; 6334 int dest_cpu;
6335 const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(dead_cpu));
6210 6336
6211 do { 6337again:
6212 /* On same node? */ 6338 /* Look for allowed, online CPU in same node. */
6213 mask = node_to_cpumask(cpu_to_node(dead_cpu)); 6339 for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask)
6214 cpus_and(mask, mask, p->cpus_allowed); 6340 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
6215 dest_cpu = any_online_cpu(mask); 6341 goto move;
6216 6342
6217 /* On any allowed CPU? */ 6343 /* Any allowed, online CPU? */
6218 if (dest_cpu >= nr_cpu_ids) 6344 dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask);
6219 dest_cpu = any_online_cpu(p->cpus_allowed); 6345 if (dest_cpu < nr_cpu_ids)
6346 goto move;
6220 6347
6221 /* No more Mr. Nice Guy. */ 6348 /* No more Mr. Nice Guy. */
6222 if (dest_cpu >= nr_cpu_ids) { 6349 if (dest_cpu >= nr_cpu_ids) {
6223 cpumask_t cpus_allowed; 6350 cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
6351 dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed);
6224 6352
6225 cpuset_cpus_allowed_locked(p, &cpus_allowed); 6353 /*
6226 /* 6354 * Don't tell them about moving exiting tasks or
6227 * Try to stay on the same cpuset, where the 6355 * kernel threads (both mm NULL), since they never
6228 * current cpuset may be a subset of all cpus. 6356 * leave kernel.
6229 * The cpuset_cpus_allowed_locked() variant of 6357 */
6230 * cpuset_cpus_allowed() will not block. It must be 6358 if (p->mm && printk_ratelimit()) {
6231 * called within calls to cpuset_lock/cpuset_unlock. 6359 printk(KERN_INFO "process %d (%s) no "
6232 */ 6360 "longer affine to cpu%d\n",
6233 rq = task_rq_lock(p, &flags); 6361 task_pid_nr(p), p->comm, dead_cpu);
6234 p->cpus_allowed = cpus_allowed;
6235 dest_cpu = any_online_cpu(p->cpus_allowed);
6236 task_rq_unlock(rq, &flags);
6237
6238 /*
6239 * Don't tell them about moving exiting tasks or
6240 * kernel threads (both mm NULL), since they never
6241 * leave kernel.
6242 */
6243 if (p->mm && printk_ratelimit()) {
6244 printk(KERN_INFO "process %d (%s) no "
6245 "longer affine to cpu%d\n",
6246 task_pid_nr(p), p->comm, dead_cpu);
6247 }
6248 } 6362 }
6249 } while (!__migrate_task_irq(p, dead_cpu, dest_cpu)); 6363 }
6364
6365move:
6366 /* It can have affinity changed while we were choosing. */
6367 if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu)))
6368 goto again;
6250} 6369}
6251 6370
6252/* 6371/*
@@ -6258,7 +6377,7 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
6258 */ 6377 */
6259static void migrate_nr_uninterruptible(struct rq *rq_src) 6378static void migrate_nr_uninterruptible(struct rq *rq_src)
6260{ 6379{
6261 struct rq *rq_dest = cpu_rq(any_online_cpu(*CPU_MASK_ALL_PTR)); 6380 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask));
6262 unsigned long flags; 6381 unsigned long flags;
6263 6382
6264 local_irq_save(flags); 6383 local_irq_save(flags);
@@ -6548,7 +6667,7 @@ static void set_rq_online(struct rq *rq)
6548 if (!rq->online) { 6667 if (!rq->online) {
6549 const struct sched_class *class; 6668 const struct sched_class *class;
6550 6669
6551 cpu_set(rq->cpu, rq->rd->online); 6670 cpumask_set_cpu(rq->cpu, rq->rd->online);
6552 rq->online = 1; 6671 rq->online = 1;
6553 6672
6554 for_each_class(class) { 6673 for_each_class(class) {
@@ -6568,7 +6687,7 @@ static void set_rq_offline(struct rq *rq)
6568 class->rq_offline(rq); 6687 class->rq_offline(rq);
6569 } 6688 }
6570 6689
6571 cpu_clear(rq->cpu, rq->rd->online); 6690 cpumask_clear_cpu(rq->cpu, rq->rd->online);
6572 rq->online = 0; 6691 rq->online = 0;
6573 } 6692 }
6574} 6693}
@@ -6609,7 +6728,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6609 rq = cpu_rq(cpu); 6728 rq = cpu_rq(cpu);
6610 spin_lock_irqsave(&rq->lock, flags); 6729 spin_lock_irqsave(&rq->lock, flags);
6611 if (rq->rd) { 6730 if (rq->rd) {
6612 BUG_ON(!cpu_isset(cpu, rq->rd->span)); 6731 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
6613 6732
6614 set_rq_online(rq); 6733 set_rq_online(rq);
6615 } 6734 }
@@ -6623,7 +6742,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6623 break; 6742 break;
6624 /* Unbind it from offline cpu so it can run. Fall thru. */ 6743 /* Unbind it from offline cpu so it can run. Fall thru. */
6625 kthread_bind(cpu_rq(cpu)->migration_thread, 6744 kthread_bind(cpu_rq(cpu)->migration_thread,
6626 any_online_cpu(cpu_online_map)); 6745 cpumask_any(cpu_online_mask));
6627 kthread_stop(cpu_rq(cpu)->migration_thread); 6746 kthread_stop(cpu_rq(cpu)->migration_thread);
6628 cpu_rq(cpu)->migration_thread = NULL; 6747 cpu_rq(cpu)->migration_thread = NULL;
6629 break; 6748 break;
@@ -6673,7 +6792,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6673 rq = cpu_rq(cpu); 6792 rq = cpu_rq(cpu);
6674 spin_lock_irqsave(&rq->lock, flags); 6793 spin_lock_irqsave(&rq->lock, flags);
6675 if (rq->rd) { 6794 if (rq->rd) {
6676 BUG_ON(!cpu_isset(cpu, rq->rd->span)); 6795 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
6677 set_rq_offline(rq); 6796 set_rq_offline(rq);
6678 } 6797 }
6679 spin_unlock_irqrestore(&rq->lock, flags); 6798 spin_unlock_irqrestore(&rq->lock, flags);
@@ -6712,13 +6831,13 @@ early_initcall(migration_init);
6712#ifdef CONFIG_SCHED_DEBUG 6831#ifdef CONFIG_SCHED_DEBUG
6713 6832
6714static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 6833static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6715 cpumask_t *groupmask) 6834 struct cpumask *groupmask)
6716{ 6835{
6717 struct sched_group *group = sd->groups; 6836 struct sched_group *group = sd->groups;
6718 char str[256]; 6837 char str[256];
6719 6838
6720 cpulist_scnprintf(str, sizeof(str), sd->span); 6839 cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
6721 cpus_clear(*groupmask); 6840 cpumask_clear(groupmask);
6722 6841
6723 printk(KERN_DEBUG "%*s domain %d: ", level, "", level); 6842 printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
6724 6843
@@ -6732,11 +6851,11 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6732 6851
6733 printk(KERN_CONT "span %s level %s\n", str, sd->name); 6852 printk(KERN_CONT "span %s level %s\n", str, sd->name);
6734 6853
6735 if (!cpu_isset(cpu, sd->span)) { 6854 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
6736 printk(KERN_ERR "ERROR: domain->span does not contain " 6855 printk(KERN_ERR "ERROR: domain->span does not contain "
6737 "CPU%d\n", cpu); 6856 "CPU%d\n", cpu);
6738 } 6857 }
6739 if (!cpu_isset(cpu, group->cpumask)) { 6858 if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
6740 printk(KERN_ERR "ERROR: domain->groups does not contain" 6859 printk(KERN_ERR "ERROR: domain->groups does not contain"
6741 " CPU%d\n", cpu); 6860 " CPU%d\n", cpu);
6742 } 6861 }
@@ -6756,31 +6875,32 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6756 break; 6875 break;
6757 } 6876 }
6758 6877
6759 if (!cpus_weight(group->cpumask)) { 6878 if (!cpumask_weight(sched_group_cpus(group))) {
6760 printk(KERN_CONT "\n"); 6879 printk(KERN_CONT "\n");
6761 printk(KERN_ERR "ERROR: empty group\n"); 6880 printk(KERN_ERR "ERROR: empty group\n");
6762 break; 6881 break;
6763 } 6882 }
6764 6883
6765 if (cpus_intersects(*groupmask, group->cpumask)) { 6884 if (cpumask_intersects(groupmask, sched_group_cpus(group))) {
6766 printk(KERN_CONT "\n"); 6885 printk(KERN_CONT "\n");
6767 printk(KERN_ERR "ERROR: repeated CPUs\n"); 6886 printk(KERN_ERR "ERROR: repeated CPUs\n");
6768 break; 6887 break;
6769 } 6888 }
6770 6889
6771 cpus_or(*groupmask, *groupmask, group->cpumask); 6890 cpumask_or(groupmask, groupmask, sched_group_cpus(group));
6772 6891
6773 cpulist_scnprintf(str, sizeof(str), group->cpumask); 6892 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
6774 printk(KERN_CONT " %s", str); 6893 printk(KERN_CONT " %s", str);
6775 6894
6776 group = group->next; 6895 group = group->next;
6777 } while (group != sd->groups); 6896 } while (group != sd->groups);
6778 printk(KERN_CONT "\n"); 6897 printk(KERN_CONT "\n");
6779 6898
6780 if (!cpus_equal(sd->span, *groupmask)) 6899 if (!cpumask_equal(sched_domain_span(sd), groupmask))
6781 printk(KERN_ERR "ERROR: groups don't span domain->span\n"); 6900 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
6782 6901
6783 if (sd->parent && !cpus_subset(*groupmask, sd->parent->span)) 6902 if (sd->parent &&
6903 !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
6784 printk(KERN_ERR "ERROR: parent span is not a superset " 6904 printk(KERN_ERR "ERROR: parent span is not a superset "
6785 "of domain->span\n"); 6905 "of domain->span\n");
6786 return 0; 6906 return 0;
@@ -6788,7 +6908,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6788 6908
6789static void sched_domain_debug(struct sched_domain *sd, int cpu) 6909static void sched_domain_debug(struct sched_domain *sd, int cpu)
6790{ 6910{
6791 cpumask_t *groupmask; 6911 cpumask_var_t groupmask;
6792 int level = 0; 6912 int level = 0;
6793 6913
6794 if (!sd) { 6914 if (!sd) {
@@ -6798,8 +6918,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
6798 6918
6799 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); 6919 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
6800 6920
6801 groupmask = kmalloc(sizeof(cpumask_t), GFP_KERNEL); 6921 if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {
6802 if (!groupmask) {
6803 printk(KERN_DEBUG "Cannot load-balance (out of memory)\n"); 6922 printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
6804 return; 6923 return;
6805 } 6924 }
@@ -6812,7 +6931,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
6812 if (!sd) 6931 if (!sd)
6813 break; 6932 break;
6814 } 6933 }
6815 kfree(groupmask); 6934 free_cpumask_var(groupmask);
6816} 6935}
6817#else /* !CONFIG_SCHED_DEBUG */ 6936#else /* !CONFIG_SCHED_DEBUG */
6818# define sched_domain_debug(sd, cpu) do { } while (0) 6937# define sched_domain_debug(sd, cpu) do { } while (0)
@@ -6820,7 +6939,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
6820 6939
6821static int sd_degenerate(struct sched_domain *sd) 6940static int sd_degenerate(struct sched_domain *sd)
6822{ 6941{
6823 if (cpus_weight(sd->span) == 1) 6942 if (cpumask_weight(sched_domain_span(sd)) == 1)
6824 return 1; 6943 return 1;
6825 6944
6826 /* Following flags need at least 2 groups */ 6945 /* Following flags need at least 2 groups */
@@ -6851,7 +6970,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6851 if (sd_degenerate(parent)) 6970 if (sd_degenerate(parent))
6852 return 1; 6971 return 1;
6853 6972
6854 if (!cpus_equal(sd->span, parent->span)) 6973 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
6855 return 0; 6974 return 0;
6856 6975
6857 /* Does parent contain flags not in child? */ 6976 /* Does parent contain flags not in child? */
@@ -6875,6 +6994,16 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6875 return 1; 6994 return 1;
6876} 6995}
6877 6996
6997static void free_rootdomain(struct root_domain *rd)
6998{
6999 cpupri_cleanup(&rd->cpupri);
7000
7001 free_cpumask_var(rd->rto_mask);
7002 free_cpumask_var(rd->online);
7003 free_cpumask_var(rd->span);
7004 kfree(rd);
7005}
7006
6878static void rq_attach_root(struct rq *rq, struct root_domain *rd) 7007static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6879{ 7008{
6880 unsigned long flags; 7009 unsigned long flags;
@@ -6884,38 +7013,62 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6884 if (rq->rd) { 7013 if (rq->rd) {
6885 struct root_domain *old_rd = rq->rd; 7014 struct root_domain *old_rd = rq->rd;
6886 7015
6887 if (cpu_isset(rq->cpu, old_rd->online)) 7016 if (cpumask_test_cpu(rq->cpu, old_rd->online))
6888 set_rq_offline(rq); 7017 set_rq_offline(rq);
6889 7018
6890 cpu_clear(rq->cpu, old_rd->span); 7019 cpumask_clear_cpu(rq->cpu, old_rd->span);
6891 7020
6892 if (atomic_dec_and_test(&old_rd->refcount)) 7021 if (atomic_dec_and_test(&old_rd->refcount))
6893 kfree(old_rd); 7022 free_rootdomain(old_rd);
6894 } 7023 }
6895 7024
6896 atomic_inc(&rd->refcount); 7025 atomic_inc(&rd->refcount);
6897 rq->rd = rd; 7026 rq->rd = rd;
6898 7027
6899 cpu_set(rq->cpu, rd->span); 7028 cpumask_set_cpu(rq->cpu, rd->span);
6900 if (cpu_isset(rq->cpu, cpu_online_map)) 7029 if (cpumask_test_cpu(rq->cpu, cpu_online_mask))
6901 set_rq_online(rq); 7030 set_rq_online(rq);
6902 7031
6903 spin_unlock_irqrestore(&rq->lock, flags); 7032 spin_unlock_irqrestore(&rq->lock, flags);
6904} 7033}
6905 7034
6906static void init_rootdomain(struct root_domain *rd) 7035static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem)
6907{ 7036{
6908 memset(rd, 0, sizeof(*rd)); 7037 memset(rd, 0, sizeof(*rd));
6909 7038
6910 cpus_clear(rd->span); 7039 if (bootmem) {
6911 cpus_clear(rd->online); 7040 alloc_bootmem_cpumask_var(&def_root_domain.span);
7041 alloc_bootmem_cpumask_var(&def_root_domain.online);
7042 alloc_bootmem_cpumask_var(&def_root_domain.rto_mask);
7043 cpupri_init(&rd->cpupri, true);
7044 return 0;
7045 }
6912 7046
6913 cpupri_init(&rd->cpupri); 7047 if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
7048 goto out;
7049 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
7050 goto free_span;
7051 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
7052 goto free_online;
7053
7054 if (cpupri_init(&rd->cpupri, false) != 0)
7055 goto free_rto_mask;
7056 return 0;
7057
7058free_rto_mask:
7059 free_cpumask_var(rd->rto_mask);
7060free_online:
7061 free_cpumask_var(rd->online);
7062free_span:
7063 free_cpumask_var(rd->span);
7064out:
7065 return -ENOMEM;
6914} 7066}
6915 7067
6916static void init_defrootdomain(void) 7068static void init_defrootdomain(void)
6917{ 7069{
6918 init_rootdomain(&def_root_domain); 7070 init_rootdomain(&def_root_domain, true);
7071
6919 atomic_set(&def_root_domain.refcount, 1); 7072 atomic_set(&def_root_domain.refcount, 1);
6920} 7073}
6921 7074
@@ -6927,7 +7080,10 @@ static struct root_domain *alloc_rootdomain(void)
6927 if (!rd) 7080 if (!rd)
6928 return NULL; 7081 return NULL;
6929 7082
6930 init_rootdomain(rd); 7083 if (init_rootdomain(rd, false) != 0) {
7084 kfree(rd);
7085 return NULL;
7086 }
6931 7087
6932 return rd; 7088 return rd;
6933} 7089}
@@ -6969,19 +7125,12 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6969} 7125}
6970 7126
6971/* cpus with isolated domains */ 7127/* cpus with isolated domains */
6972static cpumask_t cpu_isolated_map = CPU_MASK_NONE; 7128static cpumask_var_t cpu_isolated_map;
6973 7129
6974/* Setup the mask of cpus configured for isolated domains */ 7130/* Setup the mask of cpus configured for isolated domains */
6975static int __init isolated_cpu_setup(char *str) 7131static int __init isolated_cpu_setup(char *str)
6976{ 7132{
6977 static int __initdata ints[NR_CPUS]; 7133 cpulist_parse(str, cpu_isolated_map);
6978 int i;
6979
6980 str = get_options(str, ARRAY_SIZE(ints), ints);
6981 cpus_clear(cpu_isolated_map);
6982 for (i = 1; i <= ints[0]; i++)
6983 if (ints[i] < NR_CPUS)
6984 cpu_set(ints[i], cpu_isolated_map);
6985 return 1; 7134 return 1;
6986} 7135}
6987 7136
@@ -6990,42 +7139,43 @@ __setup("isolcpus=", isolated_cpu_setup);
6990/* 7139/*
6991 * init_sched_build_groups takes the cpumask we wish to span, and a pointer 7140 * init_sched_build_groups takes the cpumask we wish to span, and a pointer
6992 * to a function which identifies what group(along with sched group) a CPU 7141 * to a function which identifies what group(along with sched group) a CPU
6993 * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS 7142 * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
6994 * (due to the fact that we keep track of groups covered with a cpumask_t). 7143 * (due to the fact that we keep track of groups covered with a struct cpumask).
6995 * 7144 *
6996 * init_sched_build_groups will build a circular linked list of the groups 7145 * init_sched_build_groups will build a circular linked list of the groups
6997 * covered by the given span, and will set each group's ->cpumask correctly, 7146 * covered by the given span, and will set each group's ->cpumask correctly,
6998 * and ->cpu_power to 0. 7147 * and ->cpu_power to 0.
6999 */ 7148 */
7000static void 7149static void
7001init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map, 7150init_sched_build_groups(const struct cpumask *span,
7002 int (*group_fn)(int cpu, const cpumask_t *cpu_map, 7151 const struct cpumask *cpu_map,
7152 int (*group_fn)(int cpu, const struct cpumask *cpu_map,
7003 struct sched_group **sg, 7153 struct sched_group **sg,
7004 cpumask_t *tmpmask), 7154 struct cpumask *tmpmask),
7005 cpumask_t *covered, cpumask_t *tmpmask) 7155 struct cpumask *covered, struct cpumask *tmpmask)
7006{ 7156{
7007 struct sched_group *first = NULL, *last = NULL; 7157 struct sched_group *first = NULL, *last = NULL;
7008 int i; 7158 int i;
7009 7159
7010 cpus_clear(*covered); 7160 cpumask_clear(covered);
7011 7161
7012 for_each_cpu_mask_nr(i, *span) { 7162 for_each_cpu(i, span) {
7013 struct sched_group *sg; 7163 struct sched_group *sg;
7014 int group = group_fn(i, cpu_map, &sg, tmpmask); 7164 int group = group_fn(i, cpu_map, &sg, tmpmask);
7015 int j; 7165 int j;
7016 7166
7017 if (cpu_isset(i, *covered)) 7167 if (cpumask_test_cpu(i, covered))
7018 continue; 7168 continue;
7019 7169
7020 cpus_clear(sg->cpumask); 7170 cpumask_clear(sched_group_cpus(sg));
7021 sg->__cpu_power = 0; 7171 sg->__cpu_power = 0;
7022 7172
7023 for_each_cpu_mask_nr(j, *span) { 7173 for_each_cpu(j, span) {
7024 if (group_fn(j, cpu_map, NULL, tmpmask) != group) 7174 if (group_fn(j, cpu_map, NULL, tmpmask) != group)
7025 continue; 7175 continue;
7026 7176
7027 cpu_set(j, *covered); 7177 cpumask_set_cpu(j, covered);
7028 cpu_set(j, sg->cpumask); 7178 cpumask_set_cpu(j, sched_group_cpus(sg));
7029 } 7179 }
7030 if (!first) 7180 if (!first)
7031 first = sg; 7181 first = sg;
@@ -7089,23 +7239,21 @@ static int find_next_best_node(int node, nodemask_t *used_nodes)
7089 * should be one that prevents unnecessary balancing, but also spreads tasks 7239 * should be one that prevents unnecessary balancing, but also spreads tasks
7090 * out optimally. 7240 * out optimally.
7091 */ 7241 */
7092static void sched_domain_node_span(int node, cpumask_t *span) 7242static void sched_domain_node_span(int node, struct cpumask *span)
7093{ 7243{
7094 nodemask_t used_nodes; 7244 nodemask_t used_nodes;
7095 node_to_cpumask_ptr(nodemask, node);
7096 int i; 7245 int i;
7097 7246
7098 cpus_clear(*span); 7247 cpumask_clear(span);
7099 nodes_clear(used_nodes); 7248 nodes_clear(used_nodes);
7100 7249
7101 cpus_or(*span, *span, *nodemask); 7250 cpumask_or(span, span, cpumask_of_node(node));
7102 node_set(node, used_nodes); 7251 node_set(node, used_nodes);
7103 7252
7104 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { 7253 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
7105 int next_node = find_next_best_node(node, &used_nodes); 7254 int next_node = find_next_best_node(node, &used_nodes);
7106 7255
7107 node_to_cpumask_ptr_next(nodemask, next_node); 7256 cpumask_or(span, span, cpumask_of_node(next_node));
7108 cpus_or(*span, *span, *nodemask);
7109 } 7257 }
7110} 7258}
7111#endif /* CONFIG_NUMA */ 7259#endif /* CONFIG_NUMA */
@@ -7113,18 +7261,33 @@ static void sched_domain_node_span(int node, cpumask_t *span)
7113int sched_smt_power_savings = 0, sched_mc_power_savings = 0; 7261int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
7114 7262
7115/* 7263/*
7264 * The cpus mask in sched_group and sched_domain hangs off the end.
7265 * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space
7266 * for nr_cpu_ids < CONFIG_NR_CPUS.
7267 */
7268struct static_sched_group {
7269 struct sched_group sg;
7270 DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);
7271};
7272
7273struct static_sched_domain {
7274 struct sched_domain sd;
7275 DECLARE_BITMAP(span, CONFIG_NR_CPUS);
7276};
7277
7278/*
7116 * SMT sched-domains: 7279 * SMT sched-domains:
7117 */ 7280 */
7118#ifdef CONFIG_SCHED_SMT 7281#ifdef CONFIG_SCHED_SMT
7119static DEFINE_PER_CPU(struct sched_domain, cpu_domains); 7282static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
7120static DEFINE_PER_CPU(struct sched_group, sched_group_cpus); 7283static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus);
7121 7284
7122static int 7285static int
7123cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, 7286cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
7124 cpumask_t *unused) 7287 struct sched_group **sg, struct cpumask *unused)
7125{ 7288{
7126 if (sg) 7289 if (sg)
7127 *sg = &per_cpu(sched_group_cpus, cpu); 7290 *sg = &per_cpu(sched_group_cpus, cpu).sg;
7128 return cpu; 7291 return cpu;
7129} 7292}
7130#endif /* CONFIG_SCHED_SMT */ 7293#endif /* CONFIG_SCHED_SMT */
@@ -7133,56 +7296,53 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
7133 * multi-core sched-domains: 7296 * multi-core sched-domains:
7134 */ 7297 */
7135#ifdef CONFIG_SCHED_MC 7298#ifdef CONFIG_SCHED_MC
7136static DEFINE_PER_CPU(struct sched_domain, core_domains); 7299static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
7137static DEFINE_PER_CPU(struct sched_group, sched_group_core); 7300static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
7138#endif /* CONFIG_SCHED_MC */ 7301#endif /* CONFIG_SCHED_MC */
7139 7302
7140#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) 7303#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
7141static int 7304static int
7142cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, 7305cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
7143 cpumask_t *mask) 7306 struct sched_group **sg, struct cpumask *mask)
7144{ 7307{
7145 int group; 7308 int group;
7146 7309
7147 *mask = per_cpu(cpu_sibling_map, cpu); 7310 cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
7148 cpus_and(*mask, *mask, *cpu_map); 7311 group = cpumask_first(mask);
7149 group = first_cpu(*mask);
7150 if (sg) 7312 if (sg)
7151 *sg = &per_cpu(sched_group_core, group); 7313 *sg = &per_cpu(sched_group_core, group).sg;
7152 return group; 7314 return group;
7153} 7315}
7154#elif defined(CONFIG_SCHED_MC) 7316#elif defined(CONFIG_SCHED_MC)
7155static int 7317static int
7156cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, 7318cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
7157 cpumask_t *unused) 7319 struct sched_group **sg, struct cpumask *unused)
7158{ 7320{
7159 if (sg) 7321 if (sg)
7160 *sg = &per_cpu(sched_group_core, cpu); 7322 *sg = &per_cpu(sched_group_core, cpu).sg;
7161 return cpu; 7323 return cpu;
7162} 7324}
7163#endif 7325#endif
7164 7326
7165static DEFINE_PER_CPU(struct sched_domain, phys_domains); 7327static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
7166static DEFINE_PER_CPU(struct sched_group, sched_group_phys); 7328static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
7167 7329
7168static int 7330static int
7169cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, 7331cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
7170 cpumask_t *mask) 7332 struct sched_group **sg, struct cpumask *mask)
7171{ 7333{
7172 int group; 7334 int group;
7173#ifdef CONFIG_SCHED_MC 7335#ifdef CONFIG_SCHED_MC
7174 *mask = cpu_coregroup_map(cpu); 7336 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
7175 cpus_and(*mask, *mask, *cpu_map); 7337 group = cpumask_first(mask);
7176 group = first_cpu(*mask);
7177#elif defined(CONFIG_SCHED_SMT) 7338#elif defined(CONFIG_SCHED_SMT)
7178 *mask = per_cpu(cpu_sibling_map, cpu); 7339 cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
7179 cpus_and(*mask, *mask, *cpu_map); 7340 group = cpumask_first(mask);
7180 group = first_cpu(*mask);
7181#else 7341#else
7182 group = cpu; 7342 group = cpu;
7183#endif 7343#endif
7184 if (sg) 7344 if (sg)
7185 *sg = &per_cpu(sched_group_phys, group); 7345 *sg = &per_cpu(sched_group_phys, group).sg;
7186 return group; 7346 return group;
7187} 7347}
7188 7348
@@ -7196,19 +7356,19 @@ static DEFINE_PER_CPU(struct sched_domain, node_domains);
7196static struct sched_group ***sched_group_nodes_bycpu; 7356static struct sched_group ***sched_group_nodes_bycpu;
7197 7357
7198static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); 7358static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
7199static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes); 7359static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
7200 7360
7201static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map, 7361static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
7202 struct sched_group **sg, cpumask_t *nodemask) 7362 struct sched_group **sg,
7363 struct cpumask *nodemask)
7203{ 7364{
7204 int group; 7365 int group;
7205 7366
7206 *nodemask = node_to_cpumask(cpu_to_node(cpu)); 7367 cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map);
7207 cpus_and(*nodemask, *nodemask, *cpu_map); 7368 group = cpumask_first(nodemask);
7208 group = first_cpu(*nodemask);
7209 7369
7210 if (sg) 7370 if (sg)
7211 *sg = &per_cpu(sched_group_allnodes, group); 7371 *sg = &per_cpu(sched_group_allnodes, group).sg;
7212 return group; 7372 return group;
7213} 7373}
7214 7374
@@ -7220,11 +7380,11 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
7220 if (!sg) 7380 if (!sg)
7221 return; 7381 return;
7222 do { 7382 do {
7223 for_each_cpu_mask_nr(j, sg->cpumask) { 7383 for_each_cpu(j, sched_group_cpus(sg)) {
7224 struct sched_domain *sd; 7384 struct sched_domain *sd;
7225 7385
7226 sd = &per_cpu(phys_domains, j); 7386 sd = &per_cpu(phys_domains, j).sd;
7227 if (j != first_cpu(sd->groups->cpumask)) { 7387 if (j != cpumask_first(sched_group_cpus(sd->groups))) {
7228 /* 7388 /*
7229 * Only add "power" once for each 7389 * Only add "power" once for each
7230 * physical package. 7390 * physical package.
@@ -7241,11 +7401,12 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
7241 7401
7242#ifdef CONFIG_NUMA 7402#ifdef CONFIG_NUMA
7243/* Free memory allocated for various sched_group structures */ 7403/* Free memory allocated for various sched_group structures */
7244static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) 7404static void free_sched_groups(const struct cpumask *cpu_map,
7405 struct cpumask *nodemask)
7245{ 7406{
7246 int cpu, i; 7407 int cpu, i;
7247 7408
7248 for_each_cpu_mask_nr(cpu, *cpu_map) { 7409 for_each_cpu(cpu, cpu_map) {
7249 struct sched_group **sched_group_nodes 7410 struct sched_group **sched_group_nodes
7250 = sched_group_nodes_bycpu[cpu]; 7411 = sched_group_nodes_bycpu[cpu];
7251 7412
@@ -7255,9 +7416,8 @@ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
7255 for (i = 0; i < nr_node_ids; i++) { 7416 for (i = 0; i < nr_node_ids; i++) {
7256 struct sched_group *oldsg, *sg = sched_group_nodes[i]; 7417 struct sched_group *oldsg, *sg = sched_group_nodes[i];
7257 7418
7258 *nodemask = node_to_cpumask(i); 7419 cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
7259 cpus_and(*nodemask, *nodemask, *cpu_map); 7420 if (cpumask_empty(nodemask))
7260 if (cpus_empty(*nodemask))
7261 continue; 7421 continue;
7262 7422
7263 if (sg == NULL) 7423 if (sg == NULL)
@@ -7275,7 +7435,8 @@ next_sg:
7275 } 7435 }
7276} 7436}
7277#else /* !CONFIG_NUMA */ 7437#else /* !CONFIG_NUMA */
7278static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) 7438static void free_sched_groups(const struct cpumask *cpu_map,
7439 struct cpumask *nodemask)
7279{ 7440{
7280} 7441}
7281#endif /* CONFIG_NUMA */ 7442#endif /* CONFIG_NUMA */
@@ -7301,7 +7462,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7301 7462
7302 WARN_ON(!sd || !sd->groups); 7463 WARN_ON(!sd || !sd->groups);
7303 7464
7304 if (cpu != first_cpu(sd->groups->cpumask)) 7465 if (cpu != cpumask_first(sched_group_cpus(sd->groups)))
7305 return; 7466 return;
7306 7467
7307 child = sd->child; 7468 child = sd->child;
@@ -7366,48 +7527,6 @@ SD_INIT_FUNC(CPU)
7366 SD_INIT_FUNC(MC) 7527 SD_INIT_FUNC(MC)
7367#endif 7528#endif
7368 7529
7369/*
7370 * To minimize stack usage kmalloc room for cpumasks and share the
7371 * space as the usage in build_sched_domains() dictates. Used only
7372 * if the amount of space is significant.
7373 */
7374struct allmasks {
7375 cpumask_t tmpmask; /* make this one first */
7376 union {
7377 cpumask_t nodemask;
7378 cpumask_t this_sibling_map;
7379 cpumask_t this_core_map;
7380 };
7381 cpumask_t send_covered;
7382
7383#ifdef CONFIG_NUMA
7384 cpumask_t domainspan;
7385 cpumask_t covered;
7386 cpumask_t notcovered;
7387#endif
7388};
7389
7390#if NR_CPUS > 128
7391#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v
7392static inline void sched_cpumask_alloc(struct allmasks **masks)
7393{
7394 *masks = kmalloc(sizeof(**masks), GFP_KERNEL);
7395}
7396static inline void sched_cpumask_free(struct allmasks *masks)
7397{
7398 kfree(masks);
7399}
7400#else
7401#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v
7402static inline void sched_cpumask_alloc(struct allmasks **masks)
7403{ }
7404static inline void sched_cpumask_free(struct allmasks *masks)
7405{ }
7406#endif
7407
7408#define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \
7409 ((unsigned long)(a) + offsetof(struct allmasks, v))
7410
7411static int default_relax_domain_level = -1; 7530static int default_relax_domain_level = -1;
7412 7531
7413static int __init setup_relax_domain_level(char *str) 7532static int __init setup_relax_domain_level(char *str)
@@ -7447,17 +7566,38 @@ static void set_domain_attribute(struct sched_domain *sd,
7447 * Build sched domains for a given set of cpus and attach the sched domains 7566 * Build sched domains for a given set of cpus and attach the sched domains
7448 * to the individual cpus 7567 * to the individual cpus
7449 */ 7568 */
7450static int __build_sched_domains(const cpumask_t *cpu_map, 7569static int __build_sched_domains(const struct cpumask *cpu_map,
7451 struct sched_domain_attr *attr) 7570 struct sched_domain_attr *attr)
7452{ 7571{
7453 int i; 7572 int i, err = -ENOMEM;
7454 struct root_domain *rd; 7573 struct root_domain *rd;
7455 SCHED_CPUMASK_DECLARE(allmasks); 7574 cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered,
7456 cpumask_t *tmpmask; 7575 tmpmask;
7457#ifdef CONFIG_NUMA 7576#ifdef CONFIG_NUMA
7577 cpumask_var_t domainspan, covered, notcovered;
7458 struct sched_group **sched_group_nodes = NULL; 7578 struct sched_group **sched_group_nodes = NULL;
7459 int sd_allnodes = 0; 7579 int sd_allnodes = 0;
7460 7580
7581 if (!alloc_cpumask_var(&domainspan, GFP_KERNEL))
7582 goto out;
7583 if (!alloc_cpumask_var(&covered, GFP_KERNEL))
7584 goto free_domainspan;
7585 if (!alloc_cpumask_var(&notcovered, GFP_KERNEL))
7586 goto free_covered;
7587#endif
7588
7589 if (!alloc_cpumask_var(&nodemask, GFP_KERNEL))
7590 goto free_notcovered;
7591 if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL))
7592 goto free_nodemask;
7593 if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL))
7594 goto free_this_sibling_map;
7595 if (!alloc_cpumask_var(&send_covered, GFP_KERNEL))
7596 goto free_this_core_map;
7597 if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
7598 goto free_send_covered;
7599
7600#ifdef CONFIG_NUMA
7461 /* 7601 /*
7462 * Allocate the per-node list of sched groups 7602 * Allocate the per-node list of sched groups
7463 */ 7603 */
@@ -7465,54 +7605,35 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7465 GFP_KERNEL); 7605 GFP_KERNEL);
7466 if (!sched_group_nodes) { 7606 if (!sched_group_nodes) {
7467 printk(KERN_WARNING "Can not alloc sched group node list\n"); 7607 printk(KERN_WARNING "Can not alloc sched group node list\n");
7468 return -ENOMEM; 7608 goto free_tmpmask;
7469 } 7609 }
7470#endif 7610#endif
7471 7611
7472 rd = alloc_rootdomain(); 7612 rd = alloc_rootdomain();
7473 if (!rd) { 7613 if (!rd) {
7474 printk(KERN_WARNING "Cannot alloc root domain\n"); 7614 printk(KERN_WARNING "Cannot alloc root domain\n");
7475#ifdef CONFIG_NUMA 7615 goto free_sched_groups;
7476 kfree(sched_group_nodes);
7477#endif
7478 return -ENOMEM;
7479 }
7480
7481 /* get space for all scratch cpumask variables */
7482 sched_cpumask_alloc(&allmasks);
7483 if (!allmasks) {
7484 printk(KERN_WARNING "Cannot alloc cpumask array\n");
7485 kfree(rd);
7486#ifdef CONFIG_NUMA
7487 kfree(sched_group_nodes);
7488#endif
7489 return -ENOMEM;
7490 } 7616 }
7491 7617
7492 tmpmask = (cpumask_t *)allmasks;
7493
7494
7495#ifdef CONFIG_NUMA 7618#ifdef CONFIG_NUMA
7496 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; 7619 sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes;
7497#endif 7620#endif
7498 7621
7499 /* 7622 /*
7500 * Set up domains for cpus specified by the cpu_map. 7623 * Set up domains for cpus specified by the cpu_map.
7501 */ 7624 */
7502 for_each_cpu_mask_nr(i, *cpu_map) { 7625 for_each_cpu(i, cpu_map) {
7503 struct sched_domain *sd = NULL, *p; 7626 struct sched_domain *sd = NULL, *p;
7504 SCHED_CPUMASK_VAR(nodemask, allmasks);
7505 7627
7506 *nodemask = node_to_cpumask(cpu_to_node(i)); 7628 cpumask_and(nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map);
7507 cpus_and(*nodemask, *nodemask, *cpu_map);
7508 7629
7509#ifdef CONFIG_NUMA 7630#ifdef CONFIG_NUMA
7510 if (cpus_weight(*cpu_map) > 7631 if (cpumask_weight(cpu_map) >
7511 SD_NODES_PER_DOMAIN*cpus_weight(*nodemask)) { 7632 SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) {
7512 sd = &per_cpu(allnodes_domains, i); 7633 sd = &per_cpu(allnodes_domains, i);
7513 SD_INIT(sd, ALLNODES); 7634 SD_INIT(sd, ALLNODES);
7514 set_domain_attribute(sd, attr); 7635 set_domain_attribute(sd, attr);
7515 sd->span = *cpu_map; 7636 cpumask_copy(sched_domain_span(sd), cpu_map);
7516 cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); 7637 cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
7517 p = sd; 7638 p = sd;
7518 sd_allnodes = 1; 7639 sd_allnodes = 1;
@@ -7522,18 +7643,19 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7522 sd = &per_cpu(node_domains, i); 7643 sd = &per_cpu(node_domains, i);
7523 SD_INIT(sd, NODE); 7644 SD_INIT(sd, NODE);
7524 set_domain_attribute(sd, attr); 7645 set_domain_attribute(sd, attr);
7525 sched_domain_node_span(cpu_to_node(i), &sd->span); 7646 sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
7526 sd->parent = p; 7647 sd->parent = p;
7527 if (p) 7648 if (p)
7528 p->child = sd; 7649 p->child = sd;
7529 cpus_and(sd->span, sd->span, *cpu_map); 7650 cpumask_and(sched_domain_span(sd),
7651 sched_domain_span(sd), cpu_map);
7530#endif 7652#endif
7531 7653
7532 p = sd; 7654 p = sd;
7533 sd = &per_cpu(phys_domains, i); 7655 sd = &per_cpu(phys_domains, i).sd;
7534 SD_INIT(sd, CPU); 7656 SD_INIT(sd, CPU);
7535 set_domain_attribute(sd, attr); 7657 set_domain_attribute(sd, attr);
7536 sd->span = *nodemask; 7658 cpumask_copy(sched_domain_span(sd), nodemask);
7537 sd->parent = p; 7659 sd->parent = p;
7538 if (p) 7660 if (p)
7539 p->child = sd; 7661 p->child = sd;
@@ -7541,11 +7663,11 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7541 7663
7542#ifdef CONFIG_SCHED_MC 7664#ifdef CONFIG_SCHED_MC
7543 p = sd; 7665 p = sd;
7544 sd = &per_cpu(core_domains, i); 7666 sd = &per_cpu(core_domains, i).sd;
7545 SD_INIT(sd, MC); 7667 SD_INIT(sd, MC);
7546 set_domain_attribute(sd, attr); 7668 set_domain_attribute(sd, attr);
7547 sd->span = cpu_coregroup_map(i); 7669 cpumask_and(sched_domain_span(sd), cpu_map,
7548 cpus_and(sd->span, sd->span, *cpu_map); 7670 cpu_coregroup_mask(i));
7549 sd->parent = p; 7671 sd->parent = p;
7550 p->child = sd; 7672 p->child = sd;
7551 cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask); 7673 cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);
@@ -7553,11 +7675,11 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7553 7675
7554#ifdef CONFIG_SCHED_SMT 7676#ifdef CONFIG_SCHED_SMT
7555 p = sd; 7677 p = sd;
7556 sd = &per_cpu(cpu_domains, i); 7678 sd = &per_cpu(cpu_domains, i).sd;
7557 SD_INIT(sd, SIBLING); 7679 SD_INIT(sd, SIBLING);
7558 set_domain_attribute(sd, attr); 7680 set_domain_attribute(sd, attr);
7559 sd->span = per_cpu(cpu_sibling_map, i); 7681 cpumask_and(sched_domain_span(sd),
7560 cpus_and(sd->span, sd->span, *cpu_map); 7682 &per_cpu(cpu_sibling_map, i), cpu_map);
7561 sd->parent = p; 7683 sd->parent = p;
7562 p->child = sd; 7684 p->child = sd;
7563 cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask); 7685 cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
@@ -7566,13 +7688,10 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7566 7688
7567#ifdef CONFIG_SCHED_SMT 7689#ifdef CONFIG_SCHED_SMT
7568 /* Set up CPU (sibling) groups */ 7690 /* Set up CPU (sibling) groups */
7569 for_each_cpu_mask_nr(i, *cpu_map) { 7691 for_each_cpu(i, cpu_map) {
7570 SCHED_CPUMASK_VAR(this_sibling_map, allmasks); 7692 cpumask_and(this_sibling_map,
7571 SCHED_CPUMASK_VAR(send_covered, allmasks); 7693 &per_cpu(cpu_sibling_map, i), cpu_map);
7572 7694 if (i != cpumask_first(this_sibling_map))
7573 *this_sibling_map = per_cpu(cpu_sibling_map, i);
7574 cpus_and(*this_sibling_map, *this_sibling_map, *cpu_map);
7575 if (i != first_cpu(*this_sibling_map))
7576 continue; 7695 continue;
7577 7696
7578 init_sched_build_groups(this_sibling_map, cpu_map, 7697 init_sched_build_groups(this_sibling_map, cpu_map,
@@ -7583,13 +7702,9 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7583 7702
7584#ifdef CONFIG_SCHED_MC 7703#ifdef CONFIG_SCHED_MC
7585 /* Set up multi-core groups */ 7704 /* Set up multi-core groups */
7586 for_each_cpu_mask_nr(i, *cpu_map) { 7705 for_each_cpu(i, cpu_map) {
7587 SCHED_CPUMASK_VAR(this_core_map, allmasks); 7706 cpumask_and(this_core_map, cpu_coregroup_mask(i), cpu_map);
7588 SCHED_CPUMASK_VAR(send_covered, allmasks); 7707 if (i != cpumask_first(this_core_map))
7589
7590 *this_core_map = cpu_coregroup_map(i);
7591 cpus_and(*this_core_map, *this_core_map, *cpu_map);
7592 if (i != first_cpu(*this_core_map))
7593 continue; 7708 continue;
7594 7709
7595 init_sched_build_groups(this_core_map, cpu_map, 7710 init_sched_build_groups(this_core_map, cpu_map,
@@ -7600,12 +7715,8 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7600 7715
7601 /* Set up physical groups */ 7716 /* Set up physical groups */
7602 for (i = 0; i < nr_node_ids; i++) { 7717 for (i = 0; i < nr_node_ids; i++) {
7603 SCHED_CPUMASK_VAR(nodemask, allmasks); 7718 cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
7604 SCHED_CPUMASK_VAR(send_covered, allmasks); 7719 if (cpumask_empty(nodemask))
7605
7606 *nodemask = node_to_cpumask(i);
7607 cpus_and(*nodemask, *nodemask, *cpu_map);
7608 if (cpus_empty(*nodemask))
7609 continue; 7720 continue;
7610 7721
7611 init_sched_build_groups(nodemask, cpu_map, 7722 init_sched_build_groups(nodemask, cpu_map,
@@ -7616,8 +7727,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7616#ifdef CONFIG_NUMA 7727#ifdef CONFIG_NUMA
7617 /* Set up node groups */ 7728 /* Set up node groups */
7618 if (sd_allnodes) { 7729 if (sd_allnodes) {
7619 SCHED_CPUMASK_VAR(send_covered, allmasks);
7620
7621 init_sched_build_groups(cpu_map, cpu_map, 7730 init_sched_build_groups(cpu_map, cpu_map,
7622 &cpu_to_allnodes_group, 7731 &cpu_to_allnodes_group,
7623 send_covered, tmpmask); 7732 send_covered, tmpmask);
@@ -7626,58 +7735,53 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7626 for (i = 0; i < nr_node_ids; i++) { 7735 for (i = 0; i < nr_node_ids; i++) {
7627 /* Set up node groups */ 7736 /* Set up node groups */
7628 struct sched_group *sg, *prev; 7737 struct sched_group *sg, *prev;
7629 SCHED_CPUMASK_VAR(nodemask, allmasks);
7630 SCHED_CPUMASK_VAR(domainspan, allmasks);
7631 SCHED_CPUMASK_VAR(covered, allmasks);
7632 int j; 7738 int j;
7633 7739
7634 *nodemask = node_to_cpumask(i); 7740 cpumask_clear(covered);
7635 cpus_clear(*covered); 7741 cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
7636 7742 if (cpumask_empty(nodemask)) {
7637 cpus_and(*nodemask, *nodemask, *cpu_map);
7638 if (cpus_empty(*nodemask)) {
7639 sched_group_nodes[i] = NULL; 7743 sched_group_nodes[i] = NULL;
7640 continue; 7744 continue;
7641 } 7745 }
7642 7746
7643 sched_domain_node_span(i, domainspan); 7747 sched_domain_node_span(i, domainspan);
7644 cpus_and(*domainspan, *domainspan, *cpu_map); 7748 cpumask_and(domainspan, domainspan, cpu_map);
7645 7749
7646 sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i); 7750 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
7751 GFP_KERNEL, i);
7647 if (!sg) { 7752 if (!sg) {
7648 printk(KERN_WARNING "Can not alloc domain group for " 7753 printk(KERN_WARNING "Can not alloc domain group for "
7649 "node %d\n", i); 7754 "node %d\n", i);
7650 goto error; 7755 goto error;
7651 } 7756 }
7652 sched_group_nodes[i] = sg; 7757 sched_group_nodes[i] = sg;
7653 for_each_cpu_mask_nr(j, *nodemask) { 7758 for_each_cpu(j, nodemask) {
7654 struct sched_domain *sd; 7759 struct sched_domain *sd;
7655 7760
7656 sd = &per_cpu(node_domains, j); 7761 sd = &per_cpu(node_domains, j);
7657 sd->groups = sg; 7762 sd->groups = sg;
7658 } 7763 }
7659 sg->__cpu_power = 0; 7764 sg->__cpu_power = 0;
7660 sg->cpumask = *nodemask; 7765 cpumask_copy(sched_group_cpus(sg), nodemask);
7661 sg->next = sg; 7766 sg->next = sg;
7662 cpus_or(*covered, *covered, *nodemask); 7767 cpumask_or(covered, covered, nodemask);
7663 prev = sg; 7768 prev = sg;
7664 7769
7665 for (j = 0; j < nr_node_ids; j++) { 7770 for (j = 0; j < nr_node_ids; j++) {
7666 SCHED_CPUMASK_VAR(notcovered, allmasks);
7667 int n = (i + j) % nr_node_ids; 7771 int n = (i + j) % nr_node_ids;
7668 node_to_cpumask_ptr(pnodemask, n);
7669 7772
7670 cpus_complement(*notcovered, *covered); 7773 cpumask_complement(notcovered, covered);
7671 cpus_and(*tmpmask, *notcovered, *cpu_map); 7774 cpumask_and(tmpmask, notcovered, cpu_map);
7672 cpus_and(*tmpmask, *tmpmask, *domainspan); 7775 cpumask_and(tmpmask, tmpmask, domainspan);
7673 if (cpus_empty(*tmpmask)) 7776 if (cpumask_empty(tmpmask))
7674 break; 7777 break;
7675 7778
7676 cpus_and(*tmpmask, *tmpmask, *pnodemask); 7779 cpumask_and(tmpmask, tmpmask, cpumask_of_node(n));
7677 if (cpus_empty(*tmpmask)) 7780 if (cpumask_empty(tmpmask))
7678 continue; 7781 continue;
7679 7782
7680 sg = kmalloc_node(sizeof(struct sched_group), 7783 sg = kmalloc_node(sizeof(struct sched_group) +
7784 cpumask_size(),
7681 GFP_KERNEL, i); 7785 GFP_KERNEL, i);
7682 if (!sg) { 7786 if (!sg) {
7683 printk(KERN_WARNING 7787 printk(KERN_WARNING
@@ -7685,9 +7789,9 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7685 goto error; 7789 goto error;
7686 } 7790 }
7687 sg->__cpu_power = 0; 7791 sg->__cpu_power = 0;
7688 sg->cpumask = *tmpmask; 7792 cpumask_copy(sched_group_cpus(sg), tmpmask);
7689 sg->next = prev->next; 7793 sg->next = prev->next;
7690 cpus_or(*covered, *covered, *tmpmask); 7794 cpumask_or(covered, covered, tmpmask);
7691 prev->next = sg; 7795 prev->next = sg;
7692 prev = sg; 7796 prev = sg;
7693 } 7797 }
@@ -7696,22 +7800,22 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7696 7800
7697 /* Calculate CPU power for physical packages and nodes */ 7801 /* Calculate CPU power for physical packages and nodes */
7698#ifdef CONFIG_SCHED_SMT 7802#ifdef CONFIG_SCHED_SMT
7699 for_each_cpu_mask_nr(i, *cpu_map) { 7803 for_each_cpu(i, cpu_map) {
7700 struct sched_domain *sd = &per_cpu(cpu_domains, i); 7804 struct sched_domain *sd = &per_cpu(cpu_domains, i).sd;
7701 7805
7702 init_sched_groups_power(i, sd); 7806 init_sched_groups_power(i, sd);
7703 } 7807 }
7704#endif 7808#endif
7705#ifdef CONFIG_SCHED_MC 7809#ifdef CONFIG_SCHED_MC
7706 for_each_cpu_mask_nr(i, *cpu_map) { 7810 for_each_cpu(i, cpu_map) {
7707 struct sched_domain *sd = &per_cpu(core_domains, i); 7811 struct sched_domain *sd = &per_cpu(core_domains, i).sd;
7708 7812
7709 init_sched_groups_power(i, sd); 7813 init_sched_groups_power(i, sd);
7710 } 7814 }
7711#endif 7815#endif
7712 7816
7713 for_each_cpu_mask_nr(i, *cpu_map) { 7817 for_each_cpu(i, cpu_map) {
7714 struct sched_domain *sd = &per_cpu(phys_domains, i); 7818 struct sched_domain *sd = &per_cpu(phys_domains, i).sd;
7715 7819
7716 init_sched_groups_power(i, sd); 7820 init_sched_groups_power(i, sd);
7717 } 7821 }
@@ -7723,53 +7827,78 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7723 if (sd_allnodes) { 7827 if (sd_allnodes) {
7724 struct sched_group *sg; 7828 struct sched_group *sg;
7725 7829
7726 cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg, 7830 cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
7727 tmpmask); 7831 tmpmask);
7728 init_numa_sched_groups_power(sg); 7832 init_numa_sched_groups_power(sg);
7729 } 7833 }
7730#endif 7834#endif
7731 7835
7732 /* Attach the domains */ 7836 /* Attach the domains */
7733 for_each_cpu_mask_nr(i, *cpu_map) { 7837 for_each_cpu(i, cpu_map) {
7734 struct sched_domain *sd; 7838 struct sched_domain *sd;
7735#ifdef CONFIG_SCHED_SMT 7839#ifdef CONFIG_SCHED_SMT
7736 sd = &per_cpu(cpu_domains, i); 7840 sd = &per_cpu(cpu_domains, i).sd;
7737#elif defined(CONFIG_SCHED_MC) 7841#elif defined(CONFIG_SCHED_MC)
7738 sd = &per_cpu(core_domains, i); 7842 sd = &per_cpu(core_domains, i).sd;
7739#else 7843#else
7740 sd = &per_cpu(phys_domains, i); 7844 sd = &per_cpu(phys_domains, i).sd;
7741#endif 7845#endif
7742 cpu_attach_domain(sd, rd, i); 7846 cpu_attach_domain(sd, rd, i);
7743 } 7847 }
7744 7848
7745 sched_cpumask_free(allmasks); 7849 err = 0;
7746 return 0; 7850
7851free_tmpmask:
7852 free_cpumask_var(tmpmask);
7853free_send_covered:
7854 free_cpumask_var(send_covered);
7855free_this_core_map:
7856 free_cpumask_var(this_core_map);
7857free_this_sibling_map:
7858 free_cpumask_var(this_sibling_map);
7859free_nodemask:
7860 free_cpumask_var(nodemask);
7861free_notcovered:
7862#ifdef CONFIG_NUMA
7863 free_cpumask_var(notcovered);
7864free_covered:
7865 free_cpumask_var(covered);
7866free_domainspan:
7867 free_cpumask_var(domainspan);
7868out:
7869#endif
7870 return err;
7871
7872free_sched_groups:
7873#ifdef CONFIG_NUMA
7874 kfree(sched_group_nodes);
7875#endif
7876 goto free_tmpmask;
7747 7877
7748#ifdef CONFIG_NUMA 7878#ifdef CONFIG_NUMA
7749error: 7879error:
7750 free_sched_groups(cpu_map, tmpmask); 7880 free_sched_groups(cpu_map, tmpmask);
7751 sched_cpumask_free(allmasks); 7881 free_rootdomain(rd);
7752 kfree(rd); 7882 goto free_tmpmask;
7753 return -ENOMEM;
7754#endif 7883#endif
7755} 7884}
7756 7885
7757static int build_sched_domains(const cpumask_t *cpu_map) 7886static int build_sched_domains(const struct cpumask *cpu_map)
7758{ 7887{
7759 return __build_sched_domains(cpu_map, NULL); 7888 return __build_sched_domains(cpu_map, NULL);
7760} 7889}
7761 7890
7762static cpumask_t *doms_cur; /* current sched domains */ 7891static struct cpumask *doms_cur; /* current sched domains */
7763static int ndoms_cur; /* number of sched domains in 'doms_cur' */ 7892static int ndoms_cur; /* number of sched domains in 'doms_cur' */
7764static struct sched_domain_attr *dattr_cur; 7893static struct sched_domain_attr *dattr_cur;
7765 /* attribues of custom domains in 'doms_cur' */ 7894 /* attribues of custom domains in 'doms_cur' */
7766 7895
7767/* 7896/*
7768 * Special case: If a kmalloc of a doms_cur partition (array of 7897 * Special case: If a kmalloc of a doms_cur partition (array of
7769 * cpumask_t) fails, then fallback to a single sched domain, 7898 * cpumask) fails, then fallback to a single sched domain,
7770 * as determined by the single cpumask_t fallback_doms. 7899 * as determined by the single cpumask fallback_doms.
7771 */ 7900 */
7772static cpumask_t fallback_doms; 7901static cpumask_var_t fallback_doms;
7773 7902
7774/* 7903/*
7775 * arch_update_cpu_topology lets virtualized architectures update the 7904 * arch_update_cpu_topology lets virtualized architectures update the
@@ -7786,16 +7915,16 @@ int __attribute__((weak)) arch_update_cpu_topology(void)
7786 * For now this just excludes isolated cpus, but could be used to 7915 * For now this just excludes isolated cpus, but could be used to
7787 * exclude other special cases in the future. 7916 * exclude other special cases in the future.
7788 */ 7917 */
7789static int arch_init_sched_domains(const cpumask_t *cpu_map) 7918static int arch_init_sched_domains(const struct cpumask *cpu_map)
7790{ 7919{
7791 int err; 7920 int err;
7792 7921
7793 arch_update_cpu_topology(); 7922 arch_update_cpu_topology();
7794 ndoms_cur = 1; 7923 ndoms_cur = 1;
7795 doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL); 7924 doms_cur = kmalloc(cpumask_size(), GFP_KERNEL);
7796 if (!doms_cur) 7925 if (!doms_cur)
7797 doms_cur = &fallback_doms; 7926 doms_cur = fallback_doms;
7798 cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map); 7927 cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map);
7799 dattr_cur = NULL; 7928 dattr_cur = NULL;
7800 err = build_sched_domains(doms_cur); 7929 err = build_sched_domains(doms_cur);
7801 register_sched_domain_sysctl(); 7930 register_sched_domain_sysctl();
@@ -7803,8 +7932,8 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map)
7803 return err; 7932 return err;
7804} 7933}
7805 7934
7806static void arch_destroy_sched_domains(const cpumask_t *cpu_map, 7935static void arch_destroy_sched_domains(const struct cpumask *cpu_map,
7807 cpumask_t *tmpmask) 7936 struct cpumask *tmpmask)
7808{ 7937{
7809 free_sched_groups(cpu_map, tmpmask); 7938 free_sched_groups(cpu_map, tmpmask);
7810} 7939}
@@ -7813,15 +7942,16 @@ static void arch_destroy_sched_domains(const cpumask_t *cpu_map,
7813 * Detach sched domains from a group of cpus specified in cpu_map 7942 * Detach sched domains from a group of cpus specified in cpu_map
7814 * These cpus will now be attached to the NULL domain 7943 * These cpus will now be attached to the NULL domain
7815 */ 7944 */
7816static void detach_destroy_domains(const cpumask_t *cpu_map) 7945static void detach_destroy_domains(const struct cpumask *cpu_map)
7817{ 7946{
7818 cpumask_t tmpmask; 7947 /* Save because hotplug lock held. */
7948 static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);
7819 int i; 7949 int i;
7820 7950
7821 for_each_cpu_mask_nr(i, *cpu_map) 7951 for_each_cpu(i, cpu_map)
7822 cpu_attach_domain(NULL, &def_root_domain, i); 7952 cpu_attach_domain(NULL, &def_root_domain, i);
7823 synchronize_sched(); 7953 synchronize_sched();
7824 arch_destroy_sched_domains(cpu_map, &tmpmask); 7954 arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));
7825} 7955}
7826 7956
7827/* handle null as "default" */ 7957/* handle null as "default" */
@@ -7846,7 +7976,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
7846 * doms_new[] to the current sched domain partitioning, doms_cur[]. 7976 * doms_new[] to the current sched domain partitioning, doms_cur[].
7847 * It destroys each deleted domain and builds each new domain. 7977 * It destroys each deleted domain and builds each new domain.
7848 * 7978 *
7849 * 'doms_new' is an array of cpumask_t's of length 'ndoms_new'. 7979 * 'doms_new' is an array of cpumask's of length 'ndoms_new'.
7850 * The masks don't intersect (don't overlap.) We should setup one 7980 * The masks don't intersect (don't overlap.) We should setup one
7851 * sched domain for each mask. CPUs not in any of the cpumasks will 7981 * sched domain for each mask. CPUs not in any of the cpumasks will
7852 * not be load balanced. If the same cpumask appears both in the 7982 * not be load balanced. If the same cpumask appears both in the
@@ -7860,13 +7990,14 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
7860 * the single partition 'fallback_doms', it also forces the domains 7990 * the single partition 'fallback_doms', it also forces the domains
7861 * to be rebuilt. 7991 * to be rebuilt.
7862 * 7992 *
7863 * If doms_new == NULL it will be replaced with cpu_online_map. 7993 * If doms_new == NULL it will be replaced with cpu_online_mask.
7864 * ndoms_new == 0 is a special case for destroying existing domains, 7994 * ndoms_new == 0 is a special case for destroying existing domains,
7865 * and it will not create the default domain. 7995 * and it will not create the default domain.
7866 * 7996 *
7867 * Call with hotplug lock held 7997 * Call with hotplug lock held
7868 */ 7998 */
7869void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, 7999/* FIXME: Change to struct cpumask *doms_new[] */
8000void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
7870 struct sched_domain_attr *dattr_new) 8001 struct sched_domain_attr *dattr_new)
7871{ 8002{
7872 int i, j, n; 8003 int i, j, n;
@@ -7885,7 +8016,7 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
7885 /* Destroy deleted domains */ 8016 /* Destroy deleted domains */
7886 for (i = 0; i < ndoms_cur; i++) { 8017 for (i = 0; i < ndoms_cur; i++) {
7887 for (j = 0; j < n && !new_topology; j++) { 8018 for (j = 0; j < n && !new_topology; j++) {
7888 if (cpus_equal(doms_cur[i], doms_new[j]) 8019 if (cpumask_equal(&doms_cur[i], &doms_new[j])
7889 && dattrs_equal(dattr_cur, i, dattr_new, j)) 8020 && dattrs_equal(dattr_cur, i, dattr_new, j))
7890 goto match1; 8021 goto match1;
7891 } 8022 }
@@ -7897,15 +8028,15 @@ match1:
7897 8028
7898 if (doms_new == NULL) { 8029 if (doms_new == NULL) {
7899 ndoms_cur = 0; 8030 ndoms_cur = 0;
7900 doms_new = &fallback_doms; 8031 doms_new = fallback_doms;
7901 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); 8032 cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map);
7902 WARN_ON_ONCE(dattr_new); 8033 WARN_ON_ONCE(dattr_new);
7903 } 8034 }
7904 8035
7905 /* Build new domains */ 8036 /* Build new domains */
7906 for (i = 0; i < ndoms_new; i++) { 8037 for (i = 0; i < ndoms_new; i++) {
7907 for (j = 0; j < ndoms_cur && !new_topology; j++) { 8038 for (j = 0; j < ndoms_cur && !new_topology; j++) {
7908 if (cpus_equal(doms_new[i], doms_cur[j]) 8039 if (cpumask_equal(&doms_new[i], &doms_cur[j])
7909 && dattrs_equal(dattr_new, i, dattr_cur, j)) 8040 && dattrs_equal(dattr_new, i, dattr_cur, j))
7910 goto match2; 8041 goto match2;
7911 } 8042 }
@@ -7917,7 +8048,7 @@ match2:
7917 } 8048 }
7918 8049
7919 /* Remember the new sched domains */ 8050 /* Remember the new sched domains */
7920 if (doms_cur != &fallback_doms) 8051 if (doms_cur != fallback_doms)
7921 kfree(doms_cur); 8052 kfree(doms_cur);
7922 kfree(dattr_cur); /* kfree(NULL) is safe */ 8053 kfree(dattr_cur); /* kfree(NULL) is safe */
7923 doms_cur = doms_new; 8054 doms_cur = doms_new;
@@ -7930,7 +8061,7 @@ match2:
7930} 8061}
7931 8062
7932#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 8063#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
7933int arch_reinit_sched_domains(void) 8064static void arch_reinit_sched_domains(void)
7934{ 8065{
7935 get_online_cpus(); 8066 get_online_cpus();
7936 8067
@@ -7939,25 +8070,33 @@ int arch_reinit_sched_domains(void)
7939 8070
7940 rebuild_sched_domains(); 8071 rebuild_sched_domains();
7941 put_online_cpus(); 8072 put_online_cpus();
7942
7943 return 0;
7944} 8073}
7945 8074
7946static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) 8075static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
7947{ 8076{
7948 int ret; 8077 unsigned int level = 0;
8078
8079 if (sscanf(buf, "%u", &level) != 1)
8080 return -EINVAL;
8081
8082 /*
8083 * level is always be positive so don't check for
8084 * level < POWERSAVINGS_BALANCE_NONE which is 0
8085 * What happens on 0 or 1 byte write,
8086 * need to check for count as well?
8087 */
7949 8088
7950 if (buf[0] != '0' && buf[0] != '1') 8089 if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
7951 return -EINVAL; 8090 return -EINVAL;
7952 8091
7953 if (smt) 8092 if (smt)
7954 sched_smt_power_savings = (buf[0] == '1'); 8093 sched_smt_power_savings = level;
7955 else 8094 else
7956 sched_mc_power_savings = (buf[0] == '1'); 8095 sched_mc_power_savings = level;
7957 8096
7958 ret = arch_reinit_sched_domains(); 8097 arch_reinit_sched_domains();
7959 8098
7960 return ret ? ret : count; 8099 return count;
7961} 8100}
7962 8101
7963#ifdef CONFIG_SCHED_MC 8102#ifdef CONFIG_SCHED_MC
@@ -7992,7 +8131,7 @@ static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,
7992 sched_smt_power_savings_store); 8131 sched_smt_power_savings_store);
7993#endif 8132#endif
7994 8133
7995int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) 8134int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
7996{ 8135{
7997 int err = 0; 8136 int err = 0;
7998 8137
@@ -8057,7 +8196,9 @@ static int update_runtime(struct notifier_block *nfb,
8057 8196
8058void __init sched_init_smp(void) 8197void __init sched_init_smp(void)
8059{ 8198{
8060 cpumask_t non_isolated_cpus; 8199 cpumask_var_t non_isolated_cpus;
8200
8201 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
8061 8202
8062#if defined(CONFIG_NUMA) 8203#if defined(CONFIG_NUMA)
8063 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), 8204 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
@@ -8066,10 +8207,10 @@ void __init sched_init_smp(void)
8066#endif 8207#endif
8067 get_online_cpus(); 8208 get_online_cpus();
8068 mutex_lock(&sched_domains_mutex); 8209 mutex_lock(&sched_domains_mutex);
8069 arch_init_sched_domains(&cpu_online_map); 8210 arch_init_sched_domains(cpu_online_mask);
8070 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); 8211 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
8071 if (cpus_empty(non_isolated_cpus)) 8212 if (cpumask_empty(non_isolated_cpus))
8072 cpu_set(smp_processor_id(), non_isolated_cpus); 8213 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
8073 mutex_unlock(&sched_domains_mutex); 8214 mutex_unlock(&sched_domains_mutex);
8074 put_online_cpus(); 8215 put_online_cpus();
8075 8216
@@ -8084,9 +8225,13 @@ void __init sched_init_smp(void)
8084 init_hrtick(); 8225 init_hrtick();
8085 8226
8086 /* Move init over to a non-isolated CPU */ 8227 /* Move init over to a non-isolated CPU */
8087 if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0) 8228 if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
8088 BUG(); 8229 BUG();
8089 sched_init_granularity(); 8230 sched_init_granularity();
8231 free_cpumask_var(non_isolated_cpus);
8232
8233 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
8234 init_sched_rt_class();
8090} 8235}
8091#else 8236#else
8092void __init sched_init_smp(void) 8237void __init sched_init_smp(void)
@@ -8401,6 +8546,15 @@ void __init sched_init(void)
8401 */ 8546 */
8402 current->sched_class = &fair_sched_class; 8547 current->sched_class = &fair_sched_class;
8403 8548
8549 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
8550 alloc_bootmem_cpumask_var(&nohz_cpu_mask);
8551#ifdef CONFIG_SMP
8552#ifdef CONFIG_NO_HZ
8553 alloc_bootmem_cpumask_var(&nohz.cpu_mask);
8554#endif
8555 alloc_bootmem_cpumask_var(&cpu_isolated_map);
8556#endif /* SMP */
8557
8404 scheduler_running = 1; 8558 scheduler_running = 1;
8405} 8559}
8406 8560