aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c511
1 files changed, 303 insertions, 208 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index f385eff4682d..8a0afb97af71 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -225,8 +225,10 @@ struct rq {
225 unsigned long nr_uninterruptible; 225 unsigned long nr_uninterruptible;
226 226
227 unsigned long expired_timestamp; 227 unsigned long expired_timestamp;
228 unsigned long long timestamp_last_tick; 228 /* Cached timestamp set by update_cpu_clock() */
229 unsigned long long most_recent_timestamp;
229 struct task_struct *curr, *idle; 230 struct task_struct *curr, *idle;
231 unsigned long next_balance;
230 struct mm_struct *prev_mm; 232 struct mm_struct *prev_mm;
231 struct prio_array *active, *expired, arrays[2]; 233 struct prio_array *active, *expired, arrays[2];
232 int best_expired_prio; 234 int best_expired_prio;
@@ -426,7 +428,7 @@ static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
426 * bump this up when changing the output format or the meaning of an existing 428 * bump this up when changing the output format or the meaning of an existing
427 * format, so that tools can adapt (or abort) 429 * format, so that tools can adapt (or abort)
428 */ 430 */
429#define SCHEDSTAT_VERSION 12 431#define SCHEDSTAT_VERSION 14
430 432
431static int show_schedstat(struct seq_file *seq, void *v) 433static int show_schedstat(struct seq_file *seq, void *v)
432{ 434{
@@ -464,7 +466,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
464 seq_printf(seq, "domain%d %s", dcnt++, mask_str); 466 seq_printf(seq, "domain%d %s", dcnt++, mask_str);
465 for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES; 467 for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES;
466 itype++) { 468 itype++) {
467 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu", 469 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu "
470 "%lu",
468 sd->lb_cnt[itype], 471 sd->lb_cnt[itype],
469 sd->lb_balanced[itype], 472 sd->lb_balanced[itype],
470 sd->lb_failed[itype], 473 sd->lb_failed[itype],
@@ -474,11 +477,13 @@ static int show_schedstat(struct seq_file *seq, void *v)
474 sd->lb_nobusyq[itype], 477 sd->lb_nobusyq[itype],
475 sd->lb_nobusyg[itype]); 478 sd->lb_nobusyg[itype]);
476 } 479 }
477 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", 480 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu"
481 " %lu %lu %lu\n",
478 sd->alb_cnt, sd->alb_failed, sd->alb_pushed, 482 sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
479 sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed, 483 sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,
480 sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed, 484 sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
481 sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance); 485 sd->ttwu_wake_remote, sd->ttwu_move_affine,
486 sd->ttwu_move_balance);
482 } 487 }
483 preempt_enable(); 488 preempt_enable();
484#endif 489#endif
@@ -547,7 +552,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
547#endif 552#endif
548 553
549/* 554/*
550 * rq_lock - lock a given runqueue and disable interrupts. 555 * this_rq_lock - lock this runqueue and disable interrupts.
551 */ 556 */
552static inline struct rq *this_rq_lock(void) 557static inline struct rq *this_rq_lock(void)
553 __acquires(rq->lock) 558 __acquires(rq->lock)
@@ -938,13 +943,16 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local)
938{ 943{
939 unsigned long long now; 944 unsigned long long now;
940 945
946 if (rt_task(p))
947 goto out;
948
941 now = sched_clock(); 949 now = sched_clock();
942#ifdef CONFIG_SMP 950#ifdef CONFIG_SMP
943 if (!local) { 951 if (!local) {
944 /* Compensate for drifting sched_clock */ 952 /* Compensate for drifting sched_clock */
945 struct rq *this_rq = this_rq(); 953 struct rq *this_rq = this_rq();
946 now = (now - this_rq->timestamp_last_tick) 954 now = (now - this_rq->most_recent_timestamp)
947 + rq->timestamp_last_tick; 955 + rq->most_recent_timestamp;
948 } 956 }
949#endif 957#endif
950 958
@@ -959,8 +967,7 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local)
959 (now - p->timestamp) >> 20); 967 (now - p->timestamp) >> 20);
960 } 968 }
961 969
962 if (!rt_task(p)) 970 p->prio = recalc_task_prio(p, now);
963 p->prio = recalc_task_prio(p, now);
964 971
965 /* 972 /*
966 * This checks to make sure it's not an uninterruptible task 973 * This checks to make sure it's not an uninterruptible task
@@ -985,7 +992,7 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local)
985 } 992 }
986 } 993 }
987 p->timestamp = now; 994 p->timestamp = now;
988 995out:
989 __activate_task(p, rq); 996 __activate_task(p, rq);
990} 997}
991 998
@@ -1450,7 +1457,9 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1450 1457
1451 if (this_sd->flags & SD_WAKE_AFFINE) { 1458 if (this_sd->flags & SD_WAKE_AFFINE) {
1452 unsigned long tl = this_load; 1459 unsigned long tl = this_load;
1453 unsigned long tl_per_task = cpu_avg_load_per_task(this_cpu); 1460 unsigned long tl_per_task;
1461
1462 tl_per_task = cpu_avg_load_per_task(this_cpu);
1454 1463
1455 /* 1464 /*
1456 * If sync wakeup then subtract the (maximum possible) 1465 * If sync wakeup then subtract the (maximum possible)
@@ -1688,8 +1697,8 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
1688 * Not the local CPU - must adjust timestamp. This should 1697 * Not the local CPU - must adjust timestamp. This should
1689 * get optimised away in the !CONFIG_SMP case. 1698 * get optimised away in the !CONFIG_SMP case.
1690 */ 1699 */
1691 p->timestamp = (p->timestamp - this_rq->timestamp_last_tick) 1700 p->timestamp = (p->timestamp - this_rq->most_recent_timestamp)
1692 + rq->timestamp_last_tick; 1701 + rq->most_recent_timestamp;
1693 __activate_task(p, rq); 1702 __activate_task(p, rq);
1694 if (TASK_PREEMPTS_CURR(p, rq)) 1703 if (TASK_PREEMPTS_CURR(p, rq))
1695 resched_task(rq->curr); 1704 resched_task(rq->curr);
@@ -1952,6 +1961,7 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1952 __acquires(rq1->lock) 1961 __acquires(rq1->lock)
1953 __acquires(rq2->lock) 1962 __acquires(rq2->lock)
1954{ 1963{
1964 BUG_ON(!irqs_disabled());
1955 if (rq1 == rq2) { 1965 if (rq1 == rq2) {
1956 spin_lock(&rq1->lock); 1966 spin_lock(&rq1->lock);
1957 __acquire(rq2->lock); /* Fake it out ;) */ 1967 __acquire(rq2->lock); /* Fake it out ;) */
@@ -1991,6 +2001,11 @@ static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
1991 __acquires(busiest->lock) 2001 __acquires(busiest->lock)
1992 __acquires(this_rq->lock) 2002 __acquires(this_rq->lock)
1993{ 2003{
2004 if (unlikely(!irqs_disabled())) {
2005 /* printk() doesn't work good under rq->lock */
2006 spin_unlock(&this_rq->lock);
2007 BUG_ON(1);
2008 }
1994 if (unlikely(!spin_trylock(&busiest->lock))) { 2009 if (unlikely(!spin_trylock(&busiest->lock))) {
1995 if (busiest < this_rq) { 2010 if (busiest < this_rq) {
1996 spin_unlock(&this_rq->lock); 2011 spin_unlock(&this_rq->lock);
@@ -2061,8 +2076,8 @@ static void pull_task(struct rq *src_rq, struct prio_array *src_array,
2061 set_task_cpu(p, this_cpu); 2076 set_task_cpu(p, this_cpu);
2062 inc_nr_running(p, this_rq); 2077 inc_nr_running(p, this_rq);
2063 enqueue_task(p, this_array); 2078 enqueue_task(p, this_array);
2064 p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) 2079 p->timestamp = (p->timestamp - src_rq->most_recent_timestamp)
2065 + this_rq->timestamp_last_tick; 2080 + this_rq->most_recent_timestamp;
2066 /* 2081 /*
2067 * Note that idle threads have a prio of MAX_PRIO, for this test 2082 * Note that idle threads have a prio of MAX_PRIO, for this test
2068 * to be always true for them. 2083 * to be always true for them.
@@ -2098,10 +2113,15 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2098 * 2) too many balance attempts have failed. 2113 * 2) too many balance attempts have failed.
2099 */ 2114 */
2100 2115
2101 if (sd->nr_balance_failed > sd->cache_nice_tries) 2116 if (sd->nr_balance_failed > sd->cache_nice_tries) {
2117#ifdef CONFIG_SCHEDSTATS
2118 if (task_hot(p, rq->most_recent_timestamp, sd))
2119 schedstat_inc(sd, lb_hot_gained[idle]);
2120#endif
2102 return 1; 2121 return 1;
2122 }
2103 2123
2104 if (task_hot(p, rq->timestamp_last_tick, sd)) 2124 if (task_hot(p, rq->most_recent_timestamp, sd))
2105 return 0; 2125 return 0;
2106 return 1; 2126 return 1;
2107} 2127}
@@ -2199,11 +2219,6 @@ skip_queue:
2199 goto skip_bitmap; 2219 goto skip_bitmap;
2200 } 2220 }
2201 2221
2202#ifdef CONFIG_SCHEDSTATS
2203 if (task_hot(tmp, busiest->timestamp_last_tick, sd))
2204 schedstat_inc(sd, lb_hot_gained[idle]);
2205#endif
2206
2207 pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); 2222 pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
2208 pulled++; 2223 pulled++;
2209 rem_load_move -= tmp->load_weight; 2224 rem_load_move -= tmp->load_weight;
@@ -2241,7 +2256,7 @@ out:
2241static struct sched_group * 2256static struct sched_group *
2242find_busiest_group(struct sched_domain *sd, int this_cpu, 2257find_busiest_group(struct sched_domain *sd, int this_cpu,
2243 unsigned long *imbalance, enum idle_type idle, int *sd_idle, 2258 unsigned long *imbalance, enum idle_type idle, int *sd_idle,
2244 cpumask_t *cpus) 2259 cpumask_t *cpus, int *balance)
2245{ 2260{
2246 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; 2261 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
2247 unsigned long max_load, avg_load, total_load, this_load, total_pwr; 2262 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
@@ -2270,10 +2285,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2270 unsigned long load, group_capacity; 2285 unsigned long load, group_capacity;
2271 int local_group; 2286 int local_group;
2272 int i; 2287 int i;
2288 unsigned int balance_cpu = -1, first_idle_cpu = 0;
2273 unsigned long sum_nr_running, sum_weighted_load; 2289 unsigned long sum_nr_running, sum_weighted_load;
2274 2290
2275 local_group = cpu_isset(this_cpu, group->cpumask); 2291 local_group = cpu_isset(this_cpu, group->cpumask);
2276 2292
2293 if (local_group)
2294 balance_cpu = first_cpu(group->cpumask);
2295
2277 /* Tally up the load of all CPUs in the group */ 2296 /* Tally up the load of all CPUs in the group */
2278 sum_weighted_load = sum_nr_running = avg_load = 0; 2297 sum_weighted_load = sum_nr_running = avg_load = 0;
2279 2298
@@ -2289,9 +2308,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2289 *sd_idle = 0; 2308 *sd_idle = 0;
2290 2309
2291 /* Bias balancing toward cpus of our domain */ 2310 /* Bias balancing toward cpus of our domain */
2292 if (local_group) 2311 if (local_group) {
2312 if (idle_cpu(i) && !first_idle_cpu) {
2313 first_idle_cpu = 1;
2314 balance_cpu = i;
2315 }
2316
2293 load = target_load(i, load_idx); 2317 load = target_load(i, load_idx);
2294 else 2318 } else
2295 load = source_load(i, load_idx); 2319 load = source_load(i, load_idx);
2296 2320
2297 avg_load += load; 2321 avg_load += load;
@@ -2299,6 +2323,16 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2299 sum_weighted_load += rq->raw_weighted_load; 2323 sum_weighted_load += rq->raw_weighted_load;
2300 } 2324 }
2301 2325
2326 /*
2327 * First idle cpu or the first cpu(busiest) in this sched group
2328 * is eligible for doing load balancing at this and above
2329 * domains.
2330 */
2331 if (local_group && balance_cpu != this_cpu && balance) {
2332 *balance = 0;
2333 goto ret;
2334 }
2335
2302 total_load += avg_load; 2336 total_load += avg_load;
2303 total_pwr += group->cpu_power; 2337 total_pwr += group->cpu_power;
2304 2338
@@ -2458,18 +2492,21 @@ small_imbalance:
2458 pwr_now /= SCHED_LOAD_SCALE; 2492 pwr_now /= SCHED_LOAD_SCALE;
2459 2493
2460 /* Amount of load we'd subtract */ 2494 /* Amount of load we'd subtract */
2461 tmp = busiest_load_per_task*SCHED_LOAD_SCALE/busiest->cpu_power; 2495 tmp = busiest_load_per_task * SCHED_LOAD_SCALE /
2496 busiest->cpu_power;
2462 if (max_load > tmp) 2497 if (max_load > tmp)
2463 pwr_move += busiest->cpu_power * 2498 pwr_move += busiest->cpu_power *
2464 min(busiest_load_per_task, max_load - tmp); 2499 min(busiest_load_per_task, max_load - tmp);
2465 2500
2466 /* Amount of load we'd add */ 2501 /* Amount of load we'd add */
2467 if (max_load*busiest->cpu_power < 2502 if (max_load * busiest->cpu_power <
2468 busiest_load_per_task*SCHED_LOAD_SCALE) 2503 busiest_load_per_task * SCHED_LOAD_SCALE)
2469 tmp = max_load*busiest->cpu_power/this->cpu_power; 2504 tmp = max_load * busiest->cpu_power / this->cpu_power;
2470 else 2505 else
2471 tmp = busiest_load_per_task*SCHED_LOAD_SCALE/this->cpu_power; 2506 tmp = busiest_load_per_task * SCHED_LOAD_SCALE /
2472 pwr_move += this->cpu_power*min(this_load_per_task, this_load + tmp); 2507 this->cpu_power;
2508 pwr_move += this->cpu_power *
2509 min(this_load_per_task, this_load + tmp);
2473 pwr_move /= SCHED_LOAD_SCALE; 2510 pwr_move /= SCHED_LOAD_SCALE;
2474 2511
2475 /* Move if we gain throughput */ 2512 /* Move if we gain throughput */
@@ -2490,8 +2527,8 @@ out_balanced:
2490 *imbalance = min_load_per_task; 2527 *imbalance = min_load_per_task;
2491 return group_min; 2528 return group_min;
2492 } 2529 }
2493ret:
2494#endif 2530#endif
2531ret:
2495 *imbalance = 0; 2532 *imbalance = 0;
2496 return NULL; 2533 return NULL;
2497} 2534}
@@ -2540,17 +2577,17 @@ static inline unsigned long minus_1_or_zero(unsigned long n)
2540/* 2577/*
2541 * Check this_cpu to ensure it is balanced within domain. Attempt to move 2578 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2542 * tasks if there is an imbalance. 2579 * tasks if there is an imbalance.
2543 *
2544 * Called with this_rq unlocked.
2545 */ 2580 */
2546static int load_balance(int this_cpu, struct rq *this_rq, 2581static int load_balance(int this_cpu, struct rq *this_rq,
2547 struct sched_domain *sd, enum idle_type idle) 2582 struct sched_domain *sd, enum idle_type idle,
2583 int *balance)
2548{ 2584{
2549 int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; 2585 int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
2550 struct sched_group *group; 2586 struct sched_group *group;
2551 unsigned long imbalance; 2587 unsigned long imbalance;
2552 struct rq *busiest; 2588 struct rq *busiest;
2553 cpumask_t cpus = CPU_MASK_ALL; 2589 cpumask_t cpus = CPU_MASK_ALL;
2590 unsigned long flags;
2554 2591
2555 /* 2592 /*
2556 * When power savings policy is enabled for the parent domain, idle 2593 * When power savings policy is enabled for the parent domain, idle
@@ -2566,7 +2603,11 @@ static int load_balance(int this_cpu, struct rq *this_rq,
2566 2603
2567redo: 2604redo:
2568 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, 2605 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
2569 &cpus); 2606 &cpus, balance);
2607
2608 if (*balance == 0)
2609 goto out_balanced;
2610
2570 if (!group) { 2611 if (!group) {
2571 schedstat_inc(sd, lb_nobusyg[idle]); 2612 schedstat_inc(sd, lb_nobusyg[idle]);
2572 goto out_balanced; 2613 goto out_balanced;
@@ -2590,11 +2631,13 @@ redo:
2590 * still unbalanced. nr_moved simply stays zero, so it is 2631 * still unbalanced. nr_moved simply stays zero, so it is
2591 * correctly treated as an imbalance. 2632 * correctly treated as an imbalance.
2592 */ 2633 */
2634 local_irq_save(flags);
2593 double_rq_lock(this_rq, busiest); 2635 double_rq_lock(this_rq, busiest);
2594 nr_moved = move_tasks(this_rq, this_cpu, busiest, 2636 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2595 minus_1_or_zero(busiest->nr_running), 2637 minus_1_or_zero(busiest->nr_running),
2596 imbalance, sd, idle, &all_pinned); 2638 imbalance, sd, idle, &all_pinned);
2597 double_rq_unlock(this_rq, busiest); 2639 double_rq_unlock(this_rq, busiest);
2640 local_irq_restore(flags);
2598 2641
2599 /* All tasks on this runqueue were pinned by CPU affinity */ 2642 /* All tasks on this runqueue were pinned by CPU affinity */
2600 if (unlikely(all_pinned)) { 2643 if (unlikely(all_pinned)) {
@@ -2611,13 +2654,13 @@ redo:
2611 2654
2612 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { 2655 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
2613 2656
2614 spin_lock(&busiest->lock); 2657 spin_lock_irqsave(&busiest->lock, flags);
2615 2658
2616 /* don't kick the migration_thread, if the curr 2659 /* don't kick the migration_thread, if the curr
2617 * task on busiest cpu can't be moved to this_cpu 2660 * task on busiest cpu can't be moved to this_cpu
2618 */ 2661 */
2619 if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { 2662 if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
2620 spin_unlock(&busiest->lock); 2663 spin_unlock_irqrestore(&busiest->lock, flags);
2621 all_pinned = 1; 2664 all_pinned = 1;
2622 goto out_one_pinned; 2665 goto out_one_pinned;
2623 } 2666 }
@@ -2627,7 +2670,7 @@ redo:
2627 busiest->push_cpu = this_cpu; 2670 busiest->push_cpu = this_cpu;
2628 active_balance = 1; 2671 active_balance = 1;
2629 } 2672 }
2630 spin_unlock(&busiest->lock); 2673 spin_unlock_irqrestore(&busiest->lock, flags);
2631 if (active_balance) 2674 if (active_balance)
2632 wake_up_process(busiest->migration_thread); 2675 wake_up_process(busiest->migration_thread);
2633 2676
@@ -2706,7 +2749,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
2706 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); 2749 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
2707redo: 2750redo:
2708 group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, 2751 group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE,
2709 &sd_idle, &cpus); 2752 &sd_idle, &cpus, NULL);
2710 if (!group) { 2753 if (!group) {
2711 schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); 2754 schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
2712 goto out_balanced; 2755 goto out_balanced;
@@ -2766,14 +2809,28 @@ out_balanced:
2766static void idle_balance(int this_cpu, struct rq *this_rq) 2809static void idle_balance(int this_cpu, struct rq *this_rq)
2767{ 2810{
2768 struct sched_domain *sd; 2811 struct sched_domain *sd;
2812 int pulled_task = 0;
2813 unsigned long next_balance = jiffies + 60 * HZ;
2769 2814
2770 for_each_domain(this_cpu, sd) { 2815 for_each_domain(this_cpu, sd) {
2771 if (sd->flags & SD_BALANCE_NEWIDLE) { 2816 if (sd->flags & SD_BALANCE_NEWIDLE) {
2772 /* If we've pulled tasks over stop searching: */ 2817 /* If we've pulled tasks over stop searching: */
2773 if (load_balance_newidle(this_cpu, this_rq, sd)) 2818 pulled_task = load_balance_newidle(this_cpu,
2819 this_rq, sd);
2820 if (time_after(next_balance,
2821 sd->last_balance + sd->balance_interval))
2822 next_balance = sd->last_balance
2823 + sd->balance_interval;
2824 if (pulled_task)
2774 break; 2825 break;
2775 } 2826 }
2776 } 2827 }
2828 if (!pulled_task)
2829 /*
2830 * We are going idle. next_balance may be set based on
2831 * a busy processor. So reset next_balance.
2832 */
2833 this_rq->next_balance = next_balance;
2777} 2834}
2778 2835
2779/* 2836/*
@@ -2826,26 +2883,9 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
2826 spin_unlock(&target_rq->lock); 2883 spin_unlock(&target_rq->lock);
2827} 2884}
2828 2885
2829/* 2886static void update_load(struct rq *this_rq)
2830 * rebalance_tick will get called every timer tick, on every CPU.
2831 *
2832 * It checks each scheduling domain to see if it is due to be balanced,
2833 * and initiates a balancing operation if so.
2834 *
2835 * Balancing parameters are set up in arch_init_sched_domains.
2836 */
2837
2838/* Don't have all balancing operations going off at once: */
2839static inline unsigned long cpu_offset(int cpu)
2840{ 2887{
2841 return jiffies + cpu * HZ / NR_CPUS; 2888 unsigned long this_load;
2842}
2843
2844static void
2845rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle)
2846{
2847 unsigned long this_load, interval, j = cpu_offset(this_cpu);
2848 struct sched_domain *sd;
2849 int i, scale; 2889 int i, scale;
2850 2890
2851 this_load = this_rq->raw_weighted_load; 2891 this_load = this_rq->raw_weighted_load;
@@ -2865,6 +2905,32 @@ rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle)
2865 new_load += scale-1; 2905 new_load += scale-1;
2866 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale; 2906 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale;
2867 } 2907 }
2908}
2909
2910/*
2911 * run_rebalance_domains is triggered when needed from the scheduler tick.
2912 *
2913 * It checks each scheduling domain to see if it is due to be balanced,
2914 * and initiates a balancing operation if so.
2915 *
2916 * Balancing parameters are set up in arch_init_sched_domains.
2917 */
2918static DEFINE_SPINLOCK(balancing);
2919
2920static void run_rebalance_domains(struct softirq_action *h)
2921{
2922 int this_cpu = smp_processor_id(), balance = 1;
2923 struct rq *this_rq = cpu_rq(this_cpu);
2924 unsigned long interval;
2925 struct sched_domain *sd;
2926 /*
2927 * We are idle if there are no processes running. This
2928 * is valid even if we are the idle process (SMT).
2929 */
2930 enum idle_type idle = !this_rq->nr_running ?
2931 SCHED_IDLE : NOT_IDLE;
2932 /* Earliest time when we have to call run_rebalance_domains again */
2933 unsigned long next_balance = jiffies + 60*HZ;
2868 2934
2869 for_each_domain(this_cpu, sd) { 2935 for_each_domain(this_cpu, sd) {
2870 if (!(sd->flags & SD_LOAD_BALANCE)) 2936 if (!(sd->flags & SD_LOAD_BALANCE))
@@ -2879,8 +2945,13 @@ rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle)
2879 if (unlikely(!interval)) 2945 if (unlikely(!interval))
2880 interval = 1; 2946 interval = 1;
2881 2947
2882 if (j - sd->last_balance >= interval) { 2948 if (sd->flags & SD_SERIALIZE) {
2883 if (load_balance(this_cpu, this_rq, sd, idle)) { 2949 if (!spin_trylock(&balancing))
2950 goto out;
2951 }
2952
2953 if (time_after_eq(jiffies, sd->last_balance + interval)) {
2954 if (load_balance(this_cpu, this_rq, sd, idle, &balance)) {
2884 /* 2955 /*
2885 * We've pulled tasks over so either we're no 2956 * We've pulled tasks over so either we're no
2886 * longer idle, or one of our SMT siblings is 2957 * longer idle, or one of our SMT siblings is
@@ -2888,39 +2959,48 @@ rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle)
2888 */ 2959 */
2889 idle = NOT_IDLE; 2960 idle = NOT_IDLE;
2890 } 2961 }
2891 sd->last_balance += interval; 2962 sd->last_balance = jiffies;
2892 } 2963 }
2964 if (sd->flags & SD_SERIALIZE)
2965 spin_unlock(&balancing);
2966out:
2967 if (time_after(next_balance, sd->last_balance + interval))
2968 next_balance = sd->last_balance + interval;
2969
2970 /*
2971 * Stop the load balance at this level. There is another
2972 * CPU in our sched group which is doing load balancing more
2973 * actively.
2974 */
2975 if (!balance)
2976 break;
2893 } 2977 }
2978 this_rq->next_balance = next_balance;
2894} 2979}
2895#else 2980#else
2896/* 2981/*
2897 * on UP we do not need to balance between CPUs: 2982 * on UP we do not need to balance between CPUs:
2898 */ 2983 */
2899static inline void rebalance_tick(int cpu, struct rq *rq, enum idle_type idle)
2900{
2901}
2902static inline void idle_balance(int cpu, struct rq *rq) 2984static inline void idle_balance(int cpu, struct rq *rq)
2903{ 2985{
2904} 2986}
2905#endif 2987#endif
2906 2988
2907static inline int wake_priority_sleeper(struct rq *rq) 2989static inline void wake_priority_sleeper(struct rq *rq)
2908{ 2990{
2909 int ret = 0;
2910
2911#ifdef CONFIG_SCHED_SMT 2991#ifdef CONFIG_SCHED_SMT
2992 if (!rq->nr_running)
2993 return;
2994
2912 spin_lock(&rq->lock); 2995 spin_lock(&rq->lock);
2913 /* 2996 /*
2914 * If an SMT sibling task has been put to sleep for priority 2997 * If an SMT sibling task has been put to sleep for priority
2915 * reasons reschedule the idle task to see if it can now run. 2998 * reasons reschedule the idle task to see if it can now run.
2916 */ 2999 */
2917 if (rq->nr_running) { 3000 if (rq->nr_running)
2918 resched_task(rq->idle); 3001 resched_task(rq->idle);
2919 ret = 1;
2920 }
2921 spin_unlock(&rq->lock); 3002 spin_unlock(&rq->lock);
2922#endif 3003#endif
2923 return ret;
2924} 3004}
2925 3005
2926DEFINE_PER_CPU(struct kernel_stat, kstat); 3006DEFINE_PER_CPU(struct kernel_stat, kstat);
@@ -2934,7 +3014,8 @@ EXPORT_PER_CPU_SYMBOL(kstat);
2934static inline void 3014static inline void
2935update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now) 3015update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now)
2936{ 3016{
2937 p->sched_time += now - max(p->timestamp, rq->timestamp_last_tick); 3017 p->sched_time += now - p->last_ran;
3018 p->last_ran = rq->most_recent_timestamp = now;
2938} 3019}
2939 3020
2940/* 3021/*
@@ -2947,8 +3028,7 @@ unsigned long long current_sched_time(const struct task_struct *p)
2947 unsigned long flags; 3028 unsigned long flags;
2948 3029
2949 local_irq_save(flags); 3030 local_irq_save(flags);
2950 ns = max(p->timestamp, task_rq(p)->timestamp_last_tick); 3031 ns = p->sched_time + sched_clock() - p->last_ran;
2951 ns = p->sched_time + sched_clock() - ns;
2952 local_irq_restore(flags); 3032 local_irq_restore(flags);
2953 3033
2954 return ns; 3034 return ns;
@@ -3048,35 +3128,12 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
3048 cpustat->steal = cputime64_add(cpustat->steal, tmp); 3128 cpustat->steal = cputime64_add(cpustat->steal, tmp);
3049} 3129}
3050 3130
3051/* 3131static void task_running_tick(struct rq *rq, struct task_struct *p)
3052 * This function gets called by the timer code, with HZ frequency.
3053 * We call it with interrupts disabled.
3054 *
3055 * It also gets called by the fork code, when changing the parent's
3056 * timeslices.
3057 */
3058void scheduler_tick(void)
3059{ 3132{
3060 unsigned long long now = sched_clock();
3061 struct task_struct *p = current;
3062 int cpu = smp_processor_id();
3063 struct rq *rq = cpu_rq(cpu);
3064
3065 update_cpu_clock(p, rq, now);
3066
3067 rq->timestamp_last_tick = now;
3068
3069 if (p == rq->idle) {
3070 if (wake_priority_sleeper(rq))
3071 goto out;
3072 rebalance_tick(cpu, rq, SCHED_IDLE);
3073 return;
3074 }
3075
3076 /* Task might have expired already, but not scheduled off yet */
3077 if (p->array != rq->active) { 3133 if (p->array != rq->active) {
3134 /* Task has expired but was not scheduled yet */
3078 set_tsk_need_resched(p); 3135 set_tsk_need_resched(p);
3079 goto out; 3136 return;
3080 } 3137 }
3081 spin_lock(&rq->lock); 3138 spin_lock(&rq->lock);
3082 /* 3139 /*
@@ -3144,8 +3201,34 @@ void scheduler_tick(void)
3144 } 3201 }
3145out_unlock: 3202out_unlock:
3146 spin_unlock(&rq->lock); 3203 spin_unlock(&rq->lock);
3147out: 3204}
3148 rebalance_tick(cpu, rq, NOT_IDLE); 3205
3206/*
3207 * This function gets called by the timer code, with HZ frequency.
3208 * We call it with interrupts disabled.
3209 *
3210 * It also gets called by the fork code, when changing the parent's
3211 * timeslices.
3212 */
3213void scheduler_tick(void)
3214{
3215 unsigned long long now = sched_clock();
3216 struct task_struct *p = current;
3217 int cpu = smp_processor_id();
3218 struct rq *rq = cpu_rq(cpu);
3219
3220 update_cpu_clock(p, rq, now);
3221
3222 if (p == rq->idle)
3223 /* Task on the idle queue */
3224 wake_priority_sleeper(rq);
3225 else
3226 task_running_tick(rq, p);
3227#ifdef CONFIG_SMP
3228 update_load(rq);
3229 if (time_after_eq(jiffies, rq->next_balance))
3230 raise_softirq(SCHED_SOFTIRQ);
3231#endif
3149} 3232}
3150 3233
3151#ifdef CONFIG_SCHED_SMT 3234#ifdef CONFIG_SCHED_SMT
@@ -3291,7 +3374,8 @@ void fastcall add_preempt_count(int val)
3291 /* 3374 /*
3292 * Spinlock count overflowing soon? 3375 * Spinlock count overflowing soon?
3293 */ 3376 */
3294 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10); 3377 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
3378 PREEMPT_MASK - 10);
3295} 3379}
3296EXPORT_SYMBOL(add_preempt_count); 3380EXPORT_SYMBOL(add_preempt_count);
3297 3381
@@ -4990,8 +5074,8 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4990 * afterwards, and pretending it was a local activate. 5074 * afterwards, and pretending it was a local activate.
4991 * This way is cleaner and logically correct. 5075 * This way is cleaner and logically correct.
4992 */ 5076 */
4993 p->timestamp = p->timestamp - rq_src->timestamp_last_tick 5077 p->timestamp = p->timestamp - rq_src->most_recent_timestamp
4994 + rq_dest->timestamp_last_tick; 5078 + rq_dest->most_recent_timestamp;
4995 deactivate_task(p, rq_src); 5079 deactivate_task(p, rq_src);
4996 __activate_task(p, rq_dest); 5080 __activate_task(p, rq_dest);
4997 if (TASK_PREEMPTS_CURR(p, rq_dest)) 5081 if (TASK_PREEMPTS_CURR(p, rq_dest))
@@ -5067,7 +5151,10 @@ wait_to_die:
5067} 5151}
5068 5152
5069#ifdef CONFIG_HOTPLUG_CPU 5153#ifdef CONFIG_HOTPLUG_CPU
5070/* Figure out where task on dead CPU should go, use force if neccessary. */ 5154/*
5155 * Figure out where task on dead CPU should go, use force if neccessary.
5156 * NOTE: interrupts should be disabled by the caller
5157 */
5071static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 5158static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5072{ 5159{
5073 unsigned long flags; 5160 unsigned long flags;
@@ -5187,6 +5274,7 @@ void idle_task_exit(void)
5187 mmdrop(mm); 5274 mmdrop(mm);
5188} 5275}
5189 5276
5277/* called under rq->lock with disabled interrupts */
5190static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) 5278static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
5191{ 5279{
5192 struct rq *rq = cpu_rq(dead_cpu); 5280 struct rq *rq = cpu_rq(dead_cpu);
@@ -5203,10 +5291,11 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
5203 * Drop lock around migration; if someone else moves it, 5291 * Drop lock around migration; if someone else moves it,
5204 * that's OK. No task can be added to this CPU, so iteration is 5292 * that's OK. No task can be added to this CPU, so iteration is
5205 * fine. 5293 * fine.
5294 * NOTE: interrupts should be left disabled --dev@
5206 */ 5295 */
5207 spin_unlock_irq(&rq->lock); 5296 spin_unlock(&rq->lock);
5208 move_task_off_dead_cpu(dead_cpu, p); 5297 move_task_off_dead_cpu(dead_cpu, p);
5209 spin_lock_irq(&rq->lock); 5298 spin_lock(&rq->lock);
5210 5299
5211 put_task_struct(p); 5300 put_task_struct(p);
5212} 5301}
@@ -5359,16 +5448,19 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
5359 if (!(sd->flags & SD_LOAD_BALANCE)) { 5448 if (!(sd->flags & SD_LOAD_BALANCE)) {
5360 printk("does not load-balance\n"); 5449 printk("does not load-balance\n");
5361 if (sd->parent) 5450 if (sd->parent)
5362 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent"); 5451 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
5452 " has parent");
5363 break; 5453 break;
5364 } 5454 }
5365 5455
5366 printk("span %s\n", str); 5456 printk("span %s\n", str);
5367 5457
5368 if (!cpu_isset(cpu, sd->span)) 5458 if (!cpu_isset(cpu, sd->span))
5369 printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu); 5459 printk(KERN_ERR "ERROR: domain->span does not contain "
5460 "CPU%d\n", cpu);
5370 if (!cpu_isset(cpu, group->cpumask)) 5461 if (!cpu_isset(cpu, group->cpumask))
5371 printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu); 5462 printk(KERN_ERR "ERROR: domain->groups does not contain"
5463 " CPU%d\n", cpu);
5372 5464
5373 printk(KERN_DEBUG); 5465 printk(KERN_DEBUG);
5374 for (i = 0; i < level + 2; i++) 5466 for (i = 0; i < level + 2; i++)
@@ -5383,7 +5475,8 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
5383 5475
5384 if (!group->cpu_power) { 5476 if (!group->cpu_power) {
5385 printk("\n"); 5477 printk("\n");
5386 printk(KERN_ERR "ERROR: domain->cpu_power not set\n"); 5478 printk(KERN_ERR "ERROR: domain->cpu_power not "
5479 "set\n");
5387 } 5480 }
5388 5481
5389 if (!cpus_weight(group->cpumask)) { 5482 if (!cpus_weight(group->cpumask)) {
@@ -5406,15 +5499,17 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
5406 printk("\n"); 5499 printk("\n");
5407 5500
5408 if (!cpus_equal(sd->span, groupmask)) 5501 if (!cpus_equal(sd->span, groupmask))
5409 printk(KERN_ERR "ERROR: groups don't span domain->span\n"); 5502 printk(KERN_ERR "ERROR: groups don't span "
5503 "domain->span\n");
5410 5504
5411 level++; 5505 level++;
5412 sd = sd->parent; 5506 sd = sd->parent;
5507 if (!sd)
5508 continue;
5413 5509
5414 if (sd) { 5510 if (!cpus_subset(groupmask, sd->span))
5415 if (!cpus_subset(groupmask, sd->span)) 5511 printk(KERN_ERR "ERROR: parent span is not a superset "
5416 printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n"); 5512 "of domain->span\n");
5417 }
5418 5513
5419 } while (sd); 5514 } while (sd);
5420} 5515}
@@ -5528,28 +5623,27 @@ static int __init isolated_cpu_setup(char *str)
5528__setup ("isolcpus=", isolated_cpu_setup); 5623__setup ("isolcpus=", isolated_cpu_setup);
5529 5624
5530/* 5625/*
5531 * init_sched_build_groups takes an array of groups, the cpumask we wish 5626 * init_sched_build_groups takes the cpumask we wish to span, and a pointer
5532 * to span, and a pointer to a function which identifies what group a CPU 5627 * to a function which identifies what group(along with sched group) a CPU
5533 * belongs to. The return value of group_fn must be a valid index into the 5628 * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS
5534 * groups[] array, and must be >= 0 and < NR_CPUS (due to the fact that we 5629 * (due to the fact that we keep track of groups covered with a cpumask_t).
5535 * keep track of groups covered with a cpumask_t).
5536 * 5630 *
5537 * init_sched_build_groups will build a circular linked list of the groups 5631 * init_sched_build_groups will build a circular linked list of the groups
5538 * covered by the given span, and will set each group's ->cpumask correctly, 5632 * covered by the given span, and will set each group's ->cpumask correctly,
5539 * and ->cpu_power to 0. 5633 * and ->cpu_power to 0.
5540 */ 5634 */
5541static void 5635static void
5542init_sched_build_groups(struct sched_group groups[], cpumask_t span, 5636init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
5543 const cpumask_t *cpu_map, 5637 int (*group_fn)(int cpu, const cpumask_t *cpu_map,
5544 int (*group_fn)(int cpu, const cpumask_t *cpu_map)) 5638 struct sched_group **sg))
5545{ 5639{
5546 struct sched_group *first = NULL, *last = NULL; 5640 struct sched_group *first = NULL, *last = NULL;
5547 cpumask_t covered = CPU_MASK_NONE; 5641 cpumask_t covered = CPU_MASK_NONE;
5548 int i; 5642 int i;
5549 5643
5550 for_each_cpu_mask(i, span) { 5644 for_each_cpu_mask(i, span) {
5551 int group = group_fn(i, cpu_map); 5645 struct sched_group *sg;
5552 struct sched_group *sg = &groups[group]; 5646 int group = group_fn(i, cpu_map, &sg);
5553 int j; 5647 int j;
5554 5648
5555 if (cpu_isset(i, covered)) 5649 if (cpu_isset(i, covered))
@@ -5559,7 +5653,7 @@ init_sched_build_groups(struct sched_group groups[], cpumask_t span,
5559 sg->cpu_power = 0; 5653 sg->cpu_power = 0;
5560 5654
5561 for_each_cpu_mask(j, span) { 5655 for_each_cpu_mask(j, span) {
5562 if (group_fn(j, cpu_map) != group) 5656 if (group_fn(j, cpu_map, NULL) != group)
5563 continue; 5657 continue;
5564 5658
5565 cpu_set(j, covered); 5659 cpu_set(j, covered);
@@ -5733,8 +5827,9 @@ __setup("max_cache_size=", setup_max_cache_size);
5733 */ 5827 */
5734static void touch_cache(void *__cache, unsigned long __size) 5828static void touch_cache(void *__cache, unsigned long __size)
5735{ 5829{
5736 unsigned long size = __size/sizeof(long), chunk1 = size/3, 5830 unsigned long size = __size / sizeof(long);
5737 chunk2 = 2*size/3; 5831 unsigned long chunk1 = size / 3;
5832 unsigned long chunk2 = 2 * size / 3;
5738 unsigned long *cache = __cache; 5833 unsigned long *cache = __cache;
5739 int i; 5834 int i;
5740 5835
@@ -5843,11 +5938,11 @@ measure_cost(int cpu1, int cpu2, void *cache, unsigned int size)
5843 */ 5938 */
5844 measure_one(cache, size, cpu1, cpu2); 5939 measure_one(cache, size, cpu1, cpu2);
5845 for (i = 0; i < ITERATIONS; i++) 5940 for (i = 0; i < ITERATIONS; i++)
5846 cost1 += measure_one(cache, size - i*1024, cpu1, cpu2); 5941 cost1 += measure_one(cache, size - i * 1024, cpu1, cpu2);
5847 5942
5848 measure_one(cache, size, cpu2, cpu1); 5943 measure_one(cache, size, cpu2, cpu1);
5849 for (i = 0; i < ITERATIONS; i++) 5944 for (i = 0; i < ITERATIONS; i++)
5850 cost1 += measure_one(cache, size - i*1024, cpu2, cpu1); 5945 cost1 += measure_one(cache, size - i * 1024, cpu2, cpu1);
5851 5946
5852 /* 5947 /*
5853 * (We measure the non-migrating [cached] cost on both 5948 * (We measure the non-migrating [cached] cost on both
@@ -5857,17 +5952,17 @@ measure_cost(int cpu1, int cpu2, void *cache, unsigned int size)
5857 5952
5858 measure_one(cache, size, cpu1, cpu1); 5953 measure_one(cache, size, cpu1, cpu1);
5859 for (i = 0; i < ITERATIONS; i++) 5954 for (i = 0; i < ITERATIONS; i++)
5860 cost2 += measure_one(cache, size - i*1024, cpu1, cpu1); 5955 cost2 += measure_one(cache, size - i * 1024, cpu1, cpu1);
5861 5956
5862 measure_one(cache, size, cpu2, cpu2); 5957 measure_one(cache, size, cpu2, cpu2);
5863 for (i = 0; i < ITERATIONS; i++) 5958 for (i = 0; i < ITERATIONS; i++)
5864 cost2 += measure_one(cache, size - i*1024, cpu2, cpu2); 5959 cost2 += measure_one(cache, size - i * 1024, cpu2, cpu2);
5865 5960
5866 /* 5961 /*
5867 * Get the per-iteration migration cost: 5962 * Get the per-iteration migration cost:
5868 */ 5963 */
5869 do_div(cost1, 2*ITERATIONS); 5964 do_div(cost1, 2 * ITERATIONS);
5870 do_div(cost2, 2*ITERATIONS); 5965 do_div(cost2, 2 * ITERATIONS);
5871 5966
5872 return cost1 - cost2; 5967 return cost1 - cost2;
5873} 5968}
@@ -5905,7 +6000,7 @@ static unsigned long long measure_migration_cost(int cpu1, int cpu2)
5905 */ 6000 */
5906 cache = vmalloc(max_size); 6001 cache = vmalloc(max_size);
5907 if (!cache) { 6002 if (!cache) {
5908 printk("could not vmalloc %d bytes for cache!\n", 2*max_size); 6003 printk("could not vmalloc %d bytes for cache!\n", 2 * max_size);
5909 return 1000000; /* return 1 msec on very small boxen */ 6004 return 1000000; /* return 1 msec on very small boxen */
5910 } 6005 }
5911 6006
@@ -5930,7 +6025,8 @@ static unsigned long long measure_migration_cost(int cpu1, int cpu2)
5930 avg_fluct = (avg_fluct + fluct)/2; 6025 avg_fluct = (avg_fluct + fluct)/2;
5931 6026
5932 if (migration_debug) 6027 if (migration_debug)
5933 printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): (%8Ld %8Ld)\n", 6028 printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): "
6029 "(%8Ld %8Ld)\n",
5934 cpu1, cpu2, size, 6030 cpu1, cpu2, size,
5935 (long)cost / 1000000, 6031 (long)cost / 1000000,
5936 ((long)cost / 100000) % 10, 6032 ((long)cost / 100000) % 10,
@@ -6025,20 +6121,18 @@ static void calibrate_migration_costs(const cpumask_t *cpu_map)
6025 -1 6121 -1
6026#endif 6122#endif
6027 ); 6123 );
6028 if (system_state == SYSTEM_BOOTING) { 6124 if (system_state == SYSTEM_BOOTING && num_online_cpus() > 1) {
6029 if (num_online_cpus() > 1) { 6125 printk("migration_cost=");
6030 printk("migration_cost="); 6126 for (distance = 0; distance <= max_distance; distance++) {
6031 for (distance = 0; distance <= max_distance; distance++) { 6127 if (distance)
6032 if (distance) 6128 printk(",");
6033 printk(","); 6129 printk("%ld", (long)migration_cost[distance] / 1000);
6034 printk("%ld", (long)migration_cost[distance] / 1000);
6035 }
6036 printk("\n");
6037 } 6130 }
6131 printk("\n");
6038 } 6132 }
6039 j1 = jiffies; 6133 j1 = jiffies;
6040 if (migration_debug) 6134 if (migration_debug)
6041 printk("migration: %ld seconds\n", (j1-j0)/HZ); 6135 printk("migration: %ld seconds\n", (j1-j0) / HZ);
6042 6136
6043 /* 6137 /*
6044 * Move back to the original CPU. NUMA-Q gets confused 6138 * Move back to the original CPU. NUMA-Q gets confused
@@ -6135,10 +6229,13 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
6135 */ 6229 */
6136#ifdef CONFIG_SCHED_SMT 6230#ifdef CONFIG_SCHED_SMT
6137static DEFINE_PER_CPU(struct sched_domain, cpu_domains); 6231static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
6138static struct sched_group sched_group_cpus[NR_CPUS]; 6232static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
6139 6233
6140static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map) 6234static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map,
6235 struct sched_group **sg)
6141{ 6236{
6237 if (sg)
6238 *sg = &per_cpu(sched_group_cpus, cpu);
6142 return cpu; 6239 return cpu;
6143} 6240}
6144#endif 6241#endif
@@ -6148,39 +6245,52 @@ static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map)
6148 */ 6245 */
6149#ifdef CONFIG_SCHED_MC 6246#ifdef CONFIG_SCHED_MC
6150static DEFINE_PER_CPU(struct sched_domain, core_domains); 6247static DEFINE_PER_CPU(struct sched_domain, core_domains);
6151static struct sched_group sched_group_core[NR_CPUS]; 6248static DEFINE_PER_CPU(struct sched_group, sched_group_core);
6152#endif 6249#endif
6153 6250
6154#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) 6251#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
6155static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map) 6252static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
6253 struct sched_group **sg)
6156{ 6254{
6255 int group;
6157 cpumask_t mask = cpu_sibling_map[cpu]; 6256 cpumask_t mask = cpu_sibling_map[cpu];
6158 cpus_and(mask, mask, *cpu_map); 6257 cpus_and(mask, mask, *cpu_map);
6159 return first_cpu(mask); 6258 group = first_cpu(mask);
6259 if (sg)
6260 *sg = &per_cpu(sched_group_core, group);
6261 return group;
6160} 6262}
6161#elif defined(CONFIG_SCHED_MC) 6263#elif defined(CONFIG_SCHED_MC)
6162static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map) 6264static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
6265 struct sched_group **sg)
6163{ 6266{
6267 if (sg)
6268 *sg = &per_cpu(sched_group_core, cpu);
6164 return cpu; 6269 return cpu;
6165} 6270}
6166#endif 6271#endif
6167 6272
6168static DEFINE_PER_CPU(struct sched_domain, phys_domains); 6273static DEFINE_PER_CPU(struct sched_domain, phys_domains);
6169static struct sched_group sched_group_phys[NR_CPUS]; 6274static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
6170 6275
6171static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map) 6276static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map,
6277 struct sched_group **sg)
6172{ 6278{
6279 int group;
6173#ifdef CONFIG_SCHED_MC 6280#ifdef CONFIG_SCHED_MC
6174 cpumask_t mask = cpu_coregroup_map(cpu); 6281 cpumask_t mask = cpu_coregroup_map(cpu);
6175 cpus_and(mask, mask, *cpu_map); 6282 cpus_and(mask, mask, *cpu_map);
6176 return first_cpu(mask); 6283 group = first_cpu(mask);
6177#elif defined(CONFIG_SCHED_SMT) 6284#elif defined(CONFIG_SCHED_SMT)
6178 cpumask_t mask = cpu_sibling_map[cpu]; 6285 cpumask_t mask = cpu_sibling_map[cpu];
6179 cpus_and(mask, mask, *cpu_map); 6286 cpus_and(mask, mask, *cpu_map);
6180 return first_cpu(mask); 6287 group = first_cpu(mask);
6181#else 6288#else
6182 return cpu; 6289 group = cpu;
6183#endif 6290#endif
6291 if (sg)
6292 *sg = &per_cpu(sched_group_phys, group);
6293 return group;
6184} 6294}
6185 6295
6186#ifdef CONFIG_NUMA 6296#ifdef CONFIG_NUMA
@@ -6193,12 +6303,22 @@ static DEFINE_PER_CPU(struct sched_domain, node_domains);
6193static struct sched_group **sched_group_nodes_bycpu[NR_CPUS]; 6303static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
6194 6304
6195static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); 6305static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
6196static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS]; 6306static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
6197 6307
6198static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map) 6308static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
6309 struct sched_group **sg)
6199{ 6310{
6200 return cpu_to_node(cpu); 6311 cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu));
6312 int group;
6313
6314 cpus_and(nodemask, nodemask, *cpu_map);
6315 group = first_cpu(nodemask);
6316
6317 if (sg)
6318 *sg = &per_cpu(sched_group_allnodes, group);
6319 return group;
6201} 6320}
6321
6202static void init_numa_sched_groups_power(struct sched_group *group_head) 6322static void init_numa_sched_groups_power(struct sched_group *group_head)
6203{ 6323{
6204 struct sched_group *sg = group_head; 6324 struct sched_group *sg = group_head;
@@ -6234,16 +6354,9 @@ static void free_sched_groups(const cpumask_t *cpu_map)
6234 int cpu, i; 6354 int cpu, i;
6235 6355
6236 for_each_cpu_mask(cpu, *cpu_map) { 6356 for_each_cpu_mask(cpu, *cpu_map) {
6237 struct sched_group *sched_group_allnodes
6238 = sched_group_allnodes_bycpu[cpu];
6239 struct sched_group **sched_group_nodes 6357 struct sched_group **sched_group_nodes
6240 = sched_group_nodes_bycpu[cpu]; 6358 = sched_group_nodes_bycpu[cpu];
6241 6359
6242 if (sched_group_allnodes) {
6243 kfree(sched_group_allnodes);
6244 sched_group_allnodes_bycpu[cpu] = NULL;
6245 }
6246
6247 if (!sched_group_nodes) 6360 if (!sched_group_nodes)
6248 continue; 6361 continue;
6249 6362
@@ -6337,7 +6450,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6337 struct sched_domain *sd; 6450 struct sched_domain *sd;
6338#ifdef CONFIG_NUMA 6451#ifdef CONFIG_NUMA
6339 struct sched_group **sched_group_nodes = NULL; 6452 struct sched_group **sched_group_nodes = NULL;
6340 struct sched_group *sched_group_allnodes = NULL; 6453 int sd_allnodes = 0;
6341 6454
6342 /* 6455 /*
6343 * Allocate the per-node list of sched groups 6456 * Allocate the per-node list of sched groups
@@ -6355,7 +6468,6 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6355 * Set up domains for cpus specified by the cpu_map. 6468 * Set up domains for cpus specified by the cpu_map.
6356 */ 6469 */
6357 for_each_cpu_mask(i, *cpu_map) { 6470 for_each_cpu_mask(i, *cpu_map) {
6358 int group;
6359 struct sched_domain *sd = NULL, *p; 6471 struct sched_domain *sd = NULL, *p;
6360 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); 6472 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
6361 6473
@@ -6364,26 +6476,12 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6364#ifdef CONFIG_NUMA 6476#ifdef CONFIG_NUMA
6365 if (cpus_weight(*cpu_map) 6477 if (cpus_weight(*cpu_map)
6366 > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { 6478 > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
6367 if (!sched_group_allnodes) {
6368 sched_group_allnodes
6369 = kmalloc_node(sizeof(struct sched_group)
6370 * MAX_NUMNODES,
6371 GFP_KERNEL,
6372 cpu_to_node(i));
6373 if (!sched_group_allnodes) {
6374 printk(KERN_WARNING
6375 "Can not alloc allnodes sched group\n");
6376 goto error;
6377 }
6378 sched_group_allnodes_bycpu[i]
6379 = sched_group_allnodes;
6380 }
6381 sd = &per_cpu(allnodes_domains, i); 6479 sd = &per_cpu(allnodes_domains, i);
6382 *sd = SD_ALLNODES_INIT; 6480 *sd = SD_ALLNODES_INIT;
6383 sd->span = *cpu_map; 6481 sd->span = *cpu_map;
6384 group = cpu_to_allnodes_group(i, cpu_map); 6482 cpu_to_allnodes_group(i, cpu_map, &sd->groups);
6385 sd->groups = &sched_group_allnodes[group];
6386 p = sd; 6483 p = sd;
6484 sd_allnodes = 1;
6387 } else 6485 } else
6388 p = NULL; 6486 p = NULL;
6389 6487
@@ -6398,36 +6496,33 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6398 6496
6399 p = sd; 6497 p = sd;
6400 sd = &per_cpu(phys_domains, i); 6498 sd = &per_cpu(phys_domains, i);
6401 group = cpu_to_phys_group(i, cpu_map);
6402 *sd = SD_CPU_INIT; 6499 *sd = SD_CPU_INIT;
6403 sd->span = nodemask; 6500 sd->span = nodemask;
6404 sd->parent = p; 6501 sd->parent = p;
6405 if (p) 6502 if (p)
6406 p->child = sd; 6503 p->child = sd;
6407 sd->groups = &sched_group_phys[group]; 6504 cpu_to_phys_group(i, cpu_map, &sd->groups);
6408 6505
6409#ifdef CONFIG_SCHED_MC 6506#ifdef CONFIG_SCHED_MC
6410 p = sd; 6507 p = sd;
6411 sd = &per_cpu(core_domains, i); 6508 sd = &per_cpu(core_domains, i);
6412 group = cpu_to_core_group(i, cpu_map);
6413 *sd = SD_MC_INIT; 6509 *sd = SD_MC_INIT;
6414 sd->span = cpu_coregroup_map(i); 6510 sd->span = cpu_coregroup_map(i);
6415 cpus_and(sd->span, sd->span, *cpu_map); 6511 cpus_and(sd->span, sd->span, *cpu_map);
6416 sd->parent = p; 6512 sd->parent = p;
6417 p->child = sd; 6513 p->child = sd;
6418 sd->groups = &sched_group_core[group]; 6514 cpu_to_core_group(i, cpu_map, &sd->groups);
6419#endif 6515#endif
6420 6516
6421#ifdef CONFIG_SCHED_SMT 6517#ifdef CONFIG_SCHED_SMT
6422 p = sd; 6518 p = sd;
6423 sd = &per_cpu(cpu_domains, i); 6519 sd = &per_cpu(cpu_domains, i);
6424 group = cpu_to_cpu_group(i, cpu_map);
6425 *sd = SD_SIBLING_INIT; 6520 *sd = SD_SIBLING_INIT;
6426 sd->span = cpu_sibling_map[i]; 6521 sd->span = cpu_sibling_map[i];
6427 cpus_and(sd->span, sd->span, *cpu_map); 6522 cpus_and(sd->span, sd->span, *cpu_map);
6428 sd->parent = p; 6523 sd->parent = p;
6429 p->child = sd; 6524 p->child = sd;
6430 sd->groups = &sched_group_cpus[group]; 6525 cpu_to_cpu_group(i, cpu_map, &sd->groups);
6431#endif 6526#endif
6432 } 6527 }
6433 6528
@@ -6439,8 +6534,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6439 if (i != first_cpu(this_sibling_map)) 6534 if (i != first_cpu(this_sibling_map))
6440 continue; 6535 continue;
6441 6536
6442 init_sched_build_groups(sched_group_cpus, this_sibling_map, 6537 init_sched_build_groups(this_sibling_map, cpu_map, &cpu_to_cpu_group);
6443 cpu_map, &cpu_to_cpu_group);
6444 } 6538 }
6445#endif 6539#endif
6446 6540
@@ -6451,8 +6545,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6451 cpus_and(this_core_map, this_core_map, *cpu_map); 6545 cpus_and(this_core_map, this_core_map, *cpu_map);
6452 if (i != first_cpu(this_core_map)) 6546 if (i != first_cpu(this_core_map))
6453 continue; 6547 continue;
6454 init_sched_build_groups(sched_group_core, this_core_map, 6548 init_sched_build_groups(this_core_map, cpu_map, &cpu_to_core_group);
6455 cpu_map, &cpu_to_core_group);
6456 } 6549 }
6457#endif 6550#endif
6458 6551
@@ -6465,15 +6558,13 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6465 if (cpus_empty(nodemask)) 6558 if (cpus_empty(nodemask))
6466 continue; 6559 continue;
6467 6560
6468 init_sched_build_groups(sched_group_phys, nodemask, 6561 init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group);
6469 cpu_map, &cpu_to_phys_group);
6470 } 6562 }
6471 6563
6472#ifdef CONFIG_NUMA 6564#ifdef CONFIG_NUMA
6473 /* Set up node groups */ 6565 /* Set up node groups */
6474 if (sched_group_allnodes) 6566 if (sd_allnodes)
6475 init_sched_build_groups(sched_group_allnodes, *cpu_map, 6567 init_sched_build_groups(*cpu_map, cpu_map, &cpu_to_allnodes_group);
6476 cpu_map, &cpu_to_allnodes_group);
6477 6568
6478 for (i = 0; i < MAX_NUMNODES; i++) { 6569 for (i = 0; i < MAX_NUMNODES; i++) {
6479 /* Set up node groups */ 6570 /* Set up node groups */
@@ -6565,10 +6656,10 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6565 for (i = 0; i < MAX_NUMNODES; i++) 6656 for (i = 0; i < MAX_NUMNODES; i++)
6566 init_numa_sched_groups_power(sched_group_nodes[i]); 6657 init_numa_sched_groups_power(sched_group_nodes[i]);
6567 6658
6568 if (sched_group_allnodes) { 6659 if (sd_allnodes) {
6569 int group = cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map); 6660 struct sched_group *sg;
6570 struct sched_group *sg = &sched_group_allnodes[group];
6571 6661
6662 cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg);
6572 init_numa_sched_groups_power(sg); 6663 init_numa_sched_groups_power(sg);
6573 } 6664 }
6574#endif 6665#endif
@@ -6847,6 +6938,10 @@ void __init sched_init(void)
6847 6938
6848 set_load_weight(&init_task); 6939 set_load_weight(&init_task);
6849 6940
6941#ifdef CONFIG_SMP
6942 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
6943#endif
6944
6850#ifdef CONFIG_RT_MUTEXES 6945#ifdef CONFIG_RT_MUTEXES
6851 plist_head_init(&init_task.pi_waiters, &init_task.pi_lock); 6946 plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
6852#endif 6947#endif