aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched_fair.c
diff options
context:
space:
mode:
authorPeter Zijlstra <a.p.zijlstra@chello.nl>2011-04-07 08:09:50 -0400
committerIngo Molnar <mingo@elte.hu>2011-04-11 06:58:19 -0400
commitdce840a08702bd13a9a186e07e63d1ef82256b5e (patch)
tree168bb98aed7f5761ebe31aa92c34959e9d0f238a /kernel/sched_fair.c
parenta9c9a9b6bff27ac9c746344a9c1a19bf3327002c (diff)
sched: Dynamically allocate sched_domain/sched_group data-structures
Instead of relying on static allocations for the sched_domain and sched_group trees, dynamically allocate and RCU free them. Allocating this dynamically also allows for some build_sched_groups() simplification since we can now (like with other simplifications) rely on the sched_domain tree instead of hard-coded knowledge. One tricky to note is that detach_destroy_domains() needs to hold rcu_read_lock() over the entire tear-down, per-cpu is not sufficient since that can lead to partial sched_group existance (could possibly be solved by doing the tear-down backwards but this is much more robust). A concequence of the above is that we can no longer print the sched_domain debug stuff from cpu_attach_domain() since that might now run with preemption disabled (due to classic RCU etc.) and sched_domain_debug() does some GFP_KERNEL allocations. Another thing to note is that we now fully rely on normal RCU and not RCU-sched, this is because with the new and exiting RCU flavours we grew over the years BH doesn't necessarily hold off RCU-sched grace periods (-rt is known to break this). This would in fact already cause us grief since we do sched_domain/sched_group iterations from softirq context. This patch is somewhat larger than I would like it to be, but I didn't find any means of shrinking/splitting this. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Mike Galbraith <efault@gmx.de> Cc: Nick Piggin <npiggin@kernel.dk> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Andrew Morton <akpm@linux-foundation.org> Link: http://lkml.kernel.org/r/20110407122942.245307941@chello.nl Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel/sched_fair.c')
-rw-r--r--kernel/sched_fair.c30
1 files changed, 24 insertions, 6 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 4ee50f0af8d1..4a8ac7c2a18e 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1622,6 +1622,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
1622 /* 1622 /*
1623 * Otherwise, iterate the domains and find an elegible idle cpu. 1623 * Otherwise, iterate the domains and find an elegible idle cpu.
1624 */ 1624 */
1625 rcu_read_lock();
1625 for_each_domain(target, sd) { 1626 for_each_domain(target, sd) {
1626 if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) 1627 if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
1627 break; 1628 break;
@@ -1641,6 +1642,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
1641 cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) 1642 cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
1642 break; 1643 break;
1643 } 1644 }
1645 rcu_read_unlock();
1644 1646
1645 return target; 1647 return target;
1646} 1648}
@@ -1673,6 +1675,7 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
1673 new_cpu = prev_cpu; 1675 new_cpu = prev_cpu;
1674 } 1676 }
1675 1677
1678 rcu_read_lock();
1676 for_each_domain(cpu, tmp) { 1679 for_each_domain(cpu, tmp) {
1677 if (!(tmp->flags & SD_LOAD_BALANCE)) 1680 if (!(tmp->flags & SD_LOAD_BALANCE))
1678 continue; 1681 continue;
@@ -1723,9 +1726,10 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
1723 1726
1724 if (affine_sd) { 1727 if (affine_sd) {
1725 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) 1728 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
1726 return select_idle_sibling(p, cpu); 1729 prev_cpu = cpu;
1727 else 1730
1728 return select_idle_sibling(p, prev_cpu); 1731 new_cpu = select_idle_sibling(p, prev_cpu);
1732 goto unlock;
1729 } 1733 }
1730 1734
1731 while (sd) { 1735 while (sd) {
@@ -1766,6 +1770,8 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
1766 } 1770 }
1767 /* while loop will break here if sd == NULL */ 1771 /* while loop will break here if sd == NULL */
1768 } 1772 }
1773unlock:
1774 rcu_read_unlock();
1769 1775
1770 return new_cpu; 1776 return new_cpu;
1771} 1777}
@@ -3462,6 +3468,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3462 raw_spin_unlock(&this_rq->lock); 3468 raw_spin_unlock(&this_rq->lock);
3463 3469
3464 update_shares(this_cpu); 3470 update_shares(this_cpu);
3471 rcu_read_lock();
3465 for_each_domain(this_cpu, sd) { 3472 for_each_domain(this_cpu, sd) {
3466 unsigned long interval; 3473 unsigned long interval;
3467 int balance = 1; 3474 int balance = 1;
@@ -3483,6 +3490,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3483 break; 3490 break;
3484 } 3491 }
3485 } 3492 }
3493 rcu_read_unlock();
3486 3494
3487 raw_spin_lock(&this_rq->lock); 3495 raw_spin_lock(&this_rq->lock);
3488 3496
@@ -3531,6 +3539,7 @@ static int active_load_balance_cpu_stop(void *data)
3531 double_lock_balance(busiest_rq, target_rq); 3539 double_lock_balance(busiest_rq, target_rq);
3532 3540
3533 /* Search for an sd spanning us and the target CPU. */ 3541 /* Search for an sd spanning us and the target CPU. */
3542 rcu_read_lock();
3534 for_each_domain(target_cpu, sd) { 3543 for_each_domain(target_cpu, sd) {
3535 if ((sd->flags & SD_LOAD_BALANCE) && 3544 if ((sd->flags & SD_LOAD_BALANCE) &&
3536 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) 3545 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
@@ -3546,6 +3555,7 @@ static int active_load_balance_cpu_stop(void *data)
3546 else 3555 else
3547 schedstat_inc(sd, alb_failed); 3556 schedstat_inc(sd, alb_failed);
3548 } 3557 }
3558 rcu_read_unlock();
3549 double_unlock_balance(busiest_rq, target_rq); 3559 double_unlock_balance(busiest_rq, target_rq);
3550out_unlock: 3560out_unlock:
3551 busiest_rq->active_balance = 0; 3561 busiest_rq->active_balance = 0;
@@ -3672,6 +3682,7 @@ static int find_new_ilb(int cpu)
3672{ 3682{
3673 struct sched_domain *sd; 3683 struct sched_domain *sd;
3674 struct sched_group *ilb_group; 3684 struct sched_group *ilb_group;
3685 int ilb = nr_cpu_ids;
3675 3686
3676 /* 3687 /*
3677 * Have idle load balancer selection from semi-idle packages only 3688 * Have idle load balancer selection from semi-idle packages only
@@ -3687,20 +3698,25 @@ static int find_new_ilb(int cpu)
3687 if (cpumask_weight(nohz.idle_cpus_mask) < 2) 3698 if (cpumask_weight(nohz.idle_cpus_mask) < 2)
3688 goto out_done; 3699 goto out_done;
3689 3700
3701 rcu_read_lock();
3690 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { 3702 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
3691 ilb_group = sd->groups; 3703 ilb_group = sd->groups;
3692 3704
3693 do { 3705 do {
3694 if (is_semi_idle_group(ilb_group)) 3706 if (is_semi_idle_group(ilb_group)) {
3695 return cpumask_first(nohz.grp_idle_mask); 3707 ilb = cpumask_first(nohz.grp_idle_mask);
3708 goto unlock;
3709 }
3696 3710
3697 ilb_group = ilb_group->next; 3711 ilb_group = ilb_group->next;
3698 3712
3699 } while (ilb_group != sd->groups); 3713 } while (ilb_group != sd->groups);
3700 } 3714 }
3715unlock:
3716 rcu_read_unlock();
3701 3717
3702out_done: 3718out_done:
3703 return nr_cpu_ids; 3719 return ilb;
3704} 3720}
3705#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ 3721#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
3706static inline int find_new_ilb(int call_cpu) 3722static inline int find_new_ilb(int call_cpu)
@@ -3845,6 +3861,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3845 3861
3846 update_shares(cpu); 3862 update_shares(cpu);
3847 3863
3864 rcu_read_lock();
3848 for_each_domain(cpu, sd) { 3865 for_each_domain(cpu, sd) {
3849 if (!(sd->flags & SD_LOAD_BALANCE)) 3866 if (!(sd->flags & SD_LOAD_BALANCE))
3850 continue; 3867 continue;
@@ -3890,6 +3907,7 @@ out:
3890 if (!balance) 3907 if (!balance)
3891 break; 3908 break;
3892 } 3909 }
3910 rcu_read_unlock();
3893 3911
3894 /* 3912 /*
3895 * next_balance will be updated only when there is a need. 3913 * next_balance will be updated only when there is a need.