diff options
author | Peter Zijlstra <a.p.zijlstra@chello.nl> | 2011-04-07 08:09:50 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2011-04-11 06:58:19 -0400 |
commit | dce840a08702bd13a9a186e07e63d1ef82256b5e (patch) | |
tree | 168bb98aed7f5761ebe31aa92c34959e9d0f238a /kernel/sched_fair.c | |
parent | a9c9a9b6bff27ac9c746344a9c1a19bf3327002c (diff) |
sched: Dynamically allocate sched_domain/sched_group data-structures
Instead of relying on static allocations for the sched_domain and
sched_group trees, dynamically allocate and RCU free them.
Allocating this dynamically also allows for some build_sched_groups()
simplification since we can now (like with other simplifications) rely
on the sched_domain tree instead of hard-coded knowledge.
One tricky to note is that detach_destroy_domains() needs to hold
rcu_read_lock() over the entire tear-down, per-cpu is not sufficient
since that can lead to partial sched_group existance (could possibly
be solved by doing the tear-down backwards but this is much more
robust).
A concequence of the above is that we can no longer print the
sched_domain debug stuff from cpu_attach_domain() since that might now
run with preemption disabled (due to classic RCU etc.) and
sched_domain_debug() does some GFP_KERNEL allocations.
Another thing to note is that we now fully rely on normal RCU and not
RCU-sched, this is because with the new and exiting RCU flavours we
grew over the years BH doesn't necessarily hold off RCU-sched grace
periods (-rt is known to break this). This would in fact already cause
us grief since we do sched_domain/sched_group iterations from softirq
context.
This patch is somewhat larger than I would like it to be, but I didn't
find any means of shrinking/splitting this.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/20110407122942.245307941@chello.nl
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel/sched_fair.c')
-rw-r--r-- | kernel/sched_fair.c | 30 |
1 files changed, 24 insertions, 6 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 4ee50f0af8d1..4a8ac7c2a18e 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -1622,6 +1622,7 @@ static int select_idle_sibling(struct task_struct *p, int target) | |||
1622 | /* | 1622 | /* |
1623 | * Otherwise, iterate the domains and find an elegible idle cpu. | 1623 | * Otherwise, iterate the domains and find an elegible idle cpu. |
1624 | */ | 1624 | */ |
1625 | rcu_read_lock(); | ||
1625 | for_each_domain(target, sd) { | 1626 | for_each_domain(target, sd) { |
1626 | if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) | 1627 | if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) |
1627 | break; | 1628 | break; |
@@ -1641,6 +1642,7 @@ static int select_idle_sibling(struct task_struct *p, int target) | |||
1641 | cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) | 1642 | cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) |
1642 | break; | 1643 | break; |
1643 | } | 1644 | } |
1645 | rcu_read_unlock(); | ||
1644 | 1646 | ||
1645 | return target; | 1647 | return target; |
1646 | } | 1648 | } |
@@ -1673,6 +1675,7 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_ | |||
1673 | new_cpu = prev_cpu; | 1675 | new_cpu = prev_cpu; |
1674 | } | 1676 | } |
1675 | 1677 | ||
1678 | rcu_read_lock(); | ||
1676 | for_each_domain(cpu, tmp) { | 1679 | for_each_domain(cpu, tmp) { |
1677 | if (!(tmp->flags & SD_LOAD_BALANCE)) | 1680 | if (!(tmp->flags & SD_LOAD_BALANCE)) |
1678 | continue; | 1681 | continue; |
@@ -1723,9 +1726,10 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_ | |||
1723 | 1726 | ||
1724 | if (affine_sd) { | 1727 | if (affine_sd) { |
1725 | if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) | 1728 | if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) |
1726 | return select_idle_sibling(p, cpu); | 1729 | prev_cpu = cpu; |
1727 | else | 1730 | |
1728 | return select_idle_sibling(p, prev_cpu); | 1731 | new_cpu = select_idle_sibling(p, prev_cpu); |
1732 | goto unlock; | ||
1729 | } | 1733 | } |
1730 | 1734 | ||
1731 | while (sd) { | 1735 | while (sd) { |
@@ -1766,6 +1770,8 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_ | |||
1766 | } | 1770 | } |
1767 | /* while loop will break here if sd == NULL */ | 1771 | /* while loop will break here if sd == NULL */ |
1768 | } | 1772 | } |
1773 | unlock: | ||
1774 | rcu_read_unlock(); | ||
1769 | 1775 | ||
1770 | return new_cpu; | 1776 | return new_cpu; |
1771 | } | 1777 | } |
@@ -3462,6 +3468,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq) | |||
3462 | raw_spin_unlock(&this_rq->lock); | 3468 | raw_spin_unlock(&this_rq->lock); |
3463 | 3469 | ||
3464 | update_shares(this_cpu); | 3470 | update_shares(this_cpu); |
3471 | rcu_read_lock(); | ||
3465 | for_each_domain(this_cpu, sd) { | 3472 | for_each_domain(this_cpu, sd) { |
3466 | unsigned long interval; | 3473 | unsigned long interval; |
3467 | int balance = 1; | 3474 | int balance = 1; |
@@ -3483,6 +3490,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq) | |||
3483 | break; | 3490 | break; |
3484 | } | 3491 | } |
3485 | } | 3492 | } |
3493 | rcu_read_unlock(); | ||
3486 | 3494 | ||
3487 | raw_spin_lock(&this_rq->lock); | 3495 | raw_spin_lock(&this_rq->lock); |
3488 | 3496 | ||
@@ -3531,6 +3539,7 @@ static int active_load_balance_cpu_stop(void *data) | |||
3531 | double_lock_balance(busiest_rq, target_rq); | 3539 | double_lock_balance(busiest_rq, target_rq); |
3532 | 3540 | ||
3533 | /* Search for an sd spanning us and the target CPU. */ | 3541 | /* Search for an sd spanning us and the target CPU. */ |
3542 | rcu_read_lock(); | ||
3534 | for_each_domain(target_cpu, sd) { | 3543 | for_each_domain(target_cpu, sd) { |
3535 | if ((sd->flags & SD_LOAD_BALANCE) && | 3544 | if ((sd->flags & SD_LOAD_BALANCE) && |
3536 | cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) | 3545 | cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) |
@@ -3546,6 +3555,7 @@ static int active_load_balance_cpu_stop(void *data) | |||
3546 | else | 3555 | else |
3547 | schedstat_inc(sd, alb_failed); | 3556 | schedstat_inc(sd, alb_failed); |
3548 | } | 3557 | } |
3558 | rcu_read_unlock(); | ||
3549 | double_unlock_balance(busiest_rq, target_rq); | 3559 | double_unlock_balance(busiest_rq, target_rq); |
3550 | out_unlock: | 3560 | out_unlock: |
3551 | busiest_rq->active_balance = 0; | 3561 | busiest_rq->active_balance = 0; |
@@ -3672,6 +3682,7 @@ static int find_new_ilb(int cpu) | |||
3672 | { | 3682 | { |
3673 | struct sched_domain *sd; | 3683 | struct sched_domain *sd; |
3674 | struct sched_group *ilb_group; | 3684 | struct sched_group *ilb_group; |
3685 | int ilb = nr_cpu_ids; | ||
3675 | 3686 | ||
3676 | /* | 3687 | /* |
3677 | * Have idle load balancer selection from semi-idle packages only | 3688 | * Have idle load balancer selection from semi-idle packages only |
@@ -3687,20 +3698,25 @@ static int find_new_ilb(int cpu) | |||
3687 | if (cpumask_weight(nohz.idle_cpus_mask) < 2) | 3698 | if (cpumask_weight(nohz.idle_cpus_mask) < 2) |
3688 | goto out_done; | 3699 | goto out_done; |
3689 | 3700 | ||
3701 | rcu_read_lock(); | ||
3690 | for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { | 3702 | for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { |
3691 | ilb_group = sd->groups; | 3703 | ilb_group = sd->groups; |
3692 | 3704 | ||
3693 | do { | 3705 | do { |
3694 | if (is_semi_idle_group(ilb_group)) | 3706 | if (is_semi_idle_group(ilb_group)) { |
3695 | return cpumask_first(nohz.grp_idle_mask); | 3707 | ilb = cpumask_first(nohz.grp_idle_mask); |
3708 | goto unlock; | ||
3709 | } | ||
3696 | 3710 | ||
3697 | ilb_group = ilb_group->next; | 3711 | ilb_group = ilb_group->next; |
3698 | 3712 | ||
3699 | } while (ilb_group != sd->groups); | 3713 | } while (ilb_group != sd->groups); |
3700 | } | 3714 | } |
3715 | unlock: | ||
3716 | rcu_read_unlock(); | ||
3701 | 3717 | ||
3702 | out_done: | 3718 | out_done: |
3703 | return nr_cpu_ids; | 3719 | return ilb; |
3704 | } | 3720 | } |
3705 | #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ | 3721 | #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ |
3706 | static inline int find_new_ilb(int call_cpu) | 3722 | static inline int find_new_ilb(int call_cpu) |
@@ -3845,6 +3861,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
3845 | 3861 | ||
3846 | update_shares(cpu); | 3862 | update_shares(cpu); |
3847 | 3863 | ||
3864 | rcu_read_lock(); | ||
3848 | for_each_domain(cpu, sd) { | 3865 | for_each_domain(cpu, sd) { |
3849 | if (!(sd->flags & SD_LOAD_BALANCE)) | 3866 | if (!(sd->flags & SD_LOAD_BALANCE)) |
3850 | continue; | 3867 | continue; |
@@ -3890,6 +3907,7 @@ out: | |||
3890 | if (!balance) | 3907 | if (!balance) |
3891 | break; | 3908 | break; |
3892 | } | 3909 | } |
3910 | rcu_read_unlock(); | ||
3893 | 3911 | ||
3894 | /* | 3912 | /* |
3895 | * next_balance will be updated only when there is a need. | 3913 | * next_balance will be updated only when there is a need. |