diff options
author | Suresh Siddha <suresh.b.siddha@intel.com> | 2011-12-01 20:07:34 -0500 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2011-12-06 03:06:34 -0500 |
commit | 0b005cf54eac170a8f22540ab096a6e07bf49e7c (patch) | |
tree | d06b2c7d1b6286f4116f94b9d4b38779e885a9b2 /kernel/sched | |
parent | 69e1e811dcc436a6b129dbef273ad9ec22d095ce (diff) |
sched, nohz: Implement sched group, domain aware nohz idle load balancing
When there are many logical cpu's that enter and exit idle often, members of
the global nohz data structure are getting modified very frequently causing
lot of cache-line contention.
Make the nohz idle load balancing more scalabale by using the sched domain
topology and 'nr_busy_cpu's in the struct sched_group_power.
Idle load balance is kicked on one of the idle cpu's when there is atleast
one idle cpu and:
- a busy rq having more than one task or
- a busy rq's scheduler group that share package resources (like HT/MC
siblings) and has more than one member in that group busy or
- for the SD_ASYM_PACKING domain, if the lower numbered cpu's in that
domain are idle compared to the busy ones.
This will help in kicking the idle load balancing request only when
there is a potential imbalance. And once it is mostly balanced, these kicks will
be minimized.
These changes helped improve the workload that is context switch intensive
between number of task pairs by 2x on a 8 socket NHM-EX based system.
Reported-by: Tim Chen <tim.c.chen@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel/sched')
-rw-r--r-- | kernel/sched/fair.c | 160 |
1 files changed, 47 insertions, 113 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e050563e97a4..821af14335f3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -4727,28 +4727,17 @@ out_unlock: | |||
4727 | #ifdef CONFIG_NO_HZ | 4727 | #ifdef CONFIG_NO_HZ |
4728 | /* | 4728 | /* |
4729 | * idle load balancing details | 4729 | * idle load balancing details |
4730 | * - One of the idle CPUs nominates itself as idle load_balancer, while | ||
4731 | * entering idle. | ||
4732 | * - This idle load balancer CPU will also go into tickless mode when | ||
4733 | * it is idle, just like all other idle CPUs | ||
4734 | * - When one of the busy CPUs notice that there may be an idle rebalancing | 4730 | * - When one of the busy CPUs notice that there may be an idle rebalancing |
4735 | * needed, they will kick the idle load balancer, which then does idle | 4731 | * needed, they will kick the idle load balancer, which then does idle |
4736 | * load balancing for all the idle CPUs. | 4732 | * load balancing for all the idle CPUs. |
4737 | */ | 4733 | */ |
4738 | static struct { | 4734 | static struct { |
4739 | atomic_t load_balancer; | ||
4740 | atomic_t first_pick_cpu; | ||
4741 | atomic_t second_pick_cpu; | ||
4742 | cpumask_var_t idle_cpus_mask; | 4735 | cpumask_var_t idle_cpus_mask; |
4743 | cpumask_var_t grp_idle_mask; | 4736 | cpumask_var_t grp_idle_mask; |
4737 | atomic_t nr_cpus; | ||
4744 | unsigned long next_balance; /* in jiffy units */ | 4738 | unsigned long next_balance; /* in jiffy units */ |
4745 | } nohz ____cacheline_aligned; | 4739 | } nohz ____cacheline_aligned; |
4746 | 4740 | ||
4747 | int get_nohz_load_balancer(void) | ||
4748 | { | ||
4749 | return atomic_read(&nohz.load_balancer); | ||
4750 | } | ||
4751 | |||
4752 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | 4741 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) |
4753 | /** | 4742 | /** |
4754 | * lowest_flag_domain - Return lowest sched_domain containing flag. | 4743 | * lowest_flag_domain - Return lowest sched_domain containing flag. |
@@ -4825,9 +4814,9 @@ static inline int is_semi_idle_group(struct sched_group *ilb_group) | |||
4825 | */ | 4814 | */ |
4826 | static int find_new_ilb(int cpu) | 4815 | static int find_new_ilb(int cpu) |
4827 | { | 4816 | { |
4817 | int ilb = cpumask_first(nohz.idle_cpus_mask); | ||
4828 | struct sched_domain *sd; | 4818 | struct sched_domain *sd; |
4829 | struct sched_group *ilb_group; | 4819 | struct sched_group *ilb_group; |
4830 | int ilb = nr_cpu_ids; | ||
4831 | 4820 | ||
4832 | /* | 4821 | /* |
4833 | * Have idle load balancer selection from semi-idle packages only | 4822 | * Have idle load balancer selection from semi-idle packages only |
@@ -4881,13 +4870,10 @@ static void nohz_balancer_kick(int cpu) | |||
4881 | 4870 | ||
4882 | nohz.next_balance++; | 4871 | nohz.next_balance++; |
4883 | 4872 | ||
4884 | ilb_cpu = get_nohz_load_balancer(); | 4873 | ilb_cpu = find_new_ilb(cpu); |
4885 | 4874 | ||
4886 | if (ilb_cpu >= nr_cpu_ids) { | 4875 | if (ilb_cpu >= nr_cpu_ids) |
4887 | ilb_cpu = cpumask_first(nohz.idle_cpus_mask); | 4876 | return; |
4888 | if (ilb_cpu >= nr_cpu_ids) | ||
4889 | return; | ||
4890 | } | ||
4891 | 4877 | ||
4892 | if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu))) | 4878 | if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu))) |
4893 | return; | 4879 | return; |
@@ -4932,77 +4918,20 @@ void set_cpu_sd_state_idle(void) | |||
4932 | } | 4918 | } |
4933 | 4919 | ||
4934 | /* | 4920 | /* |
4935 | * This routine will try to nominate the ilb (idle load balancing) | 4921 | * This routine will record that this cpu is going idle with tick stopped. |
4936 | * owner among the cpus whose ticks are stopped. ilb owner will do the idle | 4922 | * This info will be used in performing idle load balancing in the future. |
4937 | * load balancing on behalf of all those cpus. | ||
4938 | * | ||
4939 | * When the ilb owner becomes busy, we will not have new ilb owner until some | ||
4940 | * idle CPU wakes up and goes back to idle or some busy CPU tries to kick | ||
4941 | * idle load balancing by kicking one of the idle CPUs. | ||
4942 | * | ||
4943 | * Ticks are stopped for the ilb owner as well, with busy CPU kicking this | ||
4944 | * ilb owner CPU in future (when there is a need for idle load balancing on | ||
4945 | * behalf of all idle CPUs). | ||
4946 | */ | 4923 | */ |
4947 | void select_nohz_load_balancer(int stop_tick) | 4924 | void select_nohz_load_balancer(int stop_tick) |
4948 | { | 4925 | { |
4949 | int cpu = smp_processor_id(); | 4926 | int cpu = smp_processor_id(); |
4950 | 4927 | ||
4951 | if (stop_tick) { | 4928 | if (stop_tick) { |
4952 | if (!cpu_active(cpu)) { | 4929 | if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) |
4953 | if (atomic_read(&nohz.load_balancer) != cpu) | ||
4954 | return; | ||
4955 | |||
4956 | /* | ||
4957 | * If we are going offline and still the leader, | ||
4958 | * give up! | ||
4959 | */ | ||
4960 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, | ||
4961 | nr_cpu_ids) != cpu) | ||
4962 | BUG(); | ||
4963 | |||
4964 | return; | 4930 | return; |
4965 | } | ||
4966 | 4931 | ||
4967 | cpumask_set_cpu(cpu, nohz.idle_cpus_mask); | 4932 | cpumask_set_cpu(cpu, nohz.idle_cpus_mask); |
4968 | 4933 | atomic_inc(&nohz.nr_cpus); | |
4969 | if (atomic_read(&nohz.first_pick_cpu) == cpu) | ||
4970 | atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids); | ||
4971 | if (atomic_read(&nohz.second_pick_cpu) == cpu) | ||
4972 | atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids); | ||
4973 | |||
4974 | if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) { | ||
4975 | int new_ilb; | ||
4976 | |||
4977 | /* make me the ilb owner */ | ||
4978 | if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids, | ||
4979 | cpu) != nr_cpu_ids) | ||
4980 | return; | ||
4981 | |||
4982 | /* | ||
4983 | * Check to see if there is a more power-efficient | ||
4984 | * ilb. | ||
4985 | */ | ||
4986 | new_ilb = find_new_ilb(cpu); | ||
4987 | if (new_ilb < nr_cpu_ids && new_ilb != cpu) { | ||
4988 | atomic_set(&nohz.load_balancer, nr_cpu_ids); | ||
4989 | resched_cpu(new_ilb); | ||
4990 | return; | ||
4991 | } | ||
4992 | return; | ||
4993 | } | ||
4994 | |||
4995 | set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); | 4934 | set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); |
4996 | } else { | ||
4997 | if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask)) | ||
4998 | return; | ||
4999 | |||
5000 | cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); | ||
5001 | |||
5002 | if (atomic_read(&nohz.load_balancer) == cpu) | ||
5003 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, | ||
5004 | nr_cpu_ids) != cpu) | ||
5005 | BUG(); | ||
5006 | } | 4935 | } |
5007 | return; | 4936 | return; |
5008 | } | 4937 | } |
@@ -5113,7 +5042,7 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) | |||
5113 | goto end; | 5042 | goto end; |
5114 | 5043 | ||
5115 | for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { | 5044 | for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { |
5116 | if (balance_cpu == this_cpu) | 5045 | if (balance_cpu == this_cpu || !idle_cpu(this_cpu)) |
5117 | continue; | 5046 | continue; |
5118 | 5047 | ||
5119 | /* | 5048 | /* |
@@ -5141,22 +5070,18 @@ end: | |||
5141 | } | 5070 | } |
5142 | 5071 | ||
5143 | /* | 5072 | /* |
5144 | * Current heuristic for kicking the idle load balancer | 5073 | * Current heuristic for kicking the idle load balancer in the presence |
5145 | * - first_pick_cpu is the one of the busy CPUs. It will kick | 5074 | * of an idle cpu is the system. |
5146 | * idle load balancer when it has more than one process active. This | 5075 | * - This rq has more than one task. |
5147 | * eliminates the need for idle load balancing altogether when we have | 5076 | * - At any scheduler domain level, this cpu's scheduler group has multiple |
5148 | * only one running process in the system (common case). | 5077 | * busy cpu's exceeding the group's power. |
5149 | * - If there are more than one busy CPU, idle load balancer may have | 5078 | * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler |
5150 | * to run for active_load_balance to happen (i.e., two busy CPUs are | 5079 | * domain span are idle. |
5151 | * SMT or core siblings and can run better if they move to different | ||
5152 | * physical CPUs). So, second_pick_cpu is the second of the busy CPUs | ||
5153 | * which will kick idle load balancer as soon as it has any load. | ||
5154 | */ | 5080 | */ |
5155 | static inline int nohz_kick_needed(struct rq *rq, int cpu) | 5081 | static inline int nohz_kick_needed(struct rq *rq, int cpu) |
5156 | { | 5082 | { |
5157 | unsigned long now = jiffies; | 5083 | unsigned long now = jiffies; |
5158 | int ret; | 5084 | struct sched_domain *sd; |
5159 | int first_pick_cpu, second_pick_cpu; | ||
5160 | 5085 | ||
5161 | if (unlikely(idle_cpu(cpu))) | 5086 | if (unlikely(idle_cpu(cpu))) |
5162 | return 0; | 5087 | return 0; |
@@ -5166,32 +5091,44 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu) | |||
5166 | * busy tick after returning from idle, we will update the busy stats. | 5091 | * busy tick after returning from idle, we will update the busy stats. |
5167 | */ | 5092 | */ |
5168 | set_cpu_sd_state_busy(); | 5093 | set_cpu_sd_state_busy(); |
5169 | if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) | 5094 | if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { |
5170 | clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); | 5095 | clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); |
5096 | cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); | ||
5097 | atomic_dec(&nohz.nr_cpus); | ||
5098 | } | ||
5099 | |||
5100 | /* | ||
5101 | * None are in tickless mode and hence no need for NOHZ idle load | ||
5102 | * balancing. | ||
5103 | */ | ||
5104 | if (likely(!atomic_read(&nohz.nr_cpus))) | ||
5105 | return 0; | ||
5171 | 5106 | ||
5172 | if (time_before(now, nohz.next_balance)) | 5107 | if (time_before(now, nohz.next_balance)) |
5173 | return 0; | 5108 | return 0; |
5174 | 5109 | ||
5175 | first_pick_cpu = atomic_read(&nohz.first_pick_cpu); | 5110 | if (rq->nr_running >= 2) |
5176 | second_pick_cpu = atomic_read(&nohz.second_pick_cpu); | 5111 | goto need_kick; |
5177 | 5112 | ||
5178 | if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu && | 5113 | for_each_domain(cpu, sd) { |
5179 | second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu) | 5114 | struct sched_group *sg = sd->groups; |
5180 | return 0; | 5115 | struct sched_group_power *sgp = sg->sgp; |
5116 | int nr_busy = atomic_read(&sgp->nr_busy_cpus); | ||
5181 | 5117 | ||
5182 | ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu); | 5118 | if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1) |
5183 | if (ret == nr_cpu_ids || ret == cpu) { | 5119 | goto need_kick; |
5184 | atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids); | 5120 | |
5185 | if (rq->nr_running > 1) | 5121 | if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight |
5186 | return 1; | 5122 | && (cpumask_first_and(nohz.idle_cpus_mask, |
5187 | } else { | 5123 | sched_domain_span(sd)) < cpu)) |
5188 | ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu); | 5124 | goto need_kick; |
5189 | if (ret == nr_cpu_ids || ret == cpu) { | 5125 | |
5190 | if (rq->nr_running) | 5126 | if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING))) |
5191 | return 1; | 5127 | break; |
5192 | } | ||
5193 | } | 5128 | } |
5194 | return 0; | 5129 | return 0; |
5130 | need_kick: | ||
5131 | return 1; | ||
5195 | } | 5132 | } |
5196 | #else | 5133 | #else |
5197 | static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } | 5134 | static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } |
@@ -5652,9 +5589,6 @@ __init void init_sched_fair_class(void) | |||
5652 | #ifdef CONFIG_NO_HZ | 5589 | #ifdef CONFIG_NO_HZ |
5653 | zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); | 5590 | zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); |
5654 | alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); | 5591 | alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); |
5655 | atomic_set(&nohz.load_balancer, nr_cpu_ids); | ||
5656 | atomic_set(&nohz.first_pick_cpu, nr_cpu_ids); | ||
5657 | atomic_set(&nohz.second_pick_cpu, nr_cpu_ids); | ||
5658 | #endif | 5592 | #endif |
5659 | #endif /* SMP */ | 5593 | #endif /* SMP */ |
5660 | 5594 | ||