aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSuresh Siddha <suresh.b.siddha@intel.com>2011-12-01 20:07:34 -0500
committerIngo Molnar <mingo@elte.hu>2011-12-06 03:06:34 -0500
commit0b005cf54eac170a8f22540ab096a6e07bf49e7c (patch)
treed06b2c7d1b6286f4116f94b9d4b38779e885a9b2
parent69e1e811dcc436a6b129dbef273ad9ec22d095ce (diff)
sched, nohz: Implement sched group, domain aware nohz idle load balancing
When there are many logical cpu's that enter and exit idle often, members of the global nohz data structure are getting modified very frequently causing lot of cache-line contention. Make the nohz idle load balancing more scalabale by using the sched domain topology and 'nr_busy_cpu's in the struct sched_group_power. Idle load balance is kicked on one of the idle cpu's when there is atleast one idle cpu and: - a busy rq having more than one task or - a busy rq's scheduler group that share package resources (like HT/MC siblings) and has more than one member in that group busy or - for the SD_ASYM_PACKING domain, if the lower numbered cpu's in that domain are idle compared to the busy ones. This will help in kicking the idle load balancing request only when there is a potential imbalance. And once it is mostly balanced, these kicks will be minimized. These changes helped improve the workload that is context switch intensive between number of task pairs by 2x on a 8 socket NHM-EX based system. Reported-by: Tim Chen <tim.c.chen@intel.com> Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Thomas Gleixner <tglx@linutronix.de> Link: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r--kernel/sched/fair.c160
1 files changed, 47 insertions, 113 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e050563e97a4..821af14335f3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4727,28 +4727,17 @@ out_unlock:
4727#ifdef CONFIG_NO_HZ 4727#ifdef CONFIG_NO_HZ
4728/* 4728/*
4729 * idle load balancing details 4729 * idle load balancing details
4730 * - One of the idle CPUs nominates itself as idle load_balancer, while
4731 * entering idle.
4732 * - This idle load balancer CPU will also go into tickless mode when
4733 * it is idle, just like all other idle CPUs
4734 * - When one of the busy CPUs notice that there may be an idle rebalancing 4730 * - When one of the busy CPUs notice that there may be an idle rebalancing
4735 * needed, they will kick the idle load balancer, which then does idle 4731 * needed, they will kick the idle load balancer, which then does idle
4736 * load balancing for all the idle CPUs. 4732 * load balancing for all the idle CPUs.
4737 */ 4733 */
4738static struct { 4734static struct {
4739 atomic_t load_balancer;
4740 atomic_t first_pick_cpu;
4741 atomic_t second_pick_cpu;
4742 cpumask_var_t idle_cpus_mask; 4735 cpumask_var_t idle_cpus_mask;
4743 cpumask_var_t grp_idle_mask; 4736 cpumask_var_t grp_idle_mask;
4737 atomic_t nr_cpus;
4744 unsigned long next_balance; /* in jiffy units */ 4738 unsigned long next_balance; /* in jiffy units */
4745} nohz ____cacheline_aligned; 4739} nohz ____cacheline_aligned;
4746 4740
4747int get_nohz_load_balancer(void)
4748{
4749 return atomic_read(&nohz.load_balancer);
4750}
4751
4752#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 4741#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
4753/** 4742/**
4754 * lowest_flag_domain - Return lowest sched_domain containing flag. 4743 * lowest_flag_domain - Return lowest sched_domain containing flag.
@@ -4825,9 +4814,9 @@ static inline int is_semi_idle_group(struct sched_group *ilb_group)
4825 */ 4814 */
4826static int find_new_ilb(int cpu) 4815static int find_new_ilb(int cpu)
4827{ 4816{
4817 int ilb = cpumask_first(nohz.idle_cpus_mask);
4828 struct sched_domain *sd; 4818 struct sched_domain *sd;
4829 struct sched_group *ilb_group; 4819 struct sched_group *ilb_group;
4830 int ilb = nr_cpu_ids;
4831 4820
4832 /* 4821 /*
4833 * Have idle load balancer selection from semi-idle packages only 4822 * Have idle load balancer selection from semi-idle packages only
@@ -4881,13 +4870,10 @@ static void nohz_balancer_kick(int cpu)
4881 4870
4882 nohz.next_balance++; 4871 nohz.next_balance++;
4883 4872
4884 ilb_cpu = get_nohz_load_balancer(); 4873 ilb_cpu = find_new_ilb(cpu);
4885 4874
4886 if (ilb_cpu >= nr_cpu_ids) { 4875 if (ilb_cpu >= nr_cpu_ids)
4887 ilb_cpu = cpumask_first(nohz.idle_cpus_mask); 4876 return;
4888 if (ilb_cpu >= nr_cpu_ids)
4889 return;
4890 }
4891 4877
4892 if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu))) 4878 if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)))
4893 return; 4879 return;
@@ -4932,77 +4918,20 @@ void set_cpu_sd_state_idle(void)
4932} 4918}
4933 4919
4934/* 4920/*
4935 * This routine will try to nominate the ilb (idle load balancing) 4921 * This routine will record that this cpu is going idle with tick stopped.
4936 * owner among the cpus whose ticks are stopped. ilb owner will do the idle 4922 * This info will be used in performing idle load balancing in the future.
4937 * load balancing on behalf of all those cpus.
4938 *
4939 * When the ilb owner becomes busy, we will not have new ilb owner until some
4940 * idle CPU wakes up and goes back to idle or some busy CPU tries to kick
4941 * idle load balancing by kicking one of the idle CPUs.
4942 *
4943 * Ticks are stopped for the ilb owner as well, with busy CPU kicking this
4944 * ilb owner CPU in future (when there is a need for idle load balancing on
4945 * behalf of all idle CPUs).
4946 */ 4923 */
4947void select_nohz_load_balancer(int stop_tick) 4924void select_nohz_load_balancer(int stop_tick)
4948{ 4925{
4949 int cpu = smp_processor_id(); 4926 int cpu = smp_processor_id();
4950 4927
4951 if (stop_tick) { 4928 if (stop_tick) {
4952 if (!cpu_active(cpu)) { 4929 if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
4953 if (atomic_read(&nohz.load_balancer) != cpu)
4954 return;
4955
4956 /*
4957 * If we are going offline and still the leader,
4958 * give up!
4959 */
4960 if (atomic_cmpxchg(&nohz.load_balancer, cpu,
4961 nr_cpu_ids) != cpu)
4962 BUG();
4963
4964 return; 4930 return;
4965 }
4966 4931
4967 cpumask_set_cpu(cpu, nohz.idle_cpus_mask); 4932 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
4968 4933 atomic_inc(&nohz.nr_cpus);
4969 if (atomic_read(&nohz.first_pick_cpu) == cpu)
4970 atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids);
4971 if (atomic_read(&nohz.second_pick_cpu) == cpu)
4972 atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
4973
4974 if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) {
4975 int new_ilb;
4976
4977 /* make me the ilb owner */
4978 if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids,
4979 cpu) != nr_cpu_ids)
4980 return;
4981
4982 /*
4983 * Check to see if there is a more power-efficient
4984 * ilb.
4985 */
4986 new_ilb = find_new_ilb(cpu);
4987 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
4988 atomic_set(&nohz.load_balancer, nr_cpu_ids);
4989 resched_cpu(new_ilb);
4990 return;
4991 }
4992 return;
4993 }
4994
4995 set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); 4934 set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
4996 } else {
4997 if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
4998 return;
4999
5000 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
5001
5002 if (atomic_read(&nohz.load_balancer) == cpu)
5003 if (atomic_cmpxchg(&nohz.load_balancer, cpu,
5004 nr_cpu_ids) != cpu)
5005 BUG();
5006 } 4935 }
5007 return; 4936 return;
5008} 4937}
@@ -5113,7 +5042,7 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
5113 goto end; 5042 goto end;
5114 5043
5115 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { 5044 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
5116 if (balance_cpu == this_cpu) 5045 if (balance_cpu == this_cpu || !idle_cpu(this_cpu))
5117 continue; 5046 continue;
5118 5047
5119 /* 5048 /*
@@ -5141,22 +5070,18 @@ end:
5141} 5070}
5142 5071
5143/* 5072/*
5144 * Current heuristic for kicking the idle load balancer 5073 * Current heuristic for kicking the idle load balancer in the presence
5145 * - first_pick_cpu is the one of the busy CPUs. It will kick 5074 * of an idle cpu is the system.
5146 * idle load balancer when it has more than one process active. This 5075 * - This rq has more than one task.
5147 * eliminates the need for idle load balancing altogether when we have 5076 * - At any scheduler domain level, this cpu's scheduler group has multiple
5148 * only one running process in the system (common case). 5077 * busy cpu's exceeding the group's power.
5149 * - If there are more than one busy CPU, idle load balancer may have 5078 * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
5150 * to run for active_load_balance to happen (i.e., two busy CPUs are 5079 * domain span are idle.
5151 * SMT or core siblings and can run better if they move to different
5152 * physical CPUs). So, second_pick_cpu is the second of the busy CPUs
5153 * which will kick idle load balancer as soon as it has any load.
5154 */ 5080 */
5155static inline int nohz_kick_needed(struct rq *rq, int cpu) 5081static inline int nohz_kick_needed(struct rq *rq, int cpu)
5156{ 5082{
5157 unsigned long now = jiffies; 5083 unsigned long now = jiffies;
5158 int ret; 5084 struct sched_domain *sd;
5159 int first_pick_cpu, second_pick_cpu;
5160 5085
5161 if (unlikely(idle_cpu(cpu))) 5086 if (unlikely(idle_cpu(cpu)))
5162 return 0; 5087 return 0;
@@ -5166,32 +5091,44 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
5166 * busy tick after returning from idle, we will update the busy stats. 5091 * busy tick after returning from idle, we will update the busy stats.
5167 */ 5092 */
5168 set_cpu_sd_state_busy(); 5093 set_cpu_sd_state_busy();
5169 if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) 5094 if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
5170 clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); 5095 clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
5096 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
5097 atomic_dec(&nohz.nr_cpus);
5098 }
5099
5100 /*
5101 * None are in tickless mode and hence no need for NOHZ idle load
5102 * balancing.
5103 */
5104 if (likely(!atomic_read(&nohz.nr_cpus)))
5105 return 0;
5171 5106
5172 if (time_before(now, nohz.next_balance)) 5107 if (time_before(now, nohz.next_balance))
5173 return 0; 5108 return 0;
5174 5109
5175 first_pick_cpu = atomic_read(&nohz.first_pick_cpu); 5110 if (rq->nr_running >= 2)
5176 second_pick_cpu = atomic_read(&nohz.second_pick_cpu); 5111 goto need_kick;
5177 5112
5178 if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu && 5113 for_each_domain(cpu, sd) {
5179 second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu) 5114 struct sched_group *sg = sd->groups;
5180 return 0; 5115 struct sched_group_power *sgp = sg->sgp;
5116 int nr_busy = atomic_read(&sgp->nr_busy_cpus);
5181 5117
5182 ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu); 5118 if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
5183 if (ret == nr_cpu_ids || ret == cpu) { 5119 goto need_kick;
5184 atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids); 5120
5185 if (rq->nr_running > 1) 5121 if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
5186 return 1; 5122 && (cpumask_first_and(nohz.idle_cpus_mask,
5187 } else { 5123 sched_domain_span(sd)) < cpu))
5188 ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu); 5124 goto need_kick;
5189 if (ret == nr_cpu_ids || ret == cpu) { 5125
5190 if (rq->nr_running) 5126 if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING)))
5191 return 1; 5127 break;
5192 }
5193 } 5128 }
5194 return 0; 5129 return 0;
5130need_kick:
5131 return 1;
5195} 5132}
5196#else 5133#else
5197static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } 5134static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
@@ -5652,9 +5589,6 @@ __init void init_sched_fair_class(void)
5652#ifdef CONFIG_NO_HZ 5589#ifdef CONFIG_NO_HZ
5653 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); 5590 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
5654 alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); 5591 alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
5655 atomic_set(&nohz.load_balancer, nr_cpu_ids);
5656 atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
5657 atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
5658#endif 5592#endif
5659#endif /* SMP */ 5593#endif /* SMP */
5660 5594