aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSiddha, Suresh B <suresh.b.siddha@intel.com>2007-05-08 03:32:51 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-05-08 14:15:17 -0400
commit46cb4b7c88fa5517f64b5bee42939ea3614cddcb (patch)
tree429b8092394974ae787bf0cfaefe5c7b6a1da782
parentbdecea3a9282d529b54954f3f1e59877629baba1 (diff)
sched: dynticks idle load balancing
Fix the process idle load balancing in the presence of dynticks. cpus for which ticks are stopped will sleep till the next event wakes it up. Potentially these sleeps can be for large durations and during which today, there is no periodic idle load balancing being done. This patch nominates an owner among the idle cpus, which does the idle load balancing on behalf of the other idle cpus. And once all the cpus are completely idle, then we can stop this idle load balancing too. Checks added in fast path are minimized. Whenever there are busy cpus in the system, there will be an owner(idle cpu) doing the system wide idle load balancing. Open items: 1. Intelligent owner selection (like an idle core in a busy package). 2. Merge with rcu's nohz_cpu_mask? Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com> Acked-by: Ingo Molnar <mingo@elte.hu> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/sched.h8
-rw-r--r--kernel/sched.c223
-rw-r--r--kernel/time/tick-sched.c9
3 files changed, 227 insertions, 13 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6312521df2c1..15ab3e039535 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -194,6 +194,14 @@ extern void sched_init_smp(void);
194extern void init_idle(struct task_struct *idle, int cpu); 194extern void init_idle(struct task_struct *idle, int cpu);
195 195
196extern cpumask_t nohz_cpu_mask; 196extern cpumask_t nohz_cpu_mask;
197#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
198extern int select_nohz_load_balancer(int cpu);
199#else
200static inline int select_nohz_load_balancer(int cpu)
201{
202 return 0;
203}
204#endif
197 205
198/* 206/*
199 * Only dump TASK_* tasks. (0 for all tasks) 207 * Only dump TASK_* tasks. (0 for all tasks)
diff --git a/kernel/sched.c b/kernel/sched.c
index ba053d88c8c6..74599286230c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -224,6 +224,9 @@ struct rq {
224#ifdef CONFIG_SMP 224#ifdef CONFIG_SMP
225 unsigned long cpu_load[3]; 225 unsigned long cpu_load[3];
226 unsigned char idle_at_tick; 226 unsigned char idle_at_tick;
227#ifdef CONFIG_NO_HZ
228 unsigned char in_nohz_recently;
229#endif
227#endif 230#endif
228 unsigned long long nr_switches; 231 unsigned long long nr_switches;
229 232
@@ -1050,6 +1053,17 @@ static void resched_task(struct task_struct *p)
1050 if (!tsk_is_polling(p)) 1053 if (!tsk_is_polling(p))
1051 smp_send_reschedule(cpu); 1054 smp_send_reschedule(cpu);
1052} 1055}
1056
1057static void resched_cpu(int cpu)
1058{
1059 struct rq *rq = cpu_rq(cpu);
1060 unsigned long flags;
1061
1062 if (!spin_trylock_irqsave(&rq->lock, flags))
1063 return;
1064 resched_task(cpu_curr(cpu));
1065 spin_unlock_irqrestore(&rq->lock, flags);
1066}
1053#else 1067#else
1054static inline void resched_task(struct task_struct *p) 1068static inline void resched_task(struct task_struct *p)
1055{ 1069{
@@ -2658,6 +2672,12 @@ redo:
2658 double_rq_unlock(this_rq, busiest); 2672 double_rq_unlock(this_rq, busiest);
2659 local_irq_restore(flags); 2673 local_irq_restore(flags);
2660 2674
2675 /*
2676 * some other cpu did the load balance for us.
2677 */
2678 if (nr_moved && this_cpu != smp_processor_id())
2679 resched_cpu(this_cpu);
2680
2661 /* All tasks on this runqueue were pinned by CPU affinity */ 2681 /* All tasks on this runqueue were pinned by CPU affinity */
2662 if (unlikely(all_pinned)) { 2682 if (unlikely(all_pinned)) {
2663 cpu_clear(cpu_of(busiest), cpus); 2683 cpu_clear(cpu_of(busiest), cpus);
@@ -2928,27 +2948,98 @@ static void update_load(struct rq *this_rq)
2928 } 2948 }
2929} 2949}
2930 2950
2951#ifdef CONFIG_NO_HZ
2952static struct {
2953 atomic_t load_balancer;
2954 cpumask_t cpu_mask;
2955} nohz ____cacheline_aligned = {
2956 .load_balancer = ATOMIC_INIT(-1),
2957 .cpu_mask = CPU_MASK_NONE,
2958};
2959
2931/* 2960/*
2932 * run_rebalance_domains is triggered when needed from the scheduler tick. 2961 * This routine will try to nominate the ilb (idle load balancing)
2962 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
2963 * load balancing on behalf of all those cpus. If all the cpus in the system
2964 * go into this tickless mode, then there will be no ilb owner (as there is
2965 * no need for one) and all the cpus will sleep till the next wakeup event
2966 * arrives...
2967 *
2968 * For the ilb owner, tick is not stopped. And this tick will be used
2969 * for idle load balancing. ilb owner will still be part of
2970 * nohz.cpu_mask..
2971 *
2972 * While stopping the tick, this cpu will become the ilb owner if there
2973 * is no other owner. And will be the owner till that cpu becomes busy
2974 * or if all cpus in the system stop their ticks at which point
2975 * there is no need for ilb owner.
2933 * 2976 *
2977 * When the ilb owner becomes busy, it nominates another owner, during the
2978 * next busy scheduler_tick()
2979 */
2980int select_nohz_load_balancer(int stop_tick)
2981{
2982 int cpu = smp_processor_id();
2983
2984 if (stop_tick) {
2985 cpu_set(cpu, nohz.cpu_mask);
2986 cpu_rq(cpu)->in_nohz_recently = 1;
2987
2988 /*
2989 * If we are going offline and still the leader, give up!
2990 */
2991 if (cpu_is_offline(cpu) &&
2992 atomic_read(&nohz.load_balancer) == cpu) {
2993 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
2994 BUG();
2995 return 0;
2996 }
2997
2998 /* time for ilb owner also to sleep */
2999 if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
3000 if (atomic_read(&nohz.load_balancer) == cpu)
3001 atomic_set(&nohz.load_balancer, -1);
3002 return 0;
3003 }
3004
3005 if (atomic_read(&nohz.load_balancer) == -1) {
3006 /* make me the ilb owner */
3007 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
3008 return 1;
3009 } else if (atomic_read(&nohz.load_balancer) == cpu)
3010 return 1;
3011 } else {
3012 if (!cpu_isset(cpu, nohz.cpu_mask))
3013 return 0;
3014
3015 cpu_clear(cpu, nohz.cpu_mask);
3016
3017 if (atomic_read(&nohz.load_balancer) == cpu)
3018 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
3019 BUG();
3020 }
3021 return 0;
3022}
3023#endif
3024
3025static DEFINE_SPINLOCK(balancing);
3026
3027/*
2934 * It checks each scheduling domain to see if it is due to be balanced, 3028 * It checks each scheduling domain to see if it is due to be balanced,
2935 * and initiates a balancing operation if so. 3029 * and initiates a balancing operation if so.
2936 * 3030 *
2937 * Balancing parameters are set up in arch_init_sched_domains. 3031 * Balancing parameters are set up in arch_init_sched_domains.
2938 */ 3032 */
2939static DEFINE_SPINLOCK(balancing); 3033static inline void rebalance_domains(int cpu, enum idle_type idle)
2940
2941static void run_rebalance_domains(struct softirq_action *h)
2942{ 3034{
2943 int this_cpu = smp_processor_id(), balance = 1; 3035 int balance = 1;
2944 struct rq *this_rq = cpu_rq(this_cpu); 3036 struct rq *rq = cpu_rq(cpu);
2945 unsigned long interval; 3037 unsigned long interval;
2946 struct sched_domain *sd; 3038 struct sched_domain *sd;
2947 enum idle_type idle = this_rq->idle_at_tick ? SCHED_IDLE : NOT_IDLE; 3039 /* Earliest time when we have to do rebalance again */
2948 /* Earliest time when we have to call run_rebalance_domains again */
2949 unsigned long next_balance = jiffies + 60*HZ; 3040 unsigned long next_balance = jiffies + 60*HZ;
2950 3041
2951 for_each_domain(this_cpu, sd) { 3042 for_each_domain(cpu, sd) {
2952 if (!(sd->flags & SD_LOAD_BALANCE)) 3043 if (!(sd->flags & SD_LOAD_BALANCE))
2953 continue; 3044 continue;
2954 3045
@@ -2967,7 +3058,7 @@ static void run_rebalance_domains(struct softirq_action *h)
2967 } 3058 }
2968 3059
2969 if (time_after_eq(jiffies, sd->last_balance + interval)) { 3060 if (time_after_eq(jiffies, sd->last_balance + interval)) {
2970 if (load_balance(this_cpu, this_rq, sd, idle, &balance)) { 3061 if (load_balance(cpu, rq, sd, idle, &balance)) {
2971 /* 3062 /*
2972 * We've pulled tasks over so either we're no 3063 * We've pulled tasks over so either we're no
2973 * longer idle, or one of our SMT siblings is 3064 * longer idle, or one of our SMT siblings is
@@ -2991,7 +3082,114 @@ out:
2991 if (!balance) 3082 if (!balance)
2992 break; 3083 break;
2993 } 3084 }
2994 this_rq->next_balance = next_balance; 3085 rq->next_balance = next_balance;
3086}
3087
3088/*
3089 * run_rebalance_domains is triggered when needed from the scheduler tick.
3090 * In CONFIG_NO_HZ case, the idle load balance owner will do the
3091 * rebalancing for all the cpus for whom scheduler ticks are stopped.
3092 */
3093static void run_rebalance_domains(struct softirq_action *h)
3094{
3095 int local_cpu = smp_processor_id();
3096 struct rq *local_rq = cpu_rq(local_cpu);
3097 enum idle_type idle = local_rq->idle_at_tick ? SCHED_IDLE : NOT_IDLE;
3098
3099 rebalance_domains(local_cpu, idle);
3100
3101#ifdef CONFIG_NO_HZ
3102 /*
3103 * If this cpu is the owner for idle load balancing, then do the
3104 * balancing on behalf of the other idle cpus whose ticks are
3105 * stopped.
3106 */
3107 if (local_rq->idle_at_tick &&
3108 atomic_read(&nohz.load_balancer) == local_cpu) {
3109 cpumask_t cpus = nohz.cpu_mask;
3110 struct rq *rq;
3111 int balance_cpu;
3112
3113 cpu_clear(local_cpu, cpus);
3114 for_each_cpu_mask(balance_cpu, cpus) {
3115 /*
3116 * If this cpu gets work to do, stop the load balancing
3117 * work being done for other cpus. Next load
3118 * balancing owner will pick it up.
3119 */
3120 if (need_resched())
3121 break;
3122
3123 rebalance_domains(balance_cpu, SCHED_IDLE);
3124
3125 rq = cpu_rq(balance_cpu);
3126 if (time_after(local_rq->next_balance, rq->next_balance))
3127 local_rq->next_balance = rq->next_balance;
3128 }
3129 }
3130#endif
3131}
3132
3133/*
3134 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
3135 *
3136 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
3137 * idle load balancing owner or decide to stop the periodic load balancing,
3138 * if the whole system is idle.
3139 */
3140static inline void trigger_load_balance(int cpu)
3141{
3142 struct rq *rq = cpu_rq(cpu);
3143#ifdef CONFIG_NO_HZ
3144 /*
3145 * If we were in the nohz mode recently and busy at the current
3146 * scheduler tick, then check if we need to nominate new idle
3147 * load balancer.
3148 */
3149 if (rq->in_nohz_recently && !rq->idle_at_tick) {
3150 rq->in_nohz_recently = 0;
3151
3152 if (atomic_read(&nohz.load_balancer) == cpu) {
3153 cpu_clear(cpu, nohz.cpu_mask);
3154 atomic_set(&nohz.load_balancer, -1);
3155 }
3156
3157 if (atomic_read(&nohz.load_balancer) == -1) {
3158 /*
3159 * simple selection for now: Nominate the
3160 * first cpu in the nohz list to be the next
3161 * ilb owner.
3162 *
3163 * TBD: Traverse the sched domains and nominate
3164 * the nearest cpu in the nohz.cpu_mask.
3165 */
3166 int ilb = first_cpu(nohz.cpu_mask);
3167
3168 if (ilb != NR_CPUS)
3169 resched_cpu(ilb);
3170 }
3171 }
3172
3173 /*
3174 * If this cpu is idle and doing idle load balancing for all the
3175 * cpus with ticks stopped, is it time for that to stop?
3176 */
3177 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
3178 cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
3179 resched_cpu(cpu);
3180 return;
3181 }
3182
3183 /*
3184 * If this cpu is idle and the idle load balancing is done by
3185 * someone else, then no need raise the SCHED_SOFTIRQ
3186 */
3187 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
3188 cpu_isset(cpu, nohz.cpu_mask))
3189 return;
3190#endif
3191 if (time_after_eq(jiffies, rq->next_balance))
3192 raise_softirq(SCHED_SOFTIRQ);
2995} 3193}
2996#else 3194#else
2997/* 3195/*
@@ -3224,8 +3422,7 @@ void scheduler_tick(void)
3224#ifdef CONFIG_SMP 3422#ifdef CONFIG_SMP
3225 update_load(rq); 3423 update_load(rq);
3226 rq->idle_at_tick = idle_at_tick; 3424 rq->idle_at_tick = idle_at_tick;
3227 if (time_after_eq(jiffies, rq->next_balance)) 3425 trigger_load_balance(cpu);
3228 raise_softirq(SCHED_SOFTIRQ);
3229#endif 3426#endif
3230} 3427}
3231 3428
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f4fc867f467d..3483e6cb9549 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -217,6 +217,14 @@ void tick_nohz_stop_sched_tick(void)
217 * the scheduler tick in nohz_restart_sched_tick. 217 * the scheduler tick in nohz_restart_sched_tick.
218 */ 218 */
219 if (!ts->tick_stopped) { 219 if (!ts->tick_stopped) {
220 if (select_nohz_load_balancer(1)) {
221 /*
222 * sched tick not stopped!
223 */
224 cpu_clear(cpu, nohz_cpu_mask);
225 goto out;
226 }
227
220 ts->idle_tick = ts->sched_timer.expires; 228 ts->idle_tick = ts->sched_timer.expires;
221 ts->tick_stopped = 1; 229 ts->tick_stopped = 1;
222 ts->idle_jiffies = last_jiffies; 230 ts->idle_jiffies = last_jiffies;
@@ -285,6 +293,7 @@ void tick_nohz_restart_sched_tick(void)
285 now = ktime_get(); 293 now = ktime_get();
286 294
287 local_irq_disable(); 295 local_irq_disable();
296 select_nohz_load_balancer(0);
288 tick_do_update_jiffies64(now); 297 tick_do_update_jiffies64(now);
289 cpu_clear(cpu, nohz_cpu_mask); 298 cpu_clear(cpu, nohz_cpu_mask);
290 299