aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/sched.h8
-rw-r--r--kernel/sched.c223
-rw-r--r--kernel/time/tick-sched.c9
3 files changed, 227 insertions, 13 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6312521df2c1..15ab3e039535 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -194,6 +194,14 @@ extern void sched_init_smp(void);
194extern void init_idle(struct task_struct *idle, int cpu); 194extern void init_idle(struct task_struct *idle, int cpu);
195 195
196extern cpumask_t nohz_cpu_mask; 196extern cpumask_t nohz_cpu_mask;
197#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
198extern int select_nohz_load_balancer(int cpu);
199#else
200static inline int select_nohz_load_balancer(int cpu)
201{
202 return 0;
203}
204#endif
197 205
198/* 206/*
199 * Only dump TASK_* tasks. (0 for all tasks) 207 * Only dump TASK_* tasks. (0 for all tasks)
diff --git a/kernel/sched.c b/kernel/sched.c
index ba053d88c8c6..74599286230c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -224,6 +224,9 @@ struct rq {
224#ifdef CONFIG_SMP 224#ifdef CONFIG_SMP
225 unsigned long cpu_load[3]; 225 unsigned long cpu_load[3];
226 unsigned char idle_at_tick; 226 unsigned char idle_at_tick;
227#ifdef CONFIG_NO_HZ
228 unsigned char in_nohz_recently;
229#endif
227#endif 230#endif
228 unsigned long long nr_switches; 231 unsigned long long nr_switches;
229 232
@@ -1050,6 +1053,17 @@ static void resched_task(struct task_struct *p)
1050 if (!tsk_is_polling(p)) 1053 if (!tsk_is_polling(p))
1051 smp_send_reschedule(cpu); 1054 smp_send_reschedule(cpu);
1052} 1055}
1056
1057static void resched_cpu(int cpu)
1058{
1059 struct rq *rq = cpu_rq(cpu);
1060 unsigned long flags;
1061
1062 if (!spin_trylock_irqsave(&rq->lock, flags))
1063 return;
1064 resched_task(cpu_curr(cpu));
1065 spin_unlock_irqrestore(&rq->lock, flags);
1066}
1053#else 1067#else
1054static inline void resched_task(struct task_struct *p) 1068static inline void resched_task(struct task_struct *p)
1055{ 1069{
@@ -2658,6 +2672,12 @@ redo:
2658 double_rq_unlock(this_rq, busiest); 2672 double_rq_unlock(this_rq, busiest);
2659 local_irq_restore(flags); 2673 local_irq_restore(flags);
2660 2674
2675 /*
2676 * some other cpu did the load balance for us.
2677 */
2678 if (nr_moved && this_cpu != smp_processor_id())
2679 resched_cpu(this_cpu);
2680
2661 /* All tasks on this runqueue were pinned by CPU affinity */ 2681 /* All tasks on this runqueue were pinned by CPU affinity */
2662 if (unlikely(all_pinned)) { 2682 if (unlikely(all_pinned)) {
2663 cpu_clear(cpu_of(busiest), cpus); 2683 cpu_clear(cpu_of(busiest), cpus);
@@ -2928,27 +2948,98 @@ static void update_load(struct rq *this_rq)
2928 } 2948 }
2929} 2949}
2930 2950
2951#ifdef CONFIG_NO_HZ
2952static struct {
2953 atomic_t load_balancer;
2954 cpumask_t cpu_mask;
2955} nohz ____cacheline_aligned = {
2956 .load_balancer = ATOMIC_INIT(-1),
2957 .cpu_mask = CPU_MASK_NONE,
2958};
2959
2931/* 2960/*
2932 * run_rebalance_domains is triggered when needed from the scheduler tick. 2961 * This routine will try to nominate the ilb (idle load balancing)
2962 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
2963 * load balancing on behalf of all those cpus. If all the cpus in the system
2964 * go into this tickless mode, then there will be no ilb owner (as there is
2965 * no need for one) and all the cpus will sleep till the next wakeup event
2966 * arrives...
2967 *
2968 * For the ilb owner, tick is not stopped. And this tick will be used
2969 * for idle load balancing. ilb owner will still be part of
2970 * nohz.cpu_mask..
2971 *
2972 * While stopping the tick, this cpu will become the ilb owner if there
2973 * is no other owner. And will be the owner till that cpu becomes busy
2974 * or if all cpus in the system stop their ticks at which point
2975 * there is no need for ilb owner.
2933 * 2976 *
2977 * When the ilb owner becomes busy, it nominates another owner, during the
2978 * next busy scheduler_tick()
2979 */
2980int select_nohz_load_balancer(int stop_tick)
2981{
2982 int cpu = smp_processor_id();
2983
2984 if (stop_tick) {
2985 cpu_set(cpu, nohz.cpu_mask);
2986 cpu_rq(cpu)->in_nohz_recently = 1;
2987
2988 /*
2989 * If we are going offline and still the leader, give up!
2990 */
2991 if (cpu_is_offline(cpu) &&
2992 atomic_read(&nohz.load_balancer) == cpu) {
2993 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
2994 BUG();
2995 return 0;
2996 }
2997
2998 /* time for ilb owner also to sleep */
2999 if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
3000 if (atomic_read(&nohz.load_balancer) == cpu)
3001 atomic_set(&nohz.load_balancer, -1);
3002 return 0;
3003 }
3004
3005 if (atomic_read(&nohz.load_balancer) == -1) {
3006 /* make me the ilb owner */
3007 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
3008 return 1;
3009 } else if (atomic_read(&nohz.load_balancer) == cpu)
3010 return 1;
3011 } else {
3012 if (!cpu_isset(cpu, nohz.cpu_mask))
3013 return 0;
3014
3015 cpu_clear(cpu, nohz.cpu_mask);
3016
3017 if (atomic_read(&nohz.load_balancer) == cpu)
3018 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
3019 BUG();
3020 }
3021 return 0;
3022}
3023#endif
3024
3025static DEFINE_SPINLOCK(balancing);
3026
3027/*
2934 * It checks each scheduling domain to see if it is due to be balanced, 3028 * It checks each scheduling domain to see if it is due to be balanced,
2935 * and initiates a balancing operation if so. 3029 * and initiates a balancing operation if so.
2936 * 3030 *
2937 * Balancing parameters are set up in arch_init_sched_domains. 3031 * Balancing parameters are set up in arch_init_sched_domains.
2938 */ 3032 */
2939static DEFINE_SPINLOCK(balancing); 3033static inline void rebalance_domains(int cpu, enum idle_type idle)
2940
2941static void run_rebalance_domains(struct softirq_action *h)
2942{ 3034{
2943 int this_cpu = smp_processor_id(), balance = 1; 3035 int balance = 1;
2944 struct rq *this_rq = cpu_rq(this_cpu); 3036 struct rq *rq = cpu_rq(cpu);
2945 unsigned long interval; 3037 unsigned long interval;
2946 struct sched_domain *sd; 3038 struct sched_domain *sd;
2947 enum idle_type idle = this_rq->idle_at_tick ? SCHED_IDLE : NOT_IDLE; 3039 /* Earliest time when we have to do rebalance again */
2948 /* Earliest time when we have to call run_rebalance_domains again */
2949 unsigned long next_balance = jiffies + 60*HZ; 3040 unsigned long next_balance = jiffies + 60*HZ;
2950 3041
2951 for_each_domain(this_cpu, sd) { 3042 for_each_domain(cpu, sd) {
2952 if (!(sd->flags & SD_LOAD_BALANCE)) 3043 if (!(sd->flags & SD_LOAD_BALANCE))
2953 continue; 3044 continue;
2954 3045
@@ -2967,7 +3058,7 @@ static void run_rebalance_domains(struct softirq_action *h)
2967 } 3058 }
2968 3059
2969 if (time_after_eq(jiffies, sd->last_balance + interval)) { 3060 if (time_after_eq(jiffies, sd->last_balance + interval)) {
2970 if (load_balance(this_cpu, this_rq, sd, idle, &balance)) { 3061 if (load_balance(cpu, rq, sd, idle, &balance)) {
2971 /* 3062 /*
2972 * We've pulled tasks over so either we're no 3063 * We've pulled tasks over so either we're no
2973 * longer idle, or one of our SMT siblings is 3064 * longer idle, or one of our SMT siblings is
@@ -2991,7 +3082,114 @@ out:
2991 if (!balance) 3082 if (!balance)
2992 break; 3083 break;
2993 } 3084 }
2994 this_rq->next_balance = next_balance; 3085 rq->next_balance = next_balance;
3086}
3087
3088/*
3089 * run_rebalance_domains is triggered when needed from the scheduler tick.
3090 * In CONFIG_NO_HZ case, the idle load balance owner will do the
3091 * rebalancing for all the cpus for whom scheduler ticks are stopped.
3092 */
3093static void run_rebalance_domains(struct softirq_action *h)
3094{
3095 int local_cpu = smp_processor_id();
3096 struct rq *local_rq = cpu_rq(local_cpu);
3097 enum idle_type idle = local_rq->idle_at_tick ? SCHED_IDLE : NOT_IDLE;
3098
3099 rebalance_domains(local_cpu, idle);
3100
3101#ifdef CONFIG_NO_HZ
3102 /*
3103 * If this cpu is the owner for idle load balancing, then do the
3104 * balancing on behalf of the other idle cpus whose ticks are
3105 * stopped.
3106 */
3107 if (local_rq->idle_at_tick &&
3108 atomic_read(&nohz.load_balancer) == local_cpu) {
3109 cpumask_t cpus = nohz.cpu_mask;
3110 struct rq *rq;
3111 int balance_cpu;
3112
3113 cpu_clear(local_cpu, cpus);
3114 for_each_cpu_mask(balance_cpu, cpus) {
3115 /*
3116 * If this cpu gets work to do, stop the load balancing
3117 * work being done for other cpus. Next load
3118 * balancing owner will pick it up.
3119 */
3120 if (need_resched())
3121 break;
3122
3123 rebalance_domains(balance_cpu, SCHED_IDLE);
3124
3125 rq = cpu_rq(balance_cpu);
3126 if (time_after(local_rq->next_balance, rq->next_balance))
3127 local_rq->next_balance = rq->next_balance;
3128 }
3129 }
3130#endif
3131}
3132
3133/*
3134 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
3135 *
3136 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
3137 * idle load balancing owner or decide to stop the periodic load balancing,
3138 * if the whole system is idle.
3139 */
3140static inline void trigger_load_balance(int cpu)
3141{
3142 struct rq *rq = cpu_rq(cpu);
3143#ifdef CONFIG_NO_HZ
3144 /*
3145 * If we were in the nohz mode recently and busy at the current
3146 * scheduler tick, then check if we need to nominate new idle
3147 * load balancer.
3148 */
3149 if (rq->in_nohz_recently && !rq->idle_at_tick) {
3150 rq->in_nohz_recently = 0;
3151
3152 if (atomic_read(&nohz.load_balancer) == cpu) {
3153 cpu_clear(cpu, nohz.cpu_mask);
3154 atomic_set(&nohz.load_balancer, -1);
3155 }
3156
3157 if (atomic_read(&nohz.load_balancer) == -1) {
3158 /*
3159 * simple selection for now: Nominate the
3160 * first cpu in the nohz list to be the next
3161 * ilb owner.
3162 *
3163 * TBD: Traverse the sched domains and nominate
3164 * the nearest cpu in the nohz.cpu_mask.
3165 */
3166 int ilb = first_cpu(nohz.cpu_mask);
3167
3168 if (ilb != NR_CPUS)
3169 resched_cpu(ilb);
3170 }
3171 }
3172
3173 /*
3174 * If this cpu is idle and doing idle load balancing for all the
3175 * cpus with ticks stopped, is it time for that to stop?
3176 */
3177 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
3178 cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
3179 resched_cpu(cpu);
3180 return;
3181 }
3182
3183 /*
3184 * If this cpu is idle and the idle load balancing is done by
3185 * someone else, then no need raise the SCHED_SOFTIRQ
3186 */
3187 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
3188 cpu_isset(cpu, nohz.cpu_mask))
3189 return;
3190#endif
3191 if (time_after_eq(jiffies, rq->next_balance))
3192 raise_softirq(SCHED_SOFTIRQ);
2995} 3193}
2996#else 3194#else
2997/* 3195/*
@@ -3224,8 +3422,7 @@ void scheduler_tick(void)
3224#ifdef CONFIG_SMP 3422#ifdef CONFIG_SMP
3225 update_load(rq); 3423 update_load(rq);
3226 rq->idle_at_tick = idle_at_tick; 3424 rq->idle_at_tick = idle_at_tick;
3227 if (time_after_eq(jiffies, rq->next_balance)) 3425 trigger_load_balance(cpu);
3228 raise_softirq(SCHED_SOFTIRQ);
3229#endif 3426#endif
3230} 3427}
3231 3428
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f4fc867f467d..3483e6cb9549 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -217,6 +217,14 @@ void tick_nohz_stop_sched_tick(void)
217 * the scheduler tick in nohz_restart_sched_tick. 217 * the scheduler tick in nohz_restart_sched_tick.
218 */ 218 */
219 if (!ts->tick_stopped) { 219 if (!ts->tick_stopped) {
220 if (select_nohz_load_balancer(1)) {
221 /*
222 * sched tick not stopped!
223 */
224 cpu_clear(cpu, nohz_cpu_mask);
225 goto out;
226 }
227
220 ts->idle_tick = ts->sched_timer.expires; 228 ts->idle_tick = ts->sched_timer.expires;
221 ts->tick_stopped = 1; 229 ts->tick_stopped = 1;
222 ts->idle_jiffies = last_jiffies; 230 ts->idle_jiffies = last_jiffies;
@@ -285,6 +293,7 @@ void tick_nohz_restart_sched_tick(void)
285 now = ktime_get(); 293 now = ktime_get();
286 294
287 local_irq_disable(); 295 local_irq_disable();
296 select_nohz_load_balancer(0);
288 tick_do_update_jiffies64(now); 297 tick_do_update_jiffies64(now);
289 cpu_clear(cpu, nohz_cpu_mask); 298 cpu_clear(cpu, nohz_cpu_mask);
290 299