diff options
-rw-r--r-- | include/linux/sched.h | 8 | ||||
-rw-r--r-- | kernel/sched.c | 223 | ||||
-rw-r--r-- | kernel/time/tick-sched.c | 9 |
3 files changed, 227 insertions, 13 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h index 6312521df2c1..15ab3e039535 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -194,6 +194,14 @@ extern void sched_init_smp(void); | |||
194 | extern void init_idle(struct task_struct *idle, int cpu); | 194 | extern void init_idle(struct task_struct *idle, int cpu); |
195 | 195 | ||
196 | extern cpumask_t nohz_cpu_mask; | 196 | extern cpumask_t nohz_cpu_mask; |
197 | #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ) | ||
198 | extern int select_nohz_load_balancer(int cpu); | ||
199 | #else | ||
200 | static inline int select_nohz_load_balancer(int cpu) | ||
201 | { | ||
202 | return 0; | ||
203 | } | ||
204 | #endif | ||
197 | 205 | ||
198 | /* | 206 | /* |
199 | * Only dump TASK_* tasks. (0 for all tasks) | 207 | * Only dump TASK_* tasks. (0 for all tasks) |
diff --git a/kernel/sched.c b/kernel/sched.c index ba053d88c8c6..74599286230c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -224,6 +224,9 @@ struct rq { | |||
224 | #ifdef CONFIG_SMP | 224 | #ifdef CONFIG_SMP |
225 | unsigned long cpu_load[3]; | 225 | unsigned long cpu_load[3]; |
226 | unsigned char idle_at_tick; | 226 | unsigned char idle_at_tick; |
227 | #ifdef CONFIG_NO_HZ | ||
228 | unsigned char in_nohz_recently; | ||
229 | #endif | ||
227 | #endif | 230 | #endif |
228 | unsigned long long nr_switches; | 231 | unsigned long long nr_switches; |
229 | 232 | ||
@@ -1050,6 +1053,17 @@ static void resched_task(struct task_struct *p) | |||
1050 | if (!tsk_is_polling(p)) | 1053 | if (!tsk_is_polling(p)) |
1051 | smp_send_reschedule(cpu); | 1054 | smp_send_reschedule(cpu); |
1052 | } | 1055 | } |
1056 | |||
1057 | static void resched_cpu(int cpu) | ||
1058 | { | ||
1059 | struct rq *rq = cpu_rq(cpu); | ||
1060 | unsigned long flags; | ||
1061 | |||
1062 | if (!spin_trylock_irqsave(&rq->lock, flags)) | ||
1063 | return; | ||
1064 | resched_task(cpu_curr(cpu)); | ||
1065 | spin_unlock_irqrestore(&rq->lock, flags); | ||
1066 | } | ||
1053 | #else | 1067 | #else |
1054 | static inline void resched_task(struct task_struct *p) | 1068 | static inline void resched_task(struct task_struct *p) |
1055 | { | 1069 | { |
@@ -2658,6 +2672,12 @@ redo: | |||
2658 | double_rq_unlock(this_rq, busiest); | 2672 | double_rq_unlock(this_rq, busiest); |
2659 | local_irq_restore(flags); | 2673 | local_irq_restore(flags); |
2660 | 2674 | ||
2675 | /* | ||
2676 | * some other cpu did the load balance for us. | ||
2677 | */ | ||
2678 | if (nr_moved && this_cpu != smp_processor_id()) | ||
2679 | resched_cpu(this_cpu); | ||
2680 | |||
2661 | /* All tasks on this runqueue were pinned by CPU affinity */ | 2681 | /* All tasks on this runqueue were pinned by CPU affinity */ |
2662 | if (unlikely(all_pinned)) { | 2682 | if (unlikely(all_pinned)) { |
2663 | cpu_clear(cpu_of(busiest), cpus); | 2683 | cpu_clear(cpu_of(busiest), cpus); |
@@ -2928,27 +2948,98 @@ static void update_load(struct rq *this_rq) | |||
2928 | } | 2948 | } |
2929 | } | 2949 | } |
2930 | 2950 | ||
2951 | #ifdef CONFIG_NO_HZ | ||
2952 | static struct { | ||
2953 | atomic_t load_balancer; | ||
2954 | cpumask_t cpu_mask; | ||
2955 | } nohz ____cacheline_aligned = { | ||
2956 | .load_balancer = ATOMIC_INIT(-1), | ||
2957 | .cpu_mask = CPU_MASK_NONE, | ||
2958 | }; | ||
2959 | |||
2931 | /* | 2960 | /* |
2932 | * run_rebalance_domains is triggered when needed from the scheduler tick. | 2961 | * This routine will try to nominate the ilb (idle load balancing) |
2962 | * owner among the cpus whose ticks are stopped. ilb owner will do the idle | ||
2963 | * load balancing on behalf of all those cpus. If all the cpus in the system | ||
2964 | * go into this tickless mode, then there will be no ilb owner (as there is | ||
2965 | * no need for one) and all the cpus will sleep till the next wakeup event | ||
2966 | * arrives... | ||
2967 | * | ||
2968 | * For the ilb owner, tick is not stopped. And this tick will be used | ||
2969 | * for idle load balancing. ilb owner will still be part of | ||
2970 | * nohz.cpu_mask.. | ||
2971 | * | ||
2972 | * While stopping the tick, this cpu will become the ilb owner if there | ||
2973 | * is no other owner. And will be the owner till that cpu becomes busy | ||
2974 | * or if all cpus in the system stop their ticks at which point | ||
2975 | * there is no need for ilb owner. | ||
2933 | * | 2976 | * |
2977 | * When the ilb owner becomes busy, it nominates another owner, during the | ||
2978 | * next busy scheduler_tick() | ||
2979 | */ | ||
2980 | int select_nohz_load_balancer(int stop_tick) | ||
2981 | { | ||
2982 | int cpu = smp_processor_id(); | ||
2983 | |||
2984 | if (stop_tick) { | ||
2985 | cpu_set(cpu, nohz.cpu_mask); | ||
2986 | cpu_rq(cpu)->in_nohz_recently = 1; | ||
2987 | |||
2988 | /* | ||
2989 | * If we are going offline and still the leader, give up! | ||
2990 | */ | ||
2991 | if (cpu_is_offline(cpu) && | ||
2992 | atomic_read(&nohz.load_balancer) == cpu) { | ||
2993 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) | ||
2994 | BUG(); | ||
2995 | return 0; | ||
2996 | } | ||
2997 | |||
2998 | /* time for ilb owner also to sleep */ | ||
2999 | if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) { | ||
3000 | if (atomic_read(&nohz.load_balancer) == cpu) | ||
3001 | atomic_set(&nohz.load_balancer, -1); | ||
3002 | return 0; | ||
3003 | } | ||
3004 | |||
3005 | if (atomic_read(&nohz.load_balancer) == -1) { | ||
3006 | /* make me the ilb owner */ | ||
3007 | if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) | ||
3008 | return 1; | ||
3009 | } else if (atomic_read(&nohz.load_balancer) == cpu) | ||
3010 | return 1; | ||
3011 | } else { | ||
3012 | if (!cpu_isset(cpu, nohz.cpu_mask)) | ||
3013 | return 0; | ||
3014 | |||
3015 | cpu_clear(cpu, nohz.cpu_mask); | ||
3016 | |||
3017 | if (atomic_read(&nohz.load_balancer) == cpu) | ||
3018 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) | ||
3019 | BUG(); | ||
3020 | } | ||
3021 | return 0; | ||
3022 | } | ||
3023 | #endif | ||
3024 | |||
3025 | static DEFINE_SPINLOCK(balancing); | ||
3026 | |||
3027 | /* | ||
2934 | * It checks each scheduling domain to see if it is due to be balanced, | 3028 | * It checks each scheduling domain to see if it is due to be balanced, |
2935 | * and initiates a balancing operation if so. | 3029 | * and initiates a balancing operation if so. |
2936 | * | 3030 | * |
2937 | * Balancing parameters are set up in arch_init_sched_domains. | 3031 | * Balancing parameters are set up in arch_init_sched_domains. |
2938 | */ | 3032 | */ |
2939 | static DEFINE_SPINLOCK(balancing); | 3033 | static inline void rebalance_domains(int cpu, enum idle_type idle) |
2940 | |||
2941 | static void run_rebalance_domains(struct softirq_action *h) | ||
2942 | { | 3034 | { |
2943 | int this_cpu = smp_processor_id(), balance = 1; | 3035 | int balance = 1; |
2944 | struct rq *this_rq = cpu_rq(this_cpu); | 3036 | struct rq *rq = cpu_rq(cpu); |
2945 | unsigned long interval; | 3037 | unsigned long interval; |
2946 | struct sched_domain *sd; | 3038 | struct sched_domain *sd; |
2947 | enum idle_type idle = this_rq->idle_at_tick ? SCHED_IDLE : NOT_IDLE; | 3039 | /* Earliest time when we have to do rebalance again */ |
2948 | /* Earliest time when we have to call run_rebalance_domains again */ | ||
2949 | unsigned long next_balance = jiffies + 60*HZ; | 3040 | unsigned long next_balance = jiffies + 60*HZ; |
2950 | 3041 | ||
2951 | for_each_domain(this_cpu, sd) { | 3042 | for_each_domain(cpu, sd) { |
2952 | if (!(sd->flags & SD_LOAD_BALANCE)) | 3043 | if (!(sd->flags & SD_LOAD_BALANCE)) |
2953 | continue; | 3044 | continue; |
2954 | 3045 | ||
@@ -2967,7 +3058,7 @@ static void run_rebalance_domains(struct softirq_action *h) | |||
2967 | } | 3058 | } |
2968 | 3059 | ||
2969 | if (time_after_eq(jiffies, sd->last_balance + interval)) { | 3060 | if (time_after_eq(jiffies, sd->last_balance + interval)) { |
2970 | if (load_balance(this_cpu, this_rq, sd, idle, &balance)) { | 3061 | if (load_balance(cpu, rq, sd, idle, &balance)) { |
2971 | /* | 3062 | /* |
2972 | * We've pulled tasks over so either we're no | 3063 | * We've pulled tasks over so either we're no |
2973 | * longer idle, or one of our SMT siblings is | 3064 | * longer idle, or one of our SMT siblings is |
@@ -2991,7 +3082,114 @@ out: | |||
2991 | if (!balance) | 3082 | if (!balance) |
2992 | break; | 3083 | break; |
2993 | } | 3084 | } |
2994 | this_rq->next_balance = next_balance; | 3085 | rq->next_balance = next_balance; |
3086 | } | ||
3087 | |||
3088 | /* | ||
3089 | * run_rebalance_domains is triggered when needed from the scheduler tick. | ||
3090 | * In CONFIG_NO_HZ case, the idle load balance owner will do the | ||
3091 | * rebalancing for all the cpus for whom scheduler ticks are stopped. | ||
3092 | */ | ||
3093 | static void run_rebalance_domains(struct softirq_action *h) | ||
3094 | { | ||
3095 | int local_cpu = smp_processor_id(); | ||
3096 | struct rq *local_rq = cpu_rq(local_cpu); | ||
3097 | enum idle_type idle = local_rq->idle_at_tick ? SCHED_IDLE : NOT_IDLE; | ||
3098 | |||
3099 | rebalance_domains(local_cpu, idle); | ||
3100 | |||
3101 | #ifdef CONFIG_NO_HZ | ||
3102 | /* | ||
3103 | * If this cpu is the owner for idle load balancing, then do the | ||
3104 | * balancing on behalf of the other idle cpus whose ticks are | ||
3105 | * stopped. | ||
3106 | */ | ||
3107 | if (local_rq->idle_at_tick && | ||
3108 | atomic_read(&nohz.load_balancer) == local_cpu) { | ||
3109 | cpumask_t cpus = nohz.cpu_mask; | ||
3110 | struct rq *rq; | ||
3111 | int balance_cpu; | ||
3112 | |||
3113 | cpu_clear(local_cpu, cpus); | ||
3114 | for_each_cpu_mask(balance_cpu, cpus) { | ||
3115 | /* | ||
3116 | * If this cpu gets work to do, stop the load balancing | ||
3117 | * work being done for other cpus. Next load | ||
3118 | * balancing owner will pick it up. | ||
3119 | */ | ||
3120 | if (need_resched()) | ||
3121 | break; | ||
3122 | |||
3123 | rebalance_domains(balance_cpu, SCHED_IDLE); | ||
3124 | |||
3125 | rq = cpu_rq(balance_cpu); | ||
3126 | if (time_after(local_rq->next_balance, rq->next_balance)) | ||
3127 | local_rq->next_balance = rq->next_balance; | ||
3128 | } | ||
3129 | } | ||
3130 | #endif | ||
3131 | } | ||
3132 | |||
3133 | /* | ||
3134 | * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. | ||
3135 | * | ||
3136 | * In case of CONFIG_NO_HZ, this is the place where we nominate a new | ||
3137 | * idle load balancing owner or decide to stop the periodic load balancing, | ||
3138 | * if the whole system is idle. | ||
3139 | */ | ||
3140 | static inline void trigger_load_balance(int cpu) | ||
3141 | { | ||
3142 | struct rq *rq = cpu_rq(cpu); | ||
3143 | #ifdef CONFIG_NO_HZ | ||
3144 | /* | ||
3145 | * If we were in the nohz mode recently and busy at the current | ||
3146 | * scheduler tick, then check if we need to nominate new idle | ||
3147 | * load balancer. | ||
3148 | */ | ||
3149 | if (rq->in_nohz_recently && !rq->idle_at_tick) { | ||
3150 | rq->in_nohz_recently = 0; | ||
3151 | |||
3152 | if (atomic_read(&nohz.load_balancer) == cpu) { | ||
3153 | cpu_clear(cpu, nohz.cpu_mask); | ||
3154 | atomic_set(&nohz.load_balancer, -1); | ||
3155 | } | ||
3156 | |||
3157 | if (atomic_read(&nohz.load_balancer) == -1) { | ||
3158 | /* | ||
3159 | * simple selection for now: Nominate the | ||
3160 | * first cpu in the nohz list to be the next | ||
3161 | * ilb owner. | ||
3162 | * | ||
3163 | * TBD: Traverse the sched domains and nominate | ||
3164 | * the nearest cpu in the nohz.cpu_mask. | ||
3165 | */ | ||
3166 | int ilb = first_cpu(nohz.cpu_mask); | ||
3167 | |||
3168 | if (ilb != NR_CPUS) | ||
3169 | resched_cpu(ilb); | ||
3170 | } | ||
3171 | } | ||
3172 | |||
3173 | /* | ||
3174 | * If this cpu is idle and doing idle load balancing for all the | ||
3175 | * cpus with ticks stopped, is it time for that to stop? | ||
3176 | */ | ||
3177 | if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu && | ||
3178 | cpus_weight(nohz.cpu_mask) == num_online_cpus()) { | ||
3179 | resched_cpu(cpu); | ||
3180 | return; | ||
3181 | } | ||
3182 | |||
3183 | /* | ||
3184 | * If this cpu is idle and the idle load balancing is done by | ||
3185 | * someone else, then no need raise the SCHED_SOFTIRQ | ||
3186 | */ | ||
3187 | if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu && | ||
3188 | cpu_isset(cpu, nohz.cpu_mask)) | ||
3189 | return; | ||
3190 | #endif | ||
3191 | if (time_after_eq(jiffies, rq->next_balance)) | ||
3192 | raise_softirq(SCHED_SOFTIRQ); | ||
2995 | } | 3193 | } |
2996 | #else | 3194 | #else |
2997 | /* | 3195 | /* |
@@ -3224,8 +3422,7 @@ void scheduler_tick(void) | |||
3224 | #ifdef CONFIG_SMP | 3422 | #ifdef CONFIG_SMP |
3225 | update_load(rq); | 3423 | update_load(rq); |
3226 | rq->idle_at_tick = idle_at_tick; | 3424 | rq->idle_at_tick = idle_at_tick; |
3227 | if (time_after_eq(jiffies, rq->next_balance)) | 3425 | trigger_load_balance(cpu); |
3228 | raise_softirq(SCHED_SOFTIRQ); | ||
3229 | #endif | 3426 | #endif |
3230 | } | 3427 | } |
3231 | 3428 | ||
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index f4fc867f467d..3483e6cb9549 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -217,6 +217,14 @@ void tick_nohz_stop_sched_tick(void) | |||
217 | * the scheduler tick in nohz_restart_sched_tick. | 217 | * the scheduler tick in nohz_restart_sched_tick. |
218 | */ | 218 | */ |
219 | if (!ts->tick_stopped) { | 219 | if (!ts->tick_stopped) { |
220 | if (select_nohz_load_balancer(1)) { | ||
221 | /* | ||
222 | * sched tick not stopped! | ||
223 | */ | ||
224 | cpu_clear(cpu, nohz_cpu_mask); | ||
225 | goto out; | ||
226 | } | ||
227 | |||
220 | ts->idle_tick = ts->sched_timer.expires; | 228 | ts->idle_tick = ts->sched_timer.expires; |
221 | ts->tick_stopped = 1; | 229 | ts->tick_stopped = 1; |
222 | ts->idle_jiffies = last_jiffies; | 230 | ts->idle_jiffies = last_jiffies; |
@@ -285,6 +293,7 @@ void tick_nohz_restart_sched_tick(void) | |||
285 | now = ktime_get(); | 293 | now = ktime_get(); |
286 | 294 | ||
287 | local_irq_disable(); | 295 | local_irq_disable(); |
296 | select_nohz_load_balancer(0); | ||
288 | tick_do_update_jiffies64(now); | 297 | tick_do_update_jiffies64(now); |
289 | cpu_clear(cpu, nohz_cpu_mask); | 298 | cpu_clear(cpu, nohz_cpu_mask); |
290 | 299 | ||