diff options
Diffstat (limited to 'kernel/sched.c')
-rw-r--r-- | kernel/sched.c | 382 |
1 files changed, 95 insertions, 287 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index b387a8de26a5..d1ad69b270ca 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -174,41 +174,6 @@ struct task_group { | |||
174 | struct sched_entity **se; | 174 | struct sched_entity **se; |
175 | /* runqueue "owned" by this group on each cpu */ | 175 | /* runqueue "owned" by this group on each cpu */ |
176 | struct cfs_rq **cfs_rq; | 176 | struct cfs_rq **cfs_rq; |
177 | |||
178 | /* | ||
179 | * shares assigned to a task group governs how much of cpu bandwidth | ||
180 | * is allocated to the group. The more shares a group has, the more is | ||
181 | * the cpu bandwidth allocated to it. | ||
182 | * | ||
183 | * For ex, lets say that there are three task groups, A, B and C which | ||
184 | * have been assigned shares 1000, 2000 and 3000 respectively. Then, | ||
185 | * cpu bandwidth allocated by the scheduler to task groups A, B and C | ||
186 | * should be: | ||
187 | * | ||
188 | * Bw(A) = 1000/(1000+2000+3000) * 100 = 16.66% | ||
189 | * Bw(B) = 2000/(1000+2000+3000) * 100 = 33.33% | ||
190 | * Bw(C) = 3000/(1000+2000+3000) * 100 = 50% | ||
191 | * | ||
192 | * The weight assigned to a task group's schedulable entities on every | ||
193 | * cpu (task_group.se[a_cpu]->load.weight) is derived from the task | ||
194 | * group's shares. For ex: lets say that task group A has been | ||
195 | * assigned shares of 1000 and there are two CPUs in a system. Then, | ||
196 | * | ||
197 | * tg_A->se[0]->load.weight = tg_A->se[1]->load.weight = 1000; | ||
198 | * | ||
199 | * Note: It's not necessary that each of a task's group schedulable | ||
200 | * entity have the same weight on all CPUs. If the group | ||
201 | * has 2 of its tasks on CPU0 and 1 task on CPU1, then a | ||
202 | * better distribution of weight could be: | ||
203 | * | ||
204 | * tg_A->se[0]->load.weight = 2/3 * 2000 = 1333 | ||
205 | * tg_A->se[1]->load.weight = 1/2 * 2000 = 667 | ||
206 | * | ||
207 | * rebalance_shares() is responsible for distributing the shares of a | ||
208 | * task groups like this among the group's schedulable entities across | ||
209 | * cpus. | ||
210 | * | ||
211 | */ | ||
212 | unsigned long shares; | 177 | unsigned long shares; |
213 | #endif | 178 | #endif |
214 | 179 | ||
@@ -250,22 +215,12 @@ static DEFINE_SPINLOCK(task_group_lock); | |||
250 | static DEFINE_MUTEX(doms_cur_mutex); | 215 | static DEFINE_MUTEX(doms_cur_mutex); |
251 | 216 | ||
252 | #ifdef CONFIG_FAIR_GROUP_SCHED | 217 | #ifdef CONFIG_FAIR_GROUP_SCHED |
253 | #ifdef CONFIG_SMP | ||
254 | /* kernel thread that runs rebalance_shares() periodically */ | ||
255 | static struct task_struct *lb_monitor_task; | ||
256 | static int load_balance_monitor(void *unused); | ||
257 | #endif | ||
258 | |||
259 | static void set_se_shares(struct sched_entity *se, unsigned long shares); | ||
260 | |||
261 | #ifdef CONFIG_USER_SCHED | 218 | #ifdef CONFIG_USER_SCHED |
262 | # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) | 219 | # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) |
263 | #else | 220 | #else |
264 | # define INIT_TASK_GROUP_LOAD NICE_0_LOAD | 221 | # define INIT_TASK_GROUP_LOAD NICE_0_LOAD |
265 | #endif | 222 | #endif |
266 | 223 | ||
267 | #define MIN_GROUP_SHARES 2 | ||
268 | |||
269 | static int init_task_group_load = INIT_TASK_GROUP_LOAD; | 224 | static int init_task_group_load = INIT_TASK_GROUP_LOAD; |
270 | #endif | 225 | #endif |
271 | 226 | ||
@@ -346,7 +301,7 @@ struct cfs_rq { | |||
346 | /* 'curr' points to currently running entity on this cfs_rq. | 301 | /* 'curr' points to currently running entity on this cfs_rq. |
347 | * It is set to NULL otherwise (i.e when none are currently running). | 302 | * It is set to NULL otherwise (i.e when none are currently running). |
348 | */ | 303 | */ |
349 | struct sched_entity *curr; | 304 | struct sched_entity *curr, *next; |
350 | 305 | ||
351 | unsigned long nr_spread_over; | 306 | unsigned long nr_spread_over; |
352 | 307 | ||
@@ -668,6 +623,8 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32; | |||
668 | */ | 623 | */ |
669 | unsigned int sysctl_sched_rt_period = 1000000; | 624 | unsigned int sysctl_sched_rt_period = 1000000; |
670 | 625 | ||
626 | static __read_mostly int scheduler_running; | ||
627 | |||
671 | /* | 628 | /* |
672 | * part of the period that we allow rt tasks to run in us. | 629 | * part of the period that we allow rt tasks to run in us. |
673 | * default: 0.95s | 630 | * default: 0.95s |
@@ -689,14 +646,16 @@ unsigned long long cpu_clock(int cpu) | |||
689 | unsigned long flags; | 646 | unsigned long flags; |
690 | struct rq *rq; | 647 | struct rq *rq; |
691 | 648 | ||
692 | local_irq_save(flags); | ||
693 | rq = cpu_rq(cpu); | ||
694 | /* | 649 | /* |
695 | * Only call sched_clock() if the scheduler has already been | 650 | * Only call sched_clock() if the scheduler has already been |
696 | * initialized (some code might call cpu_clock() very early): | 651 | * initialized (some code might call cpu_clock() very early): |
697 | */ | 652 | */ |
698 | if (rq->idle) | 653 | if (unlikely(!scheduler_running)) |
699 | update_rq_clock(rq); | 654 | return 0; |
655 | |||
656 | local_irq_save(flags); | ||
657 | rq = cpu_rq(cpu); | ||
658 | update_rq_clock(rq); | ||
700 | now = rq->clock; | 659 | now = rq->clock; |
701 | local_irq_restore(flags); | 660 | local_irq_restore(flags); |
702 | 661 | ||
@@ -1125,7 +1084,7 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight, | |||
1125 | u64 tmp; | 1084 | u64 tmp; |
1126 | 1085 | ||
1127 | if (unlikely(!lw->inv_weight)) | 1086 | if (unlikely(!lw->inv_weight)) |
1128 | lw->inv_weight = (WMULT_CONST - lw->weight/2) / lw->weight + 1; | 1087 | lw->inv_weight = (WMULT_CONST-lw->weight/2) / (lw->weight+1); |
1129 | 1088 | ||
1130 | tmp = (u64)delta_exec * weight; | 1089 | tmp = (u64)delta_exec * weight; |
1131 | /* | 1090 | /* |
@@ -1149,11 +1108,13 @@ calc_delta_fair(unsigned long delta_exec, struct load_weight *lw) | |||
1149 | static inline void update_load_add(struct load_weight *lw, unsigned long inc) | 1108 | static inline void update_load_add(struct load_weight *lw, unsigned long inc) |
1150 | { | 1109 | { |
1151 | lw->weight += inc; | 1110 | lw->weight += inc; |
1111 | lw->inv_weight = 0; | ||
1152 | } | 1112 | } |
1153 | 1113 | ||
1154 | static inline void update_load_sub(struct load_weight *lw, unsigned long dec) | 1114 | static inline void update_load_sub(struct load_weight *lw, unsigned long dec) |
1155 | { | 1115 | { |
1156 | lw->weight -= dec; | 1116 | lw->weight -= dec; |
1117 | lw->inv_weight = 0; | ||
1157 | } | 1118 | } |
1158 | 1119 | ||
1159 | /* | 1120 | /* |
@@ -1241,16 +1202,6 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime); | |||
1241 | static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} | 1202 | static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} |
1242 | #endif | 1203 | #endif |
1243 | 1204 | ||
1244 | static inline void inc_cpu_load(struct rq *rq, unsigned long load) | ||
1245 | { | ||
1246 | update_load_add(&rq->load, load); | ||
1247 | } | ||
1248 | |||
1249 | static inline void dec_cpu_load(struct rq *rq, unsigned long load) | ||
1250 | { | ||
1251 | update_load_sub(&rq->load, load); | ||
1252 | } | ||
1253 | |||
1254 | #ifdef CONFIG_SMP | 1205 | #ifdef CONFIG_SMP |
1255 | static unsigned long source_load(int cpu, int type); | 1206 | static unsigned long source_load(int cpu, int type); |
1256 | static unsigned long target_load(int cpu, int type); | 1207 | static unsigned long target_load(int cpu, int type); |
@@ -1268,14 +1219,26 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); | |||
1268 | 1219 | ||
1269 | #define sched_class_highest (&rt_sched_class) | 1220 | #define sched_class_highest (&rt_sched_class) |
1270 | 1221 | ||
1271 | static void inc_nr_running(struct rq *rq) | 1222 | static inline void inc_load(struct rq *rq, const struct task_struct *p) |
1223 | { | ||
1224 | update_load_add(&rq->load, p->se.load.weight); | ||
1225 | } | ||
1226 | |||
1227 | static inline void dec_load(struct rq *rq, const struct task_struct *p) | ||
1228 | { | ||
1229 | update_load_sub(&rq->load, p->se.load.weight); | ||
1230 | } | ||
1231 | |||
1232 | static void inc_nr_running(struct task_struct *p, struct rq *rq) | ||
1272 | { | 1233 | { |
1273 | rq->nr_running++; | 1234 | rq->nr_running++; |
1235 | inc_load(rq, p); | ||
1274 | } | 1236 | } |
1275 | 1237 | ||
1276 | static void dec_nr_running(struct rq *rq) | 1238 | static void dec_nr_running(struct task_struct *p, struct rq *rq) |
1277 | { | 1239 | { |
1278 | rq->nr_running--; | 1240 | rq->nr_running--; |
1241 | dec_load(rq, p); | ||
1279 | } | 1242 | } |
1280 | 1243 | ||
1281 | static void set_load_weight(struct task_struct *p) | 1244 | static void set_load_weight(struct task_struct *p) |
@@ -1367,7 +1330,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) | |||
1367 | rq->nr_uninterruptible--; | 1330 | rq->nr_uninterruptible--; |
1368 | 1331 | ||
1369 | enqueue_task(rq, p, wakeup); | 1332 | enqueue_task(rq, p, wakeup); |
1370 | inc_nr_running(rq); | 1333 | inc_nr_running(p, rq); |
1371 | } | 1334 | } |
1372 | 1335 | ||
1373 | /* | 1336 | /* |
@@ -1379,7 +1342,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) | |||
1379 | rq->nr_uninterruptible++; | 1342 | rq->nr_uninterruptible++; |
1380 | 1343 | ||
1381 | dequeue_task(rq, p, sleep); | 1344 | dequeue_task(rq, p, sleep); |
1382 | dec_nr_running(rq); | 1345 | dec_nr_running(p, rq); |
1383 | } | 1346 | } |
1384 | 1347 | ||
1385 | /** | 1348 | /** |
@@ -2019,7 +1982,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | |||
2019 | * management (if any): | 1982 | * management (if any): |
2020 | */ | 1983 | */ |
2021 | p->sched_class->task_new(rq, p); | 1984 | p->sched_class->task_new(rq, p); |
2022 | inc_nr_running(rq); | 1985 | inc_nr_running(p, rq); |
2023 | } | 1986 | } |
2024 | check_preempt_curr(rq, p); | 1987 | check_preempt_curr(rq, p); |
2025 | #ifdef CONFIG_SMP | 1988 | #ifdef CONFIG_SMP |
@@ -3885,7 +3848,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev) | |||
3885 | asmlinkage void __sched schedule(void) | 3848 | asmlinkage void __sched schedule(void) |
3886 | { | 3849 | { |
3887 | struct task_struct *prev, *next; | 3850 | struct task_struct *prev, *next; |
3888 | long *switch_count; | 3851 | unsigned long *switch_count; |
3889 | struct rq *rq; | 3852 | struct rq *rq; |
3890 | int cpu; | 3853 | int cpu; |
3891 | 3854 | ||
@@ -4307,11 +4270,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
4307 | oldprio = p->prio; | 4270 | oldprio = p->prio; |
4308 | on_rq = p->se.on_rq; | 4271 | on_rq = p->se.on_rq; |
4309 | running = task_current(rq, p); | 4272 | running = task_current(rq, p); |
4310 | if (on_rq) { | 4273 | if (on_rq) |
4311 | dequeue_task(rq, p, 0); | 4274 | dequeue_task(rq, p, 0); |
4312 | if (running) | 4275 | if (running) |
4313 | p->sched_class->put_prev_task(rq, p); | 4276 | p->sched_class->put_prev_task(rq, p); |
4314 | } | ||
4315 | 4277 | ||
4316 | if (rt_prio(prio)) | 4278 | if (rt_prio(prio)) |
4317 | p->sched_class = &rt_sched_class; | 4279 | p->sched_class = &rt_sched_class; |
@@ -4320,10 +4282,9 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
4320 | 4282 | ||
4321 | p->prio = prio; | 4283 | p->prio = prio; |
4322 | 4284 | ||
4285 | if (running) | ||
4286 | p->sched_class->set_curr_task(rq); | ||
4323 | if (on_rq) { | 4287 | if (on_rq) { |
4324 | if (running) | ||
4325 | p->sched_class->set_curr_task(rq); | ||
4326 | |||
4327 | enqueue_task(rq, p, 0); | 4288 | enqueue_task(rq, p, 0); |
4328 | 4289 | ||
4329 | check_class_changed(rq, p, prev_class, oldprio, running); | 4290 | check_class_changed(rq, p, prev_class, oldprio, running); |
@@ -4358,8 +4319,10 @@ void set_user_nice(struct task_struct *p, long nice) | |||
4358 | goto out_unlock; | 4319 | goto out_unlock; |
4359 | } | 4320 | } |
4360 | on_rq = p->se.on_rq; | 4321 | on_rq = p->se.on_rq; |
4361 | if (on_rq) | 4322 | if (on_rq) { |
4362 | dequeue_task(rq, p, 0); | 4323 | dequeue_task(rq, p, 0); |
4324 | dec_load(rq, p); | ||
4325 | } | ||
4363 | 4326 | ||
4364 | p->static_prio = NICE_TO_PRIO(nice); | 4327 | p->static_prio = NICE_TO_PRIO(nice); |
4365 | set_load_weight(p); | 4328 | set_load_weight(p); |
@@ -4369,6 +4332,7 @@ void set_user_nice(struct task_struct *p, long nice) | |||
4369 | 4332 | ||
4370 | if (on_rq) { | 4333 | if (on_rq) { |
4371 | enqueue_task(rq, p, 0); | 4334 | enqueue_task(rq, p, 0); |
4335 | inc_load(rq, p); | ||
4372 | /* | 4336 | /* |
4373 | * If the task increased its priority or is running and | 4337 | * If the task increased its priority or is running and |
4374 | * lowered its priority, then reschedule its CPU: | 4338 | * lowered its priority, then reschedule its CPU: |
@@ -4458,7 +4422,7 @@ int task_nice(const struct task_struct *p) | |||
4458 | { | 4422 | { |
4459 | return TASK_NICE(p); | 4423 | return TASK_NICE(p); |
4460 | } | 4424 | } |
4461 | EXPORT_SYMBOL_GPL(task_nice); | 4425 | EXPORT_SYMBOL(task_nice); |
4462 | 4426 | ||
4463 | /** | 4427 | /** |
4464 | * idle_cpu - is a given cpu idle currently? | 4428 | * idle_cpu - is a given cpu idle currently? |
@@ -4617,19 +4581,17 @@ recheck: | |||
4617 | update_rq_clock(rq); | 4581 | update_rq_clock(rq); |
4618 | on_rq = p->se.on_rq; | 4582 | on_rq = p->se.on_rq; |
4619 | running = task_current(rq, p); | 4583 | running = task_current(rq, p); |
4620 | if (on_rq) { | 4584 | if (on_rq) |
4621 | deactivate_task(rq, p, 0); | 4585 | deactivate_task(rq, p, 0); |
4622 | if (running) | 4586 | if (running) |
4623 | p->sched_class->put_prev_task(rq, p); | 4587 | p->sched_class->put_prev_task(rq, p); |
4624 | } | ||
4625 | 4588 | ||
4626 | oldprio = p->prio; | 4589 | oldprio = p->prio; |
4627 | __setscheduler(rq, p, policy, param->sched_priority); | 4590 | __setscheduler(rq, p, policy, param->sched_priority); |
4628 | 4591 | ||
4592 | if (running) | ||
4593 | p->sched_class->set_curr_task(rq); | ||
4629 | if (on_rq) { | 4594 | if (on_rq) { |
4630 | if (running) | ||
4631 | p->sched_class->set_curr_task(rq); | ||
4632 | |||
4633 | activate_task(rq, p, 0); | 4595 | activate_task(rq, p, 0); |
4634 | 4596 | ||
4635 | check_class_changed(rq, p, prev_class, oldprio, running); | 4597 | check_class_changed(rq, p, prev_class, oldprio, running); |
@@ -5136,7 +5098,7 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) | |||
5136 | time_slice = 0; | 5098 | time_slice = 0; |
5137 | if (p->policy == SCHED_RR) { | 5099 | if (p->policy == SCHED_RR) { |
5138 | time_slice = DEF_TIMESLICE; | 5100 | time_slice = DEF_TIMESLICE; |
5139 | } else { | 5101 | } else if (p->policy != SCHED_FIFO) { |
5140 | struct sched_entity *se = &p->se; | 5102 | struct sched_entity *se = &p->se; |
5141 | unsigned long flags; | 5103 | unsigned long flags; |
5142 | struct rq *rq; | 5104 | struct rq *rq; |
@@ -5917,7 +5879,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
5917 | spin_unlock_irq(&rq->lock); | 5879 | spin_unlock_irq(&rq->lock); |
5918 | break; | 5880 | break; |
5919 | 5881 | ||
5920 | case CPU_DOWN_PREPARE: | 5882 | case CPU_DYING: |
5883 | case CPU_DYING_FROZEN: | ||
5921 | /* Update our root-domain */ | 5884 | /* Update our root-domain */ |
5922 | rq = cpu_rq(cpu); | 5885 | rq = cpu_rq(cpu); |
5923 | spin_lock_irqsave(&rq->lock, flags); | 5886 | spin_lock_irqsave(&rq->lock, flags); |
@@ -7083,21 +7046,6 @@ void __init sched_init_smp(void) | |||
7083 | if (set_cpus_allowed(current, non_isolated_cpus) < 0) | 7046 | if (set_cpus_allowed(current, non_isolated_cpus) < 0) |
7084 | BUG(); | 7047 | BUG(); |
7085 | sched_init_granularity(); | 7048 | sched_init_granularity(); |
7086 | |||
7087 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
7088 | if (nr_cpu_ids == 1) | ||
7089 | return; | ||
7090 | |||
7091 | lb_monitor_task = kthread_create(load_balance_monitor, NULL, | ||
7092 | "group_balance"); | ||
7093 | if (!IS_ERR(lb_monitor_task)) { | ||
7094 | lb_monitor_task->flags |= PF_NOFREEZE; | ||
7095 | wake_up_process(lb_monitor_task); | ||
7096 | } else { | ||
7097 | printk(KERN_ERR "Could not create load balance monitor thread" | ||
7098 | "(error = %ld) \n", PTR_ERR(lb_monitor_task)); | ||
7099 | } | ||
7100 | #endif | ||
7101 | } | 7049 | } |
7102 | #else | 7050 | #else |
7103 | void __init sched_init_smp(void) | 7051 | void __init sched_init_smp(void) |
@@ -7284,6 +7232,8 @@ void __init sched_init(void) | |||
7284 | * During early bootup we pretend to be a normal task: | 7232 | * During early bootup we pretend to be a normal task: |
7285 | */ | 7233 | */ |
7286 | current->sched_class = &fair_sched_class; | 7234 | current->sched_class = &fair_sched_class; |
7235 | |||
7236 | scheduler_running = 1; | ||
7287 | } | 7237 | } |
7288 | 7238 | ||
7289 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP | 7239 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP |
@@ -7418,157 +7368,6 @@ void set_curr_task(int cpu, struct task_struct *p) | |||
7418 | 7368 | ||
7419 | #ifdef CONFIG_GROUP_SCHED | 7369 | #ifdef CONFIG_GROUP_SCHED |
7420 | 7370 | ||
7421 | #if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP | ||
7422 | /* | ||
7423 | * distribute shares of all task groups among their schedulable entities, | ||
7424 | * to reflect load distribution across cpus. | ||
7425 | */ | ||
7426 | static int rebalance_shares(struct sched_domain *sd, int this_cpu) | ||
7427 | { | ||
7428 | struct cfs_rq *cfs_rq; | ||
7429 | struct rq *rq = cpu_rq(this_cpu); | ||
7430 | cpumask_t sdspan = sd->span; | ||
7431 | int balanced = 1; | ||
7432 | |||
7433 | /* Walk thr' all the task groups that we have */ | ||
7434 | for_each_leaf_cfs_rq(rq, cfs_rq) { | ||
7435 | int i; | ||
7436 | unsigned long total_load = 0, total_shares; | ||
7437 | struct task_group *tg = cfs_rq->tg; | ||
7438 | |||
7439 | /* Gather total task load of this group across cpus */ | ||
7440 | for_each_cpu_mask(i, sdspan) | ||
7441 | total_load += tg->cfs_rq[i]->load.weight; | ||
7442 | |||
7443 | /* Nothing to do if this group has no load */ | ||
7444 | if (!total_load) | ||
7445 | continue; | ||
7446 | |||
7447 | /* | ||
7448 | * tg->shares represents the number of cpu shares the task group | ||
7449 | * is eligible to hold on a single cpu. On N cpus, it is | ||
7450 | * eligible to hold (N * tg->shares) number of cpu shares. | ||
7451 | */ | ||
7452 | total_shares = tg->shares * cpus_weight(sdspan); | ||
7453 | |||
7454 | /* | ||
7455 | * redistribute total_shares across cpus as per the task load | ||
7456 | * distribution. | ||
7457 | */ | ||
7458 | for_each_cpu_mask(i, sdspan) { | ||
7459 | unsigned long local_load, local_shares; | ||
7460 | |||
7461 | local_load = tg->cfs_rq[i]->load.weight; | ||
7462 | local_shares = (local_load * total_shares) / total_load; | ||
7463 | if (!local_shares) | ||
7464 | local_shares = MIN_GROUP_SHARES; | ||
7465 | if (local_shares == tg->se[i]->load.weight) | ||
7466 | continue; | ||
7467 | |||
7468 | spin_lock_irq(&cpu_rq(i)->lock); | ||
7469 | set_se_shares(tg->se[i], local_shares); | ||
7470 | spin_unlock_irq(&cpu_rq(i)->lock); | ||
7471 | balanced = 0; | ||
7472 | } | ||
7473 | } | ||
7474 | |||
7475 | return balanced; | ||
7476 | } | ||
7477 | |||
7478 | /* | ||
7479 | * How frequently should we rebalance_shares() across cpus? | ||
7480 | * | ||
7481 | * The more frequently we rebalance shares, the more accurate is the fairness | ||
7482 | * of cpu bandwidth distribution between task groups. However higher frequency | ||
7483 | * also implies increased scheduling overhead. | ||
7484 | * | ||
7485 | * sysctl_sched_min_bal_int_shares represents the minimum interval between | ||
7486 | * consecutive calls to rebalance_shares() in the same sched domain. | ||
7487 | * | ||
7488 | * sysctl_sched_max_bal_int_shares represents the maximum interval between | ||
7489 | * consecutive calls to rebalance_shares() in the same sched domain. | ||
7490 | * | ||
7491 | * These settings allows for the appropriate trade-off between accuracy of | ||
7492 | * fairness and the associated overhead. | ||
7493 | * | ||
7494 | */ | ||
7495 | |||
7496 | /* default: 8ms, units: milliseconds */ | ||
7497 | const_debug unsigned int sysctl_sched_min_bal_int_shares = 8; | ||
7498 | |||
7499 | /* default: 128ms, units: milliseconds */ | ||
7500 | const_debug unsigned int sysctl_sched_max_bal_int_shares = 128; | ||
7501 | |||
7502 | /* kernel thread that runs rebalance_shares() periodically */ | ||
7503 | static int load_balance_monitor(void *unused) | ||
7504 | { | ||
7505 | unsigned int timeout = sysctl_sched_min_bal_int_shares; | ||
7506 | struct sched_param schedparm; | ||
7507 | int ret; | ||
7508 | |||
7509 | /* | ||
7510 | * We don't want this thread's execution to be limited by the shares | ||
7511 | * assigned to default group (init_task_group). Hence make it run | ||
7512 | * as a SCHED_RR RT task at the lowest priority. | ||
7513 | */ | ||
7514 | schedparm.sched_priority = 1; | ||
7515 | ret = sched_setscheduler(current, SCHED_RR, &schedparm); | ||
7516 | if (ret) | ||
7517 | printk(KERN_ERR "Couldn't set SCHED_RR policy for load balance" | ||
7518 | " monitor thread (error = %d) \n", ret); | ||
7519 | |||
7520 | while (!kthread_should_stop()) { | ||
7521 | int i, cpu, balanced = 1; | ||
7522 | |||
7523 | /* Prevent cpus going down or coming up */ | ||
7524 | get_online_cpus(); | ||
7525 | /* lockout changes to doms_cur[] array */ | ||
7526 | lock_doms_cur(); | ||
7527 | /* | ||
7528 | * Enter a rcu read-side critical section to safely walk rq->sd | ||
7529 | * chain on various cpus and to walk task group list | ||
7530 | * (rq->leaf_cfs_rq_list) in rebalance_shares(). | ||
7531 | */ | ||
7532 | rcu_read_lock(); | ||
7533 | |||
7534 | for (i = 0; i < ndoms_cur; i++) { | ||
7535 | cpumask_t cpumap = doms_cur[i]; | ||
7536 | struct sched_domain *sd = NULL, *sd_prev = NULL; | ||
7537 | |||
7538 | cpu = first_cpu(cpumap); | ||
7539 | |||
7540 | /* Find the highest domain at which to balance shares */ | ||
7541 | for_each_domain(cpu, sd) { | ||
7542 | if (!(sd->flags & SD_LOAD_BALANCE)) | ||
7543 | continue; | ||
7544 | sd_prev = sd; | ||
7545 | } | ||
7546 | |||
7547 | sd = sd_prev; | ||
7548 | /* sd == NULL? No load balance reqd in this domain */ | ||
7549 | if (!sd) | ||
7550 | continue; | ||
7551 | |||
7552 | balanced &= rebalance_shares(sd, cpu); | ||
7553 | } | ||
7554 | |||
7555 | rcu_read_unlock(); | ||
7556 | |||
7557 | unlock_doms_cur(); | ||
7558 | put_online_cpus(); | ||
7559 | |||
7560 | if (!balanced) | ||
7561 | timeout = sysctl_sched_min_bal_int_shares; | ||
7562 | else if (timeout < sysctl_sched_max_bal_int_shares) | ||
7563 | timeout *= 2; | ||
7564 | |||
7565 | msleep_interruptible(timeout); | ||
7566 | } | ||
7567 | |||
7568 | return 0; | ||
7569 | } | ||
7570 | #endif /* CONFIG_SMP */ | ||
7571 | |||
7572 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7371 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7573 | static void free_fair_sched_group(struct task_group *tg) | 7372 | static void free_fair_sched_group(struct task_group *tg) |
7574 | { | 7373 | { |
@@ -7817,47 +7616,46 @@ void sched_move_task(struct task_struct *tsk) | |||
7817 | running = task_current(rq, tsk); | 7616 | running = task_current(rq, tsk); |
7818 | on_rq = tsk->se.on_rq; | 7617 | on_rq = tsk->se.on_rq; |
7819 | 7618 | ||
7820 | if (on_rq) { | 7619 | if (on_rq) |
7821 | dequeue_task(rq, tsk, 0); | 7620 | dequeue_task(rq, tsk, 0); |
7822 | if (unlikely(running)) | 7621 | if (unlikely(running)) |
7823 | tsk->sched_class->put_prev_task(rq, tsk); | 7622 | tsk->sched_class->put_prev_task(rq, tsk); |
7824 | } | ||
7825 | 7623 | ||
7826 | set_task_rq(tsk, task_cpu(tsk)); | 7624 | set_task_rq(tsk, task_cpu(tsk)); |
7827 | 7625 | ||
7828 | if (on_rq) { | 7626 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7829 | if (unlikely(running)) | 7627 | if (tsk->sched_class->moved_group) |
7830 | tsk->sched_class->set_curr_task(rq); | 7628 | tsk->sched_class->moved_group(tsk); |
7629 | #endif | ||
7630 | |||
7631 | if (unlikely(running)) | ||
7632 | tsk->sched_class->set_curr_task(rq); | ||
7633 | if (on_rq) | ||
7831 | enqueue_task(rq, tsk, 0); | 7634 | enqueue_task(rq, tsk, 0); |
7832 | } | ||
7833 | 7635 | ||
7834 | task_rq_unlock(rq, &flags); | 7636 | task_rq_unlock(rq, &flags); |
7835 | } | 7637 | } |
7836 | 7638 | ||
7837 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7639 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7838 | /* rq->lock to be locked by caller */ | ||
7839 | static void set_se_shares(struct sched_entity *se, unsigned long shares) | 7640 | static void set_se_shares(struct sched_entity *se, unsigned long shares) |
7840 | { | 7641 | { |
7841 | struct cfs_rq *cfs_rq = se->cfs_rq; | 7642 | struct cfs_rq *cfs_rq = se->cfs_rq; |
7842 | struct rq *rq = cfs_rq->rq; | 7643 | struct rq *rq = cfs_rq->rq; |
7843 | int on_rq; | 7644 | int on_rq; |
7844 | 7645 | ||
7845 | if (!shares) | 7646 | spin_lock_irq(&rq->lock); |
7846 | shares = MIN_GROUP_SHARES; | ||
7847 | 7647 | ||
7848 | on_rq = se->on_rq; | 7648 | on_rq = se->on_rq; |
7849 | if (on_rq) { | 7649 | if (on_rq) |
7850 | dequeue_entity(cfs_rq, se, 0); | 7650 | dequeue_entity(cfs_rq, se, 0); |
7851 | dec_cpu_load(rq, se->load.weight); | ||
7852 | } | ||
7853 | 7651 | ||
7854 | se->load.weight = shares; | 7652 | se->load.weight = shares; |
7855 | se->load.inv_weight = div64_64((1ULL<<32), shares); | 7653 | se->load.inv_weight = div64_64((1ULL<<32), shares); |
7856 | 7654 | ||
7857 | if (on_rq) { | 7655 | if (on_rq) |
7858 | enqueue_entity(cfs_rq, se, 0); | 7656 | enqueue_entity(cfs_rq, se, 0); |
7859 | inc_cpu_load(rq, se->load.weight); | 7657 | |
7860 | } | 7658 | spin_unlock_irq(&rq->lock); |
7861 | } | 7659 | } |
7862 | 7660 | ||
7863 | static DEFINE_MUTEX(shares_mutex); | 7661 | static DEFINE_MUTEX(shares_mutex); |
@@ -7867,18 +7665,18 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) | |||
7867 | int i; | 7665 | int i; |
7868 | unsigned long flags; | 7666 | unsigned long flags; |
7869 | 7667 | ||
7668 | /* | ||
7669 | * A weight of 0 or 1 can cause arithmetics problems. | ||
7670 | * (The default weight is 1024 - so there's no practical | ||
7671 | * limitation from this.) | ||
7672 | */ | ||
7673 | if (shares < 2) | ||
7674 | shares = 2; | ||
7675 | |||
7870 | mutex_lock(&shares_mutex); | 7676 | mutex_lock(&shares_mutex); |
7871 | if (tg->shares == shares) | 7677 | if (tg->shares == shares) |
7872 | goto done; | 7678 | goto done; |
7873 | 7679 | ||
7874 | if (shares < MIN_GROUP_SHARES) | ||
7875 | shares = MIN_GROUP_SHARES; | ||
7876 | |||
7877 | /* | ||
7878 | * Prevent any load balance activity (rebalance_shares, | ||
7879 | * load_balance_fair) from referring to this group first, | ||
7880 | * by taking it off the rq->leaf_cfs_rq_list on each cpu. | ||
7881 | */ | ||
7882 | spin_lock_irqsave(&task_group_lock, flags); | 7680 | spin_lock_irqsave(&task_group_lock, flags); |
7883 | for_each_possible_cpu(i) | 7681 | for_each_possible_cpu(i) |
7884 | unregister_fair_sched_group(tg, i); | 7682 | unregister_fair_sched_group(tg, i); |
@@ -7892,11 +7690,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) | |||
7892 | * w/o tripping rebalance_share or load_balance_fair. | 7690 | * w/o tripping rebalance_share or load_balance_fair. |
7893 | */ | 7691 | */ |
7894 | tg->shares = shares; | 7692 | tg->shares = shares; |
7895 | for_each_possible_cpu(i) { | 7693 | for_each_possible_cpu(i) |
7896 | spin_lock_irq(&cpu_rq(i)->lock); | ||
7897 | set_se_shares(tg->se[i], shares); | 7694 | set_se_shares(tg->se[i], shares); |
7898 | spin_unlock_irq(&cpu_rq(i)->lock); | ||
7899 | } | ||
7900 | 7695 | ||
7901 | /* | 7696 | /* |
7902 | * Enable load balance activity on this group, by inserting it back on | 7697 | * Enable load balance activity on this group, by inserting it back on |
@@ -7928,9 +7723,7 @@ static unsigned long to_ratio(u64 period, u64 runtime) | |||
7928 | if (runtime == RUNTIME_INF) | 7723 | if (runtime == RUNTIME_INF) |
7929 | return 1ULL << 16; | 7724 | return 1ULL << 16; |
7930 | 7725 | ||
7931 | runtime *= (1ULL << 16); | 7726 | return div64_64(runtime << 16, period); |
7932 | div64_64(runtime, period); | ||
7933 | return runtime; | ||
7934 | } | 7727 | } |
7935 | 7728 | ||
7936 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) | 7729 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) |
@@ -7954,25 +7747,40 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) | |||
7954 | return total + to_ratio(period, runtime) < global_ratio; | 7747 | return total + to_ratio(period, runtime) < global_ratio; |
7955 | } | 7748 | } |
7956 | 7749 | ||
7750 | /* Must be called with tasklist_lock held */ | ||
7751 | static inline int tg_has_rt_tasks(struct task_group *tg) | ||
7752 | { | ||
7753 | struct task_struct *g, *p; | ||
7754 | do_each_thread(g, p) { | ||
7755 | if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) | ||
7756 | return 1; | ||
7757 | } while_each_thread(g, p); | ||
7758 | return 0; | ||
7759 | } | ||
7760 | |||
7957 | int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) | 7761 | int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) |
7958 | { | 7762 | { |
7959 | u64 rt_runtime, rt_period; | 7763 | u64 rt_runtime, rt_period; |
7960 | int err = 0; | 7764 | int err = 0; |
7961 | 7765 | ||
7962 | rt_period = sysctl_sched_rt_period * NSEC_PER_USEC; | 7766 | rt_period = (u64)sysctl_sched_rt_period * NSEC_PER_USEC; |
7963 | rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; | 7767 | rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; |
7964 | if (rt_runtime_us == -1) | 7768 | if (rt_runtime_us == -1) |
7965 | rt_runtime = rt_period; | 7769 | rt_runtime = RUNTIME_INF; |
7966 | 7770 | ||
7967 | mutex_lock(&rt_constraints_mutex); | 7771 | mutex_lock(&rt_constraints_mutex); |
7772 | read_lock(&tasklist_lock); | ||
7773 | if (rt_runtime_us == 0 && tg_has_rt_tasks(tg)) { | ||
7774 | err = -EBUSY; | ||
7775 | goto unlock; | ||
7776 | } | ||
7968 | if (!__rt_schedulable(tg, rt_period, rt_runtime)) { | 7777 | if (!__rt_schedulable(tg, rt_period, rt_runtime)) { |
7969 | err = -EINVAL; | 7778 | err = -EINVAL; |
7970 | goto unlock; | 7779 | goto unlock; |
7971 | } | 7780 | } |
7972 | if (rt_runtime_us == -1) | ||
7973 | rt_runtime = RUNTIME_INF; | ||
7974 | tg->rt_runtime = rt_runtime; | 7781 | tg->rt_runtime = rt_runtime; |
7975 | unlock: | 7782 | unlock: |
7783 | read_unlock(&tasklist_lock); | ||
7976 | mutex_unlock(&rt_constraints_mutex); | 7784 | mutex_unlock(&rt_constraints_mutex); |
7977 | 7785 | ||
7978 | return err; | 7786 | return err; |