aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c382
1 files changed, 95 insertions, 287 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index b387a8de26a5..d1ad69b270ca 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -174,41 +174,6 @@ struct task_group {
174 struct sched_entity **se; 174 struct sched_entity **se;
175 /* runqueue "owned" by this group on each cpu */ 175 /* runqueue "owned" by this group on each cpu */
176 struct cfs_rq **cfs_rq; 176 struct cfs_rq **cfs_rq;
177
178 /*
179 * shares assigned to a task group governs how much of cpu bandwidth
180 * is allocated to the group. The more shares a group has, the more is
181 * the cpu bandwidth allocated to it.
182 *
183 * For ex, lets say that there are three task groups, A, B and C which
184 * have been assigned shares 1000, 2000 and 3000 respectively. Then,
185 * cpu bandwidth allocated by the scheduler to task groups A, B and C
186 * should be:
187 *
188 * Bw(A) = 1000/(1000+2000+3000) * 100 = 16.66%
189 * Bw(B) = 2000/(1000+2000+3000) * 100 = 33.33%
190 * Bw(C) = 3000/(1000+2000+3000) * 100 = 50%
191 *
192 * The weight assigned to a task group's schedulable entities on every
193 * cpu (task_group.se[a_cpu]->load.weight) is derived from the task
194 * group's shares. For ex: lets say that task group A has been
195 * assigned shares of 1000 and there are two CPUs in a system. Then,
196 *
197 * tg_A->se[0]->load.weight = tg_A->se[1]->load.weight = 1000;
198 *
199 * Note: It's not necessary that each of a task's group schedulable
200 * entity have the same weight on all CPUs. If the group
201 * has 2 of its tasks on CPU0 and 1 task on CPU1, then a
202 * better distribution of weight could be:
203 *
204 * tg_A->se[0]->load.weight = 2/3 * 2000 = 1333
205 * tg_A->se[1]->load.weight = 1/2 * 2000 = 667
206 *
207 * rebalance_shares() is responsible for distributing the shares of a
208 * task groups like this among the group's schedulable entities across
209 * cpus.
210 *
211 */
212 unsigned long shares; 177 unsigned long shares;
213#endif 178#endif
214 179
@@ -250,22 +215,12 @@ static DEFINE_SPINLOCK(task_group_lock);
250static DEFINE_MUTEX(doms_cur_mutex); 215static DEFINE_MUTEX(doms_cur_mutex);
251 216
252#ifdef CONFIG_FAIR_GROUP_SCHED 217#ifdef CONFIG_FAIR_GROUP_SCHED
253#ifdef CONFIG_SMP
254/* kernel thread that runs rebalance_shares() periodically */
255static struct task_struct *lb_monitor_task;
256static int load_balance_monitor(void *unused);
257#endif
258
259static void set_se_shares(struct sched_entity *se, unsigned long shares);
260
261#ifdef CONFIG_USER_SCHED 218#ifdef CONFIG_USER_SCHED
262# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) 219# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
263#else 220#else
264# define INIT_TASK_GROUP_LOAD NICE_0_LOAD 221# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
265#endif 222#endif
266 223
267#define MIN_GROUP_SHARES 2
268
269static int init_task_group_load = INIT_TASK_GROUP_LOAD; 224static int init_task_group_load = INIT_TASK_GROUP_LOAD;
270#endif 225#endif
271 226
@@ -346,7 +301,7 @@ struct cfs_rq {
346 /* 'curr' points to currently running entity on this cfs_rq. 301 /* 'curr' points to currently running entity on this cfs_rq.
347 * It is set to NULL otherwise (i.e when none are currently running). 302 * It is set to NULL otherwise (i.e when none are currently running).
348 */ 303 */
349 struct sched_entity *curr; 304 struct sched_entity *curr, *next;
350 305
351 unsigned long nr_spread_over; 306 unsigned long nr_spread_over;
352 307
@@ -668,6 +623,8 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32;
668 */ 623 */
669unsigned int sysctl_sched_rt_period = 1000000; 624unsigned int sysctl_sched_rt_period = 1000000;
670 625
626static __read_mostly int scheduler_running;
627
671/* 628/*
672 * part of the period that we allow rt tasks to run in us. 629 * part of the period that we allow rt tasks to run in us.
673 * default: 0.95s 630 * default: 0.95s
@@ -689,14 +646,16 @@ unsigned long long cpu_clock(int cpu)
689 unsigned long flags; 646 unsigned long flags;
690 struct rq *rq; 647 struct rq *rq;
691 648
692 local_irq_save(flags);
693 rq = cpu_rq(cpu);
694 /* 649 /*
695 * Only call sched_clock() if the scheduler has already been 650 * Only call sched_clock() if the scheduler has already been
696 * initialized (some code might call cpu_clock() very early): 651 * initialized (some code might call cpu_clock() very early):
697 */ 652 */
698 if (rq->idle) 653 if (unlikely(!scheduler_running))
699 update_rq_clock(rq); 654 return 0;
655
656 local_irq_save(flags);
657 rq = cpu_rq(cpu);
658 update_rq_clock(rq);
700 now = rq->clock; 659 now = rq->clock;
701 local_irq_restore(flags); 660 local_irq_restore(flags);
702 661
@@ -1125,7 +1084,7 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1125 u64 tmp; 1084 u64 tmp;
1126 1085
1127 if (unlikely(!lw->inv_weight)) 1086 if (unlikely(!lw->inv_weight))
1128 lw->inv_weight = (WMULT_CONST - lw->weight/2) / lw->weight + 1; 1087 lw->inv_weight = (WMULT_CONST-lw->weight/2) / (lw->weight+1);
1129 1088
1130 tmp = (u64)delta_exec * weight; 1089 tmp = (u64)delta_exec * weight;
1131 /* 1090 /*
@@ -1149,11 +1108,13 @@ calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
1149static inline void update_load_add(struct load_weight *lw, unsigned long inc) 1108static inline void update_load_add(struct load_weight *lw, unsigned long inc)
1150{ 1109{
1151 lw->weight += inc; 1110 lw->weight += inc;
1111 lw->inv_weight = 0;
1152} 1112}
1153 1113
1154static inline void update_load_sub(struct load_weight *lw, unsigned long dec) 1114static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1155{ 1115{
1156 lw->weight -= dec; 1116 lw->weight -= dec;
1117 lw->inv_weight = 0;
1157} 1118}
1158 1119
1159/* 1120/*
@@ -1241,16 +1202,6 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
1241static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} 1202static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
1242#endif 1203#endif
1243 1204
1244static inline void inc_cpu_load(struct rq *rq, unsigned long load)
1245{
1246 update_load_add(&rq->load, load);
1247}
1248
1249static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1250{
1251 update_load_sub(&rq->load, load);
1252}
1253
1254#ifdef CONFIG_SMP 1205#ifdef CONFIG_SMP
1255static unsigned long source_load(int cpu, int type); 1206static unsigned long source_load(int cpu, int type);
1256static unsigned long target_load(int cpu, int type); 1207static unsigned long target_load(int cpu, int type);
@@ -1268,14 +1219,26 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1268 1219
1269#define sched_class_highest (&rt_sched_class) 1220#define sched_class_highest (&rt_sched_class)
1270 1221
1271static void inc_nr_running(struct rq *rq) 1222static inline void inc_load(struct rq *rq, const struct task_struct *p)
1223{
1224 update_load_add(&rq->load, p->se.load.weight);
1225}
1226
1227static inline void dec_load(struct rq *rq, const struct task_struct *p)
1228{
1229 update_load_sub(&rq->load, p->se.load.weight);
1230}
1231
1232static void inc_nr_running(struct task_struct *p, struct rq *rq)
1272{ 1233{
1273 rq->nr_running++; 1234 rq->nr_running++;
1235 inc_load(rq, p);
1274} 1236}
1275 1237
1276static void dec_nr_running(struct rq *rq) 1238static void dec_nr_running(struct task_struct *p, struct rq *rq)
1277{ 1239{
1278 rq->nr_running--; 1240 rq->nr_running--;
1241 dec_load(rq, p);
1279} 1242}
1280 1243
1281static void set_load_weight(struct task_struct *p) 1244static void set_load_weight(struct task_struct *p)
@@ -1367,7 +1330,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1367 rq->nr_uninterruptible--; 1330 rq->nr_uninterruptible--;
1368 1331
1369 enqueue_task(rq, p, wakeup); 1332 enqueue_task(rq, p, wakeup);
1370 inc_nr_running(rq); 1333 inc_nr_running(p, rq);
1371} 1334}
1372 1335
1373/* 1336/*
@@ -1379,7 +1342,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1379 rq->nr_uninterruptible++; 1342 rq->nr_uninterruptible++;
1380 1343
1381 dequeue_task(rq, p, sleep); 1344 dequeue_task(rq, p, sleep);
1382 dec_nr_running(rq); 1345 dec_nr_running(p, rq);
1383} 1346}
1384 1347
1385/** 1348/**
@@ -2019,7 +1982,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2019 * management (if any): 1982 * management (if any):
2020 */ 1983 */
2021 p->sched_class->task_new(rq, p); 1984 p->sched_class->task_new(rq, p);
2022 inc_nr_running(rq); 1985 inc_nr_running(p, rq);
2023 } 1986 }
2024 check_preempt_curr(rq, p); 1987 check_preempt_curr(rq, p);
2025#ifdef CONFIG_SMP 1988#ifdef CONFIG_SMP
@@ -3885,7 +3848,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev)
3885asmlinkage void __sched schedule(void) 3848asmlinkage void __sched schedule(void)
3886{ 3849{
3887 struct task_struct *prev, *next; 3850 struct task_struct *prev, *next;
3888 long *switch_count; 3851 unsigned long *switch_count;
3889 struct rq *rq; 3852 struct rq *rq;
3890 int cpu; 3853 int cpu;
3891 3854
@@ -4307,11 +4270,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4307 oldprio = p->prio; 4270 oldprio = p->prio;
4308 on_rq = p->se.on_rq; 4271 on_rq = p->se.on_rq;
4309 running = task_current(rq, p); 4272 running = task_current(rq, p);
4310 if (on_rq) { 4273 if (on_rq)
4311 dequeue_task(rq, p, 0); 4274 dequeue_task(rq, p, 0);
4312 if (running) 4275 if (running)
4313 p->sched_class->put_prev_task(rq, p); 4276 p->sched_class->put_prev_task(rq, p);
4314 }
4315 4277
4316 if (rt_prio(prio)) 4278 if (rt_prio(prio))
4317 p->sched_class = &rt_sched_class; 4279 p->sched_class = &rt_sched_class;
@@ -4320,10 +4282,9 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4320 4282
4321 p->prio = prio; 4283 p->prio = prio;
4322 4284
4285 if (running)
4286 p->sched_class->set_curr_task(rq);
4323 if (on_rq) { 4287 if (on_rq) {
4324 if (running)
4325 p->sched_class->set_curr_task(rq);
4326
4327 enqueue_task(rq, p, 0); 4288 enqueue_task(rq, p, 0);
4328 4289
4329 check_class_changed(rq, p, prev_class, oldprio, running); 4290 check_class_changed(rq, p, prev_class, oldprio, running);
@@ -4358,8 +4319,10 @@ void set_user_nice(struct task_struct *p, long nice)
4358 goto out_unlock; 4319 goto out_unlock;
4359 } 4320 }
4360 on_rq = p->se.on_rq; 4321 on_rq = p->se.on_rq;
4361 if (on_rq) 4322 if (on_rq) {
4362 dequeue_task(rq, p, 0); 4323 dequeue_task(rq, p, 0);
4324 dec_load(rq, p);
4325 }
4363 4326
4364 p->static_prio = NICE_TO_PRIO(nice); 4327 p->static_prio = NICE_TO_PRIO(nice);
4365 set_load_weight(p); 4328 set_load_weight(p);
@@ -4369,6 +4332,7 @@ void set_user_nice(struct task_struct *p, long nice)
4369 4332
4370 if (on_rq) { 4333 if (on_rq) {
4371 enqueue_task(rq, p, 0); 4334 enqueue_task(rq, p, 0);
4335 inc_load(rq, p);
4372 /* 4336 /*
4373 * If the task increased its priority or is running and 4337 * If the task increased its priority or is running and
4374 * lowered its priority, then reschedule its CPU: 4338 * lowered its priority, then reschedule its CPU:
@@ -4458,7 +4422,7 @@ int task_nice(const struct task_struct *p)
4458{ 4422{
4459 return TASK_NICE(p); 4423 return TASK_NICE(p);
4460} 4424}
4461EXPORT_SYMBOL_GPL(task_nice); 4425EXPORT_SYMBOL(task_nice);
4462 4426
4463/** 4427/**
4464 * idle_cpu - is a given cpu idle currently? 4428 * idle_cpu - is a given cpu idle currently?
@@ -4617,19 +4581,17 @@ recheck:
4617 update_rq_clock(rq); 4581 update_rq_clock(rq);
4618 on_rq = p->se.on_rq; 4582 on_rq = p->se.on_rq;
4619 running = task_current(rq, p); 4583 running = task_current(rq, p);
4620 if (on_rq) { 4584 if (on_rq)
4621 deactivate_task(rq, p, 0); 4585 deactivate_task(rq, p, 0);
4622 if (running) 4586 if (running)
4623 p->sched_class->put_prev_task(rq, p); 4587 p->sched_class->put_prev_task(rq, p);
4624 }
4625 4588
4626 oldprio = p->prio; 4589 oldprio = p->prio;
4627 __setscheduler(rq, p, policy, param->sched_priority); 4590 __setscheduler(rq, p, policy, param->sched_priority);
4628 4591
4592 if (running)
4593 p->sched_class->set_curr_task(rq);
4629 if (on_rq) { 4594 if (on_rq) {
4630 if (running)
4631 p->sched_class->set_curr_task(rq);
4632
4633 activate_task(rq, p, 0); 4595 activate_task(rq, p, 0);
4634 4596
4635 check_class_changed(rq, p, prev_class, oldprio, running); 4597 check_class_changed(rq, p, prev_class, oldprio, running);
@@ -5136,7 +5098,7 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
5136 time_slice = 0; 5098 time_slice = 0;
5137 if (p->policy == SCHED_RR) { 5099 if (p->policy == SCHED_RR) {
5138 time_slice = DEF_TIMESLICE; 5100 time_slice = DEF_TIMESLICE;
5139 } else { 5101 } else if (p->policy != SCHED_FIFO) {
5140 struct sched_entity *se = &p->se; 5102 struct sched_entity *se = &p->se;
5141 unsigned long flags; 5103 unsigned long flags;
5142 struct rq *rq; 5104 struct rq *rq;
@@ -5917,7 +5879,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5917 spin_unlock_irq(&rq->lock); 5879 spin_unlock_irq(&rq->lock);
5918 break; 5880 break;
5919 5881
5920 case CPU_DOWN_PREPARE: 5882 case CPU_DYING:
5883 case CPU_DYING_FROZEN:
5921 /* Update our root-domain */ 5884 /* Update our root-domain */
5922 rq = cpu_rq(cpu); 5885 rq = cpu_rq(cpu);
5923 spin_lock_irqsave(&rq->lock, flags); 5886 spin_lock_irqsave(&rq->lock, flags);
@@ -7083,21 +7046,6 @@ void __init sched_init_smp(void)
7083 if (set_cpus_allowed(current, non_isolated_cpus) < 0) 7046 if (set_cpus_allowed(current, non_isolated_cpus) < 0)
7084 BUG(); 7047 BUG();
7085 sched_init_granularity(); 7048 sched_init_granularity();
7086
7087#ifdef CONFIG_FAIR_GROUP_SCHED
7088 if (nr_cpu_ids == 1)
7089 return;
7090
7091 lb_monitor_task = kthread_create(load_balance_monitor, NULL,
7092 "group_balance");
7093 if (!IS_ERR(lb_monitor_task)) {
7094 lb_monitor_task->flags |= PF_NOFREEZE;
7095 wake_up_process(lb_monitor_task);
7096 } else {
7097 printk(KERN_ERR "Could not create load balance monitor thread"
7098 "(error = %ld) \n", PTR_ERR(lb_monitor_task));
7099 }
7100#endif
7101} 7049}
7102#else 7050#else
7103void __init sched_init_smp(void) 7051void __init sched_init_smp(void)
@@ -7284,6 +7232,8 @@ void __init sched_init(void)
7284 * During early bootup we pretend to be a normal task: 7232 * During early bootup we pretend to be a normal task:
7285 */ 7233 */
7286 current->sched_class = &fair_sched_class; 7234 current->sched_class = &fair_sched_class;
7235
7236 scheduler_running = 1;
7287} 7237}
7288 7238
7289#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 7239#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
@@ -7418,157 +7368,6 @@ void set_curr_task(int cpu, struct task_struct *p)
7418 7368
7419#ifdef CONFIG_GROUP_SCHED 7369#ifdef CONFIG_GROUP_SCHED
7420 7370
7421#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
7422/*
7423 * distribute shares of all task groups among their schedulable entities,
7424 * to reflect load distribution across cpus.
7425 */
7426static int rebalance_shares(struct sched_domain *sd, int this_cpu)
7427{
7428 struct cfs_rq *cfs_rq;
7429 struct rq *rq = cpu_rq(this_cpu);
7430 cpumask_t sdspan = sd->span;
7431 int balanced = 1;
7432
7433 /* Walk thr' all the task groups that we have */
7434 for_each_leaf_cfs_rq(rq, cfs_rq) {
7435 int i;
7436 unsigned long total_load = 0, total_shares;
7437 struct task_group *tg = cfs_rq->tg;
7438
7439 /* Gather total task load of this group across cpus */
7440 for_each_cpu_mask(i, sdspan)
7441 total_load += tg->cfs_rq[i]->load.weight;
7442
7443 /* Nothing to do if this group has no load */
7444 if (!total_load)
7445 continue;
7446
7447 /*
7448 * tg->shares represents the number of cpu shares the task group
7449 * is eligible to hold on a single cpu. On N cpus, it is
7450 * eligible to hold (N * tg->shares) number of cpu shares.
7451 */
7452 total_shares = tg->shares * cpus_weight(sdspan);
7453
7454 /*
7455 * redistribute total_shares across cpus as per the task load
7456 * distribution.
7457 */
7458 for_each_cpu_mask(i, sdspan) {
7459 unsigned long local_load, local_shares;
7460
7461 local_load = tg->cfs_rq[i]->load.weight;
7462 local_shares = (local_load * total_shares) / total_load;
7463 if (!local_shares)
7464 local_shares = MIN_GROUP_SHARES;
7465 if (local_shares == tg->se[i]->load.weight)
7466 continue;
7467
7468 spin_lock_irq(&cpu_rq(i)->lock);
7469 set_se_shares(tg->se[i], local_shares);
7470 spin_unlock_irq(&cpu_rq(i)->lock);
7471 balanced = 0;
7472 }
7473 }
7474
7475 return balanced;
7476}
7477
7478/*
7479 * How frequently should we rebalance_shares() across cpus?
7480 *
7481 * The more frequently we rebalance shares, the more accurate is the fairness
7482 * of cpu bandwidth distribution between task groups. However higher frequency
7483 * also implies increased scheduling overhead.
7484 *
7485 * sysctl_sched_min_bal_int_shares represents the minimum interval between
7486 * consecutive calls to rebalance_shares() in the same sched domain.
7487 *
7488 * sysctl_sched_max_bal_int_shares represents the maximum interval between
7489 * consecutive calls to rebalance_shares() in the same sched domain.
7490 *
7491 * These settings allows for the appropriate trade-off between accuracy of
7492 * fairness and the associated overhead.
7493 *
7494 */
7495
7496/* default: 8ms, units: milliseconds */
7497const_debug unsigned int sysctl_sched_min_bal_int_shares = 8;
7498
7499/* default: 128ms, units: milliseconds */
7500const_debug unsigned int sysctl_sched_max_bal_int_shares = 128;
7501
7502/* kernel thread that runs rebalance_shares() periodically */
7503static int load_balance_monitor(void *unused)
7504{
7505 unsigned int timeout = sysctl_sched_min_bal_int_shares;
7506 struct sched_param schedparm;
7507 int ret;
7508
7509 /*
7510 * We don't want this thread's execution to be limited by the shares
7511 * assigned to default group (init_task_group). Hence make it run
7512 * as a SCHED_RR RT task at the lowest priority.
7513 */
7514 schedparm.sched_priority = 1;
7515 ret = sched_setscheduler(current, SCHED_RR, &schedparm);
7516 if (ret)
7517 printk(KERN_ERR "Couldn't set SCHED_RR policy for load balance"
7518 " monitor thread (error = %d) \n", ret);
7519
7520 while (!kthread_should_stop()) {
7521 int i, cpu, balanced = 1;
7522
7523 /* Prevent cpus going down or coming up */
7524 get_online_cpus();
7525 /* lockout changes to doms_cur[] array */
7526 lock_doms_cur();
7527 /*
7528 * Enter a rcu read-side critical section to safely walk rq->sd
7529 * chain on various cpus and to walk task group list
7530 * (rq->leaf_cfs_rq_list) in rebalance_shares().
7531 */
7532 rcu_read_lock();
7533
7534 for (i = 0; i < ndoms_cur; i++) {
7535 cpumask_t cpumap = doms_cur[i];
7536 struct sched_domain *sd = NULL, *sd_prev = NULL;
7537
7538 cpu = first_cpu(cpumap);
7539
7540 /* Find the highest domain at which to balance shares */
7541 for_each_domain(cpu, sd) {
7542 if (!(sd->flags & SD_LOAD_BALANCE))
7543 continue;
7544 sd_prev = sd;
7545 }
7546
7547 sd = sd_prev;
7548 /* sd == NULL? No load balance reqd in this domain */
7549 if (!sd)
7550 continue;
7551
7552 balanced &= rebalance_shares(sd, cpu);
7553 }
7554
7555 rcu_read_unlock();
7556
7557 unlock_doms_cur();
7558 put_online_cpus();
7559
7560 if (!balanced)
7561 timeout = sysctl_sched_min_bal_int_shares;
7562 else if (timeout < sysctl_sched_max_bal_int_shares)
7563 timeout *= 2;
7564
7565 msleep_interruptible(timeout);
7566 }
7567
7568 return 0;
7569}
7570#endif /* CONFIG_SMP */
7571
7572#ifdef CONFIG_FAIR_GROUP_SCHED 7371#ifdef CONFIG_FAIR_GROUP_SCHED
7573static void free_fair_sched_group(struct task_group *tg) 7372static void free_fair_sched_group(struct task_group *tg)
7574{ 7373{
@@ -7817,47 +7616,46 @@ void sched_move_task(struct task_struct *tsk)
7817 running = task_current(rq, tsk); 7616 running = task_current(rq, tsk);
7818 on_rq = tsk->se.on_rq; 7617 on_rq = tsk->se.on_rq;
7819 7618
7820 if (on_rq) { 7619 if (on_rq)
7821 dequeue_task(rq, tsk, 0); 7620 dequeue_task(rq, tsk, 0);
7822 if (unlikely(running)) 7621 if (unlikely(running))
7823 tsk->sched_class->put_prev_task(rq, tsk); 7622 tsk->sched_class->put_prev_task(rq, tsk);
7824 }
7825 7623
7826 set_task_rq(tsk, task_cpu(tsk)); 7624 set_task_rq(tsk, task_cpu(tsk));
7827 7625
7828 if (on_rq) { 7626#ifdef CONFIG_FAIR_GROUP_SCHED
7829 if (unlikely(running)) 7627 if (tsk->sched_class->moved_group)
7830 tsk->sched_class->set_curr_task(rq); 7628 tsk->sched_class->moved_group(tsk);
7629#endif
7630
7631 if (unlikely(running))
7632 tsk->sched_class->set_curr_task(rq);
7633 if (on_rq)
7831 enqueue_task(rq, tsk, 0); 7634 enqueue_task(rq, tsk, 0);
7832 }
7833 7635
7834 task_rq_unlock(rq, &flags); 7636 task_rq_unlock(rq, &flags);
7835} 7637}
7836 7638
7837#ifdef CONFIG_FAIR_GROUP_SCHED 7639#ifdef CONFIG_FAIR_GROUP_SCHED
7838/* rq->lock to be locked by caller */
7839static void set_se_shares(struct sched_entity *se, unsigned long shares) 7640static void set_se_shares(struct sched_entity *se, unsigned long shares)
7840{ 7641{
7841 struct cfs_rq *cfs_rq = se->cfs_rq; 7642 struct cfs_rq *cfs_rq = se->cfs_rq;
7842 struct rq *rq = cfs_rq->rq; 7643 struct rq *rq = cfs_rq->rq;
7843 int on_rq; 7644 int on_rq;
7844 7645
7845 if (!shares) 7646 spin_lock_irq(&rq->lock);
7846 shares = MIN_GROUP_SHARES;
7847 7647
7848 on_rq = se->on_rq; 7648 on_rq = se->on_rq;
7849 if (on_rq) { 7649 if (on_rq)
7850 dequeue_entity(cfs_rq, se, 0); 7650 dequeue_entity(cfs_rq, se, 0);
7851 dec_cpu_load(rq, se->load.weight);
7852 }
7853 7651
7854 se->load.weight = shares; 7652 se->load.weight = shares;
7855 se->load.inv_weight = div64_64((1ULL<<32), shares); 7653 se->load.inv_weight = div64_64((1ULL<<32), shares);
7856 7654
7857 if (on_rq) { 7655 if (on_rq)
7858 enqueue_entity(cfs_rq, se, 0); 7656 enqueue_entity(cfs_rq, se, 0);
7859 inc_cpu_load(rq, se->load.weight); 7657
7860 } 7658 spin_unlock_irq(&rq->lock);
7861} 7659}
7862 7660
7863static DEFINE_MUTEX(shares_mutex); 7661static DEFINE_MUTEX(shares_mutex);
@@ -7867,18 +7665,18 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
7867 int i; 7665 int i;
7868 unsigned long flags; 7666 unsigned long flags;
7869 7667
7668 /*
7669 * A weight of 0 or 1 can cause arithmetics problems.
7670 * (The default weight is 1024 - so there's no practical
7671 * limitation from this.)
7672 */
7673 if (shares < 2)
7674 shares = 2;
7675
7870 mutex_lock(&shares_mutex); 7676 mutex_lock(&shares_mutex);
7871 if (tg->shares == shares) 7677 if (tg->shares == shares)
7872 goto done; 7678 goto done;
7873 7679
7874 if (shares < MIN_GROUP_SHARES)
7875 shares = MIN_GROUP_SHARES;
7876
7877 /*
7878 * Prevent any load balance activity (rebalance_shares,
7879 * load_balance_fair) from referring to this group first,
7880 * by taking it off the rq->leaf_cfs_rq_list on each cpu.
7881 */
7882 spin_lock_irqsave(&task_group_lock, flags); 7680 spin_lock_irqsave(&task_group_lock, flags);
7883 for_each_possible_cpu(i) 7681 for_each_possible_cpu(i)
7884 unregister_fair_sched_group(tg, i); 7682 unregister_fair_sched_group(tg, i);
@@ -7892,11 +7690,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
7892 * w/o tripping rebalance_share or load_balance_fair. 7690 * w/o tripping rebalance_share or load_balance_fair.
7893 */ 7691 */
7894 tg->shares = shares; 7692 tg->shares = shares;
7895 for_each_possible_cpu(i) { 7693 for_each_possible_cpu(i)
7896 spin_lock_irq(&cpu_rq(i)->lock);
7897 set_se_shares(tg->se[i], shares); 7694 set_se_shares(tg->se[i], shares);
7898 spin_unlock_irq(&cpu_rq(i)->lock);
7899 }
7900 7695
7901 /* 7696 /*
7902 * Enable load balance activity on this group, by inserting it back on 7697 * Enable load balance activity on this group, by inserting it back on
@@ -7928,9 +7723,7 @@ static unsigned long to_ratio(u64 period, u64 runtime)
7928 if (runtime == RUNTIME_INF) 7723 if (runtime == RUNTIME_INF)
7929 return 1ULL << 16; 7724 return 1ULL << 16;
7930 7725
7931 runtime *= (1ULL << 16); 7726 return div64_64(runtime << 16, period);
7932 div64_64(runtime, period);
7933 return runtime;
7934} 7727}
7935 7728
7936static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) 7729static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
@@ -7954,25 +7747,40 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
7954 return total + to_ratio(period, runtime) < global_ratio; 7747 return total + to_ratio(period, runtime) < global_ratio;
7955} 7748}
7956 7749
7750/* Must be called with tasklist_lock held */
7751static inline int tg_has_rt_tasks(struct task_group *tg)
7752{
7753 struct task_struct *g, *p;
7754 do_each_thread(g, p) {
7755 if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
7756 return 1;
7757 } while_each_thread(g, p);
7758 return 0;
7759}
7760
7957int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) 7761int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
7958{ 7762{
7959 u64 rt_runtime, rt_period; 7763 u64 rt_runtime, rt_period;
7960 int err = 0; 7764 int err = 0;
7961 7765
7962 rt_period = sysctl_sched_rt_period * NSEC_PER_USEC; 7766 rt_period = (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
7963 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; 7767 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
7964 if (rt_runtime_us == -1) 7768 if (rt_runtime_us == -1)
7965 rt_runtime = rt_period; 7769 rt_runtime = RUNTIME_INF;
7966 7770
7967 mutex_lock(&rt_constraints_mutex); 7771 mutex_lock(&rt_constraints_mutex);
7772 read_lock(&tasklist_lock);
7773 if (rt_runtime_us == 0 && tg_has_rt_tasks(tg)) {
7774 err = -EBUSY;
7775 goto unlock;
7776 }
7968 if (!__rt_schedulable(tg, rt_period, rt_runtime)) { 7777 if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
7969 err = -EINVAL; 7778 err = -EINVAL;
7970 goto unlock; 7779 goto unlock;
7971 } 7780 }
7972 if (rt_runtime_us == -1)
7973 rt_runtime = RUNTIME_INF;
7974 tg->rt_runtime = rt_runtime; 7781 tg->rt_runtime = rt_runtime;
7975 unlock: 7782 unlock:
7783 read_unlock(&tasklist_lock);
7976 mutex_unlock(&rt_constraints_mutex); 7784 mutex_unlock(&rt_constraints_mutex);
7977 7785
7978 return err; 7786 return err;