diff options
Diffstat (limited to 'kernel/sched.c')
| -rw-r--r-- | kernel/sched.c | 343 |
1 files changed, 78 insertions, 265 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index f28f19e65b59..1cb53fb1fe3d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
| @@ -174,41 +174,6 @@ struct task_group { | |||
| 174 | struct sched_entity **se; | 174 | struct sched_entity **se; |
| 175 | /* runqueue "owned" by this group on each cpu */ | 175 | /* runqueue "owned" by this group on each cpu */ |
| 176 | struct cfs_rq **cfs_rq; | 176 | struct cfs_rq **cfs_rq; |
| 177 | |||
| 178 | /* | ||
| 179 | * shares assigned to a task group governs how much of cpu bandwidth | ||
| 180 | * is allocated to the group. The more shares a group has, the more is | ||
| 181 | * the cpu bandwidth allocated to it. | ||
| 182 | * | ||
| 183 | * For ex, lets say that there are three task groups, A, B and C which | ||
| 184 | * have been assigned shares 1000, 2000 and 3000 respectively. Then, | ||
| 185 | * cpu bandwidth allocated by the scheduler to task groups A, B and C | ||
| 186 | * should be: | ||
| 187 | * | ||
| 188 | * Bw(A) = 1000/(1000+2000+3000) * 100 = 16.66% | ||
| 189 | * Bw(B) = 2000/(1000+2000+3000) * 100 = 33.33% | ||
| 190 | * Bw(C) = 3000/(1000+2000+3000) * 100 = 50% | ||
| 191 | * | ||
| 192 | * The weight assigned to a task group's schedulable entities on every | ||
| 193 | * cpu (task_group.se[a_cpu]->load.weight) is derived from the task | ||
| 194 | * group's shares. For ex: lets say that task group A has been | ||
| 195 | * assigned shares of 1000 and there are two CPUs in a system. Then, | ||
| 196 | * | ||
| 197 | * tg_A->se[0]->load.weight = tg_A->se[1]->load.weight = 1000; | ||
| 198 | * | ||
| 199 | * Note: It's not necessary that each of a task's group schedulable | ||
| 200 | * entity have the same weight on all CPUs. If the group | ||
| 201 | * has 2 of its tasks on CPU0 and 1 task on CPU1, then a | ||
| 202 | * better distribution of weight could be: | ||
| 203 | * | ||
| 204 | * tg_A->se[0]->load.weight = 2/3 * 2000 = 1333 | ||
| 205 | * tg_A->se[1]->load.weight = 1/2 * 2000 = 667 | ||
| 206 | * | ||
| 207 | * rebalance_shares() is responsible for distributing the shares of a | ||
| 208 | * task groups like this among the group's schedulable entities across | ||
| 209 | * cpus. | ||
| 210 | * | ||
| 211 | */ | ||
| 212 | unsigned long shares; | 177 | unsigned long shares; |
| 213 | #endif | 178 | #endif |
| 214 | 179 | ||
| @@ -250,22 +215,12 @@ static DEFINE_SPINLOCK(task_group_lock); | |||
| 250 | static DEFINE_MUTEX(doms_cur_mutex); | 215 | static DEFINE_MUTEX(doms_cur_mutex); |
| 251 | 216 | ||
| 252 | #ifdef CONFIG_FAIR_GROUP_SCHED | 217 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 253 | #ifdef CONFIG_SMP | ||
| 254 | /* kernel thread that runs rebalance_shares() periodically */ | ||
| 255 | static struct task_struct *lb_monitor_task; | ||
| 256 | static int load_balance_monitor(void *unused); | ||
| 257 | #endif | ||
| 258 | |||
| 259 | static void set_se_shares(struct sched_entity *se, unsigned long shares); | ||
| 260 | |||
| 261 | #ifdef CONFIG_USER_SCHED | 218 | #ifdef CONFIG_USER_SCHED |
| 262 | # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) | 219 | # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) |
| 263 | #else | 220 | #else |
| 264 | # define INIT_TASK_GROUP_LOAD NICE_0_LOAD | 221 | # define INIT_TASK_GROUP_LOAD NICE_0_LOAD |
| 265 | #endif | 222 | #endif |
| 266 | 223 | ||
| 267 | #define MIN_GROUP_SHARES 2 | ||
| 268 | |||
| 269 | static int init_task_group_load = INIT_TASK_GROUP_LOAD; | 224 | static int init_task_group_load = INIT_TASK_GROUP_LOAD; |
| 270 | #endif | 225 | #endif |
| 271 | 226 | ||
| @@ -668,6 +623,8 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32; | |||
| 668 | */ | 623 | */ |
| 669 | unsigned int sysctl_sched_rt_period = 1000000; | 624 | unsigned int sysctl_sched_rt_period = 1000000; |
| 670 | 625 | ||
| 626 | static __read_mostly int scheduler_running; | ||
| 627 | |||
| 671 | /* | 628 | /* |
| 672 | * part of the period that we allow rt tasks to run in us. | 629 | * part of the period that we allow rt tasks to run in us. |
| 673 | * default: 0.95s | 630 | * default: 0.95s |
| @@ -689,14 +646,16 @@ unsigned long long cpu_clock(int cpu) | |||
| 689 | unsigned long flags; | 646 | unsigned long flags; |
| 690 | struct rq *rq; | 647 | struct rq *rq; |
| 691 | 648 | ||
| 692 | local_irq_save(flags); | ||
| 693 | rq = cpu_rq(cpu); | ||
| 694 | /* | 649 | /* |
| 695 | * Only call sched_clock() if the scheduler has already been | 650 | * Only call sched_clock() if the scheduler has already been |
| 696 | * initialized (some code might call cpu_clock() very early): | 651 | * initialized (some code might call cpu_clock() very early): |
| 697 | */ | 652 | */ |
| 698 | if (rq->idle) | 653 | if (unlikely(!scheduler_running)) |
| 699 | update_rq_clock(rq); | 654 | return 0; |
| 655 | |||
| 656 | local_irq_save(flags); | ||
| 657 | rq = cpu_rq(cpu); | ||
| 658 | update_rq_clock(rq); | ||
| 700 | now = rq->clock; | 659 | now = rq->clock; |
| 701 | local_irq_restore(flags); | 660 | local_irq_restore(flags); |
| 702 | 661 | ||
| @@ -1241,16 +1200,6 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime); | |||
| 1241 | static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} | 1200 | static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} |
| 1242 | #endif | 1201 | #endif |
| 1243 | 1202 | ||
| 1244 | static inline void inc_cpu_load(struct rq *rq, unsigned long load) | ||
| 1245 | { | ||
| 1246 | update_load_add(&rq->load, load); | ||
| 1247 | } | ||
| 1248 | |||
| 1249 | static inline void dec_cpu_load(struct rq *rq, unsigned long load) | ||
| 1250 | { | ||
| 1251 | update_load_sub(&rq->load, load); | ||
| 1252 | } | ||
| 1253 | |||
| 1254 | #ifdef CONFIG_SMP | 1203 | #ifdef CONFIG_SMP |
| 1255 | static unsigned long source_load(int cpu, int type); | 1204 | static unsigned long source_load(int cpu, int type); |
| 1256 | static unsigned long target_load(int cpu, int type); | 1205 | static unsigned long target_load(int cpu, int type); |
| @@ -1268,14 +1217,26 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); | |||
| 1268 | 1217 | ||
| 1269 | #define sched_class_highest (&rt_sched_class) | 1218 | #define sched_class_highest (&rt_sched_class) |
| 1270 | 1219 | ||
| 1271 | static void inc_nr_running(struct rq *rq) | 1220 | static inline void inc_load(struct rq *rq, const struct task_struct *p) |
| 1221 | { | ||
| 1222 | update_load_add(&rq->load, p->se.load.weight); | ||
| 1223 | } | ||
| 1224 | |||
| 1225 | static inline void dec_load(struct rq *rq, const struct task_struct *p) | ||
| 1226 | { | ||
| 1227 | update_load_sub(&rq->load, p->se.load.weight); | ||
| 1228 | } | ||
| 1229 | |||
| 1230 | static void inc_nr_running(struct task_struct *p, struct rq *rq) | ||
| 1272 | { | 1231 | { |
| 1273 | rq->nr_running++; | 1232 | rq->nr_running++; |
| 1233 | inc_load(rq, p); | ||
| 1274 | } | 1234 | } |
| 1275 | 1235 | ||
| 1276 | static void dec_nr_running(struct rq *rq) | 1236 | static void dec_nr_running(struct task_struct *p, struct rq *rq) |
| 1277 | { | 1237 | { |
| 1278 | rq->nr_running--; | 1238 | rq->nr_running--; |
| 1239 | dec_load(rq, p); | ||
| 1279 | } | 1240 | } |
| 1280 | 1241 | ||
| 1281 | static void set_load_weight(struct task_struct *p) | 1242 | static void set_load_weight(struct task_struct *p) |
| @@ -1367,7 +1328,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) | |||
| 1367 | rq->nr_uninterruptible--; | 1328 | rq->nr_uninterruptible--; |
| 1368 | 1329 | ||
| 1369 | enqueue_task(rq, p, wakeup); | 1330 | enqueue_task(rq, p, wakeup); |
| 1370 | inc_nr_running(rq); | 1331 | inc_nr_running(p, rq); |
| 1371 | } | 1332 | } |
| 1372 | 1333 | ||
| 1373 | /* | 1334 | /* |
| @@ -1379,7 +1340,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) | |||
| 1379 | rq->nr_uninterruptible++; | 1340 | rq->nr_uninterruptible++; |
| 1380 | 1341 | ||
| 1381 | dequeue_task(rq, p, sleep); | 1342 | dequeue_task(rq, p, sleep); |
| 1382 | dec_nr_running(rq); | 1343 | dec_nr_running(p, rq); |
| 1383 | } | 1344 | } |
| 1384 | 1345 | ||
| 1385 | /** | 1346 | /** |
| @@ -1831,6 +1792,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
| 1831 | long old_state; | 1792 | long old_state; |
| 1832 | struct rq *rq; | 1793 | struct rq *rq; |
| 1833 | 1794 | ||
| 1795 | smp_wmb(); | ||
| 1834 | rq = task_rq_lock(p, &flags); | 1796 | rq = task_rq_lock(p, &flags); |
| 1835 | old_state = p->state; | 1797 | old_state = p->state; |
| 1836 | if (!(old_state & state)) | 1798 | if (!(old_state & state)) |
| @@ -2018,7 +1980,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | |||
| 2018 | * management (if any): | 1980 | * management (if any): |
| 2019 | */ | 1981 | */ |
| 2020 | p->sched_class->task_new(rq, p); | 1982 | p->sched_class->task_new(rq, p); |
| 2021 | inc_nr_running(rq); | 1983 | inc_nr_running(p, rq); |
| 2022 | } | 1984 | } |
| 2023 | check_preempt_curr(rq, p); | 1985 | check_preempt_curr(rq, p); |
| 2024 | #ifdef CONFIG_SMP | 1986 | #ifdef CONFIG_SMP |
| @@ -3766,7 +3728,7 @@ void scheduler_tick(void) | |||
| 3766 | 3728 | ||
| 3767 | #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) | 3729 | #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) |
| 3768 | 3730 | ||
| 3769 | void add_preempt_count(int val) | 3731 | void __kprobes add_preempt_count(int val) |
| 3770 | { | 3732 | { |
| 3771 | /* | 3733 | /* |
| 3772 | * Underflow? | 3734 | * Underflow? |
| @@ -3782,7 +3744,7 @@ void add_preempt_count(int val) | |||
| 3782 | } | 3744 | } |
| 3783 | EXPORT_SYMBOL(add_preempt_count); | 3745 | EXPORT_SYMBOL(add_preempt_count); |
| 3784 | 3746 | ||
| 3785 | void sub_preempt_count(int val) | 3747 | void __kprobes sub_preempt_count(int val) |
| 3786 | { | 3748 | { |
| 3787 | /* | 3749 | /* |
| 3788 | * Underflow? | 3750 | * Underflow? |
| @@ -3884,7 +3846,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev) | |||
| 3884 | asmlinkage void __sched schedule(void) | 3846 | asmlinkage void __sched schedule(void) |
| 3885 | { | 3847 | { |
| 3886 | struct task_struct *prev, *next; | 3848 | struct task_struct *prev, *next; |
| 3887 | long *switch_count; | 3849 | unsigned long *switch_count; |
| 3888 | struct rq *rq; | 3850 | struct rq *rq; |
| 3889 | int cpu; | 3851 | int cpu; |
| 3890 | 3852 | ||
| @@ -4357,8 +4319,10 @@ void set_user_nice(struct task_struct *p, long nice) | |||
| 4357 | goto out_unlock; | 4319 | goto out_unlock; |
| 4358 | } | 4320 | } |
| 4359 | on_rq = p->se.on_rq; | 4321 | on_rq = p->se.on_rq; |
| 4360 | if (on_rq) | 4322 | if (on_rq) { |
| 4361 | dequeue_task(rq, p, 0); | 4323 | dequeue_task(rq, p, 0); |
| 4324 | dec_load(rq, p); | ||
| 4325 | } | ||
| 4362 | 4326 | ||
| 4363 | p->static_prio = NICE_TO_PRIO(nice); | 4327 | p->static_prio = NICE_TO_PRIO(nice); |
| 4364 | set_load_weight(p); | 4328 | set_load_weight(p); |
| @@ -4368,6 +4332,7 @@ void set_user_nice(struct task_struct *p, long nice) | |||
| 4368 | 4332 | ||
| 4369 | if (on_rq) { | 4333 | if (on_rq) { |
| 4370 | enqueue_task(rq, p, 0); | 4334 | enqueue_task(rq, p, 0); |
| 4335 | inc_load(rq, p); | ||
| 4371 | /* | 4336 | /* |
| 4372 | * If the task increased its priority or is running and | 4337 | * If the task increased its priority or is running and |
| 4373 | * lowered its priority, then reschedule its CPU: | 4338 | * lowered its priority, then reschedule its CPU: |
| @@ -4457,7 +4422,7 @@ int task_nice(const struct task_struct *p) | |||
| 4457 | { | 4422 | { |
| 4458 | return TASK_NICE(p); | 4423 | return TASK_NICE(p); |
| 4459 | } | 4424 | } |
| 4460 | EXPORT_SYMBOL_GPL(task_nice); | 4425 | EXPORT_SYMBOL(task_nice); |
| 4461 | 4426 | ||
| 4462 | /** | 4427 | /** |
| 4463 | * idle_cpu - is a given cpu idle currently? | 4428 | * idle_cpu - is a given cpu idle currently? |
| @@ -5135,7 +5100,7 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) | |||
| 5135 | time_slice = 0; | 5100 | time_slice = 0; |
| 5136 | if (p->policy == SCHED_RR) { | 5101 | if (p->policy == SCHED_RR) { |
| 5137 | time_slice = DEF_TIMESLICE; | 5102 | time_slice = DEF_TIMESLICE; |
| 5138 | } else { | 5103 | } else if (p->policy != SCHED_FIFO) { |
| 5139 | struct sched_entity *se = &p->se; | 5104 | struct sched_entity *se = &p->se; |
| 5140 | unsigned long flags; | 5105 | unsigned long flags; |
| 5141 | struct rq *rq; | 5106 | struct rq *rq; |
| @@ -5916,7 +5881,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
| 5916 | spin_unlock_irq(&rq->lock); | 5881 | spin_unlock_irq(&rq->lock); |
| 5917 | break; | 5882 | break; |
| 5918 | 5883 | ||
| 5919 | case CPU_DOWN_PREPARE: | 5884 | case CPU_DYING: |
| 5885 | case CPU_DYING_FROZEN: | ||
| 5920 | /* Update our root-domain */ | 5886 | /* Update our root-domain */ |
| 5921 | rq = cpu_rq(cpu); | 5887 | rq = cpu_rq(cpu); |
| 5922 | spin_lock_irqsave(&rq->lock, flags); | 5888 | spin_lock_irqsave(&rq->lock, flags); |
| @@ -7082,21 +7048,6 @@ void __init sched_init_smp(void) | |||
| 7082 | if (set_cpus_allowed(current, non_isolated_cpus) < 0) | 7048 | if (set_cpus_allowed(current, non_isolated_cpus) < 0) |
| 7083 | BUG(); | 7049 | BUG(); |
| 7084 | sched_init_granularity(); | 7050 | sched_init_granularity(); |
| 7085 | |||
| 7086 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 7087 | if (nr_cpu_ids == 1) | ||
| 7088 | return; | ||
| 7089 | |||
| 7090 | lb_monitor_task = kthread_create(load_balance_monitor, NULL, | ||
| 7091 | "group_balance"); | ||
| 7092 | if (!IS_ERR(lb_monitor_task)) { | ||
| 7093 | lb_monitor_task->flags |= PF_NOFREEZE; | ||
| 7094 | wake_up_process(lb_monitor_task); | ||
| 7095 | } else { | ||
| 7096 | printk(KERN_ERR "Could not create load balance monitor thread" | ||
| 7097 | "(error = %ld) \n", PTR_ERR(lb_monitor_task)); | ||
| 7098 | } | ||
| 7099 | #endif | ||
| 7100 | } | 7051 | } |
| 7101 | #else | 7052 | #else |
| 7102 | void __init sched_init_smp(void) | 7053 | void __init sched_init_smp(void) |
| @@ -7283,6 +7234,8 @@ void __init sched_init(void) | |||
| 7283 | * During early bootup we pretend to be a normal task: | 7234 | * During early bootup we pretend to be a normal task: |
| 7284 | */ | 7235 | */ |
| 7285 | current->sched_class = &fair_sched_class; | 7236 | current->sched_class = &fair_sched_class; |
| 7237 | |||
| 7238 | scheduler_running = 1; | ||
| 7286 | } | 7239 | } |
| 7287 | 7240 | ||
| 7288 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP | 7241 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP |
| @@ -7417,157 +7370,6 @@ void set_curr_task(int cpu, struct task_struct *p) | |||
| 7417 | 7370 | ||
| 7418 | #ifdef CONFIG_GROUP_SCHED | 7371 | #ifdef CONFIG_GROUP_SCHED |
| 7419 | 7372 | ||
| 7420 | #if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP | ||
| 7421 | /* | ||
| 7422 | * distribute shares of all task groups among their schedulable entities, | ||
| 7423 | * to reflect load distribution across cpus. | ||
| 7424 | */ | ||
| 7425 | static int rebalance_shares(struct sched_domain *sd, int this_cpu) | ||
| 7426 | { | ||
| 7427 | struct cfs_rq *cfs_rq; | ||
| 7428 | struct rq *rq = cpu_rq(this_cpu); | ||
| 7429 | cpumask_t sdspan = sd->span; | ||
| 7430 | int balanced = 1; | ||
| 7431 | |||
| 7432 | /* Walk thr' all the task groups that we have */ | ||
| 7433 | for_each_leaf_cfs_rq(rq, cfs_rq) { | ||
| 7434 | int i; | ||
| 7435 | unsigned long total_load = 0, total_shares; | ||
| 7436 | struct task_group *tg = cfs_rq->tg; | ||
| 7437 | |||
| 7438 | /* Gather total task load of this group across cpus */ | ||
| 7439 | for_each_cpu_mask(i, sdspan) | ||
| 7440 | total_load += tg->cfs_rq[i]->load.weight; | ||
| 7441 | |||
| 7442 | /* Nothing to do if this group has no load */ | ||
| 7443 | if (!total_load) | ||
| 7444 | continue; | ||
| 7445 | |||
| 7446 | /* | ||
| 7447 | * tg->shares represents the number of cpu shares the task group | ||
| 7448 | * is eligible to hold on a single cpu. On N cpus, it is | ||
| 7449 | * eligible to hold (N * tg->shares) number of cpu shares. | ||
| 7450 | */ | ||
| 7451 | total_shares = tg->shares * cpus_weight(sdspan); | ||
| 7452 | |||
| 7453 | /* | ||
| 7454 | * redistribute total_shares across cpus as per the task load | ||
| 7455 | * distribution. | ||
| 7456 | */ | ||
| 7457 | for_each_cpu_mask(i, sdspan) { | ||
| 7458 | unsigned long local_load, local_shares; | ||
| 7459 | |||
| 7460 | local_load = tg->cfs_rq[i]->load.weight; | ||
| 7461 | local_shares = (local_load * total_shares) / total_load; | ||
| 7462 | if (!local_shares) | ||
| 7463 | local_shares = MIN_GROUP_SHARES; | ||
| 7464 | if (local_shares == tg->se[i]->load.weight) | ||
| 7465 | continue; | ||
| 7466 | |||
| 7467 | spin_lock_irq(&cpu_rq(i)->lock); | ||
| 7468 | set_se_shares(tg->se[i], local_shares); | ||
| 7469 | spin_unlock_irq(&cpu_rq(i)->lock); | ||
| 7470 | balanced = 0; | ||
| 7471 | } | ||
| 7472 | } | ||
| 7473 | |||
| 7474 | return balanced; | ||
| 7475 | } | ||
| 7476 | |||
| 7477 | /* | ||
| 7478 | * How frequently should we rebalance_shares() across cpus? | ||
| 7479 | * | ||
| 7480 | * The more frequently we rebalance shares, the more accurate is the fairness | ||
| 7481 | * of cpu bandwidth distribution between task groups. However higher frequency | ||
| 7482 | * also implies increased scheduling overhead. | ||
| 7483 | * | ||
| 7484 | * sysctl_sched_min_bal_int_shares represents the minimum interval between | ||
| 7485 | * consecutive calls to rebalance_shares() in the same sched domain. | ||
| 7486 | * | ||
| 7487 | * sysctl_sched_max_bal_int_shares represents the maximum interval between | ||
| 7488 | * consecutive calls to rebalance_shares() in the same sched domain. | ||
| 7489 | * | ||
| 7490 | * These settings allows for the appropriate trade-off between accuracy of | ||
| 7491 | * fairness and the associated overhead. | ||
| 7492 | * | ||
| 7493 | */ | ||
| 7494 | |||
| 7495 | /* default: 8ms, units: milliseconds */ | ||
| 7496 | const_debug unsigned int sysctl_sched_min_bal_int_shares = 8; | ||
| 7497 | |||
| 7498 | /* default: 128ms, units: milliseconds */ | ||
| 7499 | const_debug unsigned int sysctl_sched_max_bal_int_shares = 128; | ||
| 7500 | |||
| 7501 | /* kernel thread that runs rebalance_shares() periodically */ | ||
| 7502 | static int load_balance_monitor(void *unused) | ||
| 7503 | { | ||
| 7504 | unsigned int timeout = sysctl_sched_min_bal_int_shares; | ||
| 7505 | struct sched_param schedparm; | ||
| 7506 | int ret; | ||
| 7507 | |||
| 7508 | /* | ||
| 7509 | * We don't want this thread's execution to be limited by the shares | ||
| 7510 | * assigned to default group (init_task_group). Hence make it run | ||
| 7511 | * as a SCHED_RR RT task at the lowest priority. | ||
| 7512 | */ | ||
| 7513 | schedparm.sched_priority = 1; | ||
| 7514 | ret = sched_setscheduler(current, SCHED_RR, &schedparm); | ||
| 7515 | if (ret) | ||
| 7516 | printk(KERN_ERR "Couldn't set SCHED_RR policy for load balance" | ||
| 7517 | " monitor thread (error = %d) \n", ret); | ||
| 7518 | |||
| 7519 | while (!kthread_should_stop()) { | ||
| 7520 | int i, cpu, balanced = 1; | ||
| 7521 | |||
| 7522 | /* Prevent cpus going down or coming up */ | ||
| 7523 | get_online_cpus(); | ||
| 7524 | /* lockout changes to doms_cur[] array */ | ||
| 7525 | lock_doms_cur(); | ||
| 7526 | /* | ||
| 7527 | * Enter a rcu read-side critical section to safely walk rq->sd | ||
| 7528 | * chain on various cpus and to walk task group list | ||
| 7529 | * (rq->leaf_cfs_rq_list) in rebalance_shares(). | ||
| 7530 | */ | ||
| 7531 | rcu_read_lock(); | ||
| 7532 | |||
| 7533 | for (i = 0; i < ndoms_cur; i++) { | ||
| 7534 | cpumask_t cpumap = doms_cur[i]; | ||
| 7535 | struct sched_domain *sd = NULL, *sd_prev = NULL; | ||
| 7536 | |||
| 7537 | cpu = first_cpu(cpumap); | ||
| 7538 | |||
| 7539 | /* Find the highest domain at which to balance shares */ | ||
| 7540 | for_each_domain(cpu, sd) { | ||
| 7541 | if (!(sd->flags & SD_LOAD_BALANCE)) | ||
| 7542 | continue; | ||
| 7543 | sd_prev = sd; | ||
| 7544 | } | ||
| 7545 | |||
| 7546 | sd = sd_prev; | ||
| 7547 | /* sd == NULL? No load balance reqd in this domain */ | ||
| 7548 | if (!sd) | ||
| 7549 | continue; | ||
| 7550 | |||
| 7551 | balanced &= rebalance_shares(sd, cpu); | ||
| 7552 | } | ||
| 7553 | |||
| 7554 | rcu_read_unlock(); | ||
| 7555 | |||
| 7556 | unlock_doms_cur(); | ||
| 7557 | put_online_cpus(); | ||
| 7558 | |||
| 7559 | if (!balanced) | ||
| 7560 | timeout = sysctl_sched_min_bal_int_shares; | ||
| 7561 | else if (timeout < sysctl_sched_max_bal_int_shares) | ||
| 7562 | timeout *= 2; | ||
| 7563 | |||
| 7564 | msleep_interruptible(timeout); | ||
| 7565 | } | ||
| 7566 | |||
| 7567 | return 0; | ||
| 7568 | } | ||
| 7569 | #endif /* CONFIG_SMP */ | ||
| 7570 | |||
| 7571 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7373 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 7572 | static void free_fair_sched_group(struct task_group *tg) | 7374 | static void free_fair_sched_group(struct task_group *tg) |
| 7573 | { | 7375 | { |
| @@ -7824,6 +7626,11 @@ void sched_move_task(struct task_struct *tsk) | |||
| 7824 | 7626 | ||
| 7825 | set_task_rq(tsk, task_cpu(tsk)); | 7627 | set_task_rq(tsk, task_cpu(tsk)); |
| 7826 | 7628 | ||
| 7629 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 7630 | if (tsk->sched_class->moved_group) | ||
| 7631 | tsk->sched_class->moved_group(tsk); | ||
| 7632 | #endif | ||
| 7633 | |||
| 7827 | if (on_rq) { | 7634 | if (on_rq) { |
| 7828 | if (unlikely(running)) | 7635 | if (unlikely(running)) |
| 7829 | tsk->sched_class->set_curr_task(rq); | 7636 | tsk->sched_class->set_curr_task(rq); |
| @@ -7834,29 +7641,25 @@ void sched_move_task(struct task_struct *tsk) | |||
| 7834 | } | 7641 | } |
| 7835 | 7642 | ||
| 7836 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7643 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 7837 | /* rq->lock to be locked by caller */ | ||
| 7838 | static void set_se_shares(struct sched_entity *se, unsigned long shares) | 7644 | static void set_se_shares(struct sched_entity *se, unsigned long shares) |
| 7839 | { | 7645 | { |
| 7840 | struct cfs_rq *cfs_rq = se->cfs_rq; | 7646 | struct cfs_rq *cfs_rq = se->cfs_rq; |
| 7841 | struct rq *rq = cfs_rq->rq; | 7647 | struct rq *rq = cfs_rq->rq; |
| 7842 | int on_rq; | 7648 | int on_rq; |
| 7843 | 7649 | ||
| 7844 | if (!shares) | 7650 | spin_lock_irq(&rq->lock); |
| 7845 | shares = MIN_GROUP_SHARES; | ||
| 7846 | 7651 | ||
| 7847 | on_rq = se->on_rq; | 7652 | on_rq = se->on_rq; |
| 7848 | if (on_rq) { | 7653 | if (on_rq) |
| 7849 | dequeue_entity(cfs_rq, se, 0); | 7654 | dequeue_entity(cfs_rq, se, 0); |
| 7850 | dec_cpu_load(rq, se->load.weight); | ||
| 7851 | } | ||
| 7852 | 7655 | ||
| 7853 | se->load.weight = shares; | 7656 | se->load.weight = shares; |
| 7854 | se->load.inv_weight = div64_64((1ULL<<32), shares); | 7657 | se->load.inv_weight = div64_64((1ULL<<32), shares); |
| 7855 | 7658 | ||
| 7856 | if (on_rq) { | 7659 | if (on_rq) |
| 7857 | enqueue_entity(cfs_rq, se, 0); | 7660 | enqueue_entity(cfs_rq, se, 0); |
| 7858 | inc_cpu_load(rq, se->load.weight); | 7661 | |
| 7859 | } | 7662 | spin_unlock_irq(&rq->lock); |
| 7860 | } | 7663 | } |
| 7861 | 7664 | ||
| 7862 | static DEFINE_MUTEX(shares_mutex); | 7665 | static DEFINE_MUTEX(shares_mutex); |
| @@ -7866,18 +7669,18 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) | |||
| 7866 | int i; | 7669 | int i; |
| 7867 | unsigned long flags; | 7670 | unsigned long flags; |
| 7868 | 7671 | ||
| 7672 | /* | ||
| 7673 | * A weight of 0 or 1 can cause arithmetics problems. | ||
| 7674 | * (The default weight is 1024 - so there's no practical | ||
| 7675 | * limitation from this.) | ||
| 7676 | */ | ||
| 7677 | if (shares < 2) | ||
| 7678 | shares = 2; | ||
| 7679 | |||
| 7869 | mutex_lock(&shares_mutex); | 7680 | mutex_lock(&shares_mutex); |
| 7870 | if (tg->shares == shares) | 7681 | if (tg->shares == shares) |
| 7871 | goto done; | 7682 | goto done; |
| 7872 | 7683 | ||
| 7873 | if (shares < MIN_GROUP_SHARES) | ||
| 7874 | shares = MIN_GROUP_SHARES; | ||
| 7875 | |||
| 7876 | /* | ||
| 7877 | * Prevent any load balance activity (rebalance_shares, | ||
| 7878 | * load_balance_fair) from referring to this group first, | ||
| 7879 | * by taking it off the rq->leaf_cfs_rq_list on each cpu. | ||
| 7880 | */ | ||
| 7881 | spin_lock_irqsave(&task_group_lock, flags); | 7684 | spin_lock_irqsave(&task_group_lock, flags); |
| 7882 | for_each_possible_cpu(i) | 7685 | for_each_possible_cpu(i) |
| 7883 | unregister_fair_sched_group(tg, i); | 7686 | unregister_fair_sched_group(tg, i); |
| @@ -7891,11 +7694,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) | |||
| 7891 | * w/o tripping rebalance_share or load_balance_fair. | 7694 | * w/o tripping rebalance_share or load_balance_fair. |
| 7892 | */ | 7695 | */ |
| 7893 | tg->shares = shares; | 7696 | tg->shares = shares; |
| 7894 | for_each_possible_cpu(i) { | 7697 | for_each_possible_cpu(i) |
| 7895 | spin_lock_irq(&cpu_rq(i)->lock); | ||
| 7896 | set_se_shares(tg->se[i], shares); | 7698 | set_se_shares(tg->se[i], shares); |
| 7897 | spin_unlock_irq(&cpu_rq(i)->lock); | ||
| 7898 | } | ||
| 7899 | 7699 | ||
| 7900 | /* | 7700 | /* |
| 7901 | * Enable load balance activity on this group, by inserting it back on | 7701 | * Enable load balance activity on this group, by inserting it back on |
| @@ -7927,9 +7727,7 @@ static unsigned long to_ratio(u64 period, u64 runtime) | |||
| 7927 | if (runtime == RUNTIME_INF) | 7727 | if (runtime == RUNTIME_INF) |
| 7928 | return 1ULL << 16; | 7728 | return 1ULL << 16; |
| 7929 | 7729 | ||
| 7930 | runtime *= (1ULL << 16); | 7730 | return div64_64(runtime << 16, period); |
| 7931 | div64_64(runtime, period); | ||
| 7932 | return runtime; | ||
| 7933 | } | 7731 | } |
| 7934 | 7732 | ||
| 7935 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) | 7733 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) |
| @@ -7953,25 +7751,40 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) | |||
| 7953 | return total + to_ratio(period, runtime) < global_ratio; | 7751 | return total + to_ratio(period, runtime) < global_ratio; |
| 7954 | } | 7752 | } |
| 7955 | 7753 | ||
| 7754 | /* Must be called with tasklist_lock held */ | ||
| 7755 | static inline int tg_has_rt_tasks(struct task_group *tg) | ||
| 7756 | { | ||
| 7757 | struct task_struct *g, *p; | ||
| 7758 | do_each_thread(g, p) { | ||
| 7759 | if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) | ||
| 7760 | return 1; | ||
| 7761 | } while_each_thread(g, p); | ||
| 7762 | return 0; | ||
| 7763 | } | ||
| 7764 | |||
| 7956 | int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) | 7765 | int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) |
| 7957 | { | 7766 | { |
| 7958 | u64 rt_runtime, rt_period; | 7767 | u64 rt_runtime, rt_period; |
| 7959 | int err = 0; | 7768 | int err = 0; |
| 7960 | 7769 | ||
| 7961 | rt_period = sysctl_sched_rt_period * NSEC_PER_USEC; | 7770 | rt_period = (u64)sysctl_sched_rt_period * NSEC_PER_USEC; |
| 7962 | rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; | 7771 | rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; |
| 7963 | if (rt_runtime_us == -1) | 7772 | if (rt_runtime_us == -1) |
| 7964 | rt_runtime = rt_period; | 7773 | rt_runtime = RUNTIME_INF; |
| 7965 | 7774 | ||
| 7966 | mutex_lock(&rt_constraints_mutex); | 7775 | mutex_lock(&rt_constraints_mutex); |
| 7776 | read_lock(&tasklist_lock); | ||
| 7777 | if (rt_runtime_us == 0 && tg_has_rt_tasks(tg)) { | ||
| 7778 | err = -EBUSY; | ||
| 7779 | goto unlock; | ||
| 7780 | } | ||
| 7967 | if (!__rt_schedulable(tg, rt_period, rt_runtime)) { | 7781 | if (!__rt_schedulable(tg, rt_period, rt_runtime)) { |
| 7968 | err = -EINVAL; | 7782 | err = -EINVAL; |
| 7969 | goto unlock; | 7783 | goto unlock; |
| 7970 | } | 7784 | } |
| 7971 | if (rt_runtime_us == -1) | ||
| 7972 | rt_runtime = RUNTIME_INF; | ||
| 7973 | tg->rt_runtime = rt_runtime; | 7785 | tg->rt_runtime = rt_runtime; |
| 7974 | unlock: | 7786 | unlock: |
| 7787 | read_unlock(&tasklist_lock); | ||
| 7975 | mutex_unlock(&rt_constraints_mutex); | 7788 | mutex_unlock(&rt_constraints_mutex); |
| 7976 | 7789 | ||
| 7977 | return err; | 7790 | return err; |
