diff options
author | Peter Zijlstra <a.p.zijlstra@chello.nl> | 2008-02-25 11:34:02 -0500 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2008-03-04 11:54:06 -0500 |
commit | 62fb185130e4d420f71a30ff59d8b16b74ef5d2b (patch) | |
tree | 474c0824a5bf90950b0a430a11a52b358c9e1f31 | |
parent | 976dde010e513a9c7c3117a32b7b015f84b37430 (diff) |
sched: revert load_balance_monitor() changes
The following commits cause a number of regressions:
commit 58e2d4ca581167c2a079f4ee02be2f0bc52e8729
Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Date: Fri Jan 25 21:08:00 2008 +0100
sched: group scheduling, change how cpu load is calculated
commit 6b2d7700266b9402e12824e11e0099ae6a4a6a79
Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Date: Fri Jan 25 21:08:00 2008 +0100
sched: group scheduler, fix fairness of cpu bandwidth allocation for task groups
Namely:
- very frequent wakeups on SMP, reported by PowerTop users.
- cacheline trashing on (large) SMP
- some latencies larger than 500ms
While there is a mergeable patch to fix the latter, the former issues
are not fixable in a manner suitable for .25 (we're at -rc3 now).
Hence we revert them and try again in v2.6.26.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
CC: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Tested-by: Alexey Zaytsev <alexey.zaytsev@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r-- | include/linux/sched.h | 4 | ||||
-rw-r--r-- | kernel/sched.c | 283 | ||||
-rw-r--r-- | kernel/sched_fair.c | 115 | ||||
-rw-r--r-- | kernel/sched_rt.c | 4 | ||||
-rw-r--r-- | kernel/sysctl.c | 18 |
5 files changed, 70 insertions, 354 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h index 2c9621f8bf87..9ae4030067a9 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -1542,10 +1542,6 @@ extern unsigned int sysctl_sched_child_runs_first; | |||
1542 | extern unsigned int sysctl_sched_features; | 1542 | extern unsigned int sysctl_sched_features; |
1543 | extern unsigned int sysctl_sched_migration_cost; | 1543 | extern unsigned int sysctl_sched_migration_cost; |
1544 | extern unsigned int sysctl_sched_nr_migrate; | 1544 | extern unsigned int sysctl_sched_nr_migrate; |
1545 | #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) | ||
1546 | extern unsigned int sysctl_sched_min_bal_int_shares; | ||
1547 | extern unsigned int sysctl_sched_max_bal_int_shares; | ||
1548 | #endif | ||
1549 | 1545 | ||
1550 | int sched_nr_latency_handler(struct ctl_table *table, int write, | 1546 | int sched_nr_latency_handler(struct ctl_table *table, int write, |
1551 | struct file *file, void __user *buffer, size_t *length, | 1547 | struct file *file, void __user *buffer, size_t *length, |
diff --git a/kernel/sched.c b/kernel/sched.c index f06950c8a6ce..dcd553cc4ee8 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -174,41 +174,6 @@ struct task_group { | |||
174 | struct sched_entity **se; | 174 | struct sched_entity **se; |
175 | /* runqueue "owned" by this group on each cpu */ | 175 | /* runqueue "owned" by this group on each cpu */ |
176 | struct cfs_rq **cfs_rq; | 176 | struct cfs_rq **cfs_rq; |
177 | |||
178 | /* | ||
179 | * shares assigned to a task group governs how much of cpu bandwidth | ||
180 | * is allocated to the group. The more shares a group has, the more is | ||
181 | * the cpu bandwidth allocated to it. | ||
182 | * | ||
183 | * For ex, lets say that there are three task groups, A, B and C which | ||
184 | * have been assigned shares 1000, 2000 and 3000 respectively. Then, | ||
185 | * cpu bandwidth allocated by the scheduler to task groups A, B and C | ||
186 | * should be: | ||
187 | * | ||
188 | * Bw(A) = 1000/(1000+2000+3000) * 100 = 16.66% | ||
189 | * Bw(B) = 2000/(1000+2000+3000) * 100 = 33.33% | ||
190 | * Bw(C) = 3000/(1000+2000+3000) * 100 = 50% | ||
191 | * | ||
192 | * The weight assigned to a task group's schedulable entities on every | ||
193 | * cpu (task_group.se[a_cpu]->load.weight) is derived from the task | ||
194 | * group's shares. For ex: lets say that task group A has been | ||
195 | * assigned shares of 1000 and there are two CPUs in a system. Then, | ||
196 | * | ||
197 | * tg_A->se[0]->load.weight = tg_A->se[1]->load.weight = 1000; | ||
198 | * | ||
199 | * Note: It's not necessary that each of a task's group schedulable | ||
200 | * entity have the same weight on all CPUs. If the group | ||
201 | * has 2 of its tasks on CPU0 and 1 task on CPU1, then a | ||
202 | * better distribution of weight could be: | ||
203 | * | ||
204 | * tg_A->se[0]->load.weight = 2/3 * 2000 = 1333 | ||
205 | * tg_A->se[1]->load.weight = 1/2 * 2000 = 667 | ||
206 | * | ||
207 | * rebalance_shares() is responsible for distributing the shares of a | ||
208 | * task groups like this among the group's schedulable entities across | ||
209 | * cpus. | ||
210 | * | ||
211 | */ | ||
212 | unsigned long shares; | 177 | unsigned long shares; |
213 | #endif | 178 | #endif |
214 | 179 | ||
@@ -250,22 +215,12 @@ static DEFINE_SPINLOCK(task_group_lock); | |||
250 | static DEFINE_MUTEX(doms_cur_mutex); | 215 | static DEFINE_MUTEX(doms_cur_mutex); |
251 | 216 | ||
252 | #ifdef CONFIG_FAIR_GROUP_SCHED | 217 | #ifdef CONFIG_FAIR_GROUP_SCHED |
253 | #ifdef CONFIG_SMP | ||
254 | /* kernel thread that runs rebalance_shares() periodically */ | ||
255 | static struct task_struct *lb_monitor_task; | ||
256 | static int load_balance_monitor(void *unused); | ||
257 | #endif | ||
258 | |||
259 | static void set_se_shares(struct sched_entity *se, unsigned long shares); | ||
260 | |||
261 | #ifdef CONFIG_USER_SCHED | 218 | #ifdef CONFIG_USER_SCHED |
262 | # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) | 219 | # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) |
263 | #else | 220 | #else |
264 | # define INIT_TASK_GROUP_LOAD NICE_0_LOAD | 221 | # define INIT_TASK_GROUP_LOAD NICE_0_LOAD |
265 | #endif | 222 | #endif |
266 | 223 | ||
267 | #define MIN_GROUP_SHARES 2 | ||
268 | |||
269 | static int init_task_group_load = INIT_TASK_GROUP_LOAD; | 224 | static int init_task_group_load = INIT_TASK_GROUP_LOAD; |
270 | #endif | 225 | #endif |
271 | 226 | ||
@@ -1245,16 +1200,6 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime); | |||
1245 | static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} | 1200 | static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} |
1246 | #endif | 1201 | #endif |
1247 | 1202 | ||
1248 | static inline void inc_cpu_load(struct rq *rq, unsigned long load) | ||
1249 | { | ||
1250 | update_load_add(&rq->load, load); | ||
1251 | } | ||
1252 | |||
1253 | static inline void dec_cpu_load(struct rq *rq, unsigned long load) | ||
1254 | { | ||
1255 | update_load_sub(&rq->load, load); | ||
1256 | } | ||
1257 | |||
1258 | #ifdef CONFIG_SMP | 1203 | #ifdef CONFIG_SMP |
1259 | static unsigned long source_load(int cpu, int type); | 1204 | static unsigned long source_load(int cpu, int type); |
1260 | static unsigned long target_load(int cpu, int type); | 1205 | static unsigned long target_load(int cpu, int type); |
@@ -1272,14 +1217,26 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); | |||
1272 | 1217 | ||
1273 | #define sched_class_highest (&rt_sched_class) | 1218 | #define sched_class_highest (&rt_sched_class) |
1274 | 1219 | ||
1275 | static void inc_nr_running(struct rq *rq) | 1220 | static inline void inc_load(struct rq *rq, const struct task_struct *p) |
1221 | { | ||
1222 | update_load_add(&rq->load, p->se.load.weight); | ||
1223 | } | ||
1224 | |||
1225 | static inline void dec_load(struct rq *rq, const struct task_struct *p) | ||
1226 | { | ||
1227 | update_load_sub(&rq->load, p->se.load.weight); | ||
1228 | } | ||
1229 | |||
1230 | static void inc_nr_running(struct task_struct *p, struct rq *rq) | ||
1276 | { | 1231 | { |
1277 | rq->nr_running++; | 1232 | rq->nr_running++; |
1233 | inc_load(rq, p); | ||
1278 | } | 1234 | } |
1279 | 1235 | ||
1280 | static void dec_nr_running(struct rq *rq) | 1236 | static void dec_nr_running(struct task_struct *p, struct rq *rq) |
1281 | { | 1237 | { |
1282 | rq->nr_running--; | 1238 | rq->nr_running--; |
1239 | dec_load(rq, p); | ||
1283 | } | 1240 | } |
1284 | 1241 | ||
1285 | static void set_load_weight(struct task_struct *p) | 1242 | static void set_load_weight(struct task_struct *p) |
@@ -1371,7 +1328,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) | |||
1371 | rq->nr_uninterruptible--; | 1328 | rq->nr_uninterruptible--; |
1372 | 1329 | ||
1373 | enqueue_task(rq, p, wakeup); | 1330 | enqueue_task(rq, p, wakeup); |
1374 | inc_nr_running(rq); | 1331 | inc_nr_running(p, rq); |
1375 | } | 1332 | } |
1376 | 1333 | ||
1377 | /* | 1334 | /* |
@@ -1383,7 +1340,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) | |||
1383 | rq->nr_uninterruptible++; | 1340 | rq->nr_uninterruptible++; |
1384 | 1341 | ||
1385 | dequeue_task(rq, p, sleep); | 1342 | dequeue_task(rq, p, sleep); |
1386 | dec_nr_running(rq); | 1343 | dec_nr_running(p, rq); |
1387 | } | 1344 | } |
1388 | 1345 | ||
1389 | /** | 1346 | /** |
@@ -2023,7 +1980,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | |||
2023 | * management (if any): | 1980 | * management (if any): |
2024 | */ | 1981 | */ |
2025 | p->sched_class->task_new(rq, p); | 1982 | p->sched_class->task_new(rq, p); |
2026 | inc_nr_running(rq); | 1983 | inc_nr_running(p, rq); |
2027 | } | 1984 | } |
2028 | check_preempt_curr(rq, p); | 1985 | check_preempt_curr(rq, p); |
2029 | #ifdef CONFIG_SMP | 1986 | #ifdef CONFIG_SMP |
@@ -4362,8 +4319,10 @@ void set_user_nice(struct task_struct *p, long nice) | |||
4362 | goto out_unlock; | 4319 | goto out_unlock; |
4363 | } | 4320 | } |
4364 | on_rq = p->se.on_rq; | 4321 | on_rq = p->se.on_rq; |
4365 | if (on_rq) | 4322 | if (on_rq) { |
4366 | dequeue_task(rq, p, 0); | 4323 | dequeue_task(rq, p, 0); |
4324 | dec_load(rq, p); | ||
4325 | } | ||
4367 | 4326 | ||
4368 | p->static_prio = NICE_TO_PRIO(nice); | 4327 | p->static_prio = NICE_TO_PRIO(nice); |
4369 | set_load_weight(p); | 4328 | set_load_weight(p); |
@@ -4373,6 +4332,7 @@ void set_user_nice(struct task_struct *p, long nice) | |||
4373 | 4332 | ||
4374 | if (on_rq) { | 4333 | if (on_rq) { |
4375 | enqueue_task(rq, p, 0); | 4334 | enqueue_task(rq, p, 0); |
4335 | inc_load(rq, p); | ||
4376 | /* | 4336 | /* |
4377 | * If the task increased its priority or is running and | 4337 | * If the task increased its priority or is running and |
4378 | * lowered its priority, then reschedule its CPU: | 4338 | * lowered its priority, then reschedule its CPU: |
@@ -7087,21 +7047,6 @@ void __init sched_init_smp(void) | |||
7087 | if (set_cpus_allowed(current, non_isolated_cpus) < 0) | 7047 | if (set_cpus_allowed(current, non_isolated_cpus) < 0) |
7088 | BUG(); | 7048 | BUG(); |
7089 | sched_init_granularity(); | 7049 | sched_init_granularity(); |
7090 | |||
7091 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
7092 | if (nr_cpu_ids == 1) | ||
7093 | return; | ||
7094 | |||
7095 | lb_monitor_task = kthread_create(load_balance_monitor, NULL, | ||
7096 | "group_balance"); | ||
7097 | if (!IS_ERR(lb_monitor_task)) { | ||
7098 | lb_monitor_task->flags |= PF_NOFREEZE; | ||
7099 | wake_up_process(lb_monitor_task); | ||
7100 | } else { | ||
7101 | printk(KERN_ERR "Could not create load balance monitor thread" | ||
7102 | "(error = %ld) \n", PTR_ERR(lb_monitor_task)); | ||
7103 | } | ||
7104 | #endif | ||
7105 | } | 7050 | } |
7106 | #else | 7051 | #else |
7107 | void __init sched_init_smp(void) | 7052 | void __init sched_init_smp(void) |
@@ -7424,157 +7369,6 @@ void set_curr_task(int cpu, struct task_struct *p) | |||
7424 | 7369 | ||
7425 | #ifdef CONFIG_GROUP_SCHED | 7370 | #ifdef CONFIG_GROUP_SCHED |
7426 | 7371 | ||
7427 | #if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP | ||
7428 | /* | ||
7429 | * distribute shares of all task groups among their schedulable entities, | ||
7430 | * to reflect load distribution across cpus. | ||
7431 | */ | ||
7432 | static int rebalance_shares(struct sched_domain *sd, int this_cpu) | ||
7433 | { | ||
7434 | struct cfs_rq *cfs_rq; | ||
7435 | struct rq *rq = cpu_rq(this_cpu); | ||
7436 | cpumask_t sdspan = sd->span; | ||
7437 | int balanced = 1; | ||
7438 | |||
7439 | /* Walk thr' all the task groups that we have */ | ||
7440 | for_each_leaf_cfs_rq(rq, cfs_rq) { | ||
7441 | int i; | ||
7442 | unsigned long total_load = 0, total_shares; | ||
7443 | struct task_group *tg = cfs_rq->tg; | ||
7444 | |||
7445 | /* Gather total task load of this group across cpus */ | ||
7446 | for_each_cpu_mask(i, sdspan) | ||
7447 | total_load += tg->cfs_rq[i]->load.weight; | ||
7448 | |||
7449 | /* Nothing to do if this group has no load */ | ||
7450 | if (!total_load) | ||
7451 | continue; | ||
7452 | |||
7453 | /* | ||
7454 | * tg->shares represents the number of cpu shares the task group | ||
7455 | * is eligible to hold on a single cpu. On N cpus, it is | ||
7456 | * eligible to hold (N * tg->shares) number of cpu shares. | ||
7457 | */ | ||
7458 | total_shares = tg->shares * cpus_weight(sdspan); | ||
7459 | |||
7460 | /* | ||
7461 | * redistribute total_shares across cpus as per the task load | ||
7462 | * distribution. | ||
7463 | */ | ||
7464 | for_each_cpu_mask(i, sdspan) { | ||
7465 | unsigned long local_load, local_shares; | ||
7466 | |||
7467 | local_load = tg->cfs_rq[i]->load.weight; | ||
7468 | local_shares = (local_load * total_shares) / total_load; | ||
7469 | if (!local_shares) | ||
7470 | local_shares = MIN_GROUP_SHARES; | ||
7471 | if (local_shares == tg->se[i]->load.weight) | ||
7472 | continue; | ||
7473 | |||
7474 | spin_lock_irq(&cpu_rq(i)->lock); | ||
7475 | set_se_shares(tg->se[i], local_shares); | ||
7476 | spin_unlock_irq(&cpu_rq(i)->lock); | ||
7477 | balanced = 0; | ||
7478 | } | ||
7479 | } | ||
7480 | |||
7481 | return balanced; | ||
7482 | } | ||
7483 | |||
7484 | /* | ||
7485 | * How frequently should we rebalance_shares() across cpus? | ||
7486 | * | ||
7487 | * The more frequently we rebalance shares, the more accurate is the fairness | ||
7488 | * of cpu bandwidth distribution between task groups. However higher frequency | ||
7489 | * also implies increased scheduling overhead. | ||
7490 | * | ||
7491 | * sysctl_sched_min_bal_int_shares represents the minimum interval between | ||
7492 | * consecutive calls to rebalance_shares() in the same sched domain. | ||
7493 | * | ||
7494 | * sysctl_sched_max_bal_int_shares represents the maximum interval between | ||
7495 | * consecutive calls to rebalance_shares() in the same sched domain. | ||
7496 | * | ||
7497 | * These settings allows for the appropriate trade-off between accuracy of | ||
7498 | * fairness and the associated overhead. | ||
7499 | * | ||
7500 | */ | ||
7501 | |||
7502 | /* default: 8ms, units: milliseconds */ | ||
7503 | const_debug unsigned int sysctl_sched_min_bal_int_shares = 8; | ||
7504 | |||
7505 | /* default: 128ms, units: milliseconds */ | ||
7506 | const_debug unsigned int sysctl_sched_max_bal_int_shares = 128; | ||
7507 | |||
7508 | /* kernel thread that runs rebalance_shares() periodically */ | ||
7509 | static int load_balance_monitor(void *unused) | ||
7510 | { | ||
7511 | unsigned int timeout = sysctl_sched_min_bal_int_shares; | ||
7512 | struct sched_param schedparm; | ||
7513 | int ret; | ||
7514 | |||
7515 | /* | ||
7516 | * We don't want this thread's execution to be limited by the shares | ||
7517 | * assigned to default group (init_task_group). Hence make it run | ||
7518 | * as a SCHED_RR RT task at the lowest priority. | ||
7519 | */ | ||
7520 | schedparm.sched_priority = 1; | ||
7521 | ret = sched_setscheduler(current, SCHED_RR, &schedparm); | ||
7522 | if (ret) | ||
7523 | printk(KERN_ERR "Couldn't set SCHED_RR policy for load balance" | ||
7524 | " monitor thread (error = %d) \n", ret); | ||
7525 | |||
7526 | while (!kthread_should_stop()) { | ||
7527 | int i, cpu, balanced = 1; | ||
7528 | |||
7529 | /* Prevent cpus going down or coming up */ | ||
7530 | get_online_cpus(); | ||
7531 | /* lockout changes to doms_cur[] array */ | ||
7532 | lock_doms_cur(); | ||
7533 | /* | ||
7534 | * Enter a rcu read-side critical section to safely walk rq->sd | ||
7535 | * chain on various cpus and to walk task group list | ||
7536 | * (rq->leaf_cfs_rq_list) in rebalance_shares(). | ||
7537 | */ | ||
7538 | rcu_read_lock(); | ||
7539 | |||
7540 | for (i = 0; i < ndoms_cur; i++) { | ||
7541 | cpumask_t cpumap = doms_cur[i]; | ||
7542 | struct sched_domain *sd = NULL, *sd_prev = NULL; | ||
7543 | |||
7544 | cpu = first_cpu(cpumap); | ||
7545 | |||
7546 | /* Find the highest domain at which to balance shares */ | ||
7547 | for_each_domain(cpu, sd) { | ||
7548 | if (!(sd->flags & SD_LOAD_BALANCE)) | ||
7549 | continue; | ||
7550 | sd_prev = sd; | ||
7551 | } | ||
7552 | |||
7553 | sd = sd_prev; | ||
7554 | /* sd == NULL? No load balance reqd in this domain */ | ||
7555 | if (!sd) | ||
7556 | continue; | ||
7557 | |||
7558 | balanced &= rebalance_shares(sd, cpu); | ||
7559 | } | ||
7560 | |||
7561 | rcu_read_unlock(); | ||
7562 | |||
7563 | unlock_doms_cur(); | ||
7564 | put_online_cpus(); | ||
7565 | |||
7566 | if (!balanced) | ||
7567 | timeout = sysctl_sched_min_bal_int_shares; | ||
7568 | else if (timeout < sysctl_sched_max_bal_int_shares) | ||
7569 | timeout *= 2; | ||
7570 | |||
7571 | msleep_interruptible(timeout); | ||
7572 | } | ||
7573 | |||
7574 | return 0; | ||
7575 | } | ||
7576 | #endif /* CONFIG_SMP */ | ||
7577 | |||
7578 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7372 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7579 | static void free_fair_sched_group(struct task_group *tg) | 7373 | static void free_fair_sched_group(struct task_group *tg) |
7580 | { | 7374 | { |
@@ -7841,29 +7635,25 @@ void sched_move_task(struct task_struct *tsk) | |||
7841 | } | 7635 | } |
7842 | 7636 | ||
7843 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7637 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7844 | /* rq->lock to be locked by caller */ | ||
7845 | static void set_se_shares(struct sched_entity *se, unsigned long shares) | 7638 | static void set_se_shares(struct sched_entity *se, unsigned long shares) |
7846 | { | 7639 | { |
7847 | struct cfs_rq *cfs_rq = se->cfs_rq; | 7640 | struct cfs_rq *cfs_rq = se->cfs_rq; |
7848 | struct rq *rq = cfs_rq->rq; | 7641 | struct rq *rq = cfs_rq->rq; |
7849 | int on_rq; | 7642 | int on_rq; |
7850 | 7643 | ||
7851 | if (!shares) | 7644 | spin_lock_irq(&rq->lock); |
7852 | shares = MIN_GROUP_SHARES; | ||
7853 | 7645 | ||
7854 | on_rq = se->on_rq; | 7646 | on_rq = se->on_rq; |
7855 | if (on_rq) { | 7647 | if (on_rq) |
7856 | dequeue_entity(cfs_rq, se, 0); | 7648 | dequeue_entity(cfs_rq, se, 0); |
7857 | dec_cpu_load(rq, se->load.weight); | ||
7858 | } | ||
7859 | 7649 | ||
7860 | se->load.weight = shares; | 7650 | se->load.weight = shares; |
7861 | se->load.inv_weight = div64_64((1ULL<<32), shares); | 7651 | se->load.inv_weight = div64_64((1ULL<<32), shares); |
7862 | 7652 | ||
7863 | if (on_rq) { | 7653 | if (on_rq) |
7864 | enqueue_entity(cfs_rq, se, 0); | 7654 | enqueue_entity(cfs_rq, se, 0); |
7865 | inc_cpu_load(rq, se->load.weight); | 7655 | |
7866 | } | 7656 | spin_unlock_irq(&rq->lock); |
7867 | } | 7657 | } |
7868 | 7658 | ||
7869 | static DEFINE_MUTEX(shares_mutex); | 7659 | static DEFINE_MUTEX(shares_mutex); |
@@ -7873,18 +7663,18 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) | |||
7873 | int i; | 7663 | int i; |
7874 | unsigned long flags; | 7664 | unsigned long flags; |
7875 | 7665 | ||
7666 | /* | ||
7667 | * A weight of 0 or 1 can cause arithmetics problems. | ||
7668 | * (The default weight is 1024 - so there's no practical | ||
7669 | * limitation from this.) | ||
7670 | */ | ||
7671 | if (shares < 2) | ||
7672 | shares = 2; | ||
7673 | |||
7876 | mutex_lock(&shares_mutex); | 7674 | mutex_lock(&shares_mutex); |
7877 | if (tg->shares == shares) | 7675 | if (tg->shares == shares) |
7878 | goto done; | 7676 | goto done; |
7879 | 7677 | ||
7880 | if (shares < MIN_GROUP_SHARES) | ||
7881 | shares = MIN_GROUP_SHARES; | ||
7882 | |||
7883 | /* | ||
7884 | * Prevent any load balance activity (rebalance_shares, | ||
7885 | * load_balance_fair) from referring to this group first, | ||
7886 | * by taking it off the rq->leaf_cfs_rq_list on each cpu. | ||
7887 | */ | ||
7888 | spin_lock_irqsave(&task_group_lock, flags); | 7678 | spin_lock_irqsave(&task_group_lock, flags); |
7889 | for_each_possible_cpu(i) | 7679 | for_each_possible_cpu(i) |
7890 | unregister_fair_sched_group(tg, i); | 7680 | unregister_fair_sched_group(tg, i); |
@@ -7898,11 +7688,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) | |||
7898 | * w/o tripping rebalance_share or load_balance_fair. | 7688 | * w/o tripping rebalance_share or load_balance_fair. |
7899 | */ | 7689 | */ |
7900 | tg->shares = shares; | 7690 | tg->shares = shares; |
7901 | for_each_possible_cpu(i) { | 7691 | for_each_possible_cpu(i) |
7902 | spin_lock_irq(&cpu_rq(i)->lock); | ||
7903 | set_se_shares(tg->se[i], shares); | 7692 | set_se_shares(tg->se[i], shares); |
7904 | spin_unlock_irq(&cpu_rq(i)->lock); | ||
7905 | } | ||
7906 | 7693 | ||
7907 | /* | 7694 | /* |
7908 | * Enable load balance activity on this group, by inserting it back on | 7695 | * Enable load balance activity on this group, by inserting it back on |
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index c8e6492c5925..3df4d46994ca 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -727,8 +727,6 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se) | |||
727 | return se->parent; | 727 | return se->parent; |
728 | } | 728 | } |
729 | 729 | ||
730 | #define GROUP_IMBALANCE_PCT 20 | ||
731 | |||
732 | #else /* CONFIG_FAIR_GROUP_SCHED */ | 730 | #else /* CONFIG_FAIR_GROUP_SCHED */ |
733 | 731 | ||
734 | #define for_each_sched_entity(se) \ | 732 | #define for_each_sched_entity(se) \ |
@@ -819,26 +817,15 @@ hrtick_start_fair(struct rq *rq, struct task_struct *p) | |||
819 | static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) | 817 | static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) |
820 | { | 818 | { |
821 | struct cfs_rq *cfs_rq; | 819 | struct cfs_rq *cfs_rq; |
822 | struct sched_entity *se = &p->se, | 820 | struct sched_entity *se = &p->se; |
823 | *topse = NULL; /* Highest schedulable entity */ | ||
824 | int incload = 1; | ||
825 | 821 | ||
826 | for_each_sched_entity(se) { | 822 | for_each_sched_entity(se) { |
827 | topse = se; | 823 | if (se->on_rq) |
828 | if (se->on_rq) { | ||
829 | incload = 0; | ||
830 | break; | 824 | break; |
831 | } | ||
832 | cfs_rq = cfs_rq_of(se); | 825 | cfs_rq = cfs_rq_of(se); |
833 | enqueue_entity(cfs_rq, se, wakeup); | 826 | enqueue_entity(cfs_rq, se, wakeup); |
834 | wakeup = 1; | 827 | wakeup = 1; |
835 | } | 828 | } |
836 | /* Increment cpu load if we just enqueued the first task of a group on | ||
837 | * 'rq->cpu'. 'topse' represents the group to which task 'p' belongs | ||
838 | * at the highest grouping level. | ||
839 | */ | ||
840 | if (incload) | ||
841 | inc_cpu_load(rq, topse->load.weight); | ||
842 | 829 | ||
843 | hrtick_start_fair(rq, rq->curr); | 830 | hrtick_start_fair(rq, rq->curr); |
844 | } | 831 | } |
@@ -851,28 +838,16 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) | |||
851 | static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) | 838 | static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) |
852 | { | 839 | { |
853 | struct cfs_rq *cfs_rq; | 840 | struct cfs_rq *cfs_rq; |
854 | struct sched_entity *se = &p->se, | 841 | struct sched_entity *se = &p->se; |
855 | *topse = NULL; /* Highest schedulable entity */ | ||
856 | int decload = 1; | ||
857 | 842 | ||
858 | for_each_sched_entity(se) { | 843 | for_each_sched_entity(se) { |
859 | topse = se; | ||
860 | cfs_rq = cfs_rq_of(se); | 844 | cfs_rq = cfs_rq_of(se); |
861 | dequeue_entity(cfs_rq, se, sleep); | 845 | dequeue_entity(cfs_rq, se, sleep); |
862 | /* Don't dequeue parent if it has other entities besides us */ | 846 | /* Don't dequeue parent if it has other entities besides us */ |
863 | if (cfs_rq->load.weight) { | 847 | if (cfs_rq->load.weight) |
864 | if (parent_entity(se)) | ||
865 | decload = 0; | ||
866 | break; | 848 | break; |
867 | } | ||
868 | sleep = 1; | 849 | sleep = 1; |
869 | } | 850 | } |
870 | /* Decrement cpu load if we just dequeued the last task of a group on | ||
871 | * 'rq->cpu'. 'topse' represents the group to which task 'p' belongs | ||
872 | * at the highest grouping level. | ||
873 | */ | ||
874 | if (decload) | ||
875 | dec_cpu_load(rq, topse->load.weight); | ||
876 | 851 | ||
877 | hrtick_start_fair(rq, rq->curr); | 852 | hrtick_start_fair(rq, rq->curr); |
878 | } | 853 | } |
@@ -1186,6 +1161,25 @@ static struct task_struct *load_balance_next_fair(void *arg) | |||
1186 | return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); | 1161 | return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); |
1187 | } | 1162 | } |
1188 | 1163 | ||
1164 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1165 | static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) | ||
1166 | { | ||
1167 | struct sched_entity *curr; | ||
1168 | struct task_struct *p; | ||
1169 | |||
1170 | if (!cfs_rq->nr_running || !first_fair(cfs_rq)) | ||
1171 | return MAX_PRIO; | ||
1172 | |||
1173 | curr = cfs_rq->curr; | ||
1174 | if (!curr) | ||
1175 | curr = __pick_next_entity(cfs_rq); | ||
1176 | |||
1177 | p = task_of(curr); | ||
1178 | |||
1179 | return p->prio; | ||
1180 | } | ||
1181 | #endif | ||
1182 | |||
1189 | static unsigned long | 1183 | static unsigned long |
1190 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 1184 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
1191 | unsigned long max_load_move, | 1185 | unsigned long max_load_move, |
@@ -1195,45 +1189,28 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
1195 | struct cfs_rq *busy_cfs_rq; | 1189 | struct cfs_rq *busy_cfs_rq; |
1196 | long rem_load_move = max_load_move; | 1190 | long rem_load_move = max_load_move; |
1197 | struct rq_iterator cfs_rq_iterator; | 1191 | struct rq_iterator cfs_rq_iterator; |
1198 | unsigned long load_moved; | ||
1199 | 1192 | ||
1200 | cfs_rq_iterator.start = load_balance_start_fair; | 1193 | cfs_rq_iterator.start = load_balance_start_fair; |
1201 | cfs_rq_iterator.next = load_balance_next_fair; | 1194 | cfs_rq_iterator.next = load_balance_next_fair; |
1202 | 1195 | ||
1203 | for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { | 1196 | for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { |
1204 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1197 | #ifdef CONFIG_FAIR_GROUP_SCHED |
1205 | struct cfs_rq *this_cfs_rq = busy_cfs_rq->tg->cfs_rq[this_cpu]; | 1198 | struct cfs_rq *this_cfs_rq; |
1206 | unsigned long maxload, task_load, group_weight; | 1199 | long imbalance; |
1207 | unsigned long thisload, per_task_load; | 1200 | unsigned long maxload; |
1208 | struct sched_entity *se = busy_cfs_rq->tg->se[busiest->cpu]; | ||
1209 | |||
1210 | task_load = busy_cfs_rq->load.weight; | ||
1211 | group_weight = se->load.weight; | ||
1212 | 1201 | ||
1213 | /* | 1202 | this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); |
1214 | * 'group_weight' is contributed by tasks of total weight | ||
1215 | * 'task_load'. To move 'rem_load_move' worth of weight only, | ||
1216 | * we need to move a maximum task load of: | ||
1217 | * | ||
1218 | * maxload = (remload / group_weight) * task_load; | ||
1219 | */ | ||
1220 | maxload = (rem_load_move * task_load) / group_weight; | ||
1221 | 1203 | ||
1222 | if (!maxload || !task_load) | 1204 | imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight; |
1205 | /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */ | ||
1206 | if (imbalance <= 0) | ||
1223 | continue; | 1207 | continue; |
1224 | 1208 | ||
1225 | per_task_load = task_load / busy_cfs_rq->nr_running; | 1209 | /* Don't pull more than imbalance/2 */ |
1226 | /* | 1210 | imbalance /= 2; |
1227 | * balance_tasks will try to forcibly move atleast one task if | 1211 | maxload = min(rem_load_move, imbalance); |
1228 | * possible (because of SCHED_LOAD_SCALE_FUZZ). Avoid that if | ||
1229 | * maxload is less than GROUP_IMBALANCE_FUZZ% the per_task_load. | ||
1230 | */ | ||
1231 | if (100 * maxload < GROUP_IMBALANCE_PCT * per_task_load) | ||
1232 | continue; | ||
1233 | 1212 | ||
1234 | /* Disable priority-based load balance */ | 1213 | *this_best_prio = cfs_rq_best_prio(this_cfs_rq); |
1235 | *this_best_prio = 0; | ||
1236 | thisload = this_cfs_rq->load.weight; | ||
1237 | #else | 1214 | #else |
1238 | # define maxload rem_load_move | 1215 | # define maxload rem_load_move |
1239 | #endif | 1216 | #endif |
@@ -1242,33 +1219,11 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
1242 | * load_balance_[start|next]_fair iterators | 1219 | * load_balance_[start|next]_fair iterators |
1243 | */ | 1220 | */ |
1244 | cfs_rq_iterator.arg = busy_cfs_rq; | 1221 | cfs_rq_iterator.arg = busy_cfs_rq; |
1245 | load_moved = balance_tasks(this_rq, this_cpu, busiest, | 1222 | rem_load_move -= balance_tasks(this_rq, this_cpu, busiest, |
1246 | maxload, sd, idle, all_pinned, | 1223 | maxload, sd, idle, all_pinned, |
1247 | this_best_prio, | 1224 | this_best_prio, |
1248 | &cfs_rq_iterator); | 1225 | &cfs_rq_iterator); |
1249 | 1226 | ||
1250 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1251 | /* | ||
1252 | * load_moved holds the task load that was moved. The | ||
1253 | * effective (group) weight moved would be: | ||
1254 | * load_moved_eff = load_moved/task_load * group_weight; | ||
1255 | */ | ||
1256 | load_moved = (group_weight * load_moved) / task_load; | ||
1257 | |||
1258 | /* Adjust shares on both cpus to reflect load_moved */ | ||
1259 | group_weight -= load_moved; | ||
1260 | set_se_shares(se, group_weight); | ||
1261 | |||
1262 | se = busy_cfs_rq->tg->se[this_cpu]; | ||
1263 | if (!thisload) | ||
1264 | group_weight = load_moved; | ||
1265 | else | ||
1266 | group_weight = se->load.weight + load_moved; | ||
1267 | set_se_shares(se, group_weight); | ||
1268 | #endif | ||
1269 | |||
1270 | rem_load_move -= load_moved; | ||
1271 | |||
1272 | if (rem_load_move <= 0) | 1227 | if (rem_load_move <= 0) |
1273 | break; | 1228 | break; |
1274 | } | 1229 | } |
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index f54792b175b2..76e828517541 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c | |||
@@ -393,8 +393,6 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) | |||
393 | */ | 393 | */ |
394 | for_each_sched_rt_entity(rt_se) | 394 | for_each_sched_rt_entity(rt_se) |
395 | enqueue_rt_entity(rt_se); | 395 | enqueue_rt_entity(rt_se); |
396 | |||
397 | inc_cpu_load(rq, p->se.load.weight); | ||
398 | } | 396 | } |
399 | 397 | ||
400 | static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) | 398 | static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) |
@@ -414,8 +412,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) | |||
414 | if (rt_rq && rt_rq->rt_nr_running) | 412 | if (rt_rq && rt_rq->rt_nr_running) |
415 | enqueue_rt_entity(rt_se); | 413 | enqueue_rt_entity(rt_se); |
416 | } | 414 | } |
417 | |||
418 | dec_cpu_load(rq, p->se.load.weight); | ||
419 | } | 415 | } |
420 | 416 | ||
421 | /* | 417 | /* |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8b7e95411795..b2a2d6889bab 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -311,24 +311,6 @@ static struct ctl_table kern_table[] = { | |||
311 | .mode = 0644, | 311 | .mode = 0644, |
312 | .proc_handler = &proc_dointvec, | 312 | .proc_handler = &proc_dointvec, |
313 | }, | 313 | }, |
314 | #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) | ||
315 | { | ||
316 | .ctl_name = CTL_UNNUMBERED, | ||
317 | .procname = "sched_min_bal_int_shares", | ||
318 | .data = &sysctl_sched_min_bal_int_shares, | ||
319 | .maxlen = sizeof(unsigned int), | ||
320 | .mode = 0644, | ||
321 | .proc_handler = &proc_dointvec, | ||
322 | }, | ||
323 | { | ||
324 | .ctl_name = CTL_UNNUMBERED, | ||
325 | .procname = "sched_max_bal_int_shares", | ||
326 | .data = &sysctl_sched_max_bal_int_shares, | ||
327 | .maxlen = sizeof(unsigned int), | ||
328 | .mode = 0644, | ||
329 | .proc_handler = &proc_dointvec, | ||
330 | }, | ||
331 | #endif | ||
332 | #endif | 314 | #endif |
333 | { | 315 | { |
334 | .ctl_name = CTL_UNNUMBERED, | 316 | .ctl_name = CTL_UNNUMBERED, |