aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched_fair.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched_fair.c')
-rw-r--r--kernel/sched_fair.c254
1 files changed, 91 insertions, 163 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index e24ecd39c4b8..08ae848b71d4 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -334,34 +334,6 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
334#endif 334#endif
335 335
336/* 336/*
337 * delta *= w / rw
338 */
339static inline unsigned long
340calc_delta_weight(unsigned long delta, struct sched_entity *se)
341{
342 for_each_sched_entity(se) {
343 delta = calc_delta_mine(delta,
344 se->load.weight, &cfs_rq_of(se)->load);
345 }
346
347 return delta;
348}
349
350/*
351 * delta *= rw / w
352 */
353static inline unsigned long
354calc_delta_fair(unsigned long delta, struct sched_entity *se)
355{
356 for_each_sched_entity(se) {
357 delta = calc_delta_mine(delta,
358 cfs_rq_of(se)->load.weight, &se->load);
359 }
360
361 return delta;
362}
363
364/*
365 * The idea is to set a period in which each task runs once. 337 * The idea is to set a period in which each task runs once.
366 * 338 *
367 * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch 339 * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch
@@ -390,54 +362,47 @@ static u64 __sched_period(unsigned long nr_running)
390 */ 362 */
391static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) 363static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
392{ 364{
393 return calc_delta_weight(__sched_period(cfs_rq->nr_running), se); 365 u64 slice = __sched_period(cfs_rq->nr_running);
366
367 for_each_sched_entity(se) {
368 cfs_rq = cfs_rq_of(se);
369
370 slice *= se->load.weight;
371 do_div(slice, cfs_rq->load.weight);
372 }
373
374
375 return slice;
394} 376}
395 377
396/* 378/*
397 * We calculate the vruntime slice of a to be inserted task 379 * We calculate the vruntime slice of a to be inserted task
398 * 380 *
399 * vs = s*rw/w = p 381 * vs = s/w = p/rw
400 */ 382 */
401static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) 383static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
402{ 384{
403 unsigned long nr_running = cfs_rq->nr_running; 385 unsigned long nr_running = cfs_rq->nr_running;
386 unsigned long weight;
387 u64 vslice;
404 388
405 if (!se->on_rq) 389 if (!se->on_rq)
406 nr_running++; 390 nr_running++;
407 391
408 return __sched_period(nr_running); 392 vslice = __sched_period(nr_running);
409}
410
411/*
412 * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in
413 * that it favours >=0 over <0.
414 *
415 * -20 |
416 * |
417 * 0 --------+-------
418 * .'
419 * 19 .'
420 *
421 */
422static unsigned long
423calc_delta_asym(unsigned long delta, struct sched_entity *se)
424{
425 struct load_weight lw = {
426 .weight = NICE_0_LOAD,
427 .inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT)
428 };
429 393
430 for_each_sched_entity(se) { 394 for_each_sched_entity(se) {
431 struct load_weight *se_lw = &se->load; 395 cfs_rq = cfs_rq_of(se);
432 396
433 if (se->load.weight < NICE_0_LOAD) 397 weight = cfs_rq->load.weight;
434 se_lw = &lw; 398 if (!se->on_rq)
399 weight += se->load.weight;
435 400
436 delta = calc_delta_mine(delta, 401 vslice *= NICE_0_LOAD;
437 cfs_rq_of(se)->load.weight, se_lw); 402 do_div(vslice, weight);
438 } 403 }
439 404
440 return delta; 405 return vslice;
441} 406}
442 407
443/* 408/*
@@ -454,7 +419,11 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
454 419
455 curr->sum_exec_runtime += delta_exec; 420 curr->sum_exec_runtime += delta_exec;
456 schedstat_add(cfs_rq, exec_clock, delta_exec); 421 schedstat_add(cfs_rq, exec_clock, delta_exec);
457 delta_exec_weighted = calc_delta_fair(delta_exec, curr); 422 delta_exec_weighted = delta_exec;
423 if (unlikely(curr->load.weight != NICE_0_LOAD)) {
424 delta_exec_weighted = calc_delta_fair(delta_exec_weighted,
425 &curr->load);
426 }
458 curr->vruntime += delta_exec_weighted; 427 curr->vruntime += delta_exec_weighted;
459} 428}
460 429
@@ -541,27 +510,10 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
541 * Scheduling class queueing methods: 510 * Scheduling class queueing methods:
542 */ 511 */
543 512
544#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
545static void
546add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
547{
548 cfs_rq->task_weight += weight;
549}
550#else
551static inline void
552add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
553{
554}
555#endif
556
557static void 513static void
558account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) 514account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
559{ 515{
560 update_load_add(&cfs_rq->load, se->load.weight); 516 update_load_add(&cfs_rq->load, se->load.weight);
561 if (!parent_entity(se))
562 inc_cpu_load(rq_of(cfs_rq), se->load.weight);
563 if (entity_is_task(se))
564 add_cfs_task_weight(cfs_rq, se->load.weight);
565 cfs_rq->nr_running++; 517 cfs_rq->nr_running++;
566 se->on_rq = 1; 518 se->on_rq = 1;
567 list_add(&se->group_node, &cfs_rq->tasks); 519 list_add(&se->group_node, &cfs_rq->tasks);
@@ -571,10 +523,6 @@ static void
571account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) 523account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
572{ 524{
573 update_load_sub(&cfs_rq->load, se->load.weight); 525 update_load_sub(&cfs_rq->load, se->load.weight);
574 if (!parent_entity(se))
575 dec_cpu_load(rq_of(cfs_rq), se->load.weight);
576 if (entity_is_task(se))
577 add_cfs_task_weight(cfs_rq, -se->load.weight);
578 cfs_rq->nr_running--; 526 cfs_rq->nr_running--;
579 se->on_rq = 0; 527 se->on_rq = 0;
580 list_del_init(&se->group_node); 528 list_del_init(&se->group_node);
@@ -661,17 +609,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
661 609
662 if (!initial) { 610 if (!initial) {
663 /* sleeps upto a single latency don't count. */ 611 /* sleeps upto a single latency don't count. */
664 if (sched_feat(NEW_FAIR_SLEEPERS)) { 612 if (sched_feat(NEW_FAIR_SLEEPERS))
665 unsigned long thresh = sysctl_sched_latency; 613 vruntime -= sysctl_sched_latency;
666
667 /*
668 * convert the sleeper threshold into virtual time
669 */
670 if (sched_feat(NORMALIZED_SLEEPER))
671 thresh = calc_delta_fair(thresh, se);
672
673 vruntime -= thresh;
674 }
675 614
676 /* ensure we never gain time by being placed backwards. */ 615 /* ensure we never gain time by being placed backwards. */
677 vruntime = max_vruntime(se->vruntime, vruntime); 616 vruntime = max_vruntime(se->vruntime, vruntime);
@@ -1057,16 +996,27 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
1057 struct task_struct *curr = this_rq->curr; 996 struct task_struct *curr = this_rq->curr;
1058 unsigned long tl = this_load; 997 unsigned long tl = this_load;
1059 unsigned long tl_per_task; 998 unsigned long tl_per_task;
999 int balanced;
1060 1000
1061 if (!(this_sd->flags & SD_WAKE_AFFINE)) 1001 if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
1062 return 0; 1002 return 0;
1063 1003
1064 /* 1004 /*
1005 * If sync wakeup then subtract the (maximum possible)
1006 * effect of the currently running task from the load
1007 * of the current CPU:
1008 */
1009 if (sync)
1010 tl -= current->se.load.weight;
1011
1012 balanced = 100*(tl + p->se.load.weight) <= imbalance*load;
1013
1014 /*
1065 * If the currently running task will sleep within 1015 * If the currently running task will sleep within
1066 * a reasonable amount of time then attract this newly 1016 * a reasonable amount of time then attract this newly
1067 * woken task: 1017 * woken task:
1068 */ 1018 */
1069 if (sync && curr->sched_class == &fair_sched_class) { 1019 if (sync && balanced && curr->sched_class == &fair_sched_class) {
1070 if (curr->se.avg_overlap < sysctl_sched_migration_cost && 1020 if (curr->se.avg_overlap < sysctl_sched_migration_cost &&
1071 p->se.avg_overlap < sysctl_sched_migration_cost) 1021 p->se.avg_overlap < sysctl_sched_migration_cost)
1072 return 1; 1022 return 1;
@@ -1075,16 +1025,8 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
1075 schedstat_inc(p, se.nr_wakeups_affine_attempts); 1025 schedstat_inc(p, se.nr_wakeups_affine_attempts);
1076 tl_per_task = cpu_avg_load_per_task(this_cpu); 1026 tl_per_task = cpu_avg_load_per_task(this_cpu);
1077 1027
1078 /*
1079 * If sync wakeup then subtract the (maximum possible)
1080 * effect of the currently running task from the load
1081 * of the current CPU:
1082 */
1083 if (sync)
1084 tl -= current->se.load.weight;
1085
1086 if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) || 1028 if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) ||
1087 100*(tl + p->se.load.weight) <= imbalance*load) { 1029 balanced) {
1088 /* 1030 /*
1089 * This domain has SD_WAKE_AFFINE and 1031 * This domain has SD_WAKE_AFFINE and
1090 * p is cache cold in this domain, and 1032 * p is cache cold in this domain, and
@@ -1169,10 +1111,11 @@ static unsigned long wakeup_gran(struct sched_entity *se)
1169 unsigned long gran = sysctl_sched_wakeup_granularity; 1111 unsigned long gran = sysctl_sched_wakeup_granularity;
1170 1112
1171 /* 1113 /*
1172 * More easily preempt - nice tasks, while not making it harder for 1114 * More easily preempt - nice tasks, while not making
1173 * + nice tasks. 1115 * it harder for + nice tasks.
1174 */ 1116 */
1175 gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se); 1117 if (unlikely(se->load.weight > NICE_0_LOAD))
1118 gran = calc_delta_fair(gran, &se->load);
1176 1119
1177 return gran; 1120 return gran;
1178} 1121}
@@ -1366,90 +1309,75 @@ static struct task_struct *load_balance_next_fair(void *arg)
1366 return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator); 1309 return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator);
1367} 1310}
1368 1311
1369static unsigned long 1312#ifdef CONFIG_FAIR_GROUP_SCHED
1370__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 1313static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
1371 unsigned long max_load_move, struct sched_domain *sd,
1372 enum cpu_idle_type idle, int *all_pinned, int *this_best_prio,
1373 struct cfs_rq *cfs_rq)
1374{ 1314{
1375 struct rq_iterator cfs_rq_iterator; 1315 struct sched_entity *curr;
1316 struct task_struct *p;
1376 1317
1377 cfs_rq_iterator.start = load_balance_start_fair; 1318 if (!cfs_rq->nr_running || !first_fair(cfs_rq))
1378 cfs_rq_iterator.next = load_balance_next_fair; 1319 return MAX_PRIO;
1379 cfs_rq_iterator.arg = cfs_rq; 1320
1321 curr = cfs_rq->curr;
1322 if (!curr)
1323 curr = __pick_next_entity(cfs_rq);
1324
1325 p = task_of(curr);
1380 1326
1381 return balance_tasks(this_rq, this_cpu, busiest, 1327 return p->prio;
1382 max_load_move, sd, idle, all_pinned,
1383 this_best_prio, &cfs_rq_iterator);
1384} 1328}
1329#endif
1385 1330
1386#ifdef CONFIG_FAIR_GROUP_SCHED
1387static unsigned long 1331static unsigned long
1388load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 1332load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1389 unsigned long max_load_move, 1333 unsigned long max_load_move,
1390 struct sched_domain *sd, enum cpu_idle_type idle, 1334 struct sched_domain *sd, enum cpu_idle_type idle,
1391 int *all_pinned, int *this_best_prio) 1335 int *all_pinned, int *this_best_prio)
1392{ 1336{
1337 struct cfs_rq *busy_cfs_rq;
1393 long rem_load_move = max_load_move; 1338 long rem_load_move = max_load_move;
1394 int busiest_cpu = cpu_of(busiest); 1339 struct rq_iterator cfs_rq_iterator;
1395 struct task_group *tg;
1396
1397 rcu_read_lock();
1398 list_for_each_entry(tg, &task_groups, list) {
1399 long imbalance;
1400 unsigned long this_weight, busiest_weight;
1401 long rem_load, max_load, moved_load;
1402
1403 /*
1404 * empty group
1405 */
1406 if (!aggregate(tg, sd)->task_weight)
1407 continue;
1408
1409 rem_load = rem_load_move * aggregate(tg, sd)->rq_weight;
1410 rem_load /= aggregate(tg, sd)->load + 1;
1411
1412 this_weight = tg->cfs_rq[this_cpu]->task_weight;
1413 busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight;
1414 1340
1415 imbalance = (busiest_weight - this_weight) / 2; 1341 cfs_rq_iterator.start = load_balance_start_fair;
1342 cfs_rq_iterator.next = load_balance_next_fair;
1416 1343
1417 if (imbalance < 0) 1344 for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
1418 imbalance = busiest_weight; 1345#ifdef CONFIG_FAIR_GROUP_SCHED
1346 struct cfs_rq *this_cfs_rq;
1347 long imbalance;
1348 unsigned long maxload;
1419 1349
1420 max_load = max(rem_load, imbalance); 1350 this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
1421 moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
1422 max_load, sd, idle, all_pinned, this_best_prio,
1423 tg->cfs_rq[busiest_cpu]);
1424 1351
1425 if (!moved_load) 1352 imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight;
1353 /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
1354 if (imbalance <= 0)
1426 continue; 1355 continue;
1427 1356
1428 move_group_shares(tg, sd, busiest_cpu, this_cpu); 1357 /* Don't pull more than imbalance/2 */
1358 imbalance /= 2;
1359 maxload = min(rem_load_move, imbalance);
1429 1360
1430 moved_load *= aggregate(tg, sd)->load; 1361 *this_best_prio = cfs_rq_best_prio(this_cfs_rq);
1431 moved_load /= aggregate(tg, sd)->rq_weight + 1; 1362#else
1363# define maxload rem_load_move
1364#endif
1365 /*
1366 * pass busy_cfs_rq argument into
1367 * load_balance_[start|next]_fair iterators
1368 */
1369 cfs_rq_iterator.arg = busy_cfs_rq;
1370 rem_load_move -= balance_tasks(this_rq, this_cpu, busiest,
1371 maxload, sd, idle, all_pinned,
1372 this_best_prio,
1373 &cfs_rq_iterator);
1432 1374
1433 rem_load_move -= moved_load; 1375 if (rem_load_move <= 0)
1434 if (rem_load_move < 0)
1435 break; 1376 break;
1436 } 1377 }
1437 rcu_read_unlock();
1438 1378
1439 return max_load_move - rem_load_move; 1379 return max_load_move - rem_load_move;
1440} 1380}
1441#else
1442static unsigned long
1443load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1444 unsigned long max_load_move,
1445 struct sched_domain *sd, enum cpu_idle_type idle,
1446 int *all_pinned, int *this_best_prio)
1447{
1448 return __load_balance_fair(this_rq, this_cpu, busiest,
1449 max_load_move, sd, idle, all_pinned,
1450 this_best_prio, &busiest->cfs);
1451}
1452#endif
1453 1381
1454static int 1382static int
1455move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 1383move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,