diff options
Diffstat (limited to 'kernel/sched_fair.c')
| -rw-r--r-- | kernel/sched_fair.c | 254 |
1 files changed, 91 insertions, 163 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index e24ecd39c4b8..08ae848b71d4 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
| @@ -334,34 +334,6 @@ int sched_nr_latency_handler(struct ctl_table *table, int write, | |||
| 334 | #endif | 334 | #endif |
| 335 | 335 | ||
| 336 | /* | 336 | /* |
| 337 | * delta *= w / rw | ||
| 338 | */ | ||
| 339 | static inline unsigned long | ||
| 340 | calc_delta_weight(unsigned long delta, struct sched_entity *se) | ||
| 341 | { | ||
| 342 | for_each_sched_entity(se) { | ||
| 343 | delta = calc_delta_mine(delta, | ||
| 344 | se->load.weight, &cfs_rq_of(se)->load); | ||
| 345 | } | ||
| 346 | |||
| 347 | return delta; | ||
| 348 | } | ||
| 349 | |||
| 350 | /* | ||
| 351 | * delta *= rw / w | ||
| 352 | */ | ||
| 353 | static inline unsigned long | ||
| 354 | calc_delta_fair(unsigned long delta, struct sched_entity *se) | ||
| 355 | { | ||
| 356 | for_each_sched_entity(se) { | ||
| 357 | delta = calc_delta_mine(delta, | ||
| 358 | cfs_rq_of(se)->load.weight, &se->load); | ||
| 359 | } | ||
| 360 | |||
| 361 | return delta; | ||
| 362 | } | ||
| 363 | |||
| 364 | /* | ||
| 365 | * The idea is to set a period in which each task runs once. | 337 | * The idea is to set a period in which each task runs once. |
| 366 | * | 338 | * |
| 367 | * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch | 339 | * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch |
| @@ -390,54 +362,47 @@ static u64 __sched_period(unsigned long nr_running) | |||
| 390 | */ | 362 | */ |
| 391 | static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) | 363 | static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) |
| 392 | { | 364 | { |
| 393 | return calc_delta_weight(__sched_period(cfs_rq->nr_running), se); | 365 | u64 slice = __sched_period(cfs_rq->nr_running); |
| 366 | |||
| 367 | for_each_sched_entity(se) { | ||
| 368 | cfs_rq = cfs_rq_of(se); | ||
| 369 | |||
| 370 | slice *= se->load.weight; | ||
| 371 | do_div(slice, cfs_rq->load.weight); | ||
| 372 | } | ||
| 373 | |||
| 374 | |||
| 375 | return slice; | ||
| 394 | } | 376 | } |
| 395 | 377 | ||
| 396 | /* | 378 | /* |
| 397 | * We calculate the vruntime slice of a to be inserted task | 379 | * We calculate the vruntime slice of a to be inserted task |
| 398 | * | 380 | * |
| 399 | * vs = s*rw/w = p | 381 | * vs = s/w = p/rw |
| 400 | */ | 382 | */ |
| 401 | static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) | 383 | static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) |
| 402 | { | 384 | { |
| 403 | unsigned long nr_running = cfs_rq->nr_running; | 385 | unsigned long nr_running = cfs_rq->nr_running; |
| 386 | unsigned long weight; | ||
| 387 | u64 vslice; | ||
| 404 | 388 | ||
| 405 | if (!se->on_rq) | 389 | if (!se->on_rq) |
| 406 | nr_running++; | 390 | nr_running++; |
| 407 | 391 | ||
| 408 | return __sched_period(nr_running); | 392 | vslice = __sched_period(nr_running); |
| 409 | } | ||
| 410 | |||
| 411 | /* | ||
| 412 | * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in | ||
| 413 | * that it favours >=0 over <0. | ||
| 414 | * | ||
| 415 | * -20 | | ||
| 416 | * | | ||
| 417 | * 0 --------+------- | ||
| 418 | * .' | ||
| 419 | * 19 .' | ||
| 420 | * | ||
| 421 | */ | ||
| 422 | static unsigned long | ||
| 423 | calc_delta_asym(unsigned long delta, struct sched_entity *se) | ||
| 424 | { | ||
| 425 | struct load_weight lw = { | ||
| 426 | .weight = NICE_0_LOAD, | ||
| 427 | .inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT) | ||
| 428 | }; | ||
| 429 | 393 | ||
| 430 | for_each_sched_entity(se) { | 394 | for_each_sched_entity(se) { |
| 431 | struct load_weight *se_lw = &se->load; | 395 | cfs_rq = cfs_rq_of(se); |
| 432 | 396 | ||
| 433 | if (se->load.weight < NICE_0_LOAD) | 397 | weight = cfs_rq->load.weight; |
| 434 | se_lw = &lw; | 398 | if (!se->on_rq) |
| 399 | weight += se->load.weight; | ||
| 435 | 400 | ||
| 436 | delta = calc_delta_mine(delta, | 401 | vslice *= NICE_0_LOAD; |
| 437 | cfs_rq_of(se)->load.weight, se_lw); | 402 | do_div(vslice, weight); |
| 438 | } | 403 | } |
| 439 | 404 | ||
| 440 | return delta; | 405 | return vslice; |
| 441 | } | 406 | } |
| 442 | 407 | ||
| 443 | /* | 408 | /* |
| @@ -454,7 +419,11 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, | |||
| 454 | 419 | ||
| 455 | curr->sum_exec_runtime += delta_exec; | 420 | curr->sum_exec_runtime += delta_exec; |
| 456 | schedstat_add(cfs_rq, exec_clock, delta_exec); | 421 | schedstat_add(cfs_rq, exec_clock, delta_exec); |
| 457 | delta_exec_weighted = calc_delta_fair(delta_exec, curr); | 422 | delta_exec_weighted = delta_exec; |
| 423 | if (unlikely(curr->load.weight != NICE_0_LOAD)) { | ||
| 424 | delta_exec_weighted = calc_delta_fair(delta_exec_weighted, | ||
| 425 | &curr->load); | ||
| 426 | } | ||
| 458 | curr->vruntime += delta_exec_weighted; | 427 | curr->vruntime += delta_exec_weighted; |
| 459 | } | 428 | } |
| 460 | 429 | ||
| @@ -541,27 +510,10 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 541 | * Scheduling class queueing methods: | 510 | * Scheduling class queueing methods: |
| 542 | */ | 511 | */ |
| 543 | 512 | ||
| 544 | #if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED | ||
| 545 | static void | ||
| 546 | add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) | ||
| 547 | { | ||
| 548 | cfs_rq->task_weight += weight; | ||
| 549 | } | ||
| 550 | #else | ||
| 551 | static inline void | ||
| 552 | add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) | ||
| 553 | { | ||
| 554 | } | ||
| 555 | #endif | ||
| 556 | |||
| 557 | static void | 513 | static void |
| 558 | account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | 514 | account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) |
| 559 | { | 515 | { |
| 560 | update_load_add(&cfs_rq->load, se->load.weight); | 516 | update_load_add(&cfs_rq->load, se->load.weight); |
| 561 | if (!parent_entity(se)) | ||
| 562 | inc_cpu_load(rq_of(cfs_rq), se->load.weight); | ||
| 563 | if (entity_is_task(se)) | ||
| 564 | add_cfs_task_weight(cfs_rq, se->load.weight); | ||
| 565 | cfs_rq->nr_running++; | 517 | cfs_rq->nr_running++; |
| 566 | se->on_rq = 1; | 518 | se->on_rq = 1; |
| 567 | list_add(&se->group_node, &cfs_rq->tasks); | 519 | list_add(&se->group_node, &cfs_rq->tasks); |
| @@ -571,10 +523,6 @@ static void | |||
| 571 | account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) | 523 | account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) |
| 572 | { | 524 | { |
| 573 | update_load_sub(&cfs_rq->load, se->load.weight); | 525 | update_load_sub(&cfs_rq->load, se->load.weight); |
| 574 | if (!parent_entity(se)) | ||
| 575 | dec_cpu_load(rq_of(cfs_rq), se->load.weight); | ||
| 576 | if (entity_is_task(se)) | ||
| 577 | add_cfs_task_weight(cfs_rq, -se->load.weight); | ||
| 578 | cfs_rq->nr_running--; | 526 | cfs_rq->nr_running--; |
| 579 | se->on_rq = 0; | 527 | se->on_rq = 0; |
| 580 | list_del_init(&se->group_node); | 528 | list_del_init(&se->group_node); |
| @@ -661,17 +609,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) | |||
| 661 | 609 | ||
| 662 | if (!initial) { | 610 | if (!initial) { |
| 663 | /* sleeps upto a single latency don't count. */ | 611 | /* sleeps upto a single latency don't count. */ |
| 664 | if (sched_feat(NEW_FAIR_SLEEPERS)) { | 612 | if (sched_feat(NEW_FAIR_SLEEPERS)) |
| 665 | unsigned long thresh = sysctl_sched_latency; | 613 | vruntime -= sysctl_sched_latency; |
| 666 | |||
| 667 | /* | ||
| 668 | * convert the sleeper threshold into virtual time | ||
| 669 | */ | ||
| 670 | if (sched_feat(NORMALIZED_SLEEPER)) | ||
| 671 | thresh = calc_delta_fair(thresh, se); | ||
| 672 | |||
| 673 | vruntime -= thresh; | ||
| 674 | } | ||
| 675 | 614 | ||
| 676 | /* ensure we never gain time by being placed backwards. */ | 615 | /* ensure we never gain time by being placed backwards. */ |
| 677 | vruntime = max_vruntime(se->vruntime, vruntime); | 616 | vruntime = max_vruntime(se->vruntime, vruntime); |
| @@ -1057,16 +996,27 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, | |||
| 1057 | struct task_struct *curr = this_rq->curr; | 996 | struct task_struct *curr = this_rq->curr; |
| 1058 | unsigned long tl = this_load; | 997 | unsigned long tl = this_load; |
| 1059 | unsigned long tl_per_task; | 998 | unsigned long tl_per_task; |
| 999 | int balanced; | ||
| 1060 | 1000 | ||
| 1061 | if (!(this_sd->flags & SD_WAKE_AFFINE)) | 1001 | if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) |
| 1062 | return 0; | 1002 | return 0; |
| 1063 | 1003 | ||
| 1064 | /* | 1004 | /* |
| 1005 | * If sync wakeup then subtract the (maximum possible) | ||
| 1006 | * effect of the currently running task from the load | ||
| 1007 | * of the current CPU: | ||
| 1008 | */ | ||
| 1009 | if (sync) | ||
| 1010 | tl -= current->se.load.weight; | ||
| 1011 | |||
| 1012 | balanced = 100*(tl + p->se.load.weight) <= imbalance*load; | ||
| 1013 | |||
| 1014 | /* | ||
| 1065 | * If the currently running task will sleep within | 1015 | * If the currently running task will sleep within |
| 1066 | * a reasonable amount of time then attract this newly | 1016 | * a reasonable amount of time then attract this newly |
| 1067 | * woken task: | 1017 | * woken task: |
| 1068 | */ | 1018 | */ |
| 1069 | if (sync && curr->sched_class == &fair_sched_class) { | 1019 | if (sync && balanced && curr->sched_class == &fair_sched_class) { |
| 1070 | if (curr->se.avg_overlap < sysctl_sched_migration_cost && | 1020 | if (curr->se.avg_overlap < sysctl_sched_migration_cost && |
| 1071 | p->se.avg_overlap < sysctl_sched_migration_cost) | 1021 | p->se.avg_overlap < sysctl_sched_migration_cost) |
| 1072 | return 1; | 1022 | return 1; |
| @@ -1075,16 +1025,8 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, | |||
| 1075 | schedstat_inc(p, se.nr_wakeups_affine_attempts); | 1025 | schedstat_inc(p, se.nr_wakeups_affine_attempts); |
| 1076 | tl_per_task = cpu_avg_load_per_task(this_cpu); | 1026 | tl_per_task = cpu_avg_load_per_task(this_cpu); |
| 1077 | 1027 | ||
| 1078 | /* | ||
| 1079 | * If sync wakeup then subtract the (maximum possible) | ||
| 1080 | * effect of the currently running task from the load | ||
| 1081 | * of the current CPU: | ||
| 1082 | */ | ||
| 1083 | if (sync) | ||
| 1084 | tl -= current->se.load.weight; | ||
| 1085 | |||
| 1086 | if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) || | 1028 | if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) || |
| 1087 | 100*(tl + p->se.load.weight) <= imbalance*load) { | 1029 | balanced) { |
| 1088 | /* | 1030 | /* |
| 1089 | * This domain has SD_WAKE_AFFINE and | 1031 | * This domain has SD_WAKE_AFFINE and |
| 1090 | * p is cache cold in this domain, and | 1032 | * p is cache cold in this domain, and |
| @@ -1169,10 +1111,11 @@ static unsigned long wakeup_gran(struct sched_entity *se) | |||
| 1169 | unsigned long gran = sysctl_sched_wakeup_granularity; | 1111 | unsigned long gran = sysctl_sched_wakeup_granularity; |
| 1170 | 1112 | ||
| 1171 | /* | 1113 | /* |
| 1172 | * More easily preempt - nice tasks, while not making it harder for | 1114 | * More easily preempt - nice tasks, while not making |
| 1173 | * + nice tasks. | 1115 | * it harder for + nice tasks. |
| 1174 | */ | 1116 | */ |
| 1175 | gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se); | 1117 | if (unlikely(se->load.weight > NICE_0_LOAD)) |
| 1118 | gran = calc_delta_fair(gran, &se->load); | ||
| 1176 | 1119 | ||
| 1177 | return gran; | 1120 | return gran; |
| 1178 | } | 1121 | } |
| @@ -1366,90 +1309,75 @@ static struct task_struct *load_balance_next_fair(void *arg) | |||
| 1366 | return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator); | 1309 | return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator); |
| 1367 | } | 1310 | } |
| 1368 | 1311 | ||
| 1369 | static unsigned long | 1312 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 1370 | __load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 1313 | static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) |
| 1371 | unsigned long max_load_move, struct sched_domain *sd, | ||
| 1372 | enum cpu_idle_type idle, int *all_pinned, int *this_best_prio, | ||
| 1373 | struct cfs_rq *cfs_rq) | ||
| 1374 | { | 1314 | { |
| 1375 | struct rq_iterator cfs_rq_iterator; | 1315 | struct sched_entity *curr; |
| 1316 | struct task_struct *p; | ||
| 1376 | 1317 | ||
| 1377 | cfs_rq_iterator.start = load_balance_start_fair; | 1318 | if (!cfs_rq->nr_running || !first_fair(cfs_rq)) |
| 1378 | cfs_rq_iterator.next = load_balance_next_fair; | 1319 | return MAX_PRIO; |
| 1379 | cfs_rq_iterator.arg = cfs_rq; | 1320 | |
| 1321 | curr = cfs_rq->curr; | ||
| 1322 | if (!curr) | ||
| 1323 | curr = __pick_next_entity(cfs_rq); | ||
| 1324 | |||
| 1325 | p = task_of(curr); | ||
| 1380 | 1326 | ||
| 1381 | return balance_tasks(this_rq, this_cpu, busiest, | 1327 | return p->prio; |
| 1382 | max_load_move, sd, idle, all_pinned, | ||
| 1383 | this_best_prio, &cfs_rq_iterator); | ||
| 1384 | } | 1328 | } |
| 1329 | #endif | ||
| 1385 | 1330 | ||
| 1386 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 1387 | static unsigned long | 1331 | static unsigned long |
| 1388 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 1332 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
| 1389 | unsigned long max_load_move, | 1333 | unsigned long max_load_move, |
| 1390 | struct sched_domain *sd, enum cpu_idle_type idle, | 1334 | struct sched_domain *sd, enum cpu_idle_type idle, |
| 1391 | int *all_pinned, int *this_best_prio) | 1335 | int *all_pinned, int *this_best_prio) |
| 1392 | { | 1336 | { |
| 1337 | struct cfs_rq *busy_cfs_rq; | ||
| 1393 | long rem_load_move = max_load_move; | 1338 | long rem_load_move = max_load_move; |
| 1394 | int busiest_cpu = cpu_of(busiest); | 1339 | struct rq_iterator cfs_rq_iterator; |
| 1395 | struct task_group *tg; | ||
| 1396 | |||
| 1397 | rcu_read_lock(); | ||
| 1398 | list_for_each_entry(tg, &task_groups, list) { | ||
| 1399 | long imbalance; | ||
| 1400 | unsigned long this_weight, busiest_weight; | ||
| 1401 | long rem_load, max_load, moved_load; | ||
| 1402 | |||
| 1403 | /* | ||
| 1404 | * empty group | ||
| 1405 | */ | ||
| 1406 | if (!aggregate(tg, sd)->task_weight) | ||
| 1407 | continue; | ||
| 1408 | |||
| 1409 | rem_load = rem_load_move * aggregate(tg, sd)->rq_weight; | ||
| 1410 | rem_load /= aggregate(tg, sd)->load + 1; | ||
| 1411 | |||
| 1412 | this_weight = tg->cfs_rq[this_cpu]->task_weight; | ||
| 1413 | busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight; | ||
| 1414 | 1340 | ||
| 1415 | imbalance = (busiest_weight - this_weight) / 2; | 1341 | cfs_rq_iterator.start = load_balance_start_fair; |
| 1342 | cfs_rq_iterator.next = load_balance_next_fair; | ||
| 1416 | 1343 | ||
| 1417 | if (imbalance < 0) | 1344 | for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { |
| 1418 | imbalance = busiest_weight; | 1345 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 1346 | struct cfs_rq *this_cfs_rq; | ||
| 1347 | long imbalance; | ||
| 1348 | unsigned long maxload; | ||
| 1419 | 1349 | ||
| 1420 | max_load = max(rem_load, imbalance); | 1350 | this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); |
| 1421 | moved_load = __load_balance_fair(this_rq, this_cpu, busiest, | ||
| 1422 | max_load, sd, idle, all_pinned, this_best_prio, | ||
| 1423 | tg->cfs_rq[busiest_cpu]); | ||
| 1424 | 1351 | ||
| 1425 | if (!moved_load) | 1352 | imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight; |
| 1353 | /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */ | ||
| 1354 | if (imbalance <= 0) | ||
| 1426 | continue; | 1355 | continue; |
| 1427 | 1356 | ||
| 1428 | move_group_shares(tg, sd, busiest_cpu, this_cpu); | 1357 | /* Don't pull more than imbalance/2 */ |
| 1358 | imbalance /= 2; | ||
| 1359 | maxload = min(rem_load_move, imbalance); | ||
| 1429 | 1360 | ||
| 1430 | moved_load *= aggregate(tg, sd)->load; | 1361 | *this_best_prio = cfs_rq_best_prio(this_cfs_rq); |
| 1431 | moved_load /= aggregate(tg, sd)->rq_weight + 1; | 1362 | #else |
| 1363 | # define maxload rem_load_move | ||
| 1364 | #endif | ||
| 1365 | /* | ||
| 1366 | * pass busy_cfs_rq argument into | ||
| 1367 | * load_balance_[start|next]_fair iterators | ||
| 1368 | */ | ||
| 1369 | cfs_rq_iterator.arg = busy_cfs_rq; | ||
| 1370 | rem_load_move -= balance_tasks(this_rq, this_cpu, busiest, | ||
| 1371 | maxload, sd, idle, all_pinned, | ||
| 1372 | this_best_prio, | ||
| 1373 | &cfs_rq_iterator); | ||
| 1432 | 1374 | ||
| 1433 | rem_load_move -= moved_load; | 1375 | if (rem_load_move <= 0) |
| 1434 | if (rem_load_move < 0) | ||
| 1435 | break; | 1376 | break; |
| 1436 | } | 1377 | } |
| 1437 | rcu_read_unlock(); | ||
| 1438 | 1378 | ||
| 1439 | return max_load_move - rem_load_move; | 1379 | return max_load_move - rem_load_move; |
| 1440 | } | 1380 | } |
| 1441 | #else | ||
| 1442 | static unsigned long | ||
| 1443 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
| 1444 | unsigned long max_load_move, | ||
| 1445 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
| 1446 | int *all_pinned, int *this_best_prio) | ||
| 1447 | { | ||
| 1448 | return __load_balance_fair(this_rq, this_cpu, busiest, | ||
| 1449 | max_load_move, sd, idle, all_pinned, | ||
| 1450 | this_best_prio, &busiest->cfs); | ||
| 1451 | } | ||
| 1452 | #endif | ||
| 1453 | 1381 | ||
| 1454 | static int | 1382 | static int |
| 1455 | move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 1383 | move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
