diff options
Diffstat (limited to 'kernel/sched_fair.c')
| -rw-r--r-- | kernel/sched_fair.c | 413 |
1 files changed, 290 insertions, 123 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 08ae848b71d4..f2aa987027d6 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
| @@ -63,13 +63,13 @@ unsigned int __read_mostly sysctl_sched_compat_yield; | |||
| 63 | 63 | ||
| 64 | /* | 64 | /* |
| 65 | * SCHED_OTHER wake-up granularity. | 65 | * SCHED_OTHER wake-up granularity. |
| 66 | * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds) | 66 | * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds) |
| 67 | * | 67 | * |
| 68 | * This option delays the preemption effects of decoupled workloads | 68 | * This option delays the preemption effects of decoupled workloads |
| 69 | * and reduces their over-scheduling. Synchronous workloads will still | 69 | * and reduces their over-scheduling. Synchronous workloads will still |
| 70 | * have immediate wakeup/sleep latencies. | 70 | * have immediate wakeup/sleep latencies. |
| 71 | */ | 71 | */ |
| 72 | unsigned int sysctl_sched_wakeup_granularity = 10000000UL; | 72 | unsigned int sysctl_sched_wakeup_granularity = 5000000UL; |
| 73 | 73 | ||
| 74 | const_debug unsigned int sysctl_sched_migration_cost = 500000UL; | 74 | const_debug unsigned int sysctl_sched_migration_cost = 500000UL; |
| 75 | 75 | ||
| @@ -334,6 +334,34 @@ int sched_nr_latency_handler(struct ctl_table *table, int write, | |||
| 334 | #endif | 334 | #endif |
| 335 | 335 | ||
| 336 | /* | 336 | /* |
| 337 | * delta *= w / rw | ||
| 338 | */ | ||
| 339 | static inline unsigned long | ||
| 340 | calc_delta_weight(unsigned long delta, struct sched_entity *se) | ||
| 341 | { | ||
| 342 | for_each_sched_entity(se) { | ||
| 343 | delta = calc_delta_mine(delta, | ||
| 344 | se->load.weight, &cfs_rq_of(se)->load); | ||
| 345 | } | ||
| 346 | |||
| 347 | return delta; | ||
| 348 | } | ||
| 349 | |||
| 350 | /* | ||
| 351 | * delta *= rw / w | ||
| 352 | */ | ||
| 353 | static inline unsigned long | ||
| 354 | calc_delta_fair(unsigned long delta, struct sched_entity *se) | ||
| 355 | { | ||
| 356 | for_each_sched_entity(se) { | ||
| 357 | delta = calc_delta_mine(delta, | ||
| 358 | cfs_rq_of(se)->load.weight, &se->load); | ||
| 359 | } | ||
| 360 | |||
| 361 | return delta; | ||
| 362 | } | ||
| 363 | |||
| 364 | /* | ||
| 337 | * The idea is to set a period in which each task runs once. | 365 | * The idea is to set a period in which each task runs once. |
| 338 | * | 366 | * |
| 339 | * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch | 367 | * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch |
| @@ -362,47 +390,80 @@ static u64 __sched_period(unsigned long nr_running) | |||
| 362 | */ | 390 | */ |
| 363 | static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) | 391 | static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) |
| 364 | { | 392 | { |
| 365 | u64 slice = __sched_period(cfs_rq->nr_running); | 393 | return calc_delta_weight(__sched_period(cfs_rq->nr_running), se); |
| 366 | |||
| 367 | for_each_sched_entity(se) { | ||
| 368 | cfs_rq = cfs_rq_of(se); | ||
| 369 | |||
| 370 | slice *= se->load.weight; | ||
| 371 | do_div(slice, cfs_rq->load.weight); | ||
| 372 | } | ||
| 373 | |||
| 374 | |||
| 375 | return slice; | ||
| 376 | } | 394 | } |
| 377 | 395 | ||
| 378 | /* | 396 | /* |
| 379 | * We calculate the vruntime slice of a to be inserted task | 397 | * We calculate the vruntime slice of a to be inserted task |
| 380 | * | 398 | * |
| 381 | * vs = s/w = p/rw | 399 | * vs = s*rw/w = p |
| 382 | */ | 400 | */ |
| 383 | static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) | 401 | static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) |
| 384 | { | 402 | { |
| 385 | unsigned long nr_running = cfs_rq->nr_running; | 403 | unsigned long nr_running = cfs_rq->nr_running; |
| 386 | unsigned long weight; | ||
| 387 | u64 vslice; | ||
| 388 | 404 | ||
| 389 | if (!se->on_rq) | 405 | if (!se->on_rq) |
| 390 | nr_running++; | 406 | nr_running++; |
| 391 | 407 | ||
| 392 | vslice = __sched_period(nr_running); | 408 | return __sched_period(nr_running); |
| 409 | } | ||
| 410 | |||
| 411 | /* | ||
| 412 | * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in | ||
| 413 | * that it favours >=0 over <0. | ||
| 414 | * | ||
| 415 | * -20 | | ||
| 416 | * | | ||
| 417 | * 0 --------+------- | ||
| 418 | * .' | ||
| 419 | * 19 .' | ||
| 420 | * | ||
| 421 | */ | ||
| 422 | static unsigned long | ||
| 423 | calc_delta_asym(unsigned long delta, struct sched_entity *se) | ||
| 424 | { | ||
| 425 | struct load_weight lw = { | ||
| 426 | .weight = NICE_0_LOAD, | ||
| 427 | .inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT) | ||
| 428 | }; | ||
| 393 | 429 | ||
| 394 | for_each_sched_entity(se) { | 430 | for_each_sched_entity(se) { |
| 395 | cfs_rq = cfs_rq_of(se); | 431 | struct load_weight *se_lw = &se->load; |
| 432 | unsigned long rw = cfs_rq_of(se)->load.weight; | ||
| 433 | |||
| 434 | #ifdef CONFIG_FAIR_SCHED_GROUP | ||
| 435 | struct cfs_rq *cfs_rq = se->my_q; | ||
| 436 | struct task_group *tg = NULL | ||
| 437 | |||
| 438 | if (cfs_rq) | ||
| 439 | tg = cfs_rq->tg; | ||
| 440 | |||
| 441 | if (tg && tg->shares < NICE_0_LOAD) { | ||
| 442 | /* | ||
| 443 | * scale shares to what it would have been had | ||
| 444 | * tg->weight been NICE_0_LOAD: | ||
| 445 | * | ||
| 446 | * weight = 1024 * shares / tg->weight | ||
| 447 | */ | ||
| 448 | lw.weight *= se->load.weight; | ||
| 449 | lw.weight /= tg->shares; | ||
| 450 | |||
| 451 | lw.inv_weight = 0; | ||
| 452 | |||
| 453 | se_lw = &lw; | ||
| 454 | rw += lw.weight - se->load.weight; | ||
| 455 | } else | ||
| 456 | #endif | ||
| 396 | 457 | ||
| 397 | weight = cfs_rq->load.weight; | 458 | if (se->load.weight < NICE_0_LOAD) { |
| 398 | if (!se->on_rq) | 459 | se_lw = &lw; |
| 399 | weight += se->load.weight; | 460 | rw += NICE_0_LOAD - se->load.weight; |
| 461 | } | ||
| 400 | 462 | ||
| 401 | vslice *= NICE_0_LOAD; | 463 | delta = calc_delta_mine(delta, rw, se_lw); |
| 402 | do_div(vslice, weight); | ||
| 403 | } | 464 | } |
| 404 | 465 | ||
| 405 | return vslice; | 466 | return delta; |
| 406 | } | 467 | } |
| 407 | 468 | ||
| 408 | /* | 469 | /* |
| @@ -419,11 +480,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, | |||
| 419 | 480 | ||
| 420 | curr->sum_exec_runtime += delta_exec; | 481 | curr->sum_exec_runtime += delta_exec; |
| 421 | schedstat_add(cfs_rq, exec_clock, delta_exec); | 482 | schedstat_add(cfs_rq, exec_clock, delta_exec); |
| 422 | delta_exec_weighted = delta_exec; | 483 | delta_exec_weighted = calc_delta_fair(delta_exec, curr); |
| 423 | if (unlikely(curr->load.weight != NICE_0_LOAD)) { | ||
| 424 | delta_exec_weighted = calc_delta_fair(delta_exec_weighted, | ||
| 425 | &curr->load); | ||
| 426 | } | ||
| 427 | curr->vruntime += delta_exec_weighted; | 484 | curr->vruntime += delta_exec_weighted; |
| 428 | } | 485 | } |
| 429 | 486 | ||
| @@ -510,10 +567,27 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 510 | * Scheduling class queueing methods: | 567 | * Scheduling class queueing methods: |
| 511 | */ | 568 | */ |
| 512 | 569 | ||
| 570 | #if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED | ||
| 571 | static void | ||
| 572 | add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) | ||
| 573 | { | ||
| 574 | cfs_rq->task_weight += weight; | ||
| 575 | } | ||
| 576 | #else | ||
| 577 | static inline void | ||
| 578 | add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) | ||
| 579 | { | ||
| 580 | } | ||
| 581 | #endif | ||
| 582 | |||
| 513 | static void | 583 | static void |
| 514 | account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | 584 | account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) |
| 515 | { | 585 | { |
| 516 | update_load_add(&cfs_rq->load, se->load.weight); | 586 | update_load_add(&cfs_rq->load, se->load.weight); |
| 587 | if (!parent_entity(se)) | ||
| 588 | inc_cpu_load(rq_of(cfs_rq), se->load.weight); | ||
| 589 | if (entity_is_task(se)) | ||
| 590 | add_cfs_task_weight(cfs_rq, se->load.weight); | ||
| 517 | cfs_rq->nr_running++; | 591 | cfs_rq->nr_running++; |
| 518 | se->on_rq = 1; | 592 | se->on_rq = 1; |
| 519 | list_add(&se->group_node, &cfs_rq->tasks); | 593 | list_add(&se->group_node, &cfs_rq->tasks); |
| @@ -523,6 +597,10 @@ static void | |||
| 523 | account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) | 597 | account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) |
| 524 | { | 598 | { |
| 525 | update_load_sub(&cfs_rq->load, se->load.weight); | 599 | update_load_sub(&cfs_rq->load, se->load.weight); |
| 600 | if (!parent_entity(se)) | ||
| 601 | dec_cpu_load(rq_of(cfs_rq), se->load.weight); | ||
| 602 | if (entity_is_task(se)) | ||
| 603 | add_cfs_task_weight(cfs_rq, -se->load.weight); | ||
| 526 | cfs_rq->nr_running--; | 604 | cfs_rq->nr_running--; |
| 527 | se->on_rq = 0; | 605 | se->on_rq = 0; |
| 528 | list_del_init(&se->group_node); | 606 | list_del_init(&se->group_node); |
| @@ -609,8 +687,17 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) | |||
| 609 | 687 | ||
| 610 | if (!initial) { | 688 | if (!initial) { |
| 611 | /* sleeps upto a single latency don't count. */ | 689 | /* sleeps upto a single latency don't count. */ |
| 612 | if (sched_feat(NEW_FAIR_SLEEPERS)) | 690 | if (sched_feat(NEW_FAIR_SLEEPERS)) { |
| 613 | vruntime -= sysctl_sched_latency; | 691 | unsigned long thresh = sysctl_sched_latency; |
| 692 | |||
| 693 | /* | ||
| 694 | * convert the sleeper threshold into virtual time | ||
| 695 | */ | ||
| 696 | if (sched_feat(NORMALIZED_SLEEPER)) | ||
| 697 | thresh = calc_delta_fair(thresh, se); | ||
| 698 | |||
| 699 | vruntime -= thresh; | ||
| 700 | } | ||
| 614 | 701 | ||
| 615 | /* ensure we never gain time by being placed backwards. */ | 702 | /* ensure we never gain time by being placed backwards. */ |
| 616 | vruntime = max_vruntime(se->vruntime, vruntime); | 703 | vruntime = max_vruntime(se->vruntime, vruntime); |
| @@ -639,21 +726,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) | |||
| 639 | __enqueue_entity(cfs_rq, se); | 726 | __enqueue_entity(cfs_rq, se); |
| 640 | } | 727 | } |
| 641 | 728 | ||
| 642 | static void update_avg(u64 *avg, u64 sample) | ||
| 643 | { | ||
| 644 | s64 diff = sample - *avg; | ||
| 645 | *avg += diff >> 3; | ||
| 646 | } | ||
| 647 | |||
| 648 | static void update_avg_stats(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
| 649 | { | ||
| 650 | if (!se->last_wakeup) | ||
| 651 | return; | ||
| 652 | |||
| 653 | update_avg(&se->avg_overlap, se->sum_exec_runtime - se->last_wakeup); | ||
| 654 | se->last_wakeup = 0; | ||
| 655 | } | ||
| 656 | |||
| 657 | static void | 729 | static void |
| 658 | dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) | 730 | dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) |
| 659 | { | 731 | { |
| @@ -664,7 +736,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) | |||
| 664 | 736 | ||
| 665 | update_stats_dequeue(cfs_rq, se); | 737 | update_stats_dequeue(cfs_rq, se); |
| 666 | if (sleep) { | 738 | if (sleep) { |
| 667 | update_avg_stats(cfs_rq, se); | ||
| 668 | #ifdef CONFIG_SCHEDSTATS | 739 | #ifdef CONFIG_SCHEDSTATS |
| 669 | if (entity_is_task(se)) { | 740 | if (entity_is_task(se)) { |
| 670 | struct task_struct *tsk = task_of(se); | 741 | struct task_struct *tsk = task_of(se); |
| @@ -726,17 +797,16 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 726 | se->prev_sum_exec_runtime = se->sum_exec_runtime; | 797 | se->prev_sum_exec_runtime = se->sum_exec_runtime; |
| 727 | } | 798 | } |
| 728 | 799 | ||
| 729 | static int | ||
| 730 | wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); | ||
| 731 | |||
| 732 | static struct sched_entity * | 800 | static struct sched_entity * |
| 733 | pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se) | 801 | pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se) |
| 734 | { | 802 | { |
| 735 | if (!cfs_rq->next) | 803 | struct rq *rq = rq_of(cfs_rq); |
| 736 | return se; | 804 | u64 pair_slice = rq->clock - cfs_rq->pair_start; |
| 737 | 805 | ||
| 738 | if (wakeup_preempt_entity(cfs_rq->next, se) != 0) | 806 | if (!cfs_rq->next || pair_slice > sched_slice(cfs_rq, cfs_rq->next)) { |
| 807 | cfs_rq->pair_start = rq->clock; | ||
| 739 | return se; | 808 | return se; |
| 809 | } | ||
| 740 | 810 | ||
| 741 | return cfs_rq->next; | 811 | return cfs_rq->next; |
| 742 | } | 812 | } |
| @@ -835,7 +905,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) | |||
| 835 | hrtick_start(rq, delta, requeue); | 905 | hrtick_start(rq, delta, requeue); |
| 836 | } | 906 | } |
| 837 | } | 907 | } |
| 838 | #else | 908 | #else /* !CONFIG_SCHED_HRTICK */ |
| 839 | static inline void | 909 | static inline void |
| 840 | hrtick_start_fair(struct rq *rq, struct task_struct *p) | 910 | hrtick_start_fair(struct rq *rq, struct task_struct *p) |
| 841 | { | 911 | { |
| @@ -976,7 +1046,7 @@ static int wake_idle(int cpu, struct task_struct *p) | |||
| 976 | } | 1046 | } |
| 977 | return cpu; | 1047 | return cpu; |
| 978 | } | 1048 | } |
| 979 | #else | 1049 | #else /* !ARCH_HAS_SCHED_WAKE_IDLE*/ |
| 980 | static inline int wake_idle(int cpu, struct task_struct *p) | 1050 | static inline int wake_idle(int cpu, struct task_struct *p) |
| 981 | { | 1051 | { |
| 982 | return cpu; | 1052 | return cpu; |
| @@ -987,6 +1057,89 @@ static inline int wake_idle(int cpu, struct task_struct *p) | |||
| 987 | 1057 | ||
| 988 | static const struct sched_class fair_sched_class; | 1058 | static const struct sched_class fair_sched_class; |
| 989 | 1059 | ||
| 1060 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 1061 | /* | ||
| 1062 | * effective_load() calculates the load change as seen from the root_task_group | ||
| 1063 | * | ||
| 1064 | * Adding load to a group doesn't make a group heavier, but can cause movement | ||
| 1065 | * of group shares between cpus. Assuming the shares were perfectly aligned one | ||
| 1066 | * can calculate the shift in shares. | ||
| 1067 | * | ||
| 1068 | * The problem is that perfectly aligning the shares is rather expensive, hence | ||
| 1069 | * we try to avoid doing that too often - see update_shares(), which ratelimits | ||
| 1070 | * this change. | ||
| 1071 | * | ||
| 1072 | * We compensate this by not only taking the current delta into account, but | ||
| 1073 | * also considering the delta between when the shares were last adjusted and | ||
| 1074 | * now. | ||
| 1075 | * | ||
| 1076 | * We still saw a performance dip, some tracing learned us that between | ||
| 1077 | * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased | ||
| 1078 | * significantly. Therefore try to bias the error in direction of failing | ||
| 1079 | * the affine wakeup. | ||
| 1080 | * | ||
| 1081 | */ | ||
| 1082 | static long effective_load(struct task_group *tg, int cpu, | ||
| 1083 | long wl, long wg) | ||
| 1084 | { | ||
| 1085 | struct sched_entity *se = tg->se[cpu]; | ||
| 1086 | long more_w; | ||
| 1087 | |||
| 1088 | if (!tg->parent) | ||
| 1089 | return wl; | ||
| 1090 | |||
| 1091 | /* | ||
| 1092 | * By not taking the decrease of shares on the other cpu into | ||
| 1093 | * account our error leans towards reducing the affine wakeups. | ||
| 1094 | */ | ||
| 1095 | if (!wl && sched_feat(ASYM_EFF_LOAD)) | ||
| 1096 | return wl; | ||
| 1097 | |||
| 1098 | /* | ||
| 1099 | * Instead of using this increment, also add the difference | ||
| 1100 | * between when the shares were last updated and now. | ||
| 1101 | */ | ||
| 1102 | more_w = se->my_q->load.weight - se->my_q->rq_weight; | ||
| 1103 | wl += more_w; | ||
| 1104 | wg += more_w; | ||
| 1105 | |||
| 1106 | for_each_sched_entity(se) { | ||
| 1107 | #define D(n) (likely(n) ? (n) : 1) | ||
| 1108 | |||
| 1109 | long S, rw, s, a, b; | ||
| 1110 | |||
| 1111 | S = se->my_q->tg->shares; | ||
| 1112 | s = se->my_q->shares; | ||
| 1113 | rw = se->my_q->rq_weight; | ||
| 1114 | |||
| 1115 | a = S*(rw + wl); | ||
| 1116 | b = S*rw + s*wg; | ||
| 1117 | |||
| 1118 | wl = s*(a-b)/D(b); | ||
| 1119 | /* | ||
| 1120 | * Assume the group is already running and will | ||
| 1121 | * thus already be accounted for in the weight. | ||
| 1122 | * | ||
| 1123 | * That is, moving shares between CPUs, does not | ||
| 1124 | * alter the group weight. | ||
| 1125 | */ | ||
| 1126 | wg = 0; | ||
| 1127 | #undef D | ||
| 1128 | } | ||
| 1129 | |||
| 1130 | return wl; | ||
| 1131 | } | ||
| 1132 | |||
| 1133 | #else | ||
| 1134 | |||
| 1135 | static inline unsigned long effective_load(struct task_group *tg, int cpu, | ||
| 1136 | unsigned long wl, unsigned long wg) | ||
| 1137 | { | ||
| 1138 | return wl; | ||
| 1139 | } | ||
| 1140 | |||
| 1141 | #endif | ||
| 1142 | |||
| 990 | static int | 1143 | static int |
| 991 | wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, | 1144 | wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, |
| 992 | struct task_struct *p, int prev_cpu, int this_cpu, int sync, | 1145 | struct task_struct *p, int prev_cpu, int this_cpu, int sync, |
| @@ -994,8 +1147,10 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, | |||
| 994 | unsigned int imbalance) | 1147 | unsigned int imbalance) |
| 995 | { | 1148 | { |
| 996 | struct task_struct *curr = this_rq->curr; | 1149 | struct task_struct *curr = this_rq->curr; |
| 1150 | struct task_group *tg; | ||
| 997 | unsigned long tl = this_load; | 1151 | unsigned long tl = this_load; |
| 998 | unsigned long tl_per_task; | 1152 | unsigned long tl_per_task; |
| 1153 | unsigned long weight; | ||
| 999 | int balanced; | 1154 | int balanced; |
| 1000 | 1155 | ||
| 1001 | if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) | 1156 | if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) |
| @@ -1006,19 +1161,28 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, | |||
| 1006 | * effect of the currently running task from the load | 1161 | * effect of the currently running task from the load |
| 1007 | * of the current CPU: | 1162 | * of the current CPU: |
| 1008 | */ | 1163 | */ |
| 1009 | if (sync) | 1164 | if (sync) { |
| 1010 | tl -= current->se.load.weight; | 1165 | tg = task_group(current); |
| 1166 | weight = current->se.load.weight; | ||
| 1167 | |||
| 1168 | tl += effective_load(tg, this_cpu, -weight, -weight); | ||
| 1169 | load += effective_load(tg, prev_cpu, 0, -weight); | ||
| 1170 | } | ||
| 1011 | 1171 | ||
| 1012 | balanced = 100*(tl + p->se.load.weight) <= imbalance*load; | 1172 | tg = task_group(p); |
| 1173 | weight = p->se.load.weight; | ||
| 1174 | |||
| 1175 | balanced = 100*(tl + effective_load(tg, this_cpu, weight, weight)) <= | ||
| 1176 | imbalance*(load + effective_load(tg, prev_cpu, 0, weight)); | ||
| 1013 | 1177 | ||
| 1014 | /* | 1178 | /* |
| 1015 | * If the currently running task will sleep within | 1179 | * If the currently running task will sleep within |
| 1016 | * a reasonable amount of time then attract this newly | 1180 | * a reasonable amount of time then attract this newly |
| 1017 | * woken task: | 1181 | * woken task: |
| 1018 | */ | 1182 | */ |
| 1019 | if (sync && balanced && curr->sched_class == &fair_sched_class) { | 1183 | if (sync && balanced) { |
| 1020 | if (curr->se.avg_overlap < sysctl_sched_migration_cost && | 1184 | if (curr->se.avg_overlap < sysctl_sched_migration_cost && |
| 1021 | p->se.avg_overlap < sysctl_sched_migration_cost) | 1185 | p->se.avg_overlap < sysctl_sched_migration_cost) |
| 1022 | return 1; | 1186 | return 1; |
| 1023 | } | 1187 | } |
| 1024 | 1188 | ||
| @@ -1111,11 +1275,13 @@ static unsigned long wakeup_gran(struct sched_entity *se) | |||
| 1111 | unsigned long gran = sysctl_sched_wakeup_granularity; | 1275 | unsigned long gran = sysctl_sched_wakeup_granularity; |
| 1112 | 1276 | ||
| 1113 | /* | 1277 | /* |
| 1114 | * More easily preempt - nice tasks, while not making | 1278 | * More easily preempt - nice tasks, while not making it harder for |
| 1115 | * it harder for + nice tasks. | 1279 | * + nice tasks. |
| 1116 | */ | 1280 | */ |
| 1117 | if (unlikely(se->load.weight > NICE_0_LOAD)) | 1281 | if (sched_feat(ASYM_GRAN)) |
| 1118 | gran = calc_delta_fair(gran, &se->load); | 1282 | gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se); |
| 1283 | else | ||
| 1284 | gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se); | ||
| 1119 | 1285 | ||
| 1120 | return gran; | 1286 | return gran; |
| 1121 | } | 1287 | } |
| @@ -1177,7 +1343,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) | |||
| 1177 | return; | 1343 | return; |
| 1178 | } | 1344 | } |
| 1179 | 1345 | ||
| 1180 | se->last_wakeup = se->sum_exec_runtime; | ||
| 1181 | if (unlikely(se == pse)) | 1346 | if (unlikely(se == pse)) |
| 1182 | return; | 1347 | return; |
| 1183 | 1348 | ||
| @@ -1275,23 +1440,18 @@ __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next) | |||
| 1275 | struct task_struct *p = NULL; | 1440 | struct task_struct *p = NULL; |
| 1276 | struct sched_entity *se; | 1441 | struct sched_entity *se; |
| 1277 | 1442 | ||
| 1278 | if (next == &cfs_rq->tasks) | 1443 | while (next != &cfs_rq->tasks) { |
| 1279 | return NULL; | ||
| 1280 | |||
| 1281 | /* Skip over entities that are not tasks */ | ||
| 1282 | do { | ||
| 1283 | se = list_entry(next, struct sched_entity, group_node); | 1444 | se = list_entry(next, struct sched_entity, group_node); |
| 1284 | next = next->next; | 1445 | next = next->next; |
| 1285 | } while (next != &cfs_rq->tasks && !entity_is_task(se)); | ||
| 1286 | 1446 | ||
| 1287 | if (next == &cfs_rq->tasks) | 1447 | /* Skip over entities that are not tasks */ |
| 1288 | return NULL; | 1448 | if (entity_is_task(se)) { |
| 1449 | p = task_of(se); | ||
| 1450 | break; | ||
| 1451 | } | ||
| 1452 | } | ||
| 1289 | 1453 | ||
| 1290 | cfs_rq->balance_iterator = next; | 1454 | cfs_rq->balance_iterator = next; |
| 1291 | |||
| 1292 | if (entity_is_task(se)) | ||
| 1293 | p = task_of(se); | ||
| 1294 | |||
| 1295 | return p; | 1455 | return p; |
| 1296 | } | 1456 | } |
| 1297 | 1457 | ||
| @@ -1309,75 +1469,82 @@ static struct task_struct *load_balance_next_fair(void *arg) | |||
| 1309 | return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator); | 1469 | return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator); |
| 1310 | } | 1470 | } |
| 1311 | 1471 | ||
| 1312 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1472 | static unsigned long |
| 1313 | static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) | 1473 | __load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
| 1474 | unsigned long max_load_move, struct sched_domain *sd, | ||
| 1475 | enum cpu_idle_type idle, int *all_pinned, int *this_best_prio, | ||
| 1476 | struct cfs_rq *cfs_rq) | ||
| 1314 | { | 1477 | { |
| 1315 | struct sched_entity *curr; | 1478 | struct rq_iterator cfs_rq_iterator; |
| 1316 | struct task_struct *p; | ||
| 1317 | |||
| 1318 | if (!cfs_rq->nr_running || !first_fair(cfs_rq)) | ||
| 1319 | return MAX_PRIO; | ||
| 1320 | |||
| 1321 | curr = cfs_rq->curr; | ||
| 1322 | if (!curr) | ||
| 1323 | curr = __pick_next_entity(cfs_rq); | ||
| 1324 | 1479 | ||
| 1325 | p = task_of(curr); | 1480 | cfs_rq_iterator.start = load_balance_start_fair; |
| 1481 | cfs_rq_iterator.next = load_balance_next_fair; | ||
| 1482 | cfs_rq_iterator.arg = cfs_rq; | ||
| 1326 | 1483 | ||
| 1327 | return p->prio; | 1484 | return balance_tasks(this_rq, this_cpu, busiest, |
| 1485 | max_load_move, sd, idle, all_pinned, | ||
| 1486 | this_best_prio, &cfs_rq_iterator); | ||
| 1328 | } | 1487 | } |
| 1329 | #endif | ||
| 1330 | 1488 | ||
| 1489 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 1331 | static unsigned long | 1490 | static unsigned long |
| 1332 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 1491 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
| 1333 | unsigned long max_load_move, | 1492 | unsigned long max_load_move, |
| 1334 | struct sched_domain *sd, enum cpu_idle_type idle, | 1493 | struct sched_domain *sd, enum cpu_idle_type idle, |
| 1335 | int *all_pinned, int *this_best_prio) | 1494 | int *all_pinned, int *this_best_prio) |
| 1336 | { | 1495 | { |
| 1337 | struct cfs_rq *busy_cfs_rq; | ||
| 1338 | long rem_load_move = max_load_move; | 1496 | long rem_load_move = max_load_move; |
| 1339 | struct rq_iterator cfs_rq_iterator; | 1497 | int busiest_cpu = cpu_of(busiest); |
| 1340 | 1498 | struct task_group *tg; | |
| 1341 | cfs_rq_iterator.start = load_balance_start_fair; | ||
| 1342 | cfs_rq_iterator.next = load_balance_next_fair; | ||
| 1343 | 1499 | ||
| 1344 | for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { | 1500 | rcu_read_lock(); |
| 1345 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1501 | update_h_load(busiest_cpu); |
| 1346 | struct cfs_rq *this_cfs_rq; | ||
| 1347 | long imbalance; | ||
| 1348 | unsigned long maxload; | ||
| 1349 | 1502 | ||
| 1350 | this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); | 1503 | list_for_each_entry(tg, &task_groups, list) { |
| 1504 | struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu]; | ||
| 1505 | unsigned long busiest_h_load = busiest_cfs_rq->h_load; | ||
| 1506 | unsigned long busiest_weight = busiest_cfs_rq->load.weight; | ||
| 1507 | u64 rem_load, moved_load; | ||
| 1351 | 1508 | ||
| 1352 | imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight; | 1509 | /* |
| 1353 | /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */ | 1510 | * empty group |
| 1354 | if (imbalance <= 0) | 1511 | */ |
| 1512 | if (!busiest_cfs_rq->task_weight) | ||
| 1355 | continue; | 1513 | continue; |
| 1356 | 1514 | ||
| 1357 | /* Don't pull more than imbalance/2 */ | 1515 | rem_load = (u64)rem_load_move * busiest_weight; |
| 1358 | imbalance /= 2; | 1516 | rem_load = div_u64(rem_load, busiest_h_load + 1); |
| 1359 | maxload = min(rem_load_move, imbalance); | ||
| 1360 | 1517 | ||
| 1361 | *this_best_prio = cfs_rq_best_prio(this_cfs_rq); | 1518 | moved_load = __load_balance_fair(this_rq, this_cpu, busiest, |
| 1362 | #else | 1519 | rem_load, sd, idle, all_pinned, this_best_prio, |
| 1363 | # define maxload rem_load_move | 1520 | tg->cfs_rq[busiest_cpu]); |
| 1364 | #endif | 1521 | |
| 1365 | /* | 1522 | if (!moved_load) |
| 1366 | * pass busy_cfs_rq argument into | 1523 | continue; |
| 1367 | * load_balance_[start|next]_fair iterators | 1524 | |
| 1368 | */ | 1525 | moved_load *= busiest_h_load; |
| 1369 | cfs_rq_iterator.arg = busy_cfs_rq; | 1526 | moved_load = div_u64(moved_load, busiest_weight + 1); |
| 1370 | rem_load_move -= balance_tasks(this_rq, this_cpu, busiest, | ||
| 1371 | maxload, sd, idle, all_pinned, | ||
| 1372 | this_best_prio, | ||
| 1373 | &cfs_rq_iterator); | ||
| 1374 | 1527 | ||
| 1375 | if (rem_load_move <= 0) | 1528 | rem_load_move -= moved_load; |
| 1529 | if (rem_load_move < 0) | ||
| 1376 | break; | 1530 | break; |
| 1377 | } | 1531 | } |
| 1532 | rcu_read_unlock(); | ||
| 1378 | 1533 | ||
| 1379 | return max_load_move - rem_load_move; | 1534 | return max_load_move - rem_load_move; |
| 1380 | } | 1535 | } |
| 1536 | #else | ||
| 1537 | static unsigned long | ||
| 1538 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
| 1539 | unsigned long max_load_move, | ||
| 1540 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
| 1541 | int *all_pinned, int *this_best_prio) | ||
| 1542 | { | ||
| 1543 | return __load_balance_fair(this_rq, this_cpu, busiest, | ||
| 1544 | max_load_move, sd, idle, all_pinned, | ||
| 1545 | this_best_prio, &busiest->cfs); | ||
| 1546 | } | ||
| 1547 | #endif | ||
| 1381 | 1548 | ||
| 1382 | static int | 1549 | static int |
| 1383 | move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 1550 | move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
| @@ -1402,7 +1569,7 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
| 1402 | 1569 | ||
| 1403 | return 0; | 1570 | return 0; |
| 1404 | } | 1571 | } |
| 1405 | #endif | 1572 | #endif /* CONFIG_SMP */ |
| 1406 | 1573 | ||
| 1407 | /* | 1574 | /* |
| 1408 | * scheduler tick hitting a task of our scheduling class: | 1575 | * scheduler tick hitting a task of our scheduling class: |
