diff options
Diffstat (limited to 'kernel/sched_fair.c')
-rw-r--r-- | kernel/sched_fair.c | 350 |
1 files changed, 148 insertions, 202 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 5a5ea2cd924f..217e4a9393e4 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -35,8 +35,8 @@ | |||
35 | * (to see the precise effective timeslice length of your workload, | 35 | * (to see the precise effective timeslice length of your workload, |
36 | * run vmstat and monitor the context-switches (cs) field) | 36 | * run vmstat and monitor the context-switches (cs) field) |
37 | */ | 37 | */ |
38 | unsigned int sysctl_sched_latency = 5000000ULL; | 38 | unsigned int sysctl_sched_latency = 6000000ULL; |
39 | unsigned int normalized_sysctl_sched_latency = 5000000ULL; | 39 | unsigned int normalized_sysctl_sched_latency = 6000000ULL; |
40 | 40 | ||
41 | /* | 41 | /* |
42 | * The initial- and re-scaling of tunables is configurable | 42 | * The initial- and re-scaling of tunables is configurable |
@@ -52,15 +52,15 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling | |||
52 | 52 | ||
53 | /* | 53 | /* |
54 | * Minimal preemption granularity for CPU-bound tasks: | 54 | * Minimal preemption granularity for CPU-bound tasks: |
55 | * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) | 55 | * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds) |
56 | */ | 56 | */ |
57 | unsigned int sysctl_sched_min_granularity = 1000000ULL; | 57 | unsigned int sysctl_sched_min_granularity = 2000000ULL; |
58 | unsigned int normalized_sysctl_sched_min_granularity = 1000000ULL; | 58 | unsigned int normalized_sysctl_sched_min_granularity = 2000000ULL; |
59 | 59 | ||
60 | /* | 60 | /* |
61 | * is kept at sysctl_sched_latency / sysctl_sched_min_granularity | 61 | * is kept at sysctl_sched_latency / sysctl_sched_min_granularity |
62 | */ | 62 | */ |
63 | static unsigned int sched_nr_latency = 5; | 63 | static unsigned int sched_nr_latency = 3; |
64 | 64 | ||
65 | /* | 65 | /* |
66 | * After fork, child runs first. If set to 0 (default) then | 66 | * After fork, child runs first. If set to 0 (default) then |
@@ -505,7 +505,8 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, | |||
505 | { | 505 | { |
506 | unsigned long delta_exec_weighted; | 506 | unsigned long delta_exec_weighted; |
507 | 507 | ||
508 | schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max)); | 508 | schedstat_set(curr->statistics.exec_max, |
509 | max((u64)delta_exec, curr->statistics.exec_max)); | ||
509 | 510 | ||
510 | curr->sum_exec_runtime += delta_exec; | 511 | curr->sum_exec_runtime += delta_exec; |
511 | schedstat_add(cfs_rq, exec_clock, delta_exec); | 512 | schedstat_add(cfs_rq, exec_clock, delta_exec); |
@@ -548,7 +549,7 @@ static void update_curr(struct cfs_rq *cfs_rq) | |||
548 | static inline void | 549 | static inline void |
549 | update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | 550 | update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) |
550 | { | 551 | { |
551 | schedstat_set(se->wait_start, rq_of(cfs_rq)->clock); | 552 | schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock); |
552 | } | 553 | } |
553 | 554 | ||
554 | /* | 555 | /* |
@@ -567,18 +568,18 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
567 | static void | 568 | static void |
568 | update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) | 569 | update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) |
569 | { | 570 | { |
570 | schedstat_set(se->wait_max, max(se->wait_max, | 571 | schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max, |
571 | rq_of(cfs_rq)->clock - se->wait_start)); | 572 | rq_of(cfs_rq)->clock - se->statistics.wait_start)); |
572 | schedstat_set(se->wait_count, se->wait_count + 1); | 573 | schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1); |
573 | schedstat_set(se->wait_sum, se->wait_sum + | 574 | schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum + |
574 | rq_of(cfs_rq)->clock - se->wait_start); | 575 | rq_of(cfs_rq)->clock - se->statistics.wait_start); |
575 | #ifdef CONFIG_SCHEDSTATS | 576 | #ifdef CONFIG_SCHEDSTATS |
576 | if (entity_is_task(se)) { | 577 | if (entity_is_task(se)) { |
577 | trace_sched_stat_wait(task_of(se), | 578 | trace_sched_stat_wait(task_of(se), |
578 | rq_of(cfs_rq)->clock - se->wait_start); | 579 | rq_of(cfs_rq)->clock - se->statistics.wait_start); |
579 | } | 580 | } |
580 | #endif | 581 | #endif |
581 | schedstat_set(se->wait_start, 0); | 582 | schedstat_set(se->statistics.wait_start, 0); |
582 | } | 583 | } |
583 | 584 | ||
584 | static inline void | 585 | static inline void |
@@ -657,39 +658,39 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
657 | if (entity_is_task(se)) | 658 | if (entity_is_task(se)) |
658 | tsk = task_of(se); | 659 | tsk = task_of(se); |
659 | 660 | ||
660 | if (se->sleep_start) { | 661 | if (se->statistics.sleep_start) { |
661 | u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; | 662 | u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start; |
662 | 663 | ||
663 | if ((s64)delta < 0) | 664 | if ((s64)delta < 0) |
664 | delta = 0; | 665 | delta = 0; |
665 | 666 | ||
666 | if (unlikely(delta > se->sleep_max)) | 667 | if (unlikely(delta > se->statistics.sleep_max)) |
667 | se->sleep_max = delta; | 668 | se->statistics.sleep_max = delta; |
668 | 669 | ||
669 | se->sleep_start = 0; | 670 | se->statistics.sleep_start = 0; |
670 | se->sum_sleep_runtime += delta; | 671 | se->statistics.sum_sleep_runtime += delta; |
671 | 672 | ||
672 | if (tsk) { | 673 | if (tsk) { |
673 | account_scheduler_latency(tsk, delta >> 10, 1); | 674 | account_scheduler_latency(tsk, delta >> 10, 1); |
674 | trace_sched_stat_sleep(tsk, delta); | 675 | trace_sched_stat_sleep(tsk, delta); |
675 | } | 676 | } |
676 | } | 677 | } |
677 | if (se->block_start) { | 678 | if (se->statistics.block_start) { |
678 | u64 delta = rq_of(cfs_rq)->clock - se->block_start; | 679 | u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start; |
679 | 680 | ||
680 | if ((s64)delta < 0) | 681 | if ((s64)delta < 0) |
681 | delta = 0; | 682 | delta = 0; |
682 | 683 | ||
683 | if (unlikely(delta > se->block_max)) | 684 | if (unlikely(delta > se->statistics.block_max)) |
684 | se->block_max = delta; | 685 | se->statistics.block_max = delta; |
685 | 686 | ||
686 | se->block_start = 0; | 687 | se->statistics.block_start = 0; |
687 | se->sum_sleep_runtime += delta; | 688 | se->statistics.sum_sleep_runtime += delta; |
688 | 689 | ||
689 | if (tsk) { | 690 | if (tsk) { |
690 | if (tsk->in_iowait) { | 691 | if (tsk->in_iowait) { |
691 | se->iowait_sum += delta; | 692 | se->statistics.iowait_sum += delta; |
692 | se->iowait_count++; | 693 | se->statistics.iowait_count++; |
693 | trace_sched_stat_iowait(tsk, delta); | 694 | trace_sched_stat_iowait(tsk, delta); |
694 | } | 695 | } |
695 | 696 | ||
@@ -737,20 +738,10 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) | |||
737 | vruntime += sched_vslice(cfs_rq, se); | 738 | vruntime += sched_vslice(cfs_rq, se); |
738 | 739 | ||
739 | /* sleeps up to a single latency don't count. */ | 740 | /* sleeps up to a single latency don't count. */ |
740 | if (!initial && sched_feat(FAIR_SLEEPERS)) { | 741 | if (!initial) { |
741 | unsigned long thresh = sysctl_sched_latency; | 742 | unsigned long thresh = sysctl_sched_latency; |
742 | 743 | ||
743 | /* | 744 | /* |
744 | * Convert the sleeper threshold into virtual time. | ||
745 | * SCHED_IDLE is a special sub-class. We care about | ||
746 | * fairness only relative to other SCHED_IDLE tasks, | ||
747 | * all of which have the same weight. | ||
748 | */ | ||
749 | if (sched_feat(NORMALIZED_SLEEPER) && (!entity_is_task(se) || | ||
750 | task_of(se)->policy != SCHED_IDLE)) | ||
751 | thresh = calc_delta_fair(thresh, se); | ||
752 | |||
753 | /* | ||
754 | * Halve their sleep time's effect, to allow | 745 | * Halve their sleep time's effect, to allow |
755 | * for a gentler effect of sleepers: | 746 | * for a gentler effect of sleepers: |
756 | */ | 747 | */ |
@@ -766,9 +757,6 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) | |||
766 | se->vruntime = vruntime; | 757 | se->vruntime = vruntime; |
767 | } | 758 | } |
768 | 759 | ||
769 | #define ENQUEUE_WAKEUP 1 | ||
770 | #define ENQUEUE_MIGRATE 2 | ||
771 | |||
772 | static void | 760 | static void |
773 | enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | 761 | enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) |
774 | { | 762 | { |
@@ -776,7 +764,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
776 | * Update the normalized vruntime before updating min_vruntime | 764 | * Update the normalized vruntime before updating min_vruntime |
777 | * through callig update_curr(). | 765 | * through callig update_curr(). |
778 | */ | 766 | */ |
779 | if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATE)) | 767 | if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING)) |
780 | se->vruntime += cfs_rq->min_vruntime; | 768 | se->vruntime += cfs_rq->min_vruntime; |
781 | 769 | ||
782 | /* | 770 | /* |
@@ -812,7 +800,7 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
812 | } | 800 | } |
813 | 801 | ||
814 | static void | 802 | static void |
815 | dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) | 803 | dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) |
816 | { | 804 | { |
817 | /* | 805 | /* |
818 | * Update run-time statistics of the 'current'. | 806 | * Update run-time statistics of the 'current'. |
@@ -820,15 +808,15 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) | |||
820 | update_curr(cfs_rq); | 808 | update_curr(cfs_rq); |
821 | 809 | ||
822 | update_stats_dequeue(cfs_rq, se); | 810 | update_stats_dequeue(cfs_rq, se); |
823 | if (sleep) { | 811 | if (flags & DEQUEUE_SLEEP) { |
824 | #ifdef CONFIG_SCHEDSTATS | 812 | #ifdef CONFIG_SCHEDSTATS |
825 | if (entity_is_task(se)) { | 813 | if (entity_is_task(se)) { |
826 | struct task_struct *tsk = task_of(se); | 814 | struct task_struct *tsk = task_of(se); |
827 | 815 | ||
828 | if (tsk->state & TASK_INTERRUPTIBLE) | 816 | if (tsk->state & TASK_INTERRUPTIBLE) |
829 | se->sleep_start = rq_of(cfs_rq)->clock; | 817 | se->statistics.sleep_start = rq_of(cfs_rq)->clock; |
830 | if (tsk->state & TASK_UNINTERRUPTIBLE) | 818 | if (tsk->state & TASK_UNINTERRUPTIBLE) |
831 | se->block_start = rq_of(cfs_rq)->clock; | 819 | se->statistics.block_start = rq_of(cfs_rq)->clock; |
832 | } | 820 | } |
833 | #endif | 821 | #endif |
834 | } | 822 | } |
@@ -845,7 +833,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) | |||
845 | * update can refer to the ->curr item and we need to reflect this | 833 | * update can refer to the ->curr item and we need to reflect this |
846 | * movement in our normalized position. | 834 | * movement in our normalized position. |
847 | */ | 835 | */ |
848 | if (!sleep) | 836 | if (!(flags & DEQUEUE_SLEEP)) |
849 | se->vruntime -= cfs_rq->min_vruntime; | 837 | se->vruntime -= cfs_rq->min_vruntime; |
850 | } | 838 | } |
851 | 839 | ||
@@ -912,7 +900,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
912 | * when there are only lesser-weight tasks around): | 900 | * when there are only lesser-weight tasks around): |
913 | */ | 901 | */ |
914 | if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { | 902 | if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { |
915 | se->slice_max = max(se->slice_max, | 903 | se->statistics.slice_max = max(se->statistics.slice_max, |
916 | se->sum_exec_runtime - se->prev_sum_exec_runtime); | 904 | se->sum_exec_runtime - se->prev_sum_exec_runtime); |
917 | } | 905 | } |
918 | #endif | 906 | #endif |
@@ -1054,16 +1042,10 @@ static inline void hrtick_update(struct rq *rq) | |||
1054 | * then put the task into the rbtree: | 1042 | * then put the task into the rbtree: |
1055 | */ | 1043 | */ |
1056 | static void | 1044 | static void |
1057 | enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head) | 1045 | enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) |
1058 | { | 1046 | { |
1059 | struct cfs_rq *cfs_rq; | 1047 | struct cfs_rq *cfs_rq; |
1060 | struct sched_entity *se = &p->se; | 1048 | struct sched_entity *se = &p->se; |
1061 | int flags = 0; | ||
1062 | |||
1063 | if (wakeup) | ||
1064 | flags |= ENQUEUE_WAKEUP; | ||
1065 | if (p->state == TASK_WAKING) | ||
1066 | flags |= ENQUEUE_MIGRATE; | ||
1067 | 1049 | ||
1068 | for_each_sched_entity(se) { | 1050 | for_each_sched_entity(se) { |
1069 | if (se->on_rq) | 1051 | if (se->on_rq) |
@@ -1081,18 +1063,18 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head) | |||
1081 | * decreased. We remove the task from the rbtree and | 1063 | * decreased. We remove the task from the rbtree and |
1082 | * update the fair scheduling stats: | 1064 | * update the fair scheduling stats: |
1083 | */ | 1065 | */ |
1084 | static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) | 1066 | static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) |
1085 | { | 1067 | { |
1086 | struct cfs_rq *cfs_rq; | 1068 | struct cfs_rq *cfs_rq; |
1087 | struct sched_entity *se = &p->se; | 1069 | struct sched_entity *se = &p->se; |
1088 | 1070 | ||
1089 | for_each_sched_entity(se) { | 1071 | for_each_sched_entity(se) { |
1090 | cfs_rq = cfs_rq_of(se); | 1072 | cfs_rq = cfs_rq_of(se); |
1091 | dequeue_entity(cfs_rq, se, sleep); | 1073 | dequeue_entity(cfs_rq, se, flags); |
1092 | /* Don't dequeue parent if it has other entities besides us */ | 1074 | /* Don't dequeue parent if it has other entities besides us */ |
1093 | if (cfs_rq->load.weight) | 1075 | if (cfs_rq->load.weight) |
1094 | break; | 1076 | break; |
1095 | sleep = 1; | 1077 | flags |= DEQUEUE_SLEEP; |
1096 | } | 1078 | } |
1097 | 1079 | ||
1098 | hrtick_update(rq); | 1080 | hrtick_update(rq); |
@@ -1240,7 +1222,6 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu, | |||
1240 | 1222 | ||
1241 | static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) | 1223 | static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) |
1242 | { | 1224 | { |
1243 | struct task_struct *curr = current; | ||
1244 | unsigned long this_load, load; | 1225 | unsigned long this_load, load; |
1245 | int idx, this_cpu, prev_cpu; | 1226 | int idx, this_cpu, prev_cpu; |
1246 | unsigned long tl_per_task; | 1227 | unsigned long tl_per_task; |
@@ -1255,18 +1236,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) | |||
1255 | load = source_load(prev_cpu, idx); | 1236 | load = source_load(prev_cpu, idx); |
1256 | this_load = target_load(this_cpu, idx); | 1237 | this_load = target_load(this_cpu, idx); |
1257 | 1238 | ||
1258 | if (sync) { | ||
1259 | if (sched_feat(SYNC_LESS) && | ||
1260 | (curr->se.avg_overlap > sysctl_sched_migration_cost || | ||
1261 | p->se.avg_overlap > sysctl_sched_migration_cost)) | ||
1262 | sync = 0; | ||
1263 | } else { | ||
1264 | if (sched_feat(SYNC_MORE) && | ||
1265 | (curr->se.avg_overlap < sysctl_sched_migration_cost && | ||
1266 | p->se.avg_overlap < sysctl_sched_migration_cost)) | ||
1267 | sync = 1; | ||
1268 | } | ||
1269 | |||
1270 | /* | 1239 | /* |
1271 | * If sync wakeup then subtract the (maximum possible) | 1240 | * If sync wakeup then subtract the (maximum possible) |
1272 | * effect of the currently running task from the load | 1241 | * effect of the currently running task from the load |
@@ -1306,7 +1275,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) | |||
1306 | if (sync && balanced) | 1275 | if (sync && balanced) |
1307 | return 1; | 1276 | return 1; |
1308 | 1277 | ||
1309 | schedstat_inc(p, se.nr_wakeups_affine_attempts); | 1278 | schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts); |
1310 | tl_per_task = cpu_avg_load_per_task(this_cpu); | 1279 | tl_per_task = cpu_avg_load_per_task(this_cpu); |
1311 | 1280 | ||
1312 | if (balanced || | 1281 | if (balanced || |
@@ -1318,7 +1287,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) | |||
1318 | * there is no bad imbalance. | 1287 | * there is no bad imbalance. |
1319 | */ | 1288 | */ |
1320 | schedstat_inc(sd, ttwu_move_affine); | 1289 | schedstat_inc(sd, ttwu_move_affine); |
1321 | schedstat_inc(p, se.nr_wakeups_affine); | 1290 | schedstat_inc(p, se.statistics.nr_wakeups_affine); |
1322 | 1291 | ||
1323 | return 1; | 1292 | return 1; |
1324 | } | 1293 | } |
@@ -1406,29 +1375,48 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) | |||
1406 | /* | 1375 | /* |
1407 | * Try and locate an idle CPU in the sched_domain. | 1376 | * Try and locate an idle CPU in the sched_domain. |
1408 | */ | 1377 | */ |
1409 | static int | 1378 | static int select_idle_sibling(struct task_struct *p, int target) |
1410 | select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target) | ||
1411 | { | 1379 | { |
1412 | int cpu = smp_processor_id(); | 1380 | int cpu = smp_processor_id(); |
1413 | int prev_cpu = task_cpu(p); | 1381 | int prev_cpu = task_cpu(p); |
1382 | struct sched_domain *sd; | ||
1414 | int i; | 1383 | int i; |
1415 | 1384 | ||
1416 | /* | 1385 | /* |
1417 | * If this domain spans both cpu and prev_cpu (see the SD_WAKE_AFFINE | 1386 | * If the task is going to be woken-up on this cpu and if it is |
1418 | * test in select_task_rq_fair) and the prev_cpu is idle then that's | 1387 | * already idle, then it is the right target. |
1419 | * always a better target than the current cpu. | ||
1420 | */ | 1388 | */ |
1421 | if (target == cpu && !cpu_rq(prev_cpu)->cfs.nr_running) | 1389 | if (target == cpu && idle_cpu(cpu)) |
1390 | return cpu; | ||
1391 | |||
1392 | /* | ||
1393 | * If the task is going to be woken-up on the cpu where it previously | ||
1394 | * ran and if it is currently idle, then it the right target. | ||
1395 | */ | ||
1396 | if (target == prev_cpu && idle_cpu(prev_cpu)) | ||
1422 | return prev_cpu; | 1397 | return prev_cpu; |
1423 | 1398 | ||
1424 | /* | 1399 | /* |
1425 | * Otherwise, iterate the domain and find an elegible idle cpu. | 1400 | * Otherwise, iterate the domains and find an elegible idle cpu. |
1426 | */ | 1401 | */ |
1427 | for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) { | 1402 | for_each_domain(target, sd) { |
1428 | if (!cpu_rq(i)->cfs.nr_running) { | 1403 | if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) |
1429 | target = i; | ||
1430 | break; | 1404 | break; |
1405 | |||
1406 | for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) { | ||
1407 | if (idle_cpu(i)) { | ||
1408 | target = i; | ||
1409 | break; | ||
1410 | } | ||
1431 | } | 1411 | } |
1412 | |||
1413 | /* | ||
1414 | * Lets stop looking for an idle sibling when we reached | ||
1415 | * the domain that spans the current cpu and prev_cpu. | ||
1416 | */ | ||
1417 | if (cpumask_test_cpu(cpu, sched_domain_span(sd)) && | ||
1418 | cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) | ||
1419 | break; | ||
1432 | } | 1420 | } |
1433 | 1421 | ||
1434 | return target; | 1422 | return target; |
@@ -1445,7 +1433,8 @@ select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target) | |||
1445 | * | 1433 | * |
1446 | * preempt must be disabled. | 1434 | * preempt must be disabled. |
1447 | */ | 1435 | */ |
1448 | static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) | 1436 | static int |
1437 | select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags) | ||
1449 | { | 1438 | { |
1450 | struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; | 1439 | struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; |
1451 | int cpu = smp_processor_id(); | 1440 | int cpu = smp_processor_id(); |
@@ -1456,8 +1445,7 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag | |||
1456 | int sync = wake_flags & WF_SYNC; | 1445 | int sync = wake_flags & WF_SYNC; |
1457 | 1446 | ||
1458 | if (sd_flag & SD_BALANCE_WAKE) { | 1447 | if (sd_flag & SD_BALANCE_WAKE) { |
1459 | if (sched_feat(AFFINE_WAKEUPS) && | 1448 | if (cpumask_test_cpu(cpu, &p->cpus_allowed)) |
1460 | cpumask_test_cpu(cpu, &p->cpus_allowed)) | ||
1461 | want_affine = 1; | 1449 | want_affine = 1; |
1462 | new_cpu = prev_cpu; | 1450 | new_cpu = prev_cpu; |
1463 | } | 1451 | } |
@@ -1491,34 +1479,13 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag | |||
1491 | } | 1479 | } |
1492 | 1480 | ||
1493 | /* | 1481 | /* |
1494 | * While iterating the domains looking for a spanning | 1482 | * If both cpu and prev_cpu are part of this domain, |
1495 | * WAKE_AFFINE domain, adjust the affine target to any idle cpu | 1483 | * cpu is a valid SD_WAKE_AFFINE target. |
1496 | * in cache sharing domains along the way. | ||
1497 | */ | 1484 | */ |
1498 | if (want_affine) { | 1485 | if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && |
1499 | int target = -1; | 1486 | cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { |
1500 | 1487 | affine_sd = tmp; | |
1501 | /* | 1488 | want_affine = 0; |
1502 | * If both cpu and prev_cpu are part of this domain, | ||
1503 | * cpu is a valid SD_WAKE_AFFINE target. | ||
1504 | */ | ||
1505 | if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) | ||
1506 | target = cpu; | ||
1507 | |||
1508 | /* | ||
1509 | * If there's an idle sibling in this domain, make that | ||
1510 | * the wake_affine target instead of the current cpu. | ||
1511 | */ | ||
1512 | if (tmp->flags & SD_SHARE_PKG_RESOURCES) | ||
1513 | target = select_idle_sibling(p, tmp, target); | ||
1514 | |||
1515 | if (target >= 0) { | ||
1516 | if (tmp->flags & SD_WAKE_AFFINE) { | ||
1517 | affine_sd = tmp; | ||
1518 | want_affine = 0; | ||
1519 | } | ||
1520 | cpu = target; | ||
1521 | } | ||
1522 | } | 1489 | } |
1523 | 1490 | ||
1524 | if (!want_sd && !want_affine) | 1491 | if (!want_sd && !want_affine) |
@@ -1531,22 +1498,29 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag | |||
1531 | sd = tmp; | 1498 | sd = tmp; |
1532 | } | 1499 | } |
1533 | 1500 | ||
1501 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1534 | if (sched_feat(LB_SHARES_UPDATE)) { | 1502 | if (sched_feat(LB_SHARES_UPDATE)) { |
1535 | /* | 1503 | /* |
1536 | * Pick the largest domain to update shares over | 1504 | * Pick the largest domain to update shares over |
1537 | */ | 1505 | */ |
1538 | tmp = sd; | 1506 | tmp = sd; |
1539 | if (affine_sd && (!tmp || | 1507 | if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight)) |
1540 | cpumask_weight(sched_domain_span(affine_sd)) > | ||
1541 | cpumask_weight(sched_domain_span(sd)))) | ||
1542 | tmp = affine_sd; | 1508 | tmp = affine_sd; |
1543 | 1509 | ||
1544 | if (tmp) | 1510 | if (tmp) { |
1511 | raw_spin_unlock(&rq->lock); | ||
1545 | update_shares(tmp); | 1512 | update_shares(tmp); |
1513 | raw_spin_lock(&rq->lock); | ||
1514 | } | ||
1546 | } | 1515 | } |
1516 | #endif | ||
1547 | 1517 | ||
1548 | if (affine_sd && wake_affine(affine_sd, p, sync)) | 1518 | if (affine_sd) { |
1549 | return cpu; | 1519 | if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) |
1520 | return select_idle_sibling(p, cpu); | ||
1521 | else | ||
1522 | return select_idle_sibling(p, prev_cpu); | ||
1523 | } | ||
1550 | 1524 | ||
1551 | while (sd) { | 1525 | while (sd) { |
1552 | int load_idx = sd->forkexec_idx; | 1526 | int load_idx = sd->forkexec_idx; |
@@ -1576,10 +1550,10 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag | |||
1576 | 1550 | ||
1577 | /* Now try balancing at a lower domain level of new_cpu */ | 1551 | /* Now try balancing at a lower domain level of new_cpu */ |
1578 | cpu = new_cpu; | 1552 | cpu = new_cpu; |
1579 | weight = cpumask_weight(sched_domain_span(sd)); | 1553 | weight = sd->span_weight; |
1580 | sd = NULL; | 1554 | sd = NULL; |
1581 | for_each_domain(cpu, tmp) { | 1555 | for_each_domain(cpu, tmp) { |
1582 | if (weight <= cpumask_weight(sched_domain_span(tmp))) | 1556 | if (weight <= tmp->span_weight) |
1583 | break; | 1557 | break; |
1584 | if (tmp->flags & sd_flag) | 1558 | if (tmp->flags & sd_flag) |
1585 | sd = tmp; | 1559 | sd = tmp; |
@@ -1591,63 +1565,26 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag | |||
1591 | } | 1565 | } |
1592 | #endif /* CONFIG_SMP */ | 1566 | #endif /* CONFIG_SMP */ |
1593 | 1567 | ||
1594 | /* | ||
1595 | * Adaptive granularity | ||
1596 | * | ||
1597 | * se->avg_wakeup gives the average time a task runs until it does a wakeup, | ||
1598 | * with the limit of wakeup_gran -- when it never does a wakeup. | ||
1599 | * | ||
1600 | * So the smaller avg_wakeup is the faster we want this task to preempt, | ||
1601 | * but we don't want to treat the preemptee unfairly and therefore allow it | ||
1602 | * to run for at least the amount of time we'd like to run. | ||
1603 | * | ||
1604 | * NOTE: we use 2*avg_wakeup to increase the probability of actually doing one | ||
1605 | * | ||
1606 | * NOTE: we use *nr_running to scale with load, this nicely matches the | ||
1607 | * degrading latency on load. | ||
1608 | */ | ||
1609 | static unsigned long | ||
1610 | adaptive_gran(struct sched_entity *curr, struct sched_entity *se) | ||
1611 | { | ||
1612 | u64 this_run = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; | ||
1613 | u64 expected_wakeup = 2*se->avg_wakeup * cfs_rq_of(se)->nr_running; | ||
1614 | u64 gran = 0; | ||
1615 | |||
1616 | if (this_run < expected_wakeup) | ||
1617 | gran = expected_wakeup - this_run; | ||
1618 | |||
1619 | return min_t(s64, gran, sysctl_sched_wakeup_granularity); | ||
1620 | } | ||
1621 | |||
1622 | static unsigned long | 1568 | static unsigned long |
1623 | wakeup_gran(struct sched_entity *curr, struct sched_entity *se) | 1569 | wakeup_gran(struct sched_entity *curr, struct sched_entity *se) |
1624 | { | 1570 | { |
1625 | unsigned long gran = sysctl_sched_wakeup_granularity; | 1571 | unsigned long gran = sysctl_sched_wakeup_granularity; |
1626 | 1572 | ||
1627 | if (cfs_rq_of(curr)->curr && sched_feat(ADAPTIVE_GRAN)) | ||
1628 | gran = adaptive_gran(curr, se); | ||
1629 | |||
1630 | /* | 1573 | /* |
1631 | * Since its curr running now, convert the gran from real-time | 1574 | * Since its curr running now, convert the gran from real-time |
1632 | * to virtual-time in his units. | 1575 | * to virtual-time in his units. |
1576 | * | ||
1577 | * By using 'se' instead of 'curr' we penalize light tasks, so | ||
1578 | * they get preempted easier. That is, if 'se' < 'curr' then | ||
1579 | * the resulting gran will be larger, therefore penalizing the | ||
1580 | * lighter, if otoh 'se' > 'curr' then the resulting gran will | ||
1581 | * be smaller, again penalizing the lighter task. | ||
1582 | * | ||
1583 | * This is especially important for buddies when the leftmost | ||
1584 | * task is higher priority than the buddy. | ||
1633 | */ | 1585 | */ |
1634 | if (sched_feat(ASYM_GRAN)) { | 1586 | if (unlikely(se->load.weight != NICE_0_LOAD)) |
1635 | /* | 1587 | gran = calc_delta_fair(gran, se); |
1636 | * By using 'se' instead of 'curr' we penalize light tasks, so | ||
1637 | * they get preempted easier. That is, if 'se' < 'curr' then | ||
1638 | * the resulting gran will be larger, therefore penalizing the | ||
1639 | * lighter, if otoh 'se' > 'curr' then the resulting gran will | ||
1640 | * be smaller, again penalizing the lighter task. | ||
1641 | * | ||
1642 | * This is especially important for buddies when the leftmost | ||
1643 | * task is higher priority than the buddy. | ||
1644 | */ | ||
1645 | if (unlikely(se->load.weight != NICE_0_LOAD)) | ||
1646 | gran = calc_delta_fair(gran, se); | ||
1647 | } else { | ||
1648 | if (unlikely(curr->load.weight != NICE_0_LOAD)) | ||
1649 | gran = calc_delta_fair(gran, curr); | ||
1650 | } | ||
1651 | 1588 | ||
1652 | return gran; | 1589 | return gran; |
1653 | } | 1590 | } |
@@ -1705,7 +1642,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ | |||
1705 | struct task_struct *curr = rq->curr; | 1642 | struct task_struct *curr = rq->curr; |
1706 | struct sched_entity *se = &curr->se, *pse = &p->se; | 1643 | struct sched_entity *se = &curr->se, *pse = &p->se; |
1707 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); | 1644 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); |
1708 | int sync = wake_flags & WF_SYNC; | ||
1709 | int scale = cfs_rq->nr_running >= sched_nr_latency; | 1645 | int scale = cfs_rq->nr_running >= sched_nr_latency; |
1710 | 1646 | ||
1711 | if (unlikely(rt_prio(p->prio))) | 1647 | if (unlikely(rt_prio(p->prio))) |
@@ -1738,14 +1674,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ | |||
1738 | if (unlikely(curr->policy == SCHED_IDLE)) | 1674 | if (unlikely(curr->policy == SCHED_IDLE)) |
1739 | goto preempt; | 1675 | goto preempt; |
1740 | 1676 | ||
1741 | if (sched_feat(WAKEUP_SYNC) && sync) | ||
1742 | goto preempt; | ||
1743 | |||
1744 | if (sched_feat(WAKEUP_OVERLAP) && | ||
1745 | se->avg_overlap < sysctl_sched_migration_cost && | ||
1746 | pse->avg_overlap < sysctl_sched_migration_cost) | ||
1747 | goto preempt; | ||
1748 | |||
1749 | if (!sched_feat(WAKEUP_PREEMPT)) | 1677 | if (!sched_feat(WAKEUP_PREEMPT)) |
1750 | return; | 1678 | return; |
1751 | 1679 | ||
@@ -1844,13 +1772,13 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, | |||
1844 | * 3) are cache-hot on their current CPU. | 1772 | * 3) are cache-hot on their current CPU. |
1845 | */ | 1773 | */ |
1846 | if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) { | 1774 | if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) { |
1847 | schedstat_inc(p, se.nr_failed_migrations_affine); | 1775 | schedstat_inc(p, se.statistics.nr_failed_migrations_affine); |
1848 | return 0; | 1776 | return 0; |
1849 | } | 1777 | } |
1850 | *all_pinned = 0; | 1778 | *all_pinned = 0; |
1851 | 1779 | ||
1852 | if (task_running(rq, p)) { | 1780 | if (task_running(rq, p)) { |
1853 | schedstat_inc(p, se.nr_failed_migrations_running); | 1781 | schedstat_inc(p, se.statistics.nr_failed_migrations_running); |
1854 | return 0; | 1782 | return 0; |
1855 | } | 1783 | } |
1856 | 1784 | ||
@@ -1866,14 +1794,14 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, | |||
1866 | #ifdef CONFIG_SCHEDSTATS | 1794 | #ifdef CONFIG_SCHEDSTATS |
1867 | if (tsk_cache_hot) { | 1795 | if (tsk_cache_hot) { |
1868 | schedstat_inc(sd, lb_hot_gained[idle]); | 1796 | schedstat_inc(sd, lb_hot_gained[idle]); |
1869 | schedstat_inc(p, se.nr_forced_migrations); | 1797 | schedstat_inc(p, se.statistics.nr_forced_migrations); |
1870 | } | 1798 | } |
1871 | #endif | 1799 | #endif |
1872 | return 1; | 1800 | return 1; |
1873 | } | 1801 | } |
1874 | 1802 | ||
1875 | if (tsk_cache_hot) { | 1803 | if (tsk_cache_hot) { |
1876 | schedstat_inc(p, se.nr_failed_migrations_hot); | 1804 | schedstat_inc(p, se.statistics.nr_failed_migrations_hot); |
1877 | return 0; | 1805 | return 0; |
1878 | } | 1806 | } |
1879 | return 1; | 1807 | return 1; |
@@ -2311,7 +2239,7 @@ unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) | |||
2311 | 2239 | ||
2312 | unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) | 2240 | unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) |
2313 | { | 2241 | { |
2314 | unsigned long weight = cpumask_weight(sched_domain_span(sd)); | 2242 | unsigned long weight = sd->span_weight; |
2315 | unsigned long smt_gain = sd->smt_gain; | 2243 | unsigned long smt_gain = sd->smt_gain; |
2316 | 2244 | ||
2317 | smt_gain /= weight; | 2245 | smt_gain /= weight; |
@@ -2344,7 +2272,7 @@ unsigned long scale_rt_power(int cpu) | |||
2344 | 2272 | ||
2345 | static void update_cpu_power(struct sched_domain *sd, int cpu) | 2273 | static void update_cpu_power(struct sched_domain *sd, int cpu) |
2346 | { | 2274 | { |
2347 | unsigned long weight = cpumask_weight(sched_domain_span(sd)); | 2275 | unsigned long weight = sd->span_weight; |
2348 | unsigned long power = SCHED_LOAD_SCALE; | 2276 | unsigned long power = SCHED_LOAD_SCALE; |
2349 | struct sched_group *sdg = sd->groups; | 2277 | struct sched_group *sdg = sd->groups; |
2350 | 2278 | ||
@@ -2870,6 +2798,8 @@ static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle) | |||
2870 | return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); | 2798 | return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); |
2871 | } | 2799 | } |
2872 | 2800 | ||
2801 | static int active_load_balance_cpu_stop(void *data); | ||
2802 | |||
2873 | /* | 2803 | /* |
2874 | * Check this_cpu to ensure it is balanced within domain. Attempt to move | 2804 | * Check this_cpu to ensure it is balanced within domain. Attempt to move |
2875 | * tasks if there is an imbalance. | 2805 | * tasks if there is an imbalance. |
@@ -2959,8 +2889,9 @@ redo: | |||
2959 | if (need_active_balance(sd, sd_idle, idle)) { | 2889 | if (need_active_balance(sd, sd_idle, idle)) { |
2960 | raw_spin_lock_irqsave(&busiest->lock, flags); | 2890 | raw_spin_lock_irqsave(&busiest->lock, flags); |
2961 | 2891 | ||
2962 | /* don't kick the migration_thread, if the curr | 2892 | /* don't kick the active_load_balance_cpu_stop, |
2963 | * task on busiest cpu can't be moved to this_cpu | 2893 | * if the curr task on busiest cpu can't be |
2894 | * moved to this_cpu | ||
2964 | */ | 2895 | */ |
2965 | if (!cpumask_test_cpu(this_cpu, | 2896 | if (!cpumask_test_cpu(this_cpu, |
2966 | &busiest->curr->cpus_allowed)) { | 2897 | &busiest->curr->cpus_allowed)) { |
@@ -2970,14 +2901,22 @@ redo: | |||
2970 | goto out_one_pinned; | 2901 | goto out_one_pinned; |
2971 | } | 2902 | } |
2972 | 2903 | ||
2904 | /* | ||
2905 | * ->active_balance synchronizes accesses to | ||
2906 | * ->active_balance_work. Once set, it's cleared | ||
2907 | * only after active load balance is finished. | ||
2908 | */ | ||
2973 | if (!busiest->active_balance) { | 2909 | if (!busiest->active_balance) { |
2974 | busiest->active_balance = 1; | 2910 | busiest->active_balance = 1; |
2975 | busiest->push_cpu = this_cpu; | 2911 | busiest->push_cpu = this_cpu; |
2976 | active_balance = 1; | 2912 | active_balance = 1; |
2977 | } | 2913 | } |
2978 | raw_spin_unlock_irqrestore(&busiest->lock, flags); | 2914 | raw_spin_unlock_irqrestore(&busiest->lock, flags); |
2915 | |||
2979 | if (active_balance) | 2916 | if (active_balance) |
2980 | wake_up_process(busiest->migration_thread); | 2917 | stop_one_cpu_nowait(cpu_of(busiest), |
2918 | active_load_balance_cpu_stop, busiest, | ||
2919 | &busiest->active_balance_work); | ||
2981 | 2920 | ||
2982 | /* | 2921 | /* |
2983 | * We've kicked active balancing, reset the failure | 2922 | * We've kicked active balancing, reset the failure |
@@ -3084,24 +3023,29 @@ static void idle_balance(int this_cpu, struct rq *this_rq) | |||
3084 | } | 3023 | } |
3085 | 3024 | ||
3086 | /* | 3025 | /* |
3087 | * active_load_balance is run by migration threads. It pushes running tasks | 3026 | * active_load_balance_cpu_stop is run by cpu stopper. It pushes |
3088 | * off the busiest CPU onto idle CPUs. It requires at least 1 task to be | 3027 | * running tasks off the busiest CPU onto idle CPUs. It requires at |
3089 | * running on each physical CPU where possible, and avoids physical / | 3028 | * least 1 task to be running on each physical CPU where possible, and |
3090 | * logical imbalances. | 3029 | * avoids physical / logical imbalances. |
3091 | * | ||
3092 | * Called with busiest_rq locked. | ||
3093 | */ | 3030 | */ |
3094 | static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) | 3031 | static int active_load_balance_cpu_stop(void *data) |
3095 | { | 3032 | { |
3033 | struct rq *busiest_rq = data; | ||
3034 | int busiest_cpu = cpu_of(busiest_rq); | ||
3096 | int target_cpu = busiest_rq->push_cpu; | 3035 | int target_cpu = busiest_rq->push_cpu; |
3036 | struct rq *target_rq = cpu_rq(target_cpu); | ||
3097 | struct sched_domain *sd; | 3037 | struct sched_domain *sd; |
3098 | struct rq *target_rq; | 3038 | |
3039 | raw_spin_lock_irq(&busiest_rq->lock); | ||
3040 | |||
3041 | /* make sure the requested cpu hasn't gone down in the meantime */ | ||
3042 | if (unlikely(busiest_cpu != smp_processor_id() || | ||
3043 | !busiest_rq->active_balance)) | ||
3044 | goto out_unlock; | ||
3099 | 3045 | ||
3100 | /* Is there any task to move? */ | 3046 | /* Is there any task to move? */ |
3101 | if (busiest_rq->nr_running <= 1) | 3047 | if (busiest_rq->nr_running <= 1) |
3102 | return; | 3048 | goto out_unlock; |
3103 | |||
3104 | target_rq = cpu_rq(target_cpu); | ||
3105 | 3049 | ||
3106 | /* | 3050 | /* |
3107 | * This condition is "impossible", if it occurs | 3051 | * This condition is "impossible", if it occurs |
@@ -3112,8 +3056,6 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) | |||
3112 | 3056 | ||
3113 | /* move a task from busiest_rq to target_rq */ | 3057 | /* move a task from busiest_rq to target_rq */ |
3114 | double_lock_balance(busiest_rq, target_rq); | 3058 | double_lock_balance(busiest_rq, target_rq); |
3115 | update_rq_clock(busiest_rq); | ||
3116 | update_rq_clock(target_rq); | ||
3117 | 3059 | ||
3118 | /* Search for an sd spanning us and the target CPU. */ | 3060 | /* Search for an sd spanning us and the target CPU. */ |
3119 | for_each_domain(target_cpu, sd) { | 3061 | for_each_domain(target_cpu, sd) { |
@@ -3132,6 +3074,10 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) | |||
3132 | schedstat_inc(sd, alb_failed); | 3074 | schedstat_inc(sd, alb_failed); |
3133 | } | 3075 | } |
3134 | double_unlock_balance(busiest_rq, target_rq); | 3076 | double_unlock_balance(busiest_rq, target_rq); |
3077 | out_unlock: | ||
3078 | busiest_rq->active_balance = 0; | ||
3079 | raw_spin_unlock_irq(&busiest_rq->lock); | ||
3080 | return 0; | ||
3135 | } | 3081 | } |
3136 | 3082 | ||
3137 | #ifdef CONFIG_NO_HZ | 3083 | #ifdef CONFIG_NO_HZ |