diff options
Diffstat (limited to 'kernel/sched_fair.c')
-rw-r--r-- | kernel/sched_fair.c | 430 |
1 files changed, 250 insertions, 180 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 0c26e2df450e..6fa833ab2cb8 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -22,6 +22,7 @@ | |||
22 | 22 | ||
23 | #include <linux/latencytop.h> | 23 | #include <linux/latencytop.h> |
24 | #include <linux/sched.h> | 24 | #include <linux/sched.h> |
25 | #include <linux/cpumask.h> | ||
25 | 26 | ||
26 | /* | 27 | /* |
27 | * Targeted preemption latency for CPU-bound tasks: | 28 | * Targeted preemption latency for CPU-bound tasks: |
@@ -69,14 +70,6 @@ static unsigned int sched_nr_latency = 8; | |||
69 | unsigned int sysctl_sched_child_runs_first __read_mostly; | 70 | unsigned int sysctl_sched_child_runs_first __read_mostly; |
70 | 71 | ||
71 | /* | 72 | /* |
72 | * sys_sched_yield() compat mode | ||
73 | * | ||
74 | * This option switches the agressive yield implementation of the | ||
75 | * old scheduler back on. | ||
76 | */ | ||
77 | unsigned int __read_mostly sysctl_sched_compat_yield; | ||
78 | |||
79 | /* | ||
80 | * SCHED_OTHER wake-up granularity. | 73 | * SCHED_OTHER wake-up granularity. |
81 | * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) | 74 | * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) |
82 | * | 75 | * |
@@ -419,7 +412,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
419 | rb_erase(&se->run_node, &cfs_rq->tasks_timeline); | 412 | rb_erase(&se->run_node, &cfs_rq->tasks_timeline); |
420 | } | 413 | } |
421 | 414 | ||
422 | static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) | 415 | static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) |
423 | { | 416 | { |
424 | struct rb_node *left = cfs_rq->rb_leftmost; | 417 | struct rb_node *left = cfs_rq->rb_leftmost; |
425 | 418 | ||
@@ -429,6 +422,17 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) | |||
429 | return rb_entry(left, struct sched_entity, run_node); | 422 | return rb_entry(left, struct sched_entity, run_node); |
430 | } | 423 | } |
431 | 424 | ||
425 | static struct sched_entity *__pick_next_entity(struct sched_entity *se) | ||
426 | { | ||
427 | struct rb_node *next = rb_next(&se->run_node); | ||
428 | |||
429 | if (!next) | ||
430 | return NULL; | ||
431 | |||
432 | return rb_entry(next, struct sched_entity, run_node); | ||
433 | } | ||
434 | |||
435 | #ifdef CONFIG_SCHED_DEBUG | ||
432 | static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) | 436 | static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) |
433 | { | 437 | { |
434 | struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); | 438 | struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); |
@@ -443,7 +447,6 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) | |||
443 | * Scheduling class statistics methods: | 447 | * Scheduling class statistics methods: |
444 | */ | 448 | */ |
445 | 449 | ||
446 | #ifdef CONFIG_SCHED_DEBUG | ||
447 | int sched_proc_update_handler(struct ctl_table *table, int write, | 450 | int sched_proc_update_handler(struct ctl_table *table, int write, |
448 | void __user *buffer, size_t *lenp, | 451 | void __user *buffer, size_t *lenp, |
449 | loff_t *ppos) | 452 | loff_t *ppos) |
@@ -540,7 +543,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
540 | } | 543 | } |
541 | 544 | ||
542 | static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update); | 545 | static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update); |
543 | static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta); | 546 | static void update_cfs_shares(struct cfs_rq *cfs_rq); |
544 | 547 | ||
545 | /* | 548 | /* |
546 | * Update the current task's runtime statistics. Skip current tasks that | 549 | * Update the current task's runtime statistics. Skip current tasks that |
@@ -733,6 +736,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) | |||
733 | now - cfs_rq->load_last > 4 * period) { | 736 | now - cfs_rq->load_last > 4 * period) { |
734 | cfs_rq->load_period = 0; | 737 | cfs_rq->load_period = 0; |
735 | cfs_rq->load_avg = 0; | 738 | cfs_rq->load_avg = 0; |
739 | delta = period - 1; | ||
736 | } | 740 | } |
737 | 741 | ||
738 | cfs_rq->load_stamp = now; | 742 | cfs_rq->load_stamp = now; |
@@ -763,16 +767,15 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) | |||
763 | list_del_leaf_cfs_rq(cfs_rq); | 767 | list_del_leaf_cfs_rq(cfs_rq); |
764 | } | 768 | } |
765 | 769 | ||
766 | static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg, | 770 | static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) |
767 | long weight_delta) | ||
768 | { | 771 | { |
769 | long load_weight, load, shares; | 772 | long load_weight, load, shares; |
770 | 773 | ||
771 | load = cfs_rq->load.weight + weight_delta; | 774 | load = cfs_rq->load.weight; |
772 | 775 | ||
773 | load_weight = atomic_read(&tg->load_weight); | 776 | load_weight = atomic_read(&tg->load_weight); |
774 | load_weight -= cfs_rq->load_contribution; | ||
775 | load_weight += load; | 777 | load_weight += load; |
778 | load_weight -= cfs_rq->load_contribution; | ||
776 | 779 | ||
777 | shares = (tg->shares * load); | 780 | shares = (tg->shares * load); |
778 | if (load_weight) | 781 | if (load_weight) |
@@ -790,7 +793,7 @@ static void update_entity_shares_tick(struct cfs_rq *cfs_rq) | |||
790 | { | 793 | { |
791 | if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { | 794 | if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { |
792 | update_cfs_load(cfs_rq, 0); | 795 | update_cfs_load(cfs_rq, 0); |
793 | update_cfs_shares(cfs_rq, 0); | 796 | update_cfs_shares(cfs_rq); |
794 | } | 797 | } |
795 | } | 798 | } |
796 | # else /* CONFIG_SMP */ | 799 | # else /* CONFIG_SMP */ |
@@ -798,8 +801,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) | |||
798 | { | 801 | { |
799 | } | 802 | } |
800 | 803 | ||
801 | static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg, | 804 | static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) |
802 | long weight_delta) | ||
803 | { | 805 | { |
804 | return tg->shares; | 806 | return tg->shares; |
805 | } | 807 | } |
@@ -824,7 +826,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, | |||
824 | account_entity_enqueue(cfs_rq, se); | 826 | account_entity_enqueue(cfs_rq, se); |
825 | } | 827 | } |
826 | 828 | ||
827 | static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) | 829 | static void update_cfs_shares(struct cfs_rq *cfs_rq) |
828 | { | 830 | { |
829 | struct task_group *tg; | 831 | struct task_group *tg; |
830 | struct sched_entity *se; | 832 | struct sched_entity *se; |
@@ -838,7 +840,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) | |||
838 | if (likely(se->load.weight == tg->shares)) | 840 | if (likely(se->load.weight == tg->shares)) |
839 | return; | 841 | return; |
840 | #endif | 842 | #endif |
841 | shares = calc_cfs_shares(cfs_rq, tg, weight_delta); | 843 | shares = calc_cfs_shares(cfs_rq, tg); |
842 | 844 | ||
843 | reweight_entity(cfs_rq_of(se), se, shares); | 845 | reweight_entity(cfs_rq_of(se), se, shares); |
844 | } | 846 | } |
@@ -847,7 +849,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) | |||
847 | { | 849 | { |
848 | } | 850 | } |
849 | 851 | ||
850 | static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) | 852 | static inline void update_cfs_shares(struct cfs_rq *cfs_rq) |
851 | { | 853 | { |
852 | } | 854 | } |
853 | 855 | ||
@@ -978,8 +980,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
978 | */ | 980 | */ |
979 | update_curr(cfs_rq); | 981 | update_curr(cfs_rq); |
980 | update_cfs_load(cfs_rq, 0); | 982 | update_cfs_load(cfs_rq, 0); |
981 | update_cfs_shares(cfs_rq, se->load.weight); | ||
982 | account_entity_enqueue(cfs_rq, se); | 983 | account_entity_enqueue(cfs_rq, se); |
984 | update_cfs_shares(cfs_rq); | ||
983 | 985 | ||
984 | if (flags & ENQUEUE_WAKEUP) { | 986 | if (flags & ENQUEUE_WAKEUP) { |
985 | place_entity(cfs_rq, se, 0); | 987 | place_entity(cfs_rq, se, 0); |
@@ -996,19 +998,49 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
996 | list_add_leaf_cfs_rq(cfs_rq); | 998 | list_add_leaf_cfs_rq(cfs_rq); |
997 | } | 999 | } |
998 | 1000 | ||
999 | static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) | 1001 | static void __clear_buddies_last(struct sched_entity *se) |
1000 | { | 1002 | { |
1001 | if (!se || cfs_rq->last == se) | 1003 | for_each_sched_entity(se) { |
1002 | cfs_rq->last = NULL; | 1004 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
1005 | if (cfs_rq->last == se) | ||
1006 | cfs_rq->last = NULL; | ||
1007 | else | ||
1008 | break; | ||
1009 | } | ||
1010 | } | ||
1003 | 1011 | ||
1004 | if (!se || cfs_rq->next == se) | 1012 | static void __clear_buddies_next(struct sched_entity *se) |
1005 | cfs_rq->next = NULL; | 1013 | { |
1014 | for_each_sched_entity(se) { | ||
1015 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
1016 | if (cfs_rq->next == se) | ||
1017 | cfs_rq->next = NULL; | ||
1018 | else | ||
1019 | break; | ||
1020 | } | ||
1021 | } | ||
1022 | |||
1023 | static void __clear_buddies_skip(struct sched_entity *se) | ||
1024 | { | ||
1025 | for_each_sched_entity(se) { | ||
1026 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
1027 | if (cfs_rq->skip == se) | ||
1028 | cfs_rq->skip = NULL; | ||
1029 | else | ||
1030 | break; | ||
1031 | } | ||
1006 | } | 1032 | } |
1007 | 1033 | ||
1008 | static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) | 1034 | static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) |
1009 | { | 1035 | { |
1010 | for_each_sched_entity(se) | 1036 | if (cfs_rq->last == se) |
1011 | __clear_buddies(cfs_rq_of(se), se); | 1037 | __clear_buddies_last(se); |
1038 | |||
1039 | if (cfs_rq->next == se) | ||
1040 | __clear_buddies_next(se); | ||
1041 | |||
1042 | if (cfs_rq->skip == se) | ||
1043 | __clear_buddies_skip(se); | ||
1012 | } | 1044 | } |
1013 | 1045 | ||
1014 | static void | 1046 | static void |
@@ -1041,7 +1073,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
1041 | update_cfs_load(cfs_rq, 0); | 1073 | update_cfs_load(cfs_rq, 0); |
1042 | account_entity_dequeue(cfs_rq, se); | 1074 | account_entity_dequeue(cfs_rq, se); |
1043 | update_min_vruntime(cfs_rq); | 1075 | update_min_vruntime(cfs_rq); |
1044 | update_cfs_shares(cfs_rq, 0); | 1076 | update_cfs_shares(cfs_rq); |
1045 | 1077 | ||
1046 | /* | 1078 | /* |
1047 | * Normalize the entity after updating the min_vruntime because the | 1079 | * Normalize the entity after updating the min_vruntime because the |
@@ -1084,7 +1116,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) | |||
1084 | return; | 1116 | return; |
1085 | 1117 | ||
1086 | if (cfs_rq->nr_running > 1) { | 1118 | if (cfs_rq->nr_running > 1) { |
1087 | struct sched_entity *se = __pick_next_entity(cfs_rq); | 1119 | struct sched_entity *se = __pick_first_entity(cfs_rq); |
1088 | s64 delta = curr->vruntime - se->vruntime; | 1120 | s64 delta = curr->vruntime - se->vruntime; |
1089 | 1121 | ||
1090 | if (delta < 0) | 1122 | if (delta < 0) |
@@ -1128,13 +1160,27 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
1128 | static int | 1160 | static int |
1129 | wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); | 1161 | wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); |
1130 | 1162 | ||
1163 | /* | ||
1164 | * Pick the next process, keeping these things in mind, in this order: | ||
1165 | * 1) keep things fair between processes/task groups | ||
1166 | * 2) pick the "next" process, since someone really wants that to run | ||
1167 | * 3) pick the "last" process, for cache locality | ||
1168 | * 4) do not run the "skip" process, if something else is available | ||
1169 | */ | ||
1131 | static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) | 1170 | static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) |
1132 | { | 1171 | { |
1133 | struct sched_entity *se = __pick_next_entity(cfs_rq); | 1172 | struct sched_entity *se = __pick_first_entity(cfs_rq); |
1134 | struct sched_entity *left = se; | 1173 | struct sched_entity *left = se; |
1135 | 1174 | ||
1136 | if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) | 1175 | /* |
1137 | se = cfs_rq->next; | 1176 | * Avoid running the skip buddy, if running something else can |
1177 | * be done without getting too unfair. | ||
1178 | */ | ||
1179 | if (cfs_rq->skip == se) { | ||
1180 | struct sched_entity *second = __pick_next_entity(se); | ||
1181 | if (second && wakeup_preempt_entity(second, left) < 1) | ||
1182 | se = second; | ||
1183 | } | ||
1138 | 1184 | ||
1139 | /* | 1185 | /* |
1140 | * Prefer last buddy, try to return the CPU to a preempted task. | 1186 | * Prefer last buddy, try to return the CPU to a preempted task. |
@@ -1142,6 +1188,12 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) | |||
1142 | if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) | 1188 | if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) |
1143 | se = cfs_rq->last; | 1189 | se = cfs_rq->last; |
1144 | 1190 | ||
1191 | /* | ||
1192 | * Someone really wants this to run. If it's not unfair, run it. | ||
1193 | */ | ||
1194 | if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) | ||
1195 | se = cfs_rq->next; | ||
1196 | |||
1145 | clear_buddies(cfs_rq, se); | 1197 | clear_buddies(cfs_rq, se); |
1146 | 1198 | ||
1147 | return se; | 1199 | return se; |
@@ -1282,7 +1334,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
1282 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 1334 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
1283 | 1335 | ||
1284 | update_cfs_load(cfs_rq, 0); | 1336 | update_cfs_load(cfs_rq, 0); |
1285 | update_cfs_shares(cfs_rq, 0); | 1337 | update_cfs_shares(cfs_rq); |
1286 | } | 1338 | } |
1287 | 1339 | ||
1288 | hrtick_update(rq); | 1340 | hrtick_update(rq); |
@@ -1312,58 +1364,12 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
1312 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 1364 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
1313 | 1365 | ||
1314 | update_cfs_load(cfs_rq, 0); | 1366 | update_cfs_load(cfs_rq, 0); |
1315 | update_cfs_shares(cfs_rq, 0); | 1367 | update_cfs_shares(cfs_rq); |
1316 | } | 1368 | } |
1317 | 1369 | ||
1318 | hrtick_update(rq); | 1370 | hrtick_update(rq); |
1319 | } | 1371 | } |
1320 | 1372 | ||
1321 | /* | ||
1322 | * sched_yield() support is very simple - we dequeue and enqueue. | ||
1323 | * | ||
1324 | * If compat_yield is turned on then we requeue to the end of the tree. | ||
1325 | */ | ||
1326 | static void yield_task_fair(struct rq *rq) | ||
1327 | { | ||
1328 | struct task_struct *curr = rq->curr; | ||
1329 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); | ||
1330 | struct sched_entity *rightmost, *se = &curr->se; | ||
1331 | |||
1332 | /* | ||
1333 | * Are we the only task in the tree? | ||
1334 | */ | ||
1335 | if (unlikely(cfs_rq->nr_running == 1)) | ||
1336 | return; | ||
1337 | |||
1338 | clear_buddies(cfs_rq, se); | ||
1339 | |||
1340 | if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) { | ||
1341 | update_rq_clock(rq); | ||
1342 | /* | ||
1343 | * Update run-time statistics of the 'current'. | ||
1344 | */ | ||
1345 | update_curr(cfs_rq); | ||
1346 | |||
1347 | return; | ||
1348 | } | ||
1349 | /* | ||
1350 | * Find the rightmost entry in the rbtree: | ||
1351 | */ | ||
1352 | rightmost = __pick_last_entity(cfs_rq); | ||
1353 | /* | ||
1354 | * Already in the rightmost position? | ||
1355 | */ | ||
1356 | if (unlikely(!rightmost || entity_before(rightmost, se))) | ||
1357 | return; | ||
1358 | |||
1359 | /* | ||
1360 | * Minimally necessary key value to be last in the tree: | ||
1361 | * Upon rescheduling, sched_class::put_prev_task() will place | ||
1362 | * 'current' within the tree based on its new key value. | ||
1363 | */ | ||
1364 | se->vruntime = rightmost->vruntime + 1; | ||
1365 | } | ||
1366 | |||
1367 | #ifdef CONFIG_SMP | 1373 | #ifdef CONFIG_SMP |
1368 | 1374 | ||
1369 | static void task_waking_fair(struct rq *rq, struct task_struct *p) | 1375 | static void task_waking_fair(struct rq *rq, struct task_struct *p) |
@@ -1834,6 +1840,14 @@ static void set_next_buddy(struct sched_entity *se) | |||
1834 | } | 1840 | } |
1835 | } | 1841 | } |
1836 | 1842 | ||
1843 | static void set_skip_buddy(struct sched_entity *se) | ||
1844 | { | ||
1845 | if (likely(task_of(se)->policy != SCHED_IDLE)) { | ||
1846 | for_each_sched_entity(se) | ||
1847 | cfs_rq_of(se)->skip = se; | ||
1848 | } | ||
1849 | } | ||
1850 | |||
1837 | /* | 1851 | /* |
1838 | * Preempt the current task with a newly woken task if needed: | 1852 | * Preempt the current task with a newly woken task if needed: |
1839 | */ | 1853 | */ |
@@ -1857,16 +1871,18 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ | |||
1857 | if (test_tsk_need_resched(curr)) | 1871 | if (test_tsk_need_resched(curr)) |
1858 | return; | 1872 | return; |
1859 | 1873 | ||
1874 | /* Idle tasks are by definition preempted by non-idle tasks. */ | ||
1875 | if (unlikely(curr->policy == SCHED_IDLE) && | ||
1876 | likely(p->policy != SCHED_IDLE)) | ||
1877 | goto preempt; | ||
1878 | |||
1860 | /* | 1879 | /* |
1861 | * Batch and idle tasks do not preempt (their preemption is driven by | 1880 | * Batch and idle tasks do not preempt non-idle tasks (their preemption |
1862 | * the tick): | 1881 | * is driven by the tick): |
1863 | */ | 1882 | */ |
1864 | if (unlikely(p->policy != SCHED_NORMAL)) | 1883 | if (unlikely(p->policy != SCHED_NORMAL)) |
1865 | return; | 1884 | return; |
1866 | 1885 | ||
1867 | /* Idle tasks are by definition preempted by everybody. */ | ||
1868 | if (unlikely(curr->policy == SCHED_IDLE)) | ||
1869 | goto preempt; | ||
1870 | 1886 | ||
1871 | if (!sched_feat(WAKEUP_PREEMPT)) | 1887 | if (!sched_feat(WAKEUP_PREEMPT)) |
1872 | return; | 1888 | return; |
@@ -1932,6 +1948,51 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) | |||
1932 | } | 1948 | } |
1933 | } | 1949 | } |
1934 | 1950 | ||
1951 | /* | ||
1952 | * sched_yield() is very simple | ||
1953 | * | ||
1954 | * The magic of dealing with the ->skip buddy is in pick_next_entity. | ||
1955 | */ | ||
1956 | static void yield_task_fair(struct rq *rq) | ||
1957 | { | ||
1958 | struct task_struct *curr = rq->curr; | ||
1959 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); | ||
1960 | struct sched_entity *se = &curr->se; | ||
1961 | |||
1962 | /* | ||
1963 | * Are we the only task in the tree? | ||
1964 | */ | ||
1965 | if (unlikely(rq->nr_running == 1)) | ||
1966 | return; | ||
1967 | |||
1968 | clear_buddies(cfs_rq, se); | ||
1969 | |||
1970 | if (curr->policy != SCHED_BATCH) { | ||
1971 | update_rq_clock(rq); | ||
1972 | /* | ||
1973 | * Update run-time statistics of the 'current'. | ||
1974 | */ | ||
1975 | update_curr(cfs_rq); | ||
1976 | } | ||
1977 | |||
1978 | set_skip_buddy(se); | ||
1979 | } | ||
1980 | |||
1981 | static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt) | ||
1982 | { | ||
1983 | struct sched_entity *se = &p->se; | ||
1984 | |||
1985 | if (!se->on_rq) | ||
1986 | return false; | ||
1987 | |||
1988 | /* Tell the scheduler that we'd really like pse to run next. */ | ||
1989 | set_next_buddy(se); | ||
1990 | |||
1991 | yield_task_fair(rq); | ||
1992 | |||
1993 | return true; | ||
1994 | } | ||
1995 | |||
1935 | #ifdef CONFIG_SMP | 1996 | #ifdef CONFIG_SMP |
1936 | /************************************************** | 1997 | /************************************************** |
1937 | * Fair scheduling class load-balancing methods: | 1998 | * Fair scheduling class load-balancing methods: |
@@ -2043,21 +2104,20 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
2043 | enum cpu_idle_type idle, int *all_pinned, | 2104 | enum cpu_idle_type idle, int *all_pinned, |
2044 | int *this_best_prio, struct cfs_rq *busiest_cfs_rq) | 2105 | int *this_best_prio, struct cfs_rq *busiest_cfs_rq) |
2045 | { | 2106 | { |
2046 | int loops = 0, pulled = 0, pinned = 0; | 2107 | int loops = 0, pulled = 0; |
2047 | long rem_load_move = max_load_move; | 2108 | long rem_load_move = max_load_move; |
2048 | struct task_struct *p, *n; | 2109 | struct task_struct *p, *n; |
2049 | 2110 | ||
2050 | if (max_load_move == 0) | 2111 | if (max_load_move == 0) |
2051 | goto out; | 2112 | goto out; |
2052 | 2113 | ||
2053 | pinned = 1; | ||
2054 | |||
2055 | list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) { | 2114 | list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) { |
2056 | if (loops++ > sysctl_sched_nr_migrate) | 2115 | if (loops++ > sysctl_sched_nr_migrate) |
2057 | break; | 2116 | break; |
2058 | 2117 | ||
2059 | if ((p->se.load.weight >> 1) > rem_load_move || | 2118 | if ((p->se.load.weight >> 1) > rem_load_move || |
2060 | !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) | 2119 | !can_migrate_task(p, busiest, this_cpu, sd, idle, |
2120 | all_pinned)) | ||
2061 | continue; | 2121 | continue; |
2062 | 2122 | ||
2063 | pull_task(busiest, p, this_rq, this_cpu); | 2123 | pull_task(busiest, p, this_rq, this_cpu); |
@@ -2092,9 +2152,6 @@ out: | |||
2092 | */ | 2152 | */ |
2093 | schedstat_add(sd, lb_gained[idle], pulled); | 2153 | schedstat_add(sd, lb_gained[idle], pulled); |
2094 | 2154 | ||
2095 | if (all_pinned) | ||
2096 | *all_pinned = pinned; | ||
2097 | |||
2098 | return max_load_move - rem_load_move; | 2155 | return max_load_move - rem_load_move; |
2099 | } | 2156 | } |
2100 | 2157 | ||
@@ -2123,7 +2180,7 @@ static int update_shares_cpu(struct task_group *tg, int cpu) | |||
2123 | * We need to update shares after updating tg->load_weight in | 2180 | * We need to update shares after updating tg->load_weight in |
2124 | * order to adjust the weight of groups with long running tasks. | 2181 | * order to adjust the weight of groups with long running tasks. |
2125 | */ | 2182 | */ |
2126 | update_cfs_shares(cfs_rq, 0); | 2183 | update_cfs_shares(cfs_rq); |
2127 | 2184 | ||
2128 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 2185 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
2129 | 2186 | ||
@@ -2610,7 +2667,6 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) | |||
2610 | * @this_cpu: Cpu for which load balance is currently performed. | 2667 | * @this_cpu: Cpu for which load balance is currently performed. |
2611 | * @idle: Idle status of this_cpu | 2668 | * @idle: Idle status of this_cpu |
2612 | * @load_idx: Load index of sched_domain of this_cpu for load calc. | 2669 | * @load_idx: Load index of sched_domain of this_cpu for load calc. |
2613 | * @sd_idle: Idle status of the sched_domain containing group. | ||
2614 | * @local_group: Does group contain this_cpu. | 2670 | * @local_group: Does group contain this_cpu. |
2615 | * @cpus: Set of cpus considered for load balancing. | 2671 | * @cpus: Set of cpus considered for load balancing. |
2616 | * @balance: Should we balance. | 2672 | * @balance: Should we balance. |
@@ -2618,7 +2674,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) | |||
2618 | */ | 2674 | */ |
2619 | static inline void update_sg_lb_stats(struct sched_domain *sd, | 2675 | static inline void update_sg_lb_stats(struct sched_domain *sd, |
2620 | struct sched_group *group, int this_cpu, | 2676 | struct sched_group *group, int this_cpu, |
2621 | enum cpu_idle_type idle, int load_idx, int *sd_idle, | 2677 | enum cpu_idle_type idle, int load_idx, |
2622 | int local_group, const struct cpumask *cpus, | 2678 | int local_group, const struct cpumask *cpus, |
2623 | int *balance, struct sg_lb_stats *sgs) | 2679 | int *balance, struct sg_lb_stats *sgs) |
2624 | { | 2680 | { |
@@ -2638,9 +2694,6 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
2638 | for_each_cpu_and(i, sched_group_cpus(group), cpus) { | 2694 | for_each_cpu_and(i, sched_group_cpus(group), cpus) { |
2639 | struct rq *rq = cpu_rq(i); | 2695 | struct rq *rq = cpu_rq(i); |
2640 | 2696 | ||
2641 | if (*sd_idle && rq->nr_running) | ||
2642 | *sd_idle = 0; | ||
2643 | |||
2644 | /* Bias balancing toward cpus of our domain */ | 2697 | /* Bias balancing toward cpus of our domain */ |
2645 | if (local_group) { | 2698 | if (local_group) { |
2646 | if (idle_cpu(i) && !first_idle_cpu) { | 2699 | if (idle_cpu(i) && !first_idle_cpu) { |
@@ -2685,7 +2738,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
2685 | 2738 | ||
2686 | /* | 2739 | /* |
2687 | * Consider the group unbalanced when the imbalance is larger | 2740 | * Consider the group unbalanced when the imbalance is larger |
2688 | * than the average weight of two tasks. | 2741 | * than the average weight of a task. |
2689 | * | 2742 | * |
2690 | * APZ: with cgroup the avg task weight can vary wildly and | 2743 | * APZ: with cgroup the avg task weight can vary wildly and |
2691 | * might not be a suitable number - should we keep a | 2744 | * might not be a suitable number - should we keep a |
@@ -2695,7 +2748,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
2695 | if (sgs->sum_nr_running) | 2748 | if (sgs->sum_nr_running) |
2696 | avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; | 2749 | avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; |
2697 | 2750 | ||
2698 | if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1) | 2751 | if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1) |
2699 | sgs->group_imb = 1; | 2752 | sgs->group_imb = 1; |
2700 | 2753 | ||
2701 | sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); | 2754 | sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); |
@@ -2755,15 +2808,13 @@ static bool update_sd_pick_busiest(struct sched_domain *sd, | |||
2755 | * @sd: sched_domain whose statistics are to be updated. | 2808 | * @sd: sched_domain whose statistics are to be updated. |
2756 | * @this_cpu: Cpu for which load balance is currently performed. | 2809 | * @this_cpu: Cpu for which load balance is currently performed. |
2757 | * @idle: Idle status of this_cpu | 2810 | * @idle: Idle status of this_cpu |
2758 | * @sd_idle: Idle status of the sched_domain containing sg. | ||
2759 | * @cpus: Set of cpus considered for load balancing. | 2811 | * @cpus: Set of cpus considered for load balancing. |
2760 | * @balance: Should we balance. | 2812 | * @balance: Should we balance. |
2761 | * @sds: variable to hold the statistics for this sched_domain. | 2813 | * @sds: variable to hold the statistics for this sched_domain. |
2762 | */ | 2814 | */ |
2763 | static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | 2815 | static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, |
2764 | enum cpu_idle_type idle, int *sd_idle, | 2816 | enum cpu_idle_type idle, const struct cpumask *cpus, |
2765 | const struct cpumask *cpus, int *balance, | 2817 | int *balance, struct sd_lb_stats *sds) |
2766 | struct sd_lb_stats *sds) | ||
2767 | { | 2818 | { |
2768 | struct sched_domain *child = sd->child; | 2819 | struct sched_domain *child = sd->child; |
2769 | struct sched_group *sg = sd->groups; | 2820 | struct sched_group *sg = sd->groups; |
@@ -2781,7 +2832,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
2781 | 2832 | ||
2782 | local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg)); | 2833 | local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg)); |
2783 | memset(&sgs, 0, sizeof(sgs)); | 2834 | memset(&sgs, 0, sizeof(sgs)); |
2784 | update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle, | 2835 | update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, |
2785 | local_group, cpus, balance, &sgs); | 2836 | local_group, cpus, balance, &sgs); |
2786 | 2837 | ||
2787 | if (local_group && !(*balance)) | 2838 | if (local_group && !(*balance)) |
@@ -3007,7 +3058,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | |||
3007 | 3058 | ||
3008 | /* | 3059 | /* |
3009 | * if *imbalance is less than the average load per runnable task | 3060 | * if *imbalance is less than the average load per runnable task |
3010 | * there is no gaurantee that any tasks will be moved so we'll have | 3061 | * there is no guarantee that any tasks will be moved so we'll have |
3011 | * a think about bumping its value to force at least one task to be | 3062 | * a think about bumping its value to force at least one task to be |
3012 | * moved | 3063 | * moved |
3013 | */ | 3064 | */ |
@@ -3033,7 +3084,6 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | |||
3033 | * @imbalance: Variable which stores amount of weighted load which should | 3084 | * @imbalance: Variable which stores amount of weighted load which should |
3034 | * be moved to restore balance/put a group to idle. | 3085 | * be moved to restore balance/put a group to idle. |
3035 | * @idle: The idle status of this_cpu. | 3086 | * @idle: The idle status of this_cpu. |
3036 | * @sd_idle: The idleness of sd | ||
3037 | * @cpus: The set of CPUs under consideration for load-balancing. | 3087 | * @cpus: The set of CPUs under consideration for load-balancing. |
3038 | * @balance: Pointer to a variable indicating if this_cpu | 3088 | * @balance: Pointer to a variable indicating if this_cpu |
3039 | * is the appropriate cpu to perform load balancing at this_level. | 3089 | * is the appropriate cpu to perform load balancing at this_level. |
@@ -3046,7 +3096,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | |||
3046 | static struct sched_group * | 3096 | static struct sched_group * |
3047 | find_busiest_group(struct sched_domain *sd, int this_cpu, | 3097 | find_busiest_group(struct sched_domain *sd, int this_cpu, |
3048 | unsigned long *imbalance, enum cpu_idle_type idle, | 3098 | unsigned long *imbalance, enum cpu_idle_type idle, |
3049 | int *sd_idle, const struct cpumask *cpus, int *balance) | 3099 | const struct cpumask *cpus, int *balance) |
3050 | { | 3100 | { |
3051 | struct sd_lb_stats sds; | 3101 | struct sd_lb_stats sds; |
3052 | 3102 | ||
@@ -3056,22 +3106,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
3056 | * Compute the various statistics relavent for load balancing at | 3106 | * Compute the various statistics relavent for load balancing at |
3057 | * this level. | 3107 | * this level. |
3058 | */ | 3108 | */ |
3059 | update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus, | 3109 | update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds); |
3060 | balance, &sds); | 3110 | |
3061 | 3111 | /* | |
3062 | /* Cases where imbalance does not exist from POV of this_cpu */ | 3112 | * this_cpu is not the appropriate cpu to perform load balancing at |
3063 | /* 1) this_cpu is not the appropriate cpu to perform load balancing | 3113 | * this level. |
3064 | * at this level. | ||
3065 | * 2) There is no busy sibling group to pull from. | ||
3066 | * 3) This group is the busiest group. | ||
3067 | * 4) This group is more busy than the avg busieness at this | ||
3068 | * sched_domain. | ||
3069 | * 5) The imbalance is within the specified limit. | ||
3070 | * | ||
3071 | * Note: when doing newidle balance, if the local group has excess | ||
3072 | * capacity (i.e. nr_running < group_capacity) and the busiest group | ||
3073 | * does not have any capacity, we force a load balance to pull tasks | ||
3074 | * to the local group. In this case, we skip past checks 3, 4 and 5. | ||
3075 | */ | 3114 | */ |
3076 | if (!(*balance)) | 3115 | if (!(*balance)) |
3077 | goto ret; | 3116 | goto ret; |
@@ -3080,41 +3119,56 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
3080 | check_asym_packing(sd, &sds, this_cpu, imbalance)) | 3119 | check_asym_packing(sd, &sds, this_cpu, imbalance)) |
3081 | return sds.busiest; | 3120 | return sds.busiest; |
3082 | 3121 | ||
3122 | /* There is no busy sibling group to pull tasks from */ | ||
3083 | if (!sds.busiest || sds.busiest_nr_running == 0) | 3123 | if (!sds.busiest || sds.busiest_nr_running == 0) |
3084 | goto out_balanced; | 3124 | goto out_balanced; |
3085 | 3125 | ||
3086 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ | 3126 | sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; |
3127 | |||
3128 | /* | ||
3129 | * If the busiest group is imbalanced the below checks don't | ||
3130 | * work because they assumes all things are equal, which typically | ||
3131 | * isn't true due to cpus_allowed constraints and the like. | ||
3132 | */ | ||
3133 | if (sds.group_imb) | ||
3134 | goto force_balance; | ||
3135 | |||
3136 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ | ||
3087 | if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity && | 3137 | if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity && |
3088 | !sds.busiest_has_capacity) | 3138 | !sds.busiest_has_capacity) |
3089 | goto force_balance; | 3139 | goto force_balance; |
3090 | 3140 | ||
3141 | /* | ||
3142 | * If the local group is more busy than the selected busiest group | ||
3143 | * don't try and pull any tasks. | ||
3144 | */ | ||
3091 | if (sds.this_load >= sds.max_load) | 3145 | if (sds.this_load >= sds.max_load) |
3092 | goto out_balanced; | 3146 | goto out_balanced; |
3093 | 3147 | ||
3094 | sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; | 3148 | /* |
3095 | 3149 | * Don't pull any tasks if this group is already above the domain | |
3150 | * average load. | ||
3151 | */ | ||
3096 | if (sds.this_load >= sds.avg_load) | 3152 | if (sds.this_load >= sds.avg_load) |
3097 | goto out_balanced; | 3153 | goto out_balanced; |
3098 | 3154 | ||
3099 | /* | 3155 | if (idle == CPU_IDLE) { |
3100 | * In the CPU_NEWLY_IDLE, use imbalance_pct to be conservative. | ||
3101 | * And to check for busy balance use !idle_cpu instead of | ||
3102 | * CPU_NOT_IDLE. This is because HT siblings will use CPU_NOT_IDLE | ||
3103 | * even when they are idle. | ||
3104 | */ | ||
3105 | if (idle == CPU_NEWLY_IDLE || !idle_cpu(this_cpu)) { | ||
3106 | if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) | ||
3107 | goto out_balanced; | ||
3108 | } else { | ||
3109 | /* | 3156 | /* |
3110 | * This cpu is idle. If the busiest group load doesn't | 3157 | * This cpu is idle. If the busiest group load doesn't |
3111 | * have more tasks than the number of available cpu's and | 3158 | * have more tasks than the number of available cpu's and |
3112 | * there is no imbalance between this and busiest group | 3159 | * there is no imbalance between this and busiest group |
3113 | * wrt to idle cpu's, it is balanced. | 3160 | * wrt to idle cpu's, it is balanced. |
3114 | */ | 3161 | */ |
3115 | if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) && | 3162 | if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) && |
3116 | sds.busiest_nr_running <= sds.busiest_group_weight) | 3163 | sds.busiest_nr_running <= sds.busiest_group_weight) |
3117 | goto out_balanced; | 3164 | goto out_balanced; |
3165 | } else { | ||
3166 | /* | ||
3167 | * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use | ||
3168 | * imbalance_pct to be conservative. | ||
3169 | */ | ||
3170 | if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) | ||
3171 | goto out_balanced; | ||
3118 | } | 3172 | } |
3119 | 3173 | ||
3120 | force_balance: | 3174 | force_balance: |
@@ -3193,7 +3247,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group, | |||
3193 | /* Working cpumask for load_balance and load_balance_newidle. */ | 3247 | /* Working cpumask for load_balance and load_balance_newidle. */ |
3194 | static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); | 3248 | static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); |
3195 | 3249 | ||
3196 | static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle, | 3250 | static int need_active_balance(struct sched_domain *sd, int idle, |
3197 | int busiest_cpu, int this_cpu) | 3251 | int busiest_cpu, int this_cpu) |
3198 | { | 3252 | { |
3199 | if (idle == CPU_NEWLY_IDLE) { | 3253 | if (idle == CPU_NEWLY_IDLE) { |
@@ -3225,10 +3279,6 @@ static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle, | |||
3225 | * move_tasks() will succeed. ld_moved will be true and this | 3279 | * move_tasks() will succeed. ld_moved will be true and this |
3226 | * active balance code will not be triggered. | 3280 | * active balance code will not be triggered. |
3227 | */ | 3281 | */ |
3228 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && | ||
3229 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
3230 | return 0; | ||
3231 | |||
3232 | if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP) | 3282 | if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP) |
3233 | return 0; | 3283 | return 0; |
3234 | } | 3284 | } |
@@ -3246,7 +3296,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
3246 | struct sched_domain *sd, enum cpu_idle_type idle, | 3296 | struct sched_domain *sd, enum cpu_idle_type idle, |
3247 | int *balance) | 3297 | int *balance) |
3248 | { | 3298 | { |
3249 | int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; | 3299 | int ld_moved, all_pinned = 0, active_balance = 0; |
3250 | struct sched_group *group; | 3300 | struct sched_group *group; |
3251 | unsigned long imbalance; | 3301 | unsigned long imbalance; |
3252 | struct rq *busiest; | 3302 | struct rq *busiest; |
@@ -3255,20 +3305,10 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
3255 | 3305 | ||
3256 | cpumask_copy(cpus, cpu_active_mask); | 3306 | cpumask_copy(cpus, cpu_active_mask); |
3257 | 3307 | ||
3258 | /* | ||
3259 | * When power savings policy is enabled for the parent domain, idle | ||
3260 | * sibling can pick up load irrespective of busy siblings. In this case, | ||
3261 | * let the state of idle sibling percolate up as CPU_IDLE, instead of | ||
3262 | * portraying it as CPU_NOT_IDLE. | ||
3263 | */ | ||
3264 | if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && | ||
3265 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
3266 | sd_idle = 1; | ||
3267 | |||
3268 | schedstat_inc(sd, lb_count[idle]); | 3308 | schedstat_inc(sd, lb_count[idle]); |
3269 | 3309 | ||
3270 | redo: | 3310 | redo: |
3271 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, | 3311 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, |
3272 | cpus, balance); | 3312 | cpus, balance); |
3273 | 3313 | ||
3274 | if (*balance == 0) | 3314 | if (*balance == 0) |
@@ -3297,6 +3337,7 @@ redo: | |||
3297 | * still unbalanced. ld_moved simply stays zero, so it is | 3337 | * still unbalanced. ld_moved simply stays zero, so it is |
3298 | * correctly treated as an imbalance. | 3338 | * correctly treated as an imbalance. |
3299 | */ | 3339 | */ |
3340 | all_pinned = 1; | ||
3300 | local_irq_save(flags); | 3341 | local_irq_save(flags); |
3301 | double_rq_lock(this_rq, busiest); | 3342 | double_rq_lock(this_rq, busiest); |
3302 | ld_moved = move_tasks(this_rq, this_cpu, busiest, | 3343 | ld_moved = move_tasks(this_rq, this_cpu, busiest, |
@@ -3330,8 +3371,7 @@ redo: | |||
3330 | if (idle != CPU_NEWLY_IDLE) | 3371 | if (idle != CPU_NEWLY_IDLE) |
3331 | sd->nr_balance_failed++; | 3372 | sd->nr_balance_failed++; |
3332 | 3373 | ||
3333 | if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest), | 3374 | if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) { |
3334 | this_cpu)) { | ||
3335 | raw_spin_lock_irqsave(&busiest->lock, flags); | 3375 | raw_spin_lock_irqsave(&busiest->lock, flags); |
3336 | 3376 | ||
3337 | /* don't kick the active_load_balance_cpu_stop, | 3377 | /* don't kick the active_load_balance_cpu_stop, |
@@ -3386,10 +3426,6 @@ redo: | |||
3386 | sd->balance_interval *= 2; | 3426 | sd->balance_interval *= 2; |
3387 | } | 3427 | } |
3388 | 3428 | ||
3389 | if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && | ||
3390 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
3391 | ld_moved = -1; | ||
3392 | |||
3393 | goto out; | 3429 | goto out; |
3394 | 3430 | ||
3395 | out_balanced: | 3431 | out_balanced: |
@@ -3403,11 +3439,7 @@ out_one_pinned: | |||
3403 | (sd->balance_interval < sd->max_interval)) | 3439 | (sd->balance_interval < sd->max_interval)) |
3404 | sd->balance_interval *= 2; | 3440 | sd->balance_interval *= 2; |
3405 | 3441 | ||
3406 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && | 3442 | ld_moved = 0; |
3407 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
3408 | ld_moved = -1; | ||
3409 | else | ||
3410 | ld_moved = 0; | ||
3411 | out: | 3443 | out: |
3412 | return ld_moved; | 3444 | return ld_moved; |
3413 | } | 3445 | } |
@@ -3786,6 +3818,17 @@ void select_nohz_load_balancer(int stop_tick) | |||
3786 | 3818 | ||
3787 | static DEFINE_SPINLOCK(balancing); | 3819 | static DEFINE_SPINLOCK(balancing); |
3788 | 3820 | ||
3821 | static unsigned long __read_mostly max_load_balance_interval = HZ/10; | ||
3822 | |||
3823 | /* | ||
3824 | * Scale the max load_balance interval with the number of CPUs in the system. | ||
3825 | * This trades load-balance latency on larger machines for less cross talk. | ||
3826 | */ | ||
3827 | static void update_max_interval(void) | ||
3828 | { | ||
3829 | max_load_balance_interval = HZ*num_online_cpus()/10; | ||
3830 | } | ||
3831 | |||
3789 | /* | 3832 | /* |
3790 | * It checks each scheduling domain to see if it is due to be balanced, | 3833 | * It checks each scheduling domain to see if it is due to be balanced, |
3791 | * and initiates a balancing operation if so. | 3834 | * and initiates a balancing operation if so. |
@@ -3815,10 +3858,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
3815 | 3858 | ||
3816 | /* scale ms to jiffies */ | 3859 | /* scale ms to jiffies */ |
3817 | interval = msecs_to_jiffies(interval); | 3860 | interval = msecs_to_jiffies(interval); |
3818 | if (unlikely(!interval)) | 3861 | interval = clamp(interval, 1UL, max_load_balance_interval); |
3819 | interval = 1; | ||
3820 | if (interval > HZ*NR_CPUS/10) | ||
3821 | interval = HZ*NR_CPUS/10; | ||
3822 | 3862 | ||
3823 | need_serialize = sd->flags & SD_SERIALIZE; | 3863 | need_serialize = sd->flags & SD_SERIALIZE; |
3824 | 3864 | ||
@@ -3831,8 +3871,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
3831 | if (load_balance(cpu, rq, sd, idle, &balance)) { | 3871 | if (load_balance(cpu, rq, sd, idle, &balance)) { |
3832 | /* | 3872 | /* |
3833 | * We've pulled tasks over so either we're no | 3873 | * We've pulled tasks over so either we're no |
3834 | * longer idle, or one of our SMT siblings is | 3874 | * longer idle. |
3835 | * not idle. | ||
3836 | */ | 3875 | */ |
3837 | idle = CPU_NOT_IDLE; | 3876 | idle = CPU_NOT_IDLE; |
3838 | } | 3877 | } |
@@ -4079,33 +4118,62 @@ static void task_fork_fair(struct task_struct *p) | |||
4079 | * Priority of the task has changed. Check to see if we preempt | 4118 | * Priority of the task has changed. Check to see if we preempt |
4080 | * the current task. | 4119 | * the current task. |
4081 | */ | 4120 | */ |
4082 | static void prio_changed_fair(struct rq *rq, struct task_struct *p, | 4121 | static void |
4083 | int oldprio, int running) | 4122 | prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) |
4084 | { | 4123 | { |
4124 | if (!p->se.on_rq) | ||
4125 | return; | ||
4126 | |||
4085 | /* | 4127 | /* |
4086 | * Reschedule if we are currently running on this runqueue and | 4128 | * Reschedule if we are currently running on this runqueue and |
4087 | * our priority decreased, or if we are not currently running on | 4129 | * our priority decreased, or if we are not currently running on |
4088 | * this runqueue and our priority is higher than the current's | 4130 | * this runqueue and our priority is higher than the current's |
4089 | */ | 4131 | */ |
4090 | if (running) { | 4132 | if (rq->curr == p) { |
4091 | if (p->prio > oldprio) | 4133 | if (p->prio > oldprio) |
4092 | resched_task(rq->curr); | 4134 | resched_task(rq->curr); |
4093 | } else | 4135 | } else |
4094 | check_preempt_curr(rq, p, 0); | 4136 | check_preempt_curr(rq, p, 0); |
4095 | } | 4137 | } |
4096 | 4138 | ||
4139 | static void switched_from_fair(struct rq *rq, struct task_struct *p) | ||
4140 | { | ||
4141 | struct sched_entity *se = &p->se; | ||
4142 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
4143 | |||
4144 | /* | ||
4145 | * Ensure the task's vruntime is normalized, so that when its | ||
4146 | * switched back to the fair class the enqueue_entity(.flags=0) will | ||
4147 | * do the right thing. | ||
4148 | * | ||
4149 | * If it was on_rq, then the dequeue_entity(.flags=0) will already | ||
4150 | * have normalized the vruntime, if it was !on_rq, then only when | ||
4151 | * the task is sleeping will it still have non-normalized vruntime. | ||
4152 | */ | ||
4153 | if (!se->on_rq && p->state != TASK_RUNNING) { | ||
4154 | /* | ||
4155 | * Fix up our vruntime so that the current sleep doesn't | ||
4156 | * cause 'unlimited' sleep bonus. | ||
4157 | */ | ||
4158 | place_entity(cfs_rq, se, 0); | ||
4159 | se->vruntime -= cfs_rq->min_vruntime; | ||
4160 | } | ||
4161 | } | ||
4162 | |||
4097 | /* | 4163 | /* |
4098 | * We switched to the sched_fair class. | 4164 | * We switched to the sched_fair class. |
4099 | */ | 4165 | */ |
4100 | static void switched_to_fair(struct rq *rq, struct task_struct *p, | 4166 | static void switched_to_fair(struct rq *rq, struct task_struct *p) |
4101 | int running) | ||
4102 | { | 4167 | { |
4168 | if (!p->se.on_rq) | ||
4169 | return; | ||
4170 | |||
4103 | /* | 4171 | /* |
4104 | * We were most likely switched from sched_rt, so | 4172 | * We were most likely switched from sched_rt, so |
4105 | * kick off the schedule if running, otherwise just see | 4173 | * kick off the schedule if running, otherwise just see |
4106 | * if we can still preempt the current task. | 4174 | * if we can still preempt the current task. |
4107 | */ | 4175 | */ |
4108 | if (running) | 4176 | if (rq->curr == p) |
4109 | resched_task(rq->curr); | 4177 | resched_task(rq->curr); |
4110 | else | 4178 | else |
4111 | check_preempt_curr(rq, p, 0); | 4179 | check_preempt_curr(rq, p, 0); |
@@ -4171,6 +4239,7 @@ static const struct sched_class fair_sched_class = { | |||
4171 | .enqueue_task = enqueue_task_fair, | 4239 | .enqueue_task = enqueue_task_fair, |
4172 | .dequeue_task = dequeue_task_fair, | 4240 | .dequeue_task = dequeue_task_fair, |
4173 | .yield_task = yield_task_fair, | 4241 | .yield_task = yield_task_fair, |
4242 | .yield_to_task = yield_to_task_fair, | ||
4174 | 4243 | ||
4175 | .check_preempt_curr = check_preempt_wakeup, | 4244 | .check_preempt_curr = check_preempt_wakeup, |
4176 | 4245 | ||
@@ -4191,6 +4260,7 @@ static const struct sched_class fair_sched_class = { | |||
4191 | .task_fork = task_fork_fair, | 4260 | .task_fork = task_fork_fair, |
4192 | 4261 | ||
4193 | .prio_changed = prio_changed_fair, | 4262 | .prio_changed = prio_changed_fair, |
4263 | .switched_from = switched_from_fair, | ||
4194 | .switched_to = switched_to_fair, | 4264 | .switched_to = switched_to_fair, |
4195 | 4265 | ||
4196 | .get_rr_interval = get_rr_interval_fair, | 4266 | .get_rr_interval = get_rr_interval_fair, |