diff options
author | Andrea Bastoni <bastoni@cs.unc.edu> | 2011-08-27 09:43:54 -0400 |
---|---|---|
committer | Andrea Bastoni <bastoni@cs.unc.edu> | 2011-08-27 10:06:11 -0400 |
commit | 7b1bb388bc879ffcc6c69b567816d5c354afe42b (patch) | |
tree | 5a217fdfb0b5e5a327bdcd624506337c1ae1fe32 /kernel/sched_fair.c | |
parent | 7d754596756240fa918b94cd0c3011c77a638987 (diff) | |
parent | 02f8c6aee8df3cdc935e9bdd4f2d020306035dbe (diff) |
Merge 'Linux v3.0' into Litmus
Some notes:
* Litmus^RT scheduling class is the topmost scheduling class
(above stop_sched_class).
* scheduler_ipi() function (e.g., in smp_reschedule_interrupt())
may increase IPI latencies.
* Added path into schedule() to quickly re-evaluate scheduling
decision without becoming preemptive again. This used to be
a standard path before the removal of BKL.
Conflicts:
Makefile
arch/arm/kernel/calls.S
arch/arm/kernel/smp.c
arch/x86/include/asm/unistd_32.h
arch/x86/kernel/smp.c
arch/x86/kernel/syscall_table_32.S
include/linux/hrtimer.h
kernel/printk.c
kernel/sched.c
kernel/sched_fair.c
Diffstat (limited to 'kernel/sched_fair.c')
-rw-r--r-- | kernel/sched_fair.c | 1084 |
1 files changed, 761 insertions, 323 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index e0e8d5ca3c98..334eb474af93 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -22,10 +22,11 @@ | |||
22 | 22 | ||
23 | #include <linux/latencytop.h> | 23 | #include <linux/latencytop.h> |
24 | #include <linux/sched.h> | 24 | #include <linux/sched.h> |
25 | #include <linux/cpumask.h> | ||
25 | 26 | ||
26 | /* | 27 | /* |
27 | * Targeted preemption latency for CPU-bound tasks: | 28 | * Targeted preemption latency for CPU-bound tasks: |
28 | * (default: 5ms * (1 + ilog(ncpus)), units: nanoseconds) | 29 | * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) |
29 | * | 30 | * |
30 | * NOTE: this latency value is not the same as the concept of | 31 | * NOTE: this latency value is not the same as the concept of |
31 | * 'timeslice length' - timeslices in CFS are of variable length | 32 | * 'timeslice length' - timeslices in CFS are of variable length |
@@ -52,7 +53,7 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling | |||
52 | 53 | ||
53 | /* | 54 | /* |
54 | * Minimal preemption granularity for CPU-bound tasks: | 55 | * Minimal preemption granularity for CPU-bound tasks: |
55 | * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds) | 56 | * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) |
56 | */ | 57 | */ |
57 | unsigned int sysctl_sched_min_granularity = 750000ULL; | 58 | unsigned int sysctl_sched_min_granularity = 750000ULL; |
58 | unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; | 59 | unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; |
@@ -69,14 +70,6 @@ static unsigned int sched_nr_latency = 8; | |||
69 | unsigned int sysctl_sched_child_runs_first __read_mostly; | 70 | unsigned int sysctl_sched_child_runs_first __read_mostly; |
70 | 71 | ||
71 | /* | 72 | /* |
72 | * sys_sched_yield() compat mode | ||
73 | * | ||
74 | * This option switches the agressive yield implementation of the | ||
75 | * old scheduler back on. | ||
76 | */ | ||
77 | unsigned int __read_mostly sysctl_sched_compat_yield; | ||
78 | |||
79 | /* | ||
80 | * SCHED_OTHER wake-up granularity. | 73 | * SCHED_OTHER wake-up granularity. |
81 | * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) | 74 | * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) |
82 | * | 75 | * |
@@ -89,6 +82,13 @@ unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; | |||
89 | 82 | ||
90 | const_debug unsigned int sysctl_sched_migration_cost = 500000UL; | 83 | const_debug unsigned int sysctl_sched_migration_cost = 500000UL; |
91 | 84 | ||
85 | /* | ||
86 | * The exponential sliding window over which load is averaged for shares | ||
87 | * distribution. | ||
88 | * (default: 10msec) | ||
89 | */ | ||
90 | unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; | ||
91 | |||
92 | static const struct sched_class fair_sched_class; | 92 | static const struct sched_class fair_sched_class; |
93 | 93 | ||
94 | /************************************************************** | 94 | /************************************************************** |
@@ -143,6 +143,36 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) | |||
143 | return cfs_rq->tg->cfs_rq[this_cpu]; | 143 | return cfs_rq->tg->cfs_rq[this_cpu]; |
144 | } | 144 | } |
145 | 145 | ||
146 | static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) | ||
147 | { | ||
148 | if (!cfs_rq->on_list) { | ||
149 | /* | ||
150 | * Ensure we either appear before our parent (if already | ||
151 | * enqueued) or force our parent to appear after us when it is | ||
152 | * enqueued. The fact that we always enqueue bottom-up | ||
153 | * reduces this to two cases. | ||
154 | */ | ||
155 | if (cfs_rq->tg->parent && | ||
156 | cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) { | ||
157 | list_add_rcu(&cfs_rq->leaf_cfs_rq_list, | ||
158 | &rq_of(cfs_rq)->leaf_cfs_rq_list); | ||
159 | } else { | ||
160 | list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, | ||
161 | &rq_of(cfs_rq)->leaf_cfs_rq_list); | ||
162 | } | ||
163 | |||
164 | cfs_rq->on_list = 1; | ||
165 | } | ||
166 | } | ||
167 | |||
168 | static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) | ||
169 | { | ||
170 | if (cfs_rq->on_list) { | ||
171 | list_del_rcu(&cfs_rq->leaf_cfs_rq_list); | ||
172 | cfs_rq->on_list = 0; | ||
173 | } | ||
174 | } | ||
175 | |||
146 | /* Iterate thr' all leaf cfs_rq's on a runqueue */ | 176 | /* Iterate thr' all leaf cfs_rq's on a runqueue */ |
147 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ | 177 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ |
148 | list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) | 178 | list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) |
@@ -246,6 +276,14 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) | |||
246 | return &cpu_rq(this_cpu)->cfs; | 276 | return &cpu_rq(this_cpu)->cfs; |
247 | } | 277 | } |
248 | 278 | ||
279 | static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) | ||
280 | { | ||
281 | } | ||
282 | |||
283 | static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) | ||
284 | { | ||
285 | } | ||
286 | |||
249 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ | 287 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ |
250 | for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) | 288 | for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) |
251 | 289 | ||
@@ -320,6 +358,10 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) | |||
320 | } | 358 | } |
321 | 359 | ||
322 | cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); | 360 | cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); |
361 | #ifndef CONFIG_64BIT | ||
362 | smp_wmb(); | ||
363 | cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; | ||
364 | #endif | ||
323 | } | 365 | } |
324 | 366 | ||
325 | /* | 367 | /* |
@@ -374,7 +416,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
374 | rb_erase(&se->run_node, &cfs_rq->tasks_timeline); | 416 | rb_erase(&se->run_node, &cfs_rq->tasks_timeline); |
375 | } | 417 | } |
376 | 418 | ||
377 | static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) | 419 | static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) |
378 | { | 420 | { |
379 | struct rb_node *left = cfs_rq->rb_leftmost; | 421 | struct rb_node *left = cfs_rq->rb_leftmost; |
380 | 422 | ||
@@ -384,6 +426,17 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) | |||
384 | return rb_entry(left, struct sched_entity, run_node); | 426 | return rb_entry(left, struct sched_entity, run_node); |
385 | } | 427 | } |
386 | 428 | ||
429 | static struct sched_entity *__pick_next_entity(struct sched_entity *se) | ||
430 | { | ||
431 | struct rb_node *next = rb_next(&se->run_node); | ||
432 | |||
433 | if (!next) | ||
434 | return NULL; | ||
435 | |||
436 | return rb_entry(next, struct sched_entity, run_node); | ||
437 | } | ||
438 | |||
439 | #ifdef CONFIG_SCHED_DEBUG | ||
387 | static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) | 440 | static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) |
388 | { | 441 | { |
389 | struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); | 442 | struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); |
@@ -398,7 +451,6 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) | |||
398 | * Scheduling class statistics methods: | 451 | * Scheduling class statistics methods: |
399 | */ | 452 | */ |
400 | 453 | ||
401 | #ifdef CONFIG_SCHED_DEBUG | ||
402 | int sched_proc_update_handler(struct ctl_table *table, int write, | 454 | int sched_proc_update_handler(struct ctl_table *table, int write, |
403 | void __user *buffer, size_t *lenp, | 455 | void __user *buffer, size_t *lenp, |
404 | loff_t *ppos) | 456 | loff_t *ppos) |
@@ -417,7 +469,6 @@ int sched_proc_update_handler(struct ctl_table *table, int write, | |||
417 | WRT_SYSCTL(sched_min_granularity); | 469 | WRT_SYSCTL(sched_min_granularity); |
418 | WRT_SYSCTL(sched_latency); | 470 | WRT_SYSCTL(sched_latency); |
419 | WRT_SYSCTL(sched_wakeup_granularity); | 471 | WRT_SYSCTL(sched_wakeup_granularity); |
420 | WRT_SYSCTL(sched_shares_ratelimit); | ||
421 | #undef WRT_SYSCTL | 472 | #undef WRT_SYSCTL |
422 | 473 | ||
423 | return 0; | 474 | return 0; |
@@ -495,6 +546,9 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
495 | return calc_delta_fair(sched_slice(cfs_rq, se), se); | 546 | return calc_delta_fair(sched_slice(cfs_rq, se), se); |
496 | } | 547 | } |
497 | 548 | ||
549 | static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update); | ||
550 | static void update_cfs_shares(struct cfs_rq *cfs_rq); | ||
551 | |||
498 | /* | 552 | /* |
499 | * Update the current task's runtime statistics. Skip current tasks that | 553 | * Update the current task's runtime statistics. Skip current tasks that |
500 | * are not in our scheduling class. | 554 | * are not in our scheduling class. |
@@ -514,12 +568,16 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, | |||
514 | 568 | ||
515 | curr->vruntime += delta_exec_weighted; | 569 | curr->vruntime += delta_exec_weighted; |
516 | update_min_vruntime(cfs_rq); | 570 | update_min_vruntime(cfs_rq); |
571 | |||
572 | #if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED | ||
573 | cfs_rq->load_unacc_exec_time += delta_exec; | ||
574 | #endif | ||
517 | } | 575 | } |
518 | 576 | ||
519 | static void update_curr(struct cfs_rq *cfs_rq) | 577 | static void update_curr(struct cfs_rq *cfs_rq) |
520 | { | 578 | { |
521 | struct sched_entity *curr = cfs_rq->curr; | 579 | struct sched_entity *curr = cfs_rq->curr; |
522 | u64 now = rq_of(cfs_rq)->clock; | 580 | u64 now = rq_of(cfs_rq)->clock_task; |
523 | unsigned long delta_exec; | 581 | unsigned long delta_exec; |
524 | 582 | ||
525 | if (unlikely(!curr)) | 583 | if (unlikely(!curr)) |
@@ -602,7 +660,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
602 | /* | 660 | /* |
603 | * We are starting a new run period: | 661 | * We are starting a new run period: |
604 | */ | 662 | */ |
605 | se->exec_start = rq_of(cfs_rq)->clock; | 663 | se->exec_start = rq_of(cfs_rq)->clock_task; |
606 | } | 664 | } |
607 | 665 | ||
608 | /************************************************** | 666 | /************************************************** |
@@ -633,7 +691,6 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
633 | list_add(&se->group_node, &cfs_rq->tasks); | 691 | list_add(&se->group_node, &cfs_rq->tasks); |
634 | } | 692 | } |
635 | cfs_rq->nr_running++; | 693 | cfs_rq->nr_running++; |
636 | se->on_rq = 1; | ||
637 | } | 694 | } |
638 | 695 | ||
639 | static void | 696 | static void |
@@ -647,9 +704,164 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
647 | list_del_init(&se->group_node); | 704 | list_del_init(&se->group_node); |
648 | } | 705 | } |
649 | cfs_rq->nr_running--; | 706 | cfs_rq->nr_running--; |
650 | se->on_rq = 0; | ||
651 | } | 707 | } |
652 | 708 | ||
709 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
710 | # ifdef CONFIG_SMP | ||
711 | static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq, | ||
712 | int global_update) | ||
713 | { | ||
714 | struct task_group *tg = cfs_rq->tg; | ||
715 | long load_avg; | ||
716 | |||
717 | load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1); | ||
718 | load_avg -= cfs_rq->load_contribution; | ||
719 | |||
720 | if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) { | ||
721 | atomic_add(load_avg, &tg->load_weight); | ||
722 | cfs_rq->load_contribution += load_avg; | ||
723 | } | ||
724 | } | ||
725 | |||
726 | static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) | ||
727 | { | ||
728 | u64 period = sysctl_sched_shares_window; | ||
729 | u64 now, delta; | ||
730 | unsigned long load = cfs_rq->load.weight; | ||
731 | |||
732 | if (cfs_rq->tg == &root_task_group) | ||
733 | return; | ||
734 | |||
735 | now = rq_of(cfs_rq)->clock_task; | ||
736 | delta = now - cfs_rq->load_stamp; | ||
737 | |||
738 | /* truncate load history at 4 idle periods */ | ||
739 | if (cfs_rq->load_stamp > cfs_rq->load_last && | ||
740 | now - cfs_rq->load_last > 4 * period) { | ||
741 | cfs_rq->load_period = 0; | ||
742 | cfs_rq->load_avg = 0; | ||
743 | delta = period - 1; | ||
744 | } | ||
745 | |||
746 | cfs_rq->load_stamp = now; | ||
747 | cfs_rq->load_unacc_exec_time = 0; | ||
748 | cfs_rq->load_period += delta; | ||
749 | if (load) { | ||
750 | cfs_rq->load_last = now; | ||
751 | cfs_rq->load_avg += delta * load; | ||
752 | } | ||
753 | |||
754 | /* consider updating load contribution on each fold or truncate */ | ||
755 | if (global_update || cfs_rq->load_period > period | ||
756 | || !cfs_rq->load_period) | ||
757 | update_cfs_rq_load_contribution(cfs_rq, global_update); | ||
758 | |||
759 | while (cfs_rq->load_period > period) { | ||
760 | /* | ||
761 | * Inline assembly required to prevent the compiler | ||
762 | * optimising this loop into a divmod call. | ||
763 | * See __iter_div_u64_rem() for another example of this. | ||
764 | */ | ||
765 | asm("" : "+rm" (cfs_rq->load_period)); | ||
766 | cfs_rq->load_period /= 2; | ||
767 | cfs_rq->load_avg /= 2; | ||
768 | } | ||
769 | |||
770 | if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg) | ||
771 | list_del_leaf_cfs_rq(cfs_rq); | ||
772 | } | ||
773 | |||
774 | static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) | ||
775 | { | ||
776 | long load_weight, load, shares; | ||
777 | |||
778 | load = cfs_rq->load.weight; | ||
779 | |||
780 | load_weight = atomic_read(&tg->load_weight); | ||
781 | load_weight += load; | ||
782 | load_weight -= cfs_rq->load_contribution; | ||
783 | |||
784 | shares = (tg->shares * load); | ||
785 | if (load_weight) | ||
786 | shares /= load_weight; | ||
787 | |||
788 | if (shares < MIN_SHARES) | ||
789 | shares = MIN_SHARES; | ||
790 | if (shares > tg->shares) | ||
791 | shares = tg->shares; | ||
792 | |||
793 | return shares; | ||
794 | } | ||
795 | |||
796 | static void update_entity_shares_tick(struct cfs_rq *cfs_rq) | ||
797 | { | ||
798 | if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { | ||
799 | update_cfs_load(cfs_rq, 0); | ||
800 | update_cfs_shares(cfs_rq); | ||
801 | } | ||
802 | } | ||
803 | # else /* CONFIG_SMP */ | ||
804 | static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) | ||
805 | { | ||
806 | } | ||
807 | |||
808 | static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) | ||
809 | { | ||
810 | return tg->shares; | ||
811 | } | ||
812 | |||
813 | static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) | ||
814 | { | ||
815 | } | ||
816 | # endif /* CONFIG_SMP */ | ||
817 | static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, | ||
818 | unsigned long weight) | ||
819 | { | ||
820 | if (se->on_rq) { | ||
821 | /* commit outstanding execution time */ | ||
822 | if (cfs_rq->curr == se) | ||
823 | update_curr(cfs_rq); | ||
824 | account_entity_dequeue(cfs_rq, se); | ||
825 | } | ||
826 | |||
827 | update_load_set(&se->load, weight); | ||
828 | |||
829 | if (se->on_rq) | ||
830 | account_entity_enqueue(cfs_rq, se); | ||
831 | } | ||
832 | |||
833 | static void update_cfs_shares(struct cfs_rq *cfs_rq) | ||
834 | { | ||
835 | struct task_group *tg; | ||
836 | struct sched_entity *se; | ||
837 | long shares; | ||
838 | |||
839 | tg = cfs_rq->tg; | ||
840 | se = tg->se[cpu_of(rq_of(cfs_rq))]; | ||
841 | if (!se) | ||
842 | return; | ||
843 | #ifndef CONFIG_SMP | ||
844 | if (likely(se->load.weight == tg->shares)) | ||
845 | return; | ||
846 | #endif | ||
847 | shares = calc_cfs_shares(cfs_rq, tg); | ||
848 | |||
849 | reweight_entity(cfs_rq_of(se), se, shares); | ||
850 | } | ||
851 | #else /* CONFIG_FAIR_GROUP_SCHED */ | ||
852 | static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) | ||
853 | { | ||
854 | } | ||
855 | |||
856 | static inline void update_cfs_shares(struct cfs_rq *cfs_rq) | ||
857 | { | ||
858 | } | ||
859 | |||
860 | static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) | ||
861 | { | ||
862 | } | ||
863 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
864 | |||
653 | static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | 865 | static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) |
654 | { | 866 | { |
655 | #ifdef CONFIG_SCHEDSTATS | 867 | #ifdef CONFIG_SCHEDSTATS |
@@ -771,7 +983,9 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
771 | * Update run-time statistics of the 'current'. | 983 | * Update run-time statistics of the 'current'. |
772 | */ | 984 | */ |
773 | update_curr(cfs_rq); | 985 | update_curr(cfs_rq); |
986 | update_cfs_load(cfs_rq, 0); | ||
774 | account_entity_enqueue(cfs_rq, se); | 987 | account_entity_enqueue(cfs_rq, se); |
988 | update_cfs_shares(cfs_rq); | ||
775 | 989 | ||
776 | if (flags & ENQUEUE_WAKEUP) { | 990 | if (flags & ENQUEUE_WAKEUP) { |
777 | place_entity(cfs_rq, se, 0); | 991 | place_entity(cfs_rq, se, 0); |
@@ -782,21 +996,55 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
782 | check_spread(cfs_rq, se); | 996 | check_spread(cfs_rq, se); |
783 | if (se != cfs_rq->curr) | 997 | if (se != cfs_rq->curr) |
784 | __enqueue_entity(cfs_rq, se); | 998 | __enqueue_entity(cfs_rq, se); |
999 | se->on_rq = 1; | ||
1000 | |||
1001 | if (cfs_rq->nr_running == 1) | ||
1002 | list_add_leaf_cfs_rq(cfs_rq); | ||
1003 | } | ||
1004 | |||
1005 | static void __clear_buddies_last(struct sched_entity *se) | ||
1006 | { | ||
1007 | for_each_sched_entity(se) { | ||
1008 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
1009 | if (cfs_rq->last == se) | ||
1010 | cfs_rq->last = NULL; | ||
1011 | else | ||
1012 | break; | ||
1013 | } | ||
785 | } | 1014 | } |
786 | 1015 | ||
787 | static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) | 1016 | static void __clear_buddies_next(struct sched_entity *se) |
788 | { | 1017 | { |
789 | if (!se || cfs_rq->last == se) | 1018 | for_each_sched_entity(se) { |
790 | cfs_rq->last = NULL; | 1019 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
1020 | if (cfs_rq->next == se) | ||
1021 | cfs_rq->next = NULL; | ||
1022 | else | ||
1023 | break; | ||
1024 | } | ||
1025 | } | ||
791 | 1026 | ||
792 | if (!se || cfs_rq->next == se) | 1027 | static void __clear_buddies_skip(struct sched_entity *se) |
793 | cfs_rq->next = NULL; | 1028 | { |
1029 | for_each_sched_entity(se) { | ||
1030 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
1031 | if (cfs_rq->skip == se) | ||
1032 | cfs_rq->skip = NULL; | ||
1033 | else | ||
1034 | break; | ||
1035 | } | ||
794 | } | 1036 | } |
795 | 1037 | ||
796 | static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) | 1038 | static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) |
797 | { | 1039 | { |
798 | for_each_sched_entity(se) | 1040 | if (cfs_rq->last == se) |
799 | __clear_buddies(cfs_rq_of(se), se); | 1041 | __clear_buddies_last(se); |
1042 | |||
1043 | if (cfs_rq->next == se) | ||
1044 | __clear_buddies_next(se); | ||
1045 | |||
1046 | if (cfs_rq->skip == se) | ||
1047 | __clear_buddies_skip(se); | ||
800 | } | 1048 | } |
801 | 1049 | ||
802 | static void | 1050 | static void |
@@ -825,8 +1073,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
825 | 1073 | ||
826 | if (se != cfs_rq->curr) | 1074 | if (se != cfs_rq->curr) |
827 | __dequeue_entity(cfs_rq, se); | 1075 | __dequeue_entity(cfs_rq, se); |
1076 | se->on_rq = 0; | ||
1077 | update_cfs_load(cfs_rq, 0); | ||
828 | account_entity_dequeue(cfs_rq, se); | 1078 | account_entity_dequeue(cfs_rq, se); |
829 | update_min_vruntime(cfs_rq); | ||
830 | 1079 | ||
831 | /* | 1080 | /* |
832 | * Normalize the entity after updating the min_vruntime because the | 1081 | * Normalize the entity after updating the min_vruntime because the |
@@ -835,6 +1084,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
835 | */ | 1084 | */ |
836 | if (!(flags & DEQUEUE_SLEEP)) | 1085 | if (!(flags & DEQUEUE_SLEEP)) |
837 | se->vruntime -= cfs_rq->min_vruntime; | 1086 | se->vruntime -= cfs_rq->min_vruntime; |
1087 | |||
1088 | update_min_vruntime(cfs_rq); | ||
1089 | update_cfs_shares(cfs_rq); | ||
838 | } | 1090 | } |
839 | 1091 | ||
840 | /* | 1092 | /* |
@@ -869,9 +1121,12 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) | |||
869 | return; | 1121 | return; |
870 | 1122 | ||
871 | if (cfs_rq->nr_running > 1) { | 1123 | if (cfs_rq->nr_running > 1) { |
872 | struct sched_entity *se = __pick_next_entity(cfs_rq); | 1124 | struct sched_entity *se = __pick_first_entity(cfs_rq); |
873 | s64 delta = curr->vruntime - se->vruntime; | 1125 | s64 delta = curr->vruntime - se->vruntime; |
874 | 1126 | ||
1127 | if (delta < 0) | ||
1128 | return; | ||
1129 | |||
875 | if (delta > ideal_runtime) | 1130 | if (delta > ideal_runtime) |
876 | resched_task(rq_of(cfs_rq)->curr); | 1131 | resched_task(rq_of(cfs_rq)->curr); |
877 | } | 1132 | } |
@@ -910,13 +1165,27 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
910 | static int | 1165 | static int |
911 | wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); | 1166 | wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); |
912 | 1167 | ||
1168 | /* | ||
1169 | * Pick the next process, keeping these things in mind, in this order: | ||
1170 | * 1) keep things fair between processes/task groups | ||
1171 | * 2) pick the "next" process, since someone really wants that to run | ||
1172 | * 3) pick the "last" process, for cache locality | ||
1173 | * 4) do not run the "skip" process, if something else is available | ||
1174 | */ | ||
913 | static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) | 1175 | static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) |
914 | { | 1176 | { |
915 | struct sched_entity *se = __pick_next_entity(cfs_rq); | 1177 | struct sched_entity *se = __pick_first_entity(cfs_rq); |
916 | struct sched_entity *left = se; | 1178 | struct sched_entity *left = se; |
917 | 1179 | ||
918 | if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) | 1180 | /* |
919 | se = cfs_rq->next; | 1181 | * Avoid running the skip buddy, if running something else can |
1182 | * be done without getting too unfair. | ||
1183 | */ | ||
1184 | if (cfs_rq->skip == se) { | ||
1185 | struct sched_entity *second = __pick_next_entity(se); | ||
1186 | if (second && wakeup_preempt_entity(second, left) < 1) | ||
1187 | se = second; | ||
1188 | } | ||
920 | 1189 | ||
921 | /* | 1190 | /* |
922 | * Prefer last buddy, try to return the CPU to a preempted task. | 1191 | * Prefer last buddy, try to return the CPU to a preempted task. |
@@ -924,6 +1193,12 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) | |||
924 | if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) | 1193 | if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) |
925 | se = cfs_rq->last; | 1194 | se = cfs_rq->last; |
926 | 1195 | ||
1196 | /* | ||
1197 | * Someone really wants this to run. If it's not unfair, run it. | ||
1198 | */ | ||
1199 | if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) | ||
1200 | se = cfs_rq->next; | ||
1201 | |||
927 | clear_buddies(cfs_rq, se); | 1202 | clear_buddies(cfs_rq, se); |
928 | 1203 | ||
929 | return se; | 1204 | return se; |
@@ -955,6 +1230,11 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) | |||
955 | */ | 1230 | */ |
956 | update_curr(cfs_rq); | 1231 | update_curr(cfs_rq); |
957 | 1232 | ||
1233 | /* | ||
1234 | * Update share accounting for long-running entities. | ||
1235 | */ | ||
1236 | update_entity_shares_tick(cfs_rq); | ||
1237 | |||
958 | #ifdef CONFIG_SCHED_HRTICK | 1238 | #ifdef CONFIG_SCHED_HRTICK |
959 | /* | 1239 | /* |
960 | * queued ticks are scheduled to match the slice, so don't bother | 1240 | * queued ticks are scheduled to match the slice, so don't bother |
@@ -1055,9 +1335,18 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
1055 | flags = ENQUEUE_WAKEUP; | 1335 | flags = ENQUEUE_WAKEUP; |
1056 | } | 1336 | } |
1057 | 1337 | ||
1338 | for_each_sched_entity(se) { | ||
1339 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
1340 | |||
1341 | update_cfs_load(cfs_rq, 0); | ||
1342 | update_cfs_shares(cfs_rq); | ||
1343 | } | ||
1344 | |||
1058 | hrtick_update(rq); | 1345 | hrtick_update(rq); |
1059 | } | 1346 | } |
1060 | 1347 | ||
1348 | static void set_next_buddy(struct sched_entity *se); | ||
1349 | |||
1061 | /* | 1350 | /* |
1062 | * The dequeue_task method is called before nr_running is | 1351 | * The dequeue_task method is called before nr_running is |
1063 | * decreased. We remove the task from the rbtree and | 1352 | * decreased. We remove the task from the rbtree and |
@@ -1067,73 +1356,56 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
1067 | { | 1356 | { |
1068 | struct cfs_rq *cfs_rq; | 1357 | struct cfs_rq *cfs_rq; |
1069 | struct sched_entity *se = &p->se; | 1358 | struct sched_entity *se = &p->se; |
1359 | int task_sleep = flags & DEQUEUE_SLEEP; | ||
1070 | 1360 | ||
1071 | for_each_sched_entity(se) { | 1361 | for_each_sched_entity(se) { |
1072 | cfs_rq = cfs_rq_of(se); | 1362 | cfs_rq = cfs_rq_of(se); |
1073 | dequeue_entity(cfs_rq, se, flags); | 1363 | dequeue_entity(cfs_rq, se, flags); |
1364 | |||
1074 | /* Don't dequeue parent if it has other entities besides us */ | 1365 | /* Don't dequeue parent if it has other entities besides us */ |
1075 | if (cfs_rq->load.weight) | 1366 | if (cfs_rq->load.weight) { |
1367 | /* | ||
1368 | * Bias pick_next to pick a task from this cfs_rq, as | ||
1369 | * p is sleeping when it is within its sched_slice. | ||
1370 | */ | ||
1371 | if (task_sleep && parent_entity(se)) | ||
1372 | set_next_buddy(parent_entity(se)); | ||
1076 | break; | 1373 | break; |
1374 | } | ||
1077 | flags |= DEQUEUE_SLEEP; | 1375 | flags |= DEQUEUE_SLEEP; |
1078 | } | 1376 | } |
1079 | 1377 | ||
1080 | hrtick_update(rq); | 1378 | for_each_sched_entity(se) { |
1081 | } | 1379 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
1082 | |||
1083 | /* | ||
1084 | * sched_yield() support is very simple - we dequeue and enqueue. | ||
1085 | * | ||
1086 | * If compat_yield is turned on then we requeue to the end of the tree. | ||
1087 | */ | ||
1088 | static void yield_task_fair(struct rq *rq) | ||
1089 | { | ||
1090 | struct task_struct *curr = rq->curr; | ||
1091 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); | ||
1092 | struct sched_entity *rightmost, *se = &curr->se; | ||
1093 | |||
1094 | /* | ||
1095 | * Are we the only task in the tree? | ||
1096 | */ | ||
1097 | if (unlikely(cfs_rq->nr_running == 1)) | ||
1098 | return; | ||
1099 | |||
1100 | clear_buddies(cfs_rq, se); | ||
1101 | |||
1102 | if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) { | ||
1103 | update_rq_clock(rq); | ||
1104 | /* | ||
1105 | * Update run-time statistics of the 'current'. | ||
1106 | */ | ||
1107 | update_curr(cfs_rq); | ||
1108 | 1380 | ||
1109 | return; | 1381 | update_cfs_load(cfs_rq, 0); |
1382 | update_cfs_shares(cfs_rq); | ||
1110 | } | 1383 | } |
1111 | /* | ||
1112 | * Find the rightmost entry in the rbtree: | ||
1113 | */ | ||
1114 | rightmost = __pick_last_entity(cfs_rq); | ||
1115 | /* | ||
1116 | * Already in the rightmost position? | ||
1117 | */ | ||
1118 | if (unlikely(!rightmost || entity_before(rightmost, se))) | ||
1119 | return; | ||
1120 | 1384 | ||
1121 | /* | 1385 | hrtick_update(rq); |
1122 | * Minimally necessary key value to be last in the tree: | ||
1123 | * Upon rescheduling, sched_class::put_prev_task() will place | ||
1124 | * 'current' within the tree based on its new key value. | ||
1125 | */ | ||
1126 | se->vruntime = rightmost->vruntime + 1; | ||
1127 | } | 1386 | } |
1128 | 1387 | ||
1129 | #ifdef CONFIG_SMP | 1388 | #ifdef CONFIG_SMP |
1130 | 1389 | ||
1131 | static void task_waking_fair(struct rq *rq, struct task_struct *p) | 1390 | static void task_waking_fair(struct task_struct *p) |
1132 | { | 1391 | { |
1133 | struct sched_entity *se = &p->se; | 1392 | struct sched_entity *se = &p->se; |
1134 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 1393 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
1394 | u64 min_vruntime; | ||
1135 | 1395 | ||
1136 | se->vruntime -= cfs_rq->min_vruntime; | 1396 | #ifndef CONFIG_64BIT |
1397 | u64 min_vruntime_copy; | ||
1398 | |||
1399 | do { | ||
1400 | min_vruntime_copy = cfs_rq->min_vruntime_copy; | ||
1401 | smp_rmb(); | ||
1402 | min_vruntime = cfs_rq->min_vruntime; | ||
1403 | } while (min_vruntime != min_vruntime_copy); | ||
1404 | #else | ||
1405 | min_vruntime = cfs_rq->min_vruntime; | ||
1406 | #endif | ||
1407 | |||
1408 | se->vruntime -= min_vruntime; | ||
1137 | } | 1409 | } |
1138 | 1410 | ||
1139 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1411 | #ifdef CONFIG_FAIR_GROUP_SCHED |
@@ -1143,67 +1415,36 @@ static void task_waking_fair(struct rq *rq, struct task_struct *p) | |||
1143 | * Adding load to a group doesn't make a group heavier, but can cause movement | 1415 | * Adding load to a group doesn't make a group heavier, but can cause movement |
1144 | * of group shares between cpus. Assuming the shares were perfectly aligned one | 1416 | * of group shares between cpus. Assuming the shares were perfectly aligned one |
1145 | * can calculate the shift in shares. | 1417 | * can calculate the shift in shares. |
1146 | * | ||
1147 | * The problem is that perfectly aligning the shares is rather expensive, hence | ||
1148 | * we try to avoid doing that too often - see update_shares(), which ratelimits | ||
1149 | * this change. | ||
1150 | * | ||
1151 | * We compensate this by not only taking the current delta into account, but | ||
1152 | * also considering the delta between when the shares were last adjusted and | ||
1153 | * now. | ||
1154 | * | ||
1155 | * We still saw a performance dip, some tracing learned us that between | ||
1156 | * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased | ||
1157 | * significantly. Therefore try to bias the error in direction of failing | ||
1158 | * the affine wakeup. | ||
1159 | * | ||
1160 | */ | 1418 | */ |
1161 | static long effective_load(struct task_group *tg, int cpu, | 1419 | static long effective_load(struct task_group *tg, int cpu, long wl, long wg) |
1162 | long wl, long wg) | ||
1163 | { | 1420 | { |
1164 | struct sched_entity *se = tg->se[cpu]; | 1421 | struct sched_entity *se = tg->se[cpu]; |
1165 | 1422 | ||
1166 | if (!tg->parent) | 1423 | if (!tg->parent) |
1167 | return wl; | 1424 | return wl; |
1168 | 1425 | ||
1169 | /* | ||
1170 | * By not taking the decrease of shares on the other cpu into | ||
1171 | * account our error leans towards reducing the affine wakeups. | ||
1172 | */ | ||
1173 | if (!wl && sched_feat(ASYM_EFF_LOAD)) | ||
1174 | return wl; | ||
1175 | |||
1176 | for_each_sched_entity(se) { | 1426 | for_each_sched_entity(se) { |
1177 | long S, rw, s, a, b; | 1427 | long lw, w; |
1178 | long more_w; | ||
1179 | 1428 | ||
1180 | /* | 1429 | tg = se->my_q->tg; |
1181 | * Instead of using this increment, also add the difference | 1430 | w = se->my_q->load.weight; |
1182 | * between when the shares were last updated and now. | ||
1183 | */ | ||
1184 | more_w = se->my_q->load.weight - se->my_q->rq_weight; | ||
1185 | wl += more_w; | ||
1186 | wg += more_w; | ||
1187 | 1431 | ||
1188 | S = se->my_q->tg->shares; | 1432 | /* use this cpu's instantaneous contribution */ |
1189 | s = se->my_q->shares; | 1433 | lw = atomic_read(&tg->load_weight); |
1190 | rw = se->my_q->rq_weight; | 1434 | lw -= se->my_q->load_contribution; |
1435 | lw += w + wg; | ||
1191 | 1436 | ||
1192 | a = S*(rw + wl); | 1437 | wl += w; |
1193 | b = S*rw + s*wg; | ||
1194 | 1438 | ||
1195 | wl = s*(a-b); | 1439 | if (lw > 0 && wl < lw) |
1196 | 1440 | wl = (wl * tg->shares) / lw; | |
1197 | if (likely(b)) | 1441 | else |
1198 | wl /= b; | 1442 | wl = tg->shares; |
1199 | 1443 | ||
1200 | /* | 1444 | /* zero point is MIN_SHARES */ |
1201 | * Assume the group is already running and will | 1445 | if (wl < MIN_SHARES) |
1202 | * thus already be accounted for in the weight. | 1446 | wl = MIN_SHARES; |
1203 | * | 1447 | wl -= se->load.weight; |
1204 | * That is, moving shares between CPUs, does not | ||
1205 | * alter the group weight. | ||
1206 | */ | ||
1207 | wg = 0; | 1448 | wg = 0; |
1208 | } | 1449 | } |
1209 | 1450 | ||
@@ -1222,7 +1463,7 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu, | |||
1222 | 1463 | ||
1223 | static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) | 1464 | static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) |
1224 | { | 1465 | { |
1225 | unsigned long this_load, load; | 1466 | s64 this_load, load; |
1226 | int idx, this_cpu, prev_cpu; | 1467 | int idx, this_cpu, prev_cpu; |
1227 | unsigned long tl_per_task; | 1468 | unsigned long tl_per_task; |
1228 | struct task_group *tg; | 1469 | struct task_group *tg; |
@@ -1261,8 +1502,8 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) | |||
1261 | * Otherwise check if either cpus are near enough in load to allow this | 1502 | * Otherwise check if either cpus are near enough in load to allow this |
1262 | * task to be woken on this_cpu. | 1503 | * task to be woken on this_cpu. |
1263 | */ | 1504 | */ |
1264 | if (this_load) { | 1505 | if (this_load > 0) { |
1265 | unsigned long this_eff_load, prev_eff_load; | 1506 | s64 this_eff_load, prev_eff_load; |
1266 | 1507 | ||
1267 | this_eff_load = 100; | 1508 | this_eff_load = 100; |
1268 | this_eff_load *= power_of(prev_cpu); | 1509 | this_eff_load *= power_of(prev_cpu); |
@@ -1344,7 +1585,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, | |||
1344 | } | 1585 | } |
1345 | 1586 | ||
1346 | /* Adjust by relative CPU power of the group */ | 1587 | /* Adjust by relative CPU power of the group */ |
1347 | avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; | 1588 | avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power; |
1348 | 1589 | ||
1349 | if (local_group) { | 1590 | if (local_group) { |
1350 | this_load = avg_load; | 1591 | this_load = avg_load; |
@@ -1409,6 +1650,7 @@ static int select_idle_sibling(struct task_struct *p, int target) | |||
1409 | /* | 1650 | /* |
1410 | * Otherwise, iterate the domains and find an elegible idle cpu. | 1651 | * Otherwise, iterate the domains and find an elegible idle cpu. |
1411 | */ | 1652 | */ |
1653 | rcu_read_lock(); | ||
1412 | for_each_domain(target, sd) { | 1654 | for_each_domain(target, sd) { |
1413 | if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) | 1655 | if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) |
1414 | break; | 1656 | break; |
@@ -1428,6 +1670,7 @@ static int select_idle_sibling(struct task_struct *p, int target) | |||
1428 | cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) | 1670 | cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) |
1429 | break; | 1671 | break; |
1430 | } | 1672 | } |
1673 | rcu_read_unlock(); | ||
1431 | 1674 | ||
1432 | return target; | 1675 | return target; |
1433 | } | 1676 | } |
@@ -1444,7 +1687,7 @@ static int select_idle_sibling(struct task_struct *p, int target) | |||
1444 | * preempt must be disabled. | 1687 | * preempt must be disabled. |
1445 | */ | 1688 | */ |
1446 | static int | 1689 | static int |
1447 | select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags) | 1690 | select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) |
1448 | { | 1691 | { |
1449 | struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; | 1692 | struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; |
1450 | int cpu = smp_processor_id(); | 1693 | int cpu = smp_processor_id(); |
@@ -1460,6 +1703,7 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_ | |||
1460 | new_cpu = prev_cpu; | 1703 | new_cpu = prev_cpu; |
1461 | } | 1704 | } |
1462 | 1705 | ||
1706 | rcu_read_lock(); | ||
1463 | for_each_domain(cpu, tmp) { | 1707 | for_each_domain(cpu, tmp) { |
1464 | if (!(tmp->flags & SD_LOAD_BALANCE)) | 1708 | if (!(tmp->flags & SD_LOAD_BALANCE)) |
1465 | continue; | 1709 | continue; |
@@ -1479,7 +1723,7 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_ | |||
1479 | nr_running += cpu_rq(i)->cfs.nr_running; | 1723 | nr_running += cpu_rq(i)->cfs.nr_running; |
1480 | } | 1724 | } |
1481 | 1725 | ||
1482 | capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); | 1726 | capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); |
1483 | 1727 | ||
1484 | if (tmp->flags & SD_POWERSAVINGS_BALANCE) | 1728 | if (tmp->flags & SD_POWERSAVINGS_BALANCE) |
1485 | nr_running /= 2; | 1729 | nr_running /= 2; |
@@ -1508,28 +1752,12 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_ | |||
1508 | sd = tmp; | 1752 | sd = tmp; |
1509 | } | 1753 | } |
1510 | 1754 | ||
1511 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1512 | if (sched_feat(LB_SHARES_UPDATE)) { | ||
1513 | /* | ||
1514 | * Pick the largest domain to update shares over | ||
1515 | */ | ||
1516 | tmp = sd; | ||
1517 | if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight)) | ||
1518 | tmp = affine_sd; | ||
1519 | |||
1520 | if (tmp) { | ||
1521 | raw_spin_unlock(&rq->lock); | ||
1522 | update_shares(tmp); | ||
1523 | raw_spin_lock(&rq->lock); | ||
1524 | } | ||
1525 | } | ||
1526 | #endif | ||
1527 | |||
1528 | if (affine_sd) { | 1755 | if (affine_sd) { |
1529 | if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) | 1756 | if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) |
1530 | return select_idle_sibling(p, cpu); | 1757 | prev_cpu = cpu; |
1531 | else | 1758 | |
1532 | return select_idle_sibling(p, prev_cpu); | 1759 | new_cpu = select_idle_sibling(p, prev_cpu); |
1760 | goto unlock; | ||
1533 | } | 1761 | } |
1534 | 1762 | ||
1535 | while (sd) { | 1763 | while (sd) { |
@@ -1570,6 +1798,8 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_ | |||
1570 | } | 1798 | } |
1571 | /* while loop will break here if sd == NULL */ | 1799 | /* while loop will break here if sd == NULL */ |
1572 | } | 1800 | } |
1801 | unlock: | ||
1802 | rcu_read_unlock(); | ||
1573 | 1803 | ||
1574 | return new_cpu; | 1804 | return new_cpu; |
1575 | } | 1805 | } |
@@ -1593,10 +1823,7 @@ wakeup_gran(struct sched_entity *curr, struct sched_entity *se) | |||
1593 | * This is especially important for buddies when the leftmost | 1823 | * This is especially important for buddies when the leftmost |
1594 | * task is higher priority than the buddy. | 1824 | * task is higher priority than the buddy. |
1595 | */ | 1825 | */ |
1596 | if (unlikely(se->load.weight != NICE_0_LOAD)) | 1826 | return calc_delta_fair(gran, se); |
1597 | gran = calc_delta_fair(gran, se); | ||
1598 | |||
1599 | return gran; | ||
1600 | } | 1827 | } |
1601 | 1828 | ||
1602 | /* | 1829 | /* |
@@ -1630,18 +1857,26 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) | |||
1630 | 1857 | ||
1631 | static void set_last_buddy(struct sched_entity *se) | 1858 | static void set_last_buddy(struct sched_entity *se) |
1632 | { | 1859 | { |
1633 | if (likely(task_of(se)->policy != SCHED_IDLE)) { | 1860 | if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE)) |
1634 | for_each_sched_entity(se) | 1861 | return; |
1635 | cfs_rq_of(se)->last = se; | 1862 | |
1636 | } | 1863 | for_each_sched_entity(se) |
1864 | cfs_rq_of(se)->last = se; | ||
1637 | } | 1865 | } |
1638 | 1866 | ||
1639 | static void set_next_buddy(struct sched_entity *se) | 1867 | static void set_next_buddy(struct sched_entity *se) |
1640 | { | 1868 | { |
1641 | if (likely(task_of(se)->policy != SCHED_IDLE)) { | 1869 | if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE)) |
1642 | for_each_sched_entity(se) | 1870 | return; |
1643 | cfs_rq_of(se)->next = se; | 1871 | |
1644 | } | 1872 | for_each_sched_entity(se) |
1873 | cfs_rq_of(se)->next = se; | ||
1874 | } | ||
1875 | |||
1876 | static void set_skip_buddy(struct sched_entity *se) | ||
1877 | { | ||
1878 | for_each_sched_entity(se) | ||
1879 | cfs_rq_of(se)->skip = se; | ||
1645 | } | 1880 | } |
1646 | 1881 | ||
1647 | /* | 1882 | /* |
@@ -1653,18 +1888,18 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ | |||
1653 | struct sched_entity *se = &curr->se, *pse = &p->se; | 1888 | struct sched_entity *se = &curr->se, *pse = &p->se; |
1654 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); | 1889 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); |
1655 | int scale = cfs_rq->nr_running >= sched_nr_latency; | 1890 | int scale = cfs_rq->nr_running >= sched_nr_latency; |
1891 | int next_buddy_marked = 0; | ||
1656 | 1892 | ||
1657 | if (unlikely(rt_prio(p->prio)) || p->policy == SCHED_LITMUS) | 1893 | if (unlikely(rt_prio(p->prio)) || p->policy == SCHED_LITMUS) |
1658 | goto preempt; | 1894 | goto preempt; |
1659 | 1895 | ||
1660 | if (unlikely(p->sched_class != &fair_sched_class)) | ||
1661 | return; | ||
1662 | |||
1663 | if (unlikely(se == pse)) | 1896 | if (unlikely(se == pse)) |
1664 | return; | 1897 | return; |
1665 | 1898 | ||
1666 | if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) | 1899 | if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) { |
1667 | set_next_buddy(pse); | 1900 | set_next_buddy(pse); |
1901 | next_buddy_marked = 1; | ||
1902 | } | ||
1668 | 1903 | ||
1669 | /* | 1904 | /* |
1670 | * We can come here with TIF_NEED_RESCHED already set from new task | 1905 | * We can come here with TIF_NEED_RESCHED already set from new task |
@@ -1673,16 +1908,18 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ | |||
1673 | if (test_tsk_need_resched(curr)) | 1908 | if (test_tsk_need_resched(curr)) |
1674 | return; | 1909 | return; |
1675 | 1910 | ||
1911 | /* Idle tasks are by definition preempted by non-idle tasks. */ | ||
1912 | if (unlikely(curr->policy == SCHED_IDLE) && | ||
1913 | likely(p->policy != SCHED_IDLE)) | ||
1914 | goto preempt; | ||
1915 | |||
1676 | /* | 1916 | /* |
1677 | * Batch and idle tasks do not preempt (their preemption is driven by | 1917 | * Batch and idle tasks do not preempt non-idle tasks (their preemption |
1678 | * the tick): | 1918 | * is driven by the tick): |
1679 | */ | 1919 | */ |
1680 | if (unlikely(p->policy != SCHED_NORMAL)) | 1920 | if (unlikely(p->policy != SCHED_NORMAL)) |
1681 | return; | 1921 | return; |
1682 | 1922 | ||
1683 | /* Idle tasks are by definition preempted by everybody. */ | ||
1684 | if (unlikely(curr->policy == SCHED_IDLE)) | ||
1685 | goto preempt; | ||
1686 | 1923 | ||
1687 | if (!sched_feat(WAKEUP_PREEMPT)) | 1924 | if (!sched_feat(WAKEUP_PREEMPT)) |
1688 | return; | 1925 | return; |
@@ -1690,8 +1927,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ | |||
1690 | update_curr(cfs_rq); | 1927 | update_curr(cfs_rq); |
1691 | find_matching_se(&se, &pse); | 1928 | find_matching_se(&se, &pse); |
1692 | BUG_ON(!pse); | 1929 | BUG_ON(!pse); |
1693 | if (wakeup_preempt_entity(se, pse) == 1) | 1930 | if (wakeup_preempt_entity(se, pse) == 1) { |
1931 | /* | ||
1932 | * Bias pick_next to pick the sched entity that is | ||
1933 | * triggering this preemption. | ||
1934 | */ | ||
1935 | if (!next_buddy_marked) | ||
1936 | set_next_buddy(pse); | ||
1694 | goto preempt; | 1937 | goto preempt; |
1938 | } | ||
1695 | 1939 | ||
1696 | return; | 1940 | return; |
1697 | 1941 | ||
@@ -1748,6 +1992,51 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) | |||
1748 | } | 1992 | } |
1749 | } | 1993 | } |
1750 | 1994 | ||
1995 | /* | ||
1996 | * sched_yield() is very simple | ||
1997 | * | ||
1998 | * The magic of dealing with the ->skip buddy is in pick_next_entity. | ||
1999 | */ | ||
2000 | static void yield_task_fair(struct rq *rq) | ||
2001 | { | ||
2002 | struct task_struct *curr = rq->curr; | ||
2003 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); | ||
2004 | struct sched_entity *se = &curr->se; | ||
2005 | |||
2006 | /* | ||
2007 | * Are we the only task in the tree? | ||
2008 | */ | ||
2009 | if (unlikely(rq->nr_running == 1)) | ||
2010 | return; | ||
2011 | |||
2012 | clear_buddies(cfs_rq, se); | ||
2013 | |||
2014 | if (curr->policy != SCHED_BATCH) { | ||
2015 | update_rq_clock(rq); | ||
2016 | /* | ||
2017 | * Update run-time statistics of the 'current'. | ||
2018 | */ | ||
2019 | update_curr(cfs_rq); | ||
2020 | } | ||
2021 | |||
2022 | set_skip_buddy(se); | ||
2023 | } | ||
2024 | |||
2025 | static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt) | ||
2026 | { | ||
2027 | struct sched_entity *se = &p->se; | ||
2028 | |||
2029 | if (!se->on_rq) | ||
2030 | return false; | ||
2031 | |||
2032 | /* Tell the scheduler that we'd really like pse to run next. */ | ||
2033 | set_next_buddy(se); | ||
2034 | |||
2035 | yield_task_fair(rq); | ||
2036 | |||
2037 | return true; | ||
2038 | } | ||
2039 | |||
1751 | #ifdef CONFIG_SMP | 2040 | #ifdef CONFIG_SMP |
1752 | /************************************************** | 2041 | /************************************************** |
1753 | * Fair scheduling class load-balancing methods: | 2042 | * Fair scheduling class load-balancing methods: |
@@ -1798,7 +2087,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, | |||
1798 | * 2) too many balance attempts have failed. | 2087 | * 2) too many balance attempts have failed. |
1799 | */ | 2088 | */ |
1800 | 2089 | ||
1801 | tsk_cache_hot = task_hot(p, rq->clock, sd); | 2090 | tsk_cache_hot = task_hot(p, rq->clock_task, sd); |
1802 | if (!tsk_cache_hot || | 2091 | if (!tsk_cache_hot || |
1803 | sd->nr_balance_failed > sd->cache_nice_tries) { | 2092 | sd->nr_balance_failed > sd->cache_nice_tries) { |
1804 | #ifdef CONFIG_SCHEDSTATS | 2093 | #ifdef CONFIG_SCHEDSTATS |
@@ -1857,23 +2146,22 @@ static unsigned long | |||
1857 | balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | 2146 | balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, |
1858 | unsigned long max_load_move, struct sched_domain *sd, | 2147 | unsigned long max_load_move, struct sched_domain *sd, |
1859 | enum cpu_idle_type idle, int *all_pinned, | 2148 | enum cpu_idle_type idle, int *all_pinned, |
1860 | int *this_best_prio, struct cfs_rq *busiest_cfs_rq) | 2149 | struct cfs_rq *busiest_cfs_rq) |
1861 | { | 2150 | { |
1862 | int loops = 0, pulled = 0, pinned = 0; | 2151 | int loops = 0, pulled = 0; |
1863 | long rem_load_move = max_load_move; | 2152 | long rem_load_move = max_load_move; |
1864 | struct task_struct *p, *n; | 2153 | struct task_struct *p, *n; |
1865 | 2154 | ||
1866 | if (max_load_move == 0) | 2155 | if (max_load_move == 0) |
1867 | goto out; | 2156 | goto out; |
1868 | 2157 | ||
1869 | pinned = 1; | ||
1870 | |||
1871 | list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) { | 2158 | list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) { |
1872 | if (loops++ > sysctl_sched_nr_migrate) | 2159 | if (loops++ > sysctl_sched_nr_migrate) |
1873 | break; | 2160 | break; |
1874 | 2161 | ||
1875 | if ((p->se.load.weight >> 1) > rem_load_move || | 2162 | if ((p->se.load.weight >> 1) > rem_load_move || |
1876 | !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) | 2163 | !can_migrate_task(p, busiest, this_cpu, sd, idle, |
2164 | all_pinned)) | ||
1877 | continue; | 2165 | continue; |
1878 | 2166 | ||
1879 | pull_task(busiest, p, this_rq, this_cpu); | 2167 | pull_task(busiest, p, this_rq, this_cpu); |
@@ -1896,9 +2184,6 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
1896 | */ | 2184 | */ |
1897 | if (rem_load_move <= 0) | 2185 | if (rem_load_move <= 0) |
1898 | break; | 2186 | break; |
1899 | |||
1900 | if (p->prio < *this_best_prio) | ||
1901 | *this_best_prio = p->prio; | ||
1902 | } | 2187 | } |
1903 | out: | 2188 | out: |
1904 | /* | 2189 | /* |
@@ -1908,18 +2193,57 @@ out: | |||
1908 | */ | 2193 | */ |
1909 | schedstat_add(sd, lb_gained[idle], pulled); | 2194 | schedstat_add(sd, lb_gained[idle], pulled); |
1910 | 2195 | ||
1911 | if (all_pinned) | ||
1912 | *all_pinned = pinned; | ||
1913 | |||
1914 | return max_load_move - rem_load_move; | 2196 | return max_load_move - rem_load_move; |
1915 | } | 2197 | } |
1916 | 2198 | ||
1917 | #ifdef CONFIG_FAIR_GROUP_SCHED | 2199 | #ifdef CONFIG_FAIR_GROUP_SCHED |
2200 | /* | ||
2201 | * update tg->load_weight by folding this cpu's load_avg | ||
2202 | */ | ||
2203 | static int update_shares_cpu(struct task_group *tg, int cpu) | ||
2204 | { | ||
2205 | struct cfs_rq *cfs_rq; | ||
2206 | unsigned long flags; | ||
2207 | struct rq *rq; | ||
2208 | |||
2209 | if (!tg->se[cpu]) | ||
2210 | return 0; | ||
2211 | |||
2212 | rq = cpu_rq(cpu); | ||
2213 | cfs_rq = tg->cfs_rq[cpu]; | ||
2214 | |||
2215 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
2216 | |||
2217 | update_rq_clock(rq); | ||
2218 | update_cfs_load(cfs_rq, 1); | ||
2219 | |||
2220 | /* | ||
2221 | * We need to update shares after updating tg->load_weight in | ||
2222 | * order to adjust the weight of groups with long running tasks. | ||
2223 | */ | ||
2224 | update_cfs_shares(cfs_rq); | ||
2225 | |||
2226 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
2227 | |||
2228 | return 0; | ||
2229 | } | ||
2230 | |||
2231 | static void update_shares(int cpu) | ||
2232 | { | ||
2233 | struct cfs_rq *cfs_rq; | ||
2234 | struct rq *rq = cpu_rq(cpu); | ||
2235 | |||
2236 | rcu_read_lock(); | ||
2237 | for_each_leaf_cfs_rq(rq, cfs_rq) | ||
2238 | update_shares_cpu(cfs_rq->tg, cpu); | ||
2239 | rcu_read_unlock(); | ||
2240 | } | ||
2241 | |||
1918 | static unsigned long | 2242 | static unsigned long |
1919 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 2243 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
1920 | unsigned long max_load_move, | 2244 | unsigned long max_load_move, |
1921 | struct sched_domain *sd, enum cpu_idle_type idle, | 2245 | struct sched_domain *sd, enum cpu_idle_type idle, |
1922 | int *all_pinned, int *this_best_prio) | 2246 | int *all_pinned) |
1923 | { | 2247 | { |
1924 | long rem_load_move = max_load_move; | 2248 | long rem_load_move = max_load_move; |
1925 | int busiest_cpu = cpu_of(busiest); | 2249 | int busiest_cpu = cpu_of(busiest); |
@@ -1944,7 +2268,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
1944 | rem_load = div_u64(rem_load, busiest_h_load + 1); | 2268 | rem_load = div_u64(rem_load, busiest_h_load + 1); |
1945 | 2269 | ||
1946 | moved_load = balance_tasks(this_rq, this_cpu, busiest, | 2270 | moved_load = balance_tasks(this_rq, this_cpu, busiest, |
1947 | rem_load, sd, idle, all_pinned, this_best_prio, | 2271 | rem_load, sd, idle, all_pinned, |
1948 | busiest_cfs_rq); | 2272 | busiest_cfs_rq); |
1949 | 2273 | ||
1950 | if (!moved_load) | 2274 | if (!moved_load) |
@@ -1962,15 +2286,19 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
1962 | return max_load_move - rem_load_move; | 2286 | return max_load_move - rem_load_move; |
1963 | } | 2287 | } |
1964 | #else | 2288 | #else |
2289 | static inline void update_shares(int cpu) | ||
2290 | { | ||
2291 | } | ||
2292 | |||
1965 | static unsigned long | 2293 | static unsigned long |
1966 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 2294 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
1967 | unsigned long max_load_move, | 2295 | unsigned long max_load_move, |
1968 | struct sched_domain *sd, enum cpu_idle_type idle, | 2296 | struct sched_domain *sd, enum cpu_idle_type idle, |
1969 | int *all_pinned, int *this_best_prio) | 2297 | int *all_pinned) |
1970 | { | 2298 | { |
1971 | return balance_tasks(this_rq, this_cpu, busiest, | 2299 | return balance_tasks(this_rq, this_cpu, busiest, |
1972 | max_load_move, sd, idle, all_pinned, | 2300 | max_load_move, sd, idle, all_pinned, |
1973 | this_best_prio, &busiest->cfs); | 2301 | &busiest->cfs); |
1974 | } | 2302 | } |
1975 | #endif | 2303 | #endif |
1976 | 2304 | ||
@@ -1987,12 +2315,11 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
1987 | int *all_pinned) | 2315 | int *all_pinned) |
1988 | { | 2316 | { |
1989 | unsigned long total_load_moved = 0, load_moved; | 2317 | unsigned long total_load_moved = 0, load_moved; |
1990 | int this_best_prio = this_rq->curr->prio; | ||
1991 | 2318 | ||
1992 | do { | 2319 | do { |
1993 | load_moved = load_balance_fair(this_rq, this_cpu, busiest, | 2320 | load_moved = load_balance_fair(this_rq, this_cpu, busiest, |
1994 | max_load_move - total_load_moved, | 2321 | max_load_move - total_load_moved, |
1995 | sd, idle, all_pinned, &this_best_prio); | 2322 | sd, idle, all_pinned); |
1996 | 2323 | ||
1997 | total_load_moved += load_moved; | 2324 | total_load_moved += load_moved; |
1998 | 2325 | ||
@@ -2030,12 +2357,17 @@ struct sd_lb_stats { | |||
2030 | unsigned long this_load; | 2357 | unsigned long this_load; |
2031 | unsigned long this_load_per_task; | 2358 | unsigned long this_load_per_task; |
2032 | unsigned long this_nr_running; | 2359 | unsigned long this_nr_running; |
2360 | unsigned long this_has_capacity; | ||
2361 | unsigned int this_idle_cpus; | ||
2033 | 2362 | ||
2034 | /* Statistics of the busiest group */ | 2363 | /* Statistics of the busiest group */ |
2364 | unsigned int busiest_idle_cpus; | ||
2035 | unsigned long max_load; | 2365 | unsigned long max_load; |
2036 | unsigned long busiest_load_per_task; | 2366 | unsigned long busiest_load_per_task; |
2037 | unsigned long busiest_nr_running; | 2367 | unsigned long busiest_nr_running; |
2038 | unsigned long busiest_group_capacity; | 2368 | unsigned long busiest_group_capacity; |
2369 | unsigned long busiest_has_capacity; | ||
2370 | unsigned int busiest_group_weight; | ||
2039 | 2371 | ||
2040 | int group_imb; /* Is there imbalance in this sd */ | 2372 | int group_imb; /* Is there imbalance in this sd */ |
2041 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | 2373 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) |
@@ -2057,7 +2389,10 @@ struct sg_lb_stats { | |||
2057 | unsigned long sum_nr_running; /* Nr tasks running in the group */ | 2389 | unsigned long sum_nr_running; /* Nr tasks running in the group */ |
2058 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ | 2390 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ |
2059 | unsigned long group_capacity; | 2391 | unsigned long group_capacity; |
2392 | unsigned long idle_cpus; | ||
2393 | unsigned long group_weight; | ||
2060 | int group_imb; /* Is there an imbalance in the group ? */ | 2394 | int group_imb; /* Is there an imbalance in the group ? */ |
2395 | int group_has_capacity; /* Is there extra capacity in the group? */ | ||
2061 | }; | 2396 | }; |
2062 | 2397 | ||
2063 | /** | 2398 | /** |
@@ -2239,7 +2574,7 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, | |||
2239 | 2574 | ||
2240 | unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) | 2575 | unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) |
2241 | { | 2576 | { |
2242 | return SCHED_LOAD_SCALE; | 2577 | return SCHED_POWER_SCALE; |
2243 | } | 2578 | } |
2244 | 2579 | ||
2245 | unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) | 2580 | unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) |
@@ -2268,12 +2603,18 @@ unsigned long scale_rt_power(int cpu) | |||
2268 | u64 total, available; | 2603 | u64 total, available; |
2269 | 2604 | ||
2270 | total = sched_avg_period() + (rq->clock - rq->age_stamp); | 2605 | total = sched_avg_period() + (rq->clock - rq->age_stamp); |
2271 | available = total - rq->rt_avg; | ||
2272 | 2606 | ||
2273 | if (unlikely((s64)total < SCHED_LOAD_SCALE)) | 2607 | if (unlikely(total < rq->rt_avg)) { |
2274 | total = SCHED_LOAD_SCALE; | 2608 | /* Ensures that power won't end up being negative */ |
2609 | available = 0; | ||
2610 | } else { | ||
2611 | available = total - rq->rt_avg; | ||
2612 | } | ||
2613 | |||
2614 | if (unlikely((s64)total < SCHED_POWER_SCALE)) | ||
2615 | total = SCHED_POWER_SCALE; | ||
2275 | 2616 | ||
2276 | total >>= SCHED_LOAD_SHIFT; | 2617 | total >>= SCHED_POWER_SHIFT; |
2277 | 2618 | ||
2278 | return div_u64(available, total); | 2619 | return div_u64(available, total); |
2279 | } | 2620 | } |
@@ -2281,7 +2622,7 @@ unsigned long scale_rt_power(int cpu) | |||
2281 | static void update_cpu_power(struct sched_domain *sd, int cpu) | 2622 | static void update_cpu_power(struct sched_domain *sd, int cpu) |
2282 | { | 2623 | { |
2283 | unsigned long weight = sd->span_weight; | 2624 | unsigned long weight = sd->span_weight; |
2284 | unsigned long power = SCHED_LOAD_SCALE; | 2625 | unsigned long power = SCHED_POWER_SCALE; |
2285 | struct sched_group *sdg = sd->groups; | 2626 | struct sched_group *sdg = sd->groups; |
2286 | 2627 | ||
2287 | if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { | 2628 | if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { |
@@ -2290,26 +2631,26 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) | |||
2290 | else | 2631 | else |
2291 | power *= default_scale_smt_power(sd, cpu); | 2632 | power *= default_scale_smt_power(sd, cpu); |
2292 | 2633 | ||
2293 | power >>= SCHED_LOAD_SHIFT; | 2634 | power >>= SCHED_POWER_SHIFT; |
2294 | } | 2635 | } |
2295 | 2636 | ||
2296 | sdg->cpu_power_orig = power; | 2637 | sdg->sgp->power_orig = power; |
2297 | 2638 | ||
2298 | if (sched_feat(ARCH_POWER)) | 2639 | if (sched_feat(ARCH_POWER)) |
2299 | power *= arch_scale_freq_power(sd, cpu); | 2640 | power *= arch_scale_freq_power(sd, cpu); |
2300 | else | 2641 | else |
2301 | power *= default_scale_freq_power(sd, cpu); | 2642 | power *= default_scale_freq_power(sd, cpu); |
2302 | 2643 | ||
2303 | power >>= SCHED_LOAD_SHIFT; | 2644 | power >>= SCHED_POWER_SHIFT; |
2304 | 2645 | ||
2305 | power *= scale_rt_power(cpu); | 2646 | power *= scale_rt_power(cpu); |
2306 | power >>= SCHED_LOAD_SHIFT; | 2647 | power >>= SCHED_POWER_SHIFT; |
2307 | 2648 | ||
2308 | if (!power) | 2649 | if (!power) |
2309 | power = 1; | 2650 | power = 1; |
2310 | 2651 | ||
2311 | cpu_rq(cpu)->cpu_power = power; | 2652 | cpu_rq(cpu)->cpu_power = power; |
2312 | sdg->cpu_power = power; | 2653 | sdg->sgp->power = power; |
2313 | } | 2654 | } |
2314 | 2655 | ||
2315 | static void update_group_power(struct sched_domain *sd, int cpu) | 2656 | static void update_group_power(struct sched_domain *sd, int cpu) |
@@ -2327,11 +2668,11 @@ static void update_group_power(struct sched_domain *sd, int cpu) | |||
2327 | 2668 | ||
2328 | group = child->groups; | 2669 | group = child->groups; |
2329 | do { | 2670 | do { |
2330 | power += group->cpu_power; | 2671 | power += group->sgp->power; |
2331 | group = group->next; | 2672 | group = group->next; |
2332 | } while (group != child->groups); | 2673 | } while (group != child->groups); |
2333 | 2674 | ||
2334 | sdg->cpu_power = power; | 2675 | sdg->sgp->power = power; |
2335 | } | 2676 | } |
2336 | 2677 | ||
2337 | /* | 2678 | /* |
@@ -2345,15 +2686,15 @@ static inline int | |||
2345 | fix_small_capacity(struct sched_domain *sd, struct sched_group *group) | 2686 | fix_small_capacity(struct sched_domain *sd, struct sched_group *group) |
2346 | { | 2687 | { |
2347 | /* | 2688 | /* |
2348 | * Only siblings can have significantly less than SCHED_LOAD_SCALE | 2689 | * Only siblings can have significantly less than SCHED_POWER_SCALE |
2349 | */ | 2690 | */ |
2350 | if (sd->level != SD_LV_SIBLING) | 2691 | if (!(sd->flags & SD_SHARE_CPUPOWER)) |
2351 | return 0; | 2692 | return 0; |
2352 | 2693 | ||
2353 | /* | 2694 | /* |
2354 | * If ~90% of the cpu_power is still there, we're good. | 2695 | * If ~90% of the cpu_power is still there, we're good. |
2355 | */ | 2696 | */ |
2356 | if (group->cpu_power * 32 > group->cpu_power_orig * 29) | 2697 | if (group->sgp->power * 32 > group->sgp->power_orig * 29) |
2357 | return 1; | 2698 | return 1; |
2358 | 2699 | ||
2359 | return 0; | 2700 | return 0; |
@@ -2366,7 +2707,6 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) | |||
2366 | * @this_cpu: Cpu for which load balance is currently performed. | 2707 | * @this_cpu: Cpu for which load balance is currently performed. |
2367 | * @idle: Idle status of this_cpu | 2708 | * @idle: Idle status of this_cpu |
2368 | * @load_idx: Load index of sched_domain of this_cpu for load calc. | 2709 | * @load_idx: Load index of sched_domain of this_cpu for load calc. |
2369 | * @sd_idle: Idle status of the sched_domain containing group. | ||
2370 | * @local_group: Does group contain this_cpu. | 2710 | * @local_group: Does group contain this_cpu. |
2371 | * @cpus: Set of cpus considered for load balancing. | 2711 | * @cpus: Set of cpus considered for load balancing. |
2372 | * @balance: Should we balance. | 2712 | * @balance: Should we balance. |
@@ -2374,11 +2714,11 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) | |||
2374 | */ | 2714 | */ |
2375 | static inline void update_sg_lb_stats(struct sched_domain *sd, | 2715 | static inline void update_sg_lb_stats(struct sched_domain *sd, |
2376 | struct sched_group *group, int this_cpu, | 2716 | struct sched_group *group, int this_cpu, |
2377 | enum cpu_idle_type idle, int load_idx, int *sd_idle, | 2717 | enum cpu_idle_type idle, int load_idx, |
2378 | int local_group, const struct cpumask *cpus, | 2718 | int local_group, const struct cpumask *cpus, |
2379 | int *balance, struct sg_lb_stats *sgs) | 2719 | int *balance, struct sg_lb_stats *sgs) |
2380 | { | 2720 | { |
2381 | unsigned long load, max_cpu_load, min_cpu_load; | 2721 | unsigned long load, max_cpu_load, min_cpu_load, max_nr_running; |
2382 | int i; | 2722 | int i; |
2383 | unsigned int balance_cpu = -1, first_idle_cpu = 0; | 2723 | unsigned int balance_cpu = -1, first_idle_cpu = 0; |
2384 | unsigned long avg_load_per_task = 0; | 2724 | unsigned long avg_load_per_task = 0; |
@@ -2389,13 +2729,11 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
2389 | /* Tally up the load of all CPUs in the group */ | 2729 | /* Tally up the load of all CPUs in the group */ |
2390 | max_cpu_load = 0; | 2730 | max_cpu_load = 0; |
2391 | min_cpu_load = ~0UL; | 2731 | min_cpu_load = ~0UL; |
2732 | max_nr_running = 0; | ||
2392 | 2733 | ||
2393 | for_each_cpu_and(i, sched_group_cpus(group), cpus) { | 2734 | for_each_cpu_and(i, sched_group_cpus(group), cpus) { |
2394 | struct rq *rq = cpu_rq(i); | 2735 | struct rq *rq = cpu_rq(i); |
2395 | 2736 | ||
2396 | if (*sd_idle && rq->nr_running) | ||
2397 | *sd_idle = 0; | ||
2398 | |||
2399 | /* Bias balancing toward cpus of our domain */ | 2737 | /* Bias balancing toward cpus of our domain */ |
2400 | if (local_group) { | 2738 | if (local_group) { |
2401 | if (idle_cpu(i) && !first_idle_cpu) { | 2739 | if (idle_cpu(i) && !first_idle_cpu) { |
@@ -2406,8 +2744,10 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
2406 | load = target_load(i, load_idx); | 2744 | load = target_load(i, load_idx); |
2407 | } else { | 2745 | } else { |
2408 | load = source_load(i, load_idx); | 2746 | load = source_load(i, load_idx); |
2409 | if (load > max_cpu_load) | 2747 | if (load > max_cpu_load) { |
2410 | max_cpu_load = load; | 2748 | max_cpu_load = load; |
2749 | max_nr_running = rq->nr_running; | ||
2750 | } | ||
2411 | if (min_cpu_load > load) | 2751 | if (min_cpu_load > load) |
2412 | min_cpu_load = load; | 2752 | min_cpu_load = load; |
2413 | } | 2753 | } |
@@ -2415,7 +2755,8 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
2415 | sgs->group_load += load; | 2755 | sgs->group_load += load; |
2416 | sgs->sum_nr_running += rq->nr_running; | 2756 | sgs->sum_nr_running += rq->nr_running; |
2417 | sgs->sum_weighted_load += weighted_cpuload(i); | 2757 | sgs->sum_weighted_load += weighted_cpuload(i); |
2418 | 2758 | if (idle_cpu(i)) | |
2759 | sgs->idle_cpus++; | ||
2419 | } | 2760 | } |
2420 | 2761 | ||
2421 | /* | 2762 | /* |
@@ -2433,11 +2774,11 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
2433 | } | 2774 | } |
2434 | 2775 | ||
2435 | /* Adjust by relative CPU power of the group */ | 2776 | /* Adjust by relative CPU power of the group */ |
2436 | sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; | 2777 | sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power; |
2437 | 2778 | ||
2438 | /* | 2779 | /* |
2439 | * Consider the group unbalanced when the imbalance is larger | 2780 | * Consider the group unbalanced when the imbalance is larger |
2440 | * than the average weight of two tasks. | 2781 | * than the average weight of a task. |
2441 | * | 2782 | * |
2442 | * APZ: with cgroup the avg task weight can vary wildly and | 2783 | * APZ: with cgroup the avg task weight can vary wildly and |
2443 | * might not be a suitable number - should we keep a | 2784 | * might not be a suitable number - should we keep a |
@@ -2447,13 +2788,17 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
2447 | if (sgs->sum_nr_running) | 2788 | if (sgs->sum_nr_running) |
2448 | avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; | 2789 | avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; |
2449 | 2790 | ||
2450 | if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) | 2791 | if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1) |
2451 | sgs->group_imb = 1; | 2792 | sgs->group_imb = 1; |
2452 | 2793 | ||
2453 | sgs->group_capacity = | 2794 | sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power, |
2454 | DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); | 2795 | SCHED_POWER_SCALE); |
2455 | if (!sgs->group_capacity) | 2796 | if (!sgs->group_capacity) |
2456 | sgs->group_capacity = fix_small_capacity(sd, group); | 2797 | sgs->group_capacity = fix_small_capacity(sd, group); |
2798 | sgs->group_weight = group->group_weight; | ||
2799 | |||
2800 | if (sgs->group_capacity > sgs->sum_nr_running) | ||
2801 | sgs->group_has_capacity = 1; | ||
2457 | } | 2802 | } |
2458 | 2803 | ||
2459 | /** | 2804 | /** |
@@ -2504,15 +2849,13 @@ static bool update_sd_pick_busiest(struct sched_domain *sd, | |||
2504 | * @sd: sched_domain whose statistics are to be updated. | 2849 | * @sd: sched_domain whose statistics are to be updated. |
2505 | * @this_cpu: Cpu for which load balance is currently performed. | 2850 | * @this_cpu: Cpu for which load balance is currently performed. |
2506 | * @idle: Idle status of this_cpu | 2851 | * @idle: Idle status of this_cpu |
2507 | * @sd_idle: Idle status of the sched_domain containing sg. | ||
2508 | * @cpus: Set of cpus considered for load balancing. | 2852 | * @cpus: Set of cpus considered for load balancing. |
2509 | * @balance: Should we balance. | 2853 | * @balance: Should we balance. |
2510 | * @sds: variable to hold the statistics for this sched_domain. | 2854 | * @sds: variable to hold the statistics for this sched_domain. |
2511 | */ | 2855 | */ |
2512 | static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | 2856 | static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, |
2513 | enum cpu_idle_type idle, int *sd_idle, | 2857 | enum cpu_idle_type idle, const struct cpumask *cpus, |
2514 | const struct cpumask *cpus, int *balance, | 2858 | int *balance, struct sd_lb_stats *sds) |
2515 | struct sd_lb_stats *sds) | ||
2516 | { | 2859 | { |
2517 | struct sched_domain *child = sd->child; | 2860 | struct sched_domain *child = sd->child; |
2518 | struct sched_group *sg = sd->groups; | 2861 | struct sched_group *sg = sd->groups; |
@@ -2530,21 +2873,26 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
2530 | 2873 | ||
2531 | local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg)); | 2874 | local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg)); |
2532 | memset(&sgs, 0, sizeof(sgs)); | 2875 | memset(&sgs, 0, sizeof(sgs)); |
2533 | update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle, | 2876 | update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, |
2534 | local_group, cpus, balance, &sgs); | 2877 | local_group, cpus, balance, &sgs); |
2535 | 2878 | ||
2536 | if (local_group && !(*balance)) | 2879 | if (local_group && !(*balance)) |
2537 | return; | 2880 | return; |
2538 | 2881 | ||
2539 | sds->total_load += sgs.group_load; | 2882 | sds->total_load += sgs.group_load; |
2540 | sds->total_pwr += sg->cpu_power; | 2883 | sds->total_pwr += sg->sgp->power; |
2541 | 2884 | ||
2542 | /* | 2885 | /* |
2543 | * In case the child domain prefers tasks go to siblings | 2886 | * In case the child domain prefers tasks go to siblings |
2544 | * first, lower the sg capacity to one so that we'll try | 2887 | * first, lower the sg capacity to one so that we'll try |
2545 | * and move all the excess tasks away. | 2888 | * and move all the excess tasks away. We lower the capacity |
2889 | * of a group only if the local group has the capacity to fit | ||
2890 | * these excess tasks, i.e. nr_running < group_capacity. The | ||
2891 | * extra check prevents the case where you always pull from the | ||
2892 | * heaviest group when it is already under-utilized (possible | ||
2893 | * with a large weight task outweighs the tasks on the system). | ||
2546 | */ | 2894 | */ |
2547 | if (prefer_sibling) | 2895 | if (prefer_sibling && !local_group && sds->this_has_capacity) |
2548 | sgs.group_capacity = min(sgs.group_capacity, 1UL); | 2896 | sgs.group_capacity = min(sgs.group_capacity, 1UL); |
2549 | 2897 | ||
2550 | if (local_group) { | 2898 | if (local_group) { |
@@ -2552,12 +2900,17 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
2552 | sds->this = sg; | 2900 | sds->this = sg; |
2553 | sds->this_nr_running = sgs.sum_nr_running; | 2901 | sds->this_nr_running = sgs.sum_nr_running; |
2554 | sds->this_load_per_task = sgs.sum_weighted_load; | 2902 | sds->this_load_per_task = sgs.sum_weighted_load; |
2903 | sds->this_has_capacity = sgs.group_has_capacity; | ||
2904 | sds->this_idle_cpus = sgs.idle_cpus; | ||
2555 | } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { | 2905 | } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { |
2556 | sds->max_load = sgs.avg_load; | 2906 | sds->max_load = sgs.avg_load; |
2557 | sds->busiest = sg; | 2907 | sds->busiest = sg; |
2558 | sds->busiest_nr_running = sgs.sum_nr_running; | 2908 | sds->busiest_nr_running = sgs.sum_nr_running; |
2909 | sds->busiest_idle_cpus = sgs.idle_cpus; | ||
2559 | sds->busiest_group_capacity = sgs.group_capacity; | 2910 | sds->busiest_group_capacity = sgs.group_capacity; |
2560 | sds->busiest_load_per_task = sgs.sum_weighted_load; | 2911 | sds->busiest_load_per_task = sgs.sum_weighted_load; |
2912 | sds->busiest_has_capacity = sgs.group_has_capacity; | ||
2913 | sds->busiest_group_weight = sgs.group_weight; | ||
2561 | sds->group_imb = sgs.group_imb; | 2914 | sds->group_imb = sgs.group_imb; |
2562 | } | 2915 | } |
2563 | 2916 | ||
@@ -2612,8 +2965,8 @@ static int check_asym_packing(struct sched_domain *sd, | |||
2612 | if (this_cpu > busiest_cpu) | 2965 | if (this_cpu > busiest_cpu) |
2613 | return 0; | 2966 | return 0; |
2614 | 2967 | ||
2615 | *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power, | 2968 | *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->sgp->power, |
2616 | SCHED_LOAD_SCALE); | 2969 | SCHED_POWER_SCALE); |
2617 | return 1; | 2970 | return 1; |
2618 | } | 2971 | } |
2619 | 2972 | ||
@@ -2642,8 +2995,8 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, | |||
2642 | cpu_avg_load_per_task(this_cpu); | 2995 | cpu_avg_load_per_task(this_cpu); |
2643 | 2996 | ||
2644 | scaled_busy_load_per_task = sds->busiest_load_per_task | 2997 | scaled_busy_load_per_task = sds->busiest_load_per_task |
2645 | * SCHED_LOAD_SCALE; | 2998 | * SCHED_POWER_SCALE; |
2646 | scaled_busy_load_per_task /= sds->busiest->cpu_power; | 2999 | scaled_busy_load_per_task /= sds->busiest->sgp->power; |
2647 | 3000 | ||
2648 | if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= | 3001 | if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= |
2649 | (scaled_busy_load_per_task * imbn)) { | 3002 | (scaled_busy_load_per_task * imbn)) { |
@@ -2657,30 +3010,30 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, | |||
2657 | * moving them. | 3010 | * moving them. |
2658 | */ | 3011 | */ |
2659 | 3012 | ||
2660 | pwr_now += sds->busiest->cpu_power * | 3013 | pwr_now += sds->busiest->sgp->power * |
2661 | min(sds->busiest_load_per_task, sds->max_load); | 3014 | min(sds->busiest_load_per_task, sds->max_load); |
2662 | pwr_now += sds->this->cpu_power * | 3015 | pwr_now += sds->this->sgp->power * |
2663 | min(sds->this_load_per_task, sds->this_load); | 3016 | min(sds->this_load_per_task, sds->this_load); |
2664 | pwr_now /= SCHED_LOAD_SCALE; | 3017 | pwr_now /= SCHED_POWER_SCALE; |
2665 | 3018 | ||
2666 | /* Amount of load we'd subtract */ | 3019 | /* Amount of load we'd subtract */ |
2667 | tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / | 3020 | tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / |
2668 | sds->busiest->cpu_power; | 3021 | sds->busiest->sgp->power; |
2669 | if (sds->max_load > tmp) | 3022 | if (sds->max_load > tmp) |
2670 | pwr_move += sds->busiest->cpu_power * | 3023 | pwr_move += sds->busiest->sgp->power * |
2671 | min(sds->busiest_load_per_task, sds->max_load - tmp); | 3024 | min(sds->busiest_load_per_task, sds->max_load - tmp); |
2672 | 3025 | ||
2673 | /* Amount of load we'd add */ | 3026 | /* Amount of load we'd add */ |
2674 | if (sds->max_load * sds->busiest->cpu_power < | 3027 | if (sds->max_load * sds->busiest->sgp->power < |
2675 | sds->busiest_load_per_task * SCHED_LOAD_SCALE) | 3028 | sds->busiest_load_per_task * SCHED_POWER_SCALE) |
2676 | tmp = (sds->max_load * sds->busiest->cpu_power) / | 3029 | tmp = (sds->max_load * sds->busiest->sgp->power) / |
2677 | sds->this->cpu_power; | 3030 | sds->this->sgp->power; |
2678 | else | 3031 | else |
2679 | tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / | 3032 | tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / |
2680 | sds->this->cpu_power; | 3033 | sds->this->sgp->power; |
2681 | pwr_move += sds->this->cpu_power * | 3034 | pwr_move += sds->this->sgp->power * |
2682 | min(sds->this_load_per_task, sds->this_load + tmp); | 3035 | min(sds->this_load_per_task, sds->this_load + tmp); |
2683 | pwr_move /= SCHED_LOAD_SCALE; | 3036 | pwr_move /= SCHED_POWER_SCALE; |
2684 | 3037 | ||
2685 | /* Move if we gain throughput */ | 3038 | /* Move if we gain throughput */ |
2686 | if (pwr_move > pwr_now) | 3039 | if (pwr_move > pwr_now) |
@@ -2722,9 +3075,9 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | |||
2722 | load_above_capacity = (sds->busiest_nr_running - | 3075 | load_above_capacity = (sds->busiest_nr_running - |
2723 | sds->busiest_group_capacity); | 3076 | sds->busiest_group_capacity); |
2724 | 3077 | ||
2725 | load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_LOAD_SCALE); | 3078 | load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); |
2726 | 3079 | ||
2727 | load_above_capacity /= sds->busiest->cpu_power; | 3080 | load_above_capacity /= sds->busiest->sgp->power; |
2728 | } | 3081 | } |
2729 | 3082 | ||
2730 | /* | 3083 | /* |
@@ -2740,13 +3093,13 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | |||
2740 | max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); | 3093 | max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); |
2741 | 3094 | ||
2742 | /* How much load to actually move to equalise the imbalance */ | 3095 | /* How much load to actually move to equalise the imbalance */ |
2743 | *imbalance = min(max_pull * sds->busiest->cpu_power, | 3096 | *imbalance = min(max_pull * sds->busiest->sgp->power, |
2744 | (sds->avg_load - sds->this_load) * sds->this->cpu_power) | 3097 | (sds->avg_load - sds->this_load) * sds->this->sgp->power) |
2745 | / SCHED_LOAD_SCALE; | 3098 | / SCHED_POWER_SCALE; |
2746 | 3099 | ||
2747 | /* | 3100 | /* |
2748 | * if *imbalance is less than the average load per runnable task | 3101 | * if *imbalance is less than the average load per runnable task |
2749 | * there is no gaurantee that any tasks will be moved so we'll have | 3102 | * there is no guarantee that any tasks will be moved so we'll have |
2750 | * a think about bumping its value to force at least one task to be | 3103 | * a think about bumping its value to force at least one task to be |
2751 | * moved | 3104 | * moved |
2752 | */ | 3105 | */ |
@@ -2754,6 +3107,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | |||
2754 | return fix_small_imbalance(sds, this_cpu, imbalance); | 3107 | return fix_small_imbalance(sds, this_cpu, imbalance); |
2755 | 3108 | ||
2756 | } | 3109 | } |
3110 | |||
2757 | /******* find_busiest_group() helpers end here *********************/ | 3111 | /******* find_busiest_group() helpers end here *********************/ |
2758 | 3112 | ||
2759 | /** | 3113 | /** |
@@ -2771,7 +3125,6 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | |||
2771 | * @imbalance: Variable which stores amount of weighted load which should | 3125 | * @imbalance: Variable which stores amount of weighted load which should |
2772 | * be moved to restore balance/put a group to idle. | 3126 | * be moved to restore balance/put a group to idle. |
2773 | * @idle: The idle status of this_cpu. | 3127 | * @idle: The idle status of this_cpu. |
2774 | * @sd_idle: The idleness of sd | ||
2775 | * @cpus: The set of CPUs under consideration for load-balancing. | 3128 | * @cpus: The set of CPUs under consideration for load-balancing. |
2776 | * @balance: Pointer to a variable indicating if this_cpu | 3129 | * @balance: Pointer to a variable indicating if this_cpu |
2777 | * is the appropriate cpu to perform load balancing at this_level. | 3130 | * is the appropriate cpu to perform load balancing at this_level. |
@@ -2784,7 +3137,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | |||
2784 | static struct sched_group * | 3137 | static struct sched_group * |
2785 | find_busiest_group(struct sched_domain *sd, int this_cpu, | 3138 | find_busiest_group(struct sched_domain *sd, int this_cpu, |
2786 | unsigned long *imbalance, enum cpu_idle_type idle, | 3139 | unsigned long *imbalance, enum cpu_idle_type idle, |
2787 | int *sd_idle, const struct cpumask *cpus, int *balance) | 3140 | const struct cpumask *cpus, int *balance) |
2788 | { | 3141 | { |
2789 | struct sd_lb_stats sds; | 3142 | struct sd_lb_stats sds; |
2790 | 3143 | ||
@@ -2794,17 +3147,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2794 | * Compute the various statistics relavent for load balancing at | 3147 | * Compute the various statistics relavent for load balancing at |
2795 | * this level. | 3148 | * this level. |
2796 | */ | 3149 | */ |
2797 | update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus, | 3150 | update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds); |
2798 | balance, &sds); | ||
2799 | 3151 | ||
2800 | /* Cases where imbalance does not exist from POV of this_cpu */ | 3152 | /* |
2801 | /* 1) this_cpu is not the appropriate cpu to perform load balancing | 3153 | * this_cpu is not the appropriate cpu to perform load balancing at |
2802 | * at this level. | 3154 | * this level. |
2803 | * 2) There is no busy sibling group to pull from. | ||
2804 | * 3) This group is the busiest group. | ||
2805 | * 4) This group is more busy than the avg busieness at this | ||
2806 | * sched_domain. | ||
2807 | * 5) The imbalance is within the specified limit. | ||
2808 | */ | 3155 | */ |
2809 | if (!(*balance)) | 3156 | if (!(*balance)) |
2810 | goto ret; | 3157 | goto ret; |
@@ -2813,20 +3160,59 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2813 | check_asym_packing(sd, &sds, this_cpu, imbalance)) | 3160 | check_asym_packing(sd, &sds, this_cpu, imbalance)) |
2814 | return sds.busiest; | 3161 | return sds.busiest; |
2815 | 3162 | ||
3163 | /* There is no busy sibling group to pull tasks from */ | ||
2816 | if (!sds.busiest || sds.busiest_nr_running == 0) | 3164 | if (!sds.busiest || sds.busiest_nr_running == 0) |
2817 | goto out_balanced; | 3165 | goto out_balanced; |
2818 | 3166 | ||
3167 | sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr; | ||
3168 | |||
3169 | /* | ||
3170 | * If the busiest group is imbalanced the below checks don't | ||
3171 | * work because they assumes all things are equal, which typically | ||
3172 | * isn't true due to cpus_allowed constraints and the like. | ||
3173 | */ | ||
3174 | if (sds.group_imb) | ||
3175 | goto force_balance; | ||
3176 | |||
3177 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ | ||
3178 | if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity && | ||
3179 | !sds.busiest_has_capacity) | ||
3180 | goto force_balance; | ||
3181 | |||
3182 | /* | ||
3183 | * If the local group is more busy than the selected busiest group | ||
3184 | * don't try and pull any tasks. | ||
3185 | */ | ||
2819 | if (sds.this_load >= sds.max_load) | 3186 | if (sds.this_load >= sds.max_load) |
2820 | goto out_balanced; | 3187 | goto out_balanced; |
2821 | 3188 | ||
2822 | sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; | 3189 | /* |
2823 | 3190 | * Don't pull any tasks if this group is already above the domain | |
3191 | * average load. | ||
3192 | */ | ||
2824 | if (sds.this_load >= sds.avg_load) | 3193 | if (sds.this_load >= sds.avg_load) |
2825 | goto out_balanced; | 3194 | goto out_balanced; |
2826 | 3195 | ||
2827 | if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) | 3196 | if (idle == CPU_IDLE) { |
2828 | goto out_balanced; | 3197 | /* |
3198 | * This cpu is idle. If the busiest group load doesn't | ||
3199 | * have more tasks than the number of available cpu's and | ||
3200 | * there is no imbalance between this and busiest group | ||
3201 | * wrt to idle cpu's, it is balanced. | ||
3202 | */ | ||
3203 | if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) && | ||
3204 | sds.busiest_nr_running <= sds.busiest_group_weight) | ||
3205 | goto out_balanced; | ||
3206 | } else { | ||
3207 | /* | ||
3208 | * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use | ||
3209 | * imbalance_pct to be conservative. | ||
3210 | */ | ||
3211 | if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) | ||
3212 | goto out_balanced; | ||
3213 | } | ||
2829 | 3214 | ||
3215 | force_balance: | ||
2830 | /* Looks like there is an imbalance. Compute it */ | 3216 | /* Looks like there is an imbalance. Compute it */ |
2831 | calculate_imbalance(&sds, this_cpu, imbalance); | 3217 | calculate_imbalance(&sds, this_cpu, imbalance); |
2832 | return sds.busiest; | 3218 | return sds.busiest; |
@@ -2857,7 +3243,8 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group, | |||
2857 | 3243 | ||
2858 | for_each_cpu(i, sched_group_cpus(group)) { | 3244 | for_each_cpu(i, sched_group_cpus(group)) { |
2859 | unsigned long power = power_of(i); | 3245 | unsigned long power = power_of(i); |
2860 | unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); | 3246 | unsigned long capacity = DIV_ROUND_CLOSEST(power, |
3247 | SCHED_POWER_SCALE); | ||
2861 | unsigned long wl; | 3248 | unsigned long wl; |
2862 | 3249 | ||
2863 | if (!capacity) | 3250 | if (!capacity) |
@@ -2882,7 +3269,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group, | |||
2882 | * the load can be moved away from the cpu that is potentially | 3269 | * the load can be moved away from the cpu that is potentially |
2883 | * running at a lower capacity. | 3270 | * running at a lower capacity. |
2884 | */ | 3271 | */ |
2885 | wl = (wl * SCHED_LOAD_SCALE) / power; | 3272 | wl = (wl * SCHED_POWER_SCALE) / power; |
2886 | 3273 | ||
2887 | if (wl > max_load) { | 3274 | if (wl > max_load) { |
2888 | max_load = wl; | 3275 | max_load = wl; |
@@ -2902,7 +3289,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group, | |||
2902 | /* Working cpumask for load_balance and load_balance_newidle. */ | 3289 | /* Working cpumask for load_balance and load_balance_newidle. */ |
2903 | static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); | 3290 | static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); |
2904 | 3291 | ||
2905 | static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle, | 3292 | static int need_active_balance(struct sched_domain *sd, int idle, |
2906 | int busiest_cpu, int this_cpu) | 3293 | int busiest_cpu, int this_cpu) |
2907 | { | 3294 | { |
2908 | if (idle == CPU_NEWLY_IDLE) { | 3295 | if (idle == CPU_NEWLY_IDLE) { |
@@ -2934,10 +3321,6 @@ static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle, | |||
2934 | * move_tasks() will succeed. ld_moved will be true and this | 3321 | * move_tasks() will succeed. ld_moved will be true and this |
2935 | * active balance code will not be triggered. | 3322 | * active balance code will not be triggered. |
2936 | */ | 3323 | */ |
2937 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && | ||
2938 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
2939 | return 0; | ||
2940 | |||
2941 | if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP) | 3324 | if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP) |
2942 | return 0; | 3325 | return 0; |
2943 | } | 3326 | } |
@@ -2955,7 +3338,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
2955 | struct sched_domain *sd, enum cpu_idle_type idle, | 3338 | struct sched_domain *sd, enum cpu_idle_type idle, |
2956 | int *balance) | 3339 | int *balance) |
2957 | { | 3340 | { |
2958 | int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; | 3341 | int ld_moved, all_pinned = 0, active_balance = 0; |
2959 | struct sched_group *group; | 3342 | struct sched_group *group; |
2960 | unsigned long imbalance; | 3343 | unsigned long imbalance; |
2961 | struct rq *busiest; | 3344 | struct rq *busiest; |
@@ -2964,21 +3347,10 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
2964 | 3347 | ||
2965 | cpumask_copy(cpus, cpu_active_mask); | 3348 | cpumask_copy(cpus, cpu_active_mask); |
2966 | 3349 | ||
2967 | /* | ||
2968 | * When power savings policy is enabled for the parent domain, idle | ||
2969 | * sibling can pick up load irrespective of busy siblings. In this case, | ||
2970 | * let the state of idle sibling percolate up as CPU_IDLE, instead of | ||
2971 | * portraying it as CPU_NOT_IDLE. | ||
2972 | */ | ||
2973 | if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && | ||
2974 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
2975 | sd_idle = 1; | ||
2976 | |||
2977 | schedstat_inc(sd, lb_count[idle]); | 3350 | schedstat_inc(sd, lb_count[idle]); |
2978 | 3351 | ||
2979 | redo: | 3352 | redo: |
2980 | update_shares(sd); | 3353 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, |
2981 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, | ||
2982 | cpus, balance); | 3354 | cpus, balance); |
2983 | 3355 | ||
2984 | if (*balance == 0) | 3356 | if (*balance == 0) |
@@ -3007,6 +3379,7 @@ redo: | |||
3007 | * still unbalanced. ld_moved simply stays zero, so it is | 3379 | * still unbalanced. ld_moved simply stays zero, so it is |
3008 | * correctly treated as an imbalance. | 3380 | * correctly treated as an imbalance. |
3009 | */ | 3381 | */ |
3382 | all_pinned = 1; | ||
3010 | local_irq_save(flags); | 3383 | local_irq_save(flags); |
3011 | double_rq_lock(this_rq, busiest); | 3384 | double_rq_lock(this_rq, busiest); |
3012 | ld_moved = move_tasks(this_rq, this_cpu, busiest, | 3385 | ld_moved = move_tasks(this_rq, this_cpu, busiest, |
@@ -3031,10 +3404,16 @@ redo: | |||
3031 | 3404 | ||
3032 | if (!ld_moved) { | 3405 | if (!ld_moved) { |
3033 | schedstat_inc(sd, lb_failed[idle]); | 3406 | schedstat_inc(sd, lb_failed[idle]); |
3034 | sd->nr_balance_failed++; | 3407 | /* |
3408 | * Increment the failure counter only on periodic balance. | ||
3409 | * We do not want newidle balance, which can be very | ||
3410 | * frequent, pollute the failure counter causing | ||
3411 | * excessive cache_hot migrations and active balances. | ||
3412 | */ | ||
3413 | if (idle != CPU_NEWLY_IDLE) | ||
3414 | sd->nr_balance_failed++; | ||
3035 | 3415 | ||
3036 | if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest), | 3416 | if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) { |
3037 | this_cpu)) { | ||
3038 | raw_spin_lock_irqsave(&busiest->lock, flags); | 3417 | raw_spin_lock_irqsave(&busiest->lock, flags); |
3039 | 3418 | ||
3040 | /* don't kick the active_load_balance_cpu_stop, | 3419 | /* don't kick the active_load_balance_cpu_stop, |
@@ -3089,10 +3468,6 @@ redo: | |||
3089 | sd->balance_interval *= 2; | 3468 | sd->balance_interval *= 2; |
3090 | } | 3469 | } |
3091 | 3470 | ||
3092 | if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && | ||
3093 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
3094 | ld_moved = -1; | ||
3095 | |||
3096 | goto out; | 3471 | goto out; |
3097 | 3472 | ||
3098 | out_balanced: | 3473 | out_balanced: |
@@ -3106,14 +3481,8 @@ out_one_pinned: | |||
3106 | (sd->balance_interval < sd->max_interval)) | 3481 | (sd->balance_interval < sd->max_interval)) |
3107 | sd->balance_interval *= 2; | 3482 | sd->balance_interval *= 2; |
3108 | 3483 | ||
3109 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && | 3484 | ld_moved = 0; |
3110 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
3111 | ld_moved = -1; | ||
3112 | else | ||
3113 | ld_moved = 0; | ||
3114 | out: | 3485 | out: |
3115 | if (ld_moved) | ||
3116 | update_shares(sd); | ||
3117 | return ld_moved; | 3486 | return ld_moved; |
3118 | } | 3487 | } |
3119 | 3488 | ||
@@ -3137,6 +3506,8 @@ static void idle_balance(int this_cpu, struct rq *this_rq) | |||
3137 | */ | 3506 | */ |
3138 | raw_spin_unlock(&this_rq->lock); | 3507 | raw_spin_unlock(&this_rq->lock); |
3139 | 3508 | ||
3509 | update_shares(this_cpu); | ||
3510 | rcu_read_lock(); | ||
3140 | for_each_domain(this_cpu, sd) { | 3511 | for_each_domain(this_cpu, sd) { |
3141 | unsigned long interval; | 3512 | unsigned long interval; |
3142 | int balance = 1; | 3513 | int balance = 1; |
@@ -3158,6 +3529,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq) | |||
3158 | break; | 3529 | break; |
3159 | } | 3530 | } |
3160 | } | 3531 | } |
3532 | rcu_read_unlock(); | ||
3161 | 3533 | ||
3162 | raw_spin_lock(&this_rq->lock); | 3534 | raw_spin_lock(&this_rq->lock); |
3163 | 3535 | ||
@@ -3206,6 +3578,7 @@ static int active_load_balance_cpu_stop(void *data) | |||
3206 | double_lock_balance(busiest_rq, target_rq); | 3578 | double_lock_balance(busiest_rq, target_rq); |
3207 | 3579 | ||
3208 | /* Search for an sd spanning us and the target CPU. */ | 3580 | /* Search for an sd spanning us and the target CPU. */ |
3581 | rcu_read_lock(); | ||
3209 | for_each_domain(target_cpu, sd) { | 3582 | for_each_domain(target_cpu, sd) { |
3210 | if ((sd->flags & SD_LOAD_BALANCE) && | 3583 | if ((sd->flags & SD_LOAD_BALANCE) && |
3211 | cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) | 3584 | cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) |
@@ -3221,6 +3594,7 @@ static int active_load_balance_cpu_stop(void *data) | |||
3221 | else | 3594 | else |
3222 | schedstat_inc(sd, alb_failed); | 3595 | schedstat_inc(sd, alb_failed); |
3223 | } | 3596 | } |
3597 | rcu_read_unlock(); | ||
3224 | double_unlock_balance(busiest_rq, target_rq); | 3598 | double_unlock_balance(busiest_rq, target_rq); |
3225 | out_unlock: | 3599 | out_unlock: |
3226 | busiest_rq->active_balance = 0; | 3600 | busiest_rq->active_balance = 0; |
@@ -3347,6 +3721,7 @@ static int find_new_ilb(int cpu) | |||
3347 | { | 3721 | { |
3348 | struct sched_domain *sd; | 3722 | struct sched_domain *sd; |
3349 | struct sched_group *ilb_group; | 3723 | struct sched_group *ilb_group; |
3724 | int ilb = nr_cpu_ids; | ||
3350 | 3725 | ||
3351 | /* | 3726 | /* |
3352 | * Have idle load balancer selection from semi-idle packages only | 3727 | * Have idle load balancer selection from semi-idle packages only |
@@ -3362,20 +3737,25 @@ static int find_new_ilb(int cpu) | |||
3362 | if (cpumask_weight(nohz.idle_cpus_mask) < 2) | 3737 | if (cpumask_weight(nohz.idle_cpus_mask) < 2) |
3363 | goto out_done; | 3738 | goto out_done; |
3364 | 3739 | ||
3740 | rcu_read_lock(); | ||
3365 | for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { | 3741 | for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { |
3366 | ilb_group = sd->groups; | 3742 | ilb_group = sd->groups; |
3367 | 3743 | ||
3368 | do { | 3744 | do { |
3369 | if (is_semi_idle_group(ilb_group)) | 3745 | if (is_semi_idle_group(ilb_group)) { |
3370 | return cpumask_first(nohz.grp_idle_mask); | 3746 | ilb = cpumask_first(nohz.grp_idle_mask); |
3747 | goto unlock; | ||
3748 | } | ||
3371 | 3749 | ||
3372 | ilb_group = ilb_group->next; | 3750 | ilb_group = ilb_group->next; |
3373 | 3751 | ||
3374 | } while (ilb_group != sd->groups); | 3752 | } while (ilb_group != sd->groups); |
3375 | } | 3753 | } |
3754 | unlock: | ||
3755 | rcu_read_unlock(); | ||
3376 | 3756 | ||
3377 | out_done: | 3757 | out_done: |
3378 | return nr_cpu_ids; | 3758 | return ilb; |
3379 | } | 3759 | } |
3380 | #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ | 3760 | #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ |
3381 | static inline int find_new_ilb(int call_cpu) | 3761 | static inline int find_new_ilb(int call_cpu) |
@@ -3490,6 +3870,17 @@ void select_nohz_load_balancer(int stop_tick) | |||
3490 | 3870 | ||
3491 | static DEFINE_SPINLOCK(balancing); | 3871 | static DEFINE_SPINLOCK(balancing); |
3492 | 3872 | ||
3873 | static unsigned long __read_mostly max_load_balance_interval = HZ/10; | ||
3874 | |||
3875 | /* | ||
3876 | * Scale the max load_balance interval with the number of CPUs in the system. | ||
3877 | * This trades load-balance latency on larger machines for less cross talk. | ||
3878 | */ | ||
3879 | static void update_max_interval(void) | ||
3880 | { | ||
3881 | max_load_balance_interval = HZ*num_online_cpus()/10; | ||
3882 | } | ||
3883 | |||
3493 | /* | 3884 | /* |
3494 | * It checks each scheduling domain to see if it is due to be balanced, | 3885 | * It checks each scheduling domain to see if it is due to be balanced, |
3495 | * and initiates a balancing operation if so. | 3886 | * and initiates a balancing operation if so. |
@@ -3507,6 +3898,9 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
3507 | int update_next_balance = 0; | 3898 | int update_next_balance = 0; |
3508 | int need_serialize; | 3899 | int need_serialize; |
3509 | 3900 | ||
3901 | update_shares(cpu); | ||
3902 | |||
3903 | rcu_read_lock(); | ||
3510 | for_each_domain(cpu, sd) { | 3904 | for_each_domain(cpu, sd) { |
3511 | if (!(sd->flags & SD_LOAD_BALANCE)) | 3905 | if (!(sd->flags & SD_LOAD_BALANCE)) |
3512 | continue; | 3906 | continue; |
@@ -3517,10 +3911,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
3517 | 3911 | ||
3518 | /* scale ms to jiffies */ | 3912 | /* scale ms to jiffies */ |
3519 | interval = msecs_to_jiffies(interval); | 3913 | interval = msecs_to_jiffies(interval); |
3520 | if (unlikely(!interval)) | 3914 | interval = clamp(interval, 1UL, max_load_balance_interval); |
3521 | interval = 1; | ||
3522 | if (interval > HZ*NR_CPUS/10) | ||
3523 | interval = HZ*NR_CPUS/10; | ||
3524 | 3915 | ||
3525 | need_serialize = sd->flags & SD_SERIALIZE; | 3916 | need_serialize = sd->flags & SD_SERIALIZE; |
3526 | 3917 | ||
@@ -3533,8 +3924,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
3533 | if (load_balance(cpu, rq, sd, idle, &balance)) { | 3924 | if (load_balance(cpu, rq, sd, idle, &balance)) { |
3534 | /* | 3925 | /* |
3535 | * We've pulled tasks over so either we're no | 3926 | * We've pulled tasks over so either we're no |
3536 | * longer idle, or one of our SMT siblings is | 3927 | * longer idle. |
3537 | * not idle. | ||
3538 | */ | 3928 | */ |
3539 | idle = CPU_NOT_IDLE; | 3929 | idle = CPU_NOT_IDLE; |
3540 | } | 3930 | } |
@@ -3556,6 +3946,7 @@ out: | |||
3556 | if (!balance) | 3946 | if (!balance) |
3557 | break; | 3947 | break; |
3558 | } | 3948 | } |
3949 | rcu_read_unlock(); | ||
3559 | 3950 | ||
3560 | /* | 3951 | /* |
3561 | * next_balance will be updated only when there is a need. | 3952 | * next_balance will be updated only when there is a need. |
@@ -3751,8 +4142,11 @@ static void task_fork_fair(struct task_struct *p) | |||
3751 | 4142 | ||
3752 | update_rq_clock(rq); | 4143 | update_rq_clock(rq); |
3753 | 4144 | ||
3754 | if (unlikely(task_cpu(p) != this_cpu)) | 4145 | if (unlikely(task_cpu(p) != this_cpu)) { |
4146 | rcu_read_lock(); | ||
3755 | __set_task_cpu(p, this_cpu); | 4147 | __set_task_cpu(p, this_cpu); |
4148 | rcu_read_unlock(); | ||
4149 | } | ||
3756 | 4150 | ||
3757 | update_curr(cfs_rq); | 4151 | update_curr(cfs_rq); |
3758 | 4152 | ||
@@ -3778,33 +4172,62 @@ static void task_fork_fair(struct task_struct *p) | |||
3778 | * Priority of the task has changed. Check to see if we preempt | 4172 | * Priority of the task has changed. Check to see if we preempt |
3779 | * the current task. | 4173 | * the current task. |
3780 | */ | 4174 | */ |
3781 | static void prio_changed_fair(struct rq *rq, struct task_struct *p, | 4175 | static void |
3782 | int oldprio, int running) | 4176 | prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) |
3783 | { | 4177 | { |
4178 | if (!p->se.on_rq) | ||
4179 | return; | ||
4180 | |||
3784 | /* | 4181 | /* |
3785 | * Reschedule if we are currently running on this runqueue and | 4182 | * Reschedule if we are currently running on this runqueue and |
3786 | * our priority decreased, or if we are not currently running on | 4183 | * our priority decreased, or if we are not currently running on |
3787 | * this runqueue and our priority is higher than the current's | 4184 | * this runqueue and our priority is higher than the current's |
3788 | */ | 4185 | */ |
3789 | if (running) { | 4186 | if (rq->curr == p) { |
3790 | if (p->prio > oldprio) | 4187 | if (p->prio > oldprio) |
3791 | resched_task(rq->curr); | 4188 | resched_task(rq->curr); |
3792 | } else | 4189 | } else |
3793 | check_preempt_curr(rq, p, 0); | 4190 | check_preempt_curr(rq, p, 0); |
3794 | } | 4191 | } |
3795 | 4192 | ||
4193 | static void switched_from_fair(struct rq *rq, struct task_struct *p) | ||
4194 | { | ||
4195 | struct sched_entity *se = &p->se; | ||
4196 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
4197 | |||
4198 | /* | ||
4199 | * Ensure the task's vruntime is normalized, so that when its | ||
4200 | * switched back to the fair class the enqueue_entity(.flags=0) will | ||
4201 | * do the right thing. | ||
4202 | * | ||
4203 | * If it was on_rq, then the dequeue_entity(.flags=0) will already | ||
4204 | * have normalized the vruntime, if it was !on_rq, then only when | ||
4205 | * the task is sleeping will it still have non-normalized vruntime. | ||
4206 | */ | ||
4207 | if (!se->on_rq && p->state != TASK_RUNNING) { | ||
4208 | /* | ||
4209 | * Fix up our vruntime so that the current sleep doesn't | ||
4210 | * cause 'unlimited' sleep bonus. | ||
4211 | */ | ||
4212 | place_entity(cfs_rq, se, 0); | ||
4213 | se->vruntime -= cfs_rq->min_vruntime; | ||
4214 | } | ||
4215 | } | ||
4216 | |||
3796 | /* | 4217 | /* |
3797 | * We switched to the sched_fair class. | 4218 | * We switched to the sched_fair class. |
3798 | */ | 4219 | */ |
3799 | static void switched_to_fair(struct rq *rq, struct task_struct *p, | 4220 | static void switched_to_fair(struct rq *rq, struct task_struct *p) |
3800 | int running) | ||
3801 | { | 4221 | { |
4222 | if (!p->se.on_rq) | ||
4223 | return; | ||
4224 | |||
3802 | /* | 4225 | /* |
3803 | * We were most likely switched from sched_rt, so | 4226 | * We were most likely switched from sched_rt, so |
3804 | * kick off the schedule if running, otherwise just see | 4227 | * kick off the schedule if running, otherwise just see |
3805 | * if we can still preempt the current task. | 4228 | * if we can still preempt the current task. |
3806 | */ | 4229 | */ |
3807 | if (running) | 4230 | if (rq->curr == p) |
3808 | resched_task(rq->curr); | 4231 | resched_task(rq->curr); |
3809 | else | 4232 | else |
3810 | check_preempt_curr(rq, p, 0); | 4233 | check_preempt_curr(rq, p, 0); |
@@ -3824,13 +4247,26 @@ static void set_curr_task_fair(struct rq *rq) | |||
3824 | } | 4247 | } |
3825 | 4248 | ||
3826 | #ifdef CONFIG_FAIR_GROUP_SCHED | 4249 | #ifdef CONFIG_FAIR_GROUP_SCHED |
3827 | static void moved_group_fair(struct task_struct *p, int on_rq) | 4250 | static void task_move_group_fair(struct task_struct *p, int on_rq) |
3828 | { | 4251 | { |
3829 | struct cfs_rq *cfs_rq = task_cfs_rq(p); | 4252 | /* |
3830 | 4253 | * If the task was not on the rq at the time of this cgroup movement | |
3831 | update_curr(cfs_rq); | 4254 | * it must have been asleep, sleeping tasks keep their ->vruntime |
4255 | * absolute on their old rq until wakeup (needed for the fair sleeper | ||
4256 | * bonus in place_entity()). | ||
4257 | * | ||
4258 | * If it was on the rq, we've just 'preempted' it, which does convert | ||
4259 | * ->vruntime to a relative base. | ||
4260 | * | ||
4261 | * Make sure both cases convert their relative position when migrating | ||
4262 | * to another cgroup's rq. This does somewhat interfere with the | ||
4263 | * fair sleeper stuff for the first placement, but who cares. | ||
4264 | */ | ||
4265 | if (!on_rq) | ||
4266 | p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; | ||
4267 | set_task_rq(p, task_cpu(p)); | ||
3832 | if (!on_rq) | 4268 | if (!on_rq) |
3833 | place_entity(cfs_rq, &p->se, 1); | 4269 | p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime; |
3834 | } | 4270 | } |
3835 | #endif | 4271 | #endif |
3836 | 4272 | ||
@@ -3857,6 +4293,7 @@ static const struct sched_class fair_sched_class = { | |||
3857 | .enqueue_task = enqueue_task_fair, | 4293 | .enqueue_task = enqueue_task_fair, |
3858 | .dequeue_task = dequeue_task_fair, | 4294 | .dequeue_task = dequeue_task_fair, |
3859 | .yield_task = yield_task_fair, | 4295 | .yield_task = yield_task_fair, |
4296 | .yield_to_task = yield_to_task_fair, | ||
3860 | 4297 | ||
3861 | .check_preempt_curr = check_preempt_wakeup, | 4298 | .check_preempt_curr = check_preempt_wakeup, |
3862 | 4299 | ||
@@ -3877,12 +4314,13 @@ static const struct sched_class fair_sched_class = { | |||
3877 | .task_fork = task_fork_fair, | 4314 | .task_fork = task_fork_fair, |
3878 | 4315 | ||
3879 | .prio_changed = prio_changed_fair, | 4316 | .prio_changed = prio_changed_fair, |
4317 | .switched_from = switched_from_fair, | ||
3880 | .switched_to = switched_to_fair, | 4318 | .switched_to = switched_to_fair, |
3881 | 4319 | ||
3882 | .get_rr_interval = get_rr_interval_fair, | 4320 | .get_rr_interval = get_rr_interval_fair, |
3883 | 4321 | ||
3884 | #ifdef CONFIG_FAIR_GROUP_SCHED | 4322 | #ifdef CONFIG_FAIR_GROUP_SCHED |
3885 | .moved_group = moved_group_fair, | 4323 | .task_move_group = task_move_group_fair, |
3886 | #endif | 4324 | #endif |
3887 | }; | 4325 | }; |
3888 | 4326 | ||