diff options
Diffstat (limited to 'kernel/sched_fair.c')
-rw-r--r-- | kernel/sched_fair.c | 391 |
1 files changed, 349 insertions, 42 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index da7c061e7206..72e25c7a3a18 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -20,6 +20,8 @@ | |||
20 | * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> | 20 | * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> |
21 | */ | 21 | */ |
22 | 22 | ||
23 | #include <linux/latencytop.h> | ||
24 | |||
23 | /* | 25 | /* |
24 | * Targeted preemption latency for CPU-bound tasks: | 26 | * Targeted preemption latency for CPU-bound tasks: |
25 | * (default: 20ms * (1 + ilog(ncpus)), units: nanoseconds) | 27 | * (default: 20ms * (1 + ilog(ncpus)), units: nanoseconds) |
@@ -248,8 +250,8 @@ static u64 __sched_period(unsigned long nr_running) | |||
248 | unsigned long nr_latency = sched_nr_latency; | 250 | unsigned long nr_latency = sched_nr_latency; |
249 | 251 | ||
250 | if (unlikely(nr_running > nr_latency)) { | 252 | if (unlikely(nr_running > nr_latency)) { |
253 | period = sysctl_sched_min_granularity; | ||
251 | period *= nr_running; | 254 | period *= nr_running; |
252 | do_div(period, nr_latency); | ||
253 | } | 255 | } |
254 | 256 | ||
255 | return period; | 257 | return period; |
@@ -383,6 +385,9 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
383 | { | 385 | { |
384 | schedstat_set(se->wait_max, max(se->wait_max, | 386 | schedstat_set(se->wait_max, max(se->wait_max, |
385 | rq_of(cfs_rq)->clock - se->wait_start)); | 387 | rq_of(cfs_rq)->clock - se->wait_start)); |
388 | schedstat_set(se->wait_count, se->wait_count + 1); | ||
389 | schedstat_set(se->wait_sum, se->wait_sum + | ||
390 | rq_of(cfs_rq)->clock - se->wait_start); | ||
386 | schedstat_set(se->wait_start, 0); | 391 | schedstat_set(se->wait_start, 0); |
387 | } | 392 | } |
388 | 393 | ||
@@ -434,6 +439,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
434 | #ifdef CONFIG_SCHEDSTATS | 439 | #ifdef CONFIG_SCHEDSTATS |
435 | if (se->sleep_start) { | 440 | if (se->sleep_start) { |
436 | u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; | 441 | u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; |
442 | struct task_struct *tsk = task_of(se); | ||
437 | 443 | ||
438 | if ((s64)delta < 0) | 444 | if ((s64)delta < 0) |
439 | delta = 0; | 445 | delta = 0; |
@@ -443,9 +449,12 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
443 | 449 | ||
444 | se->sleep_start = 0; | 450 | se->sleep_start = 0; |
445 | se->sum_sleep_runtime += delta; | 451 | se->sum_sleep_runtime += delta; |
452 | |||
453 | account_scheduler_latency(tsk, delta >> 10, 1); | ||
446 | } | 454 | } |
447 | if (se->block_start) { | 455 | if (se->block_start) { |
448 | u64 delta = rq_of(cfs_rq)->clock - se->block_start; | 456 | u64 delta = rq_of(cfs_rq)->clock - se->block_start; |
457 | struct task_struct *tsk = task_of(se); | ||
449 | 458 | ||
450 | if ((s64)delta < 0) | 459 | if ((s64)delta < 0) |
451 | delta = 0; | 460 | delta = 0; |
@@ -462,11 +471,11 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
462 | * time that the task spent sleeping: | 471 | * time that the task spent sleeping: |
463 | */ | 472 | */ |
464 | if (unlikely(prof_on == SLEEP_PROFILING)) { | 473 | if (unlikely(prof_on == SLEEP_PROFILING)) { |
465 | struct task_struct *tsk = task_of(se); | ||
466 | 474 | ||
467 | profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk), | 475 | profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk), |
468 | delta >> 20); | 476 | delta >> 20); |
469 | } | 477 | } |
478 | account_scheduler_latency(tsk, delta >> 10, 0); | ||
470 | } | 479 | } |
471 | #endif | 480 | #endif |
472 | } | 481 | } |
@@ -642,13 +651,29 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) | |||
642 | cfs_rq->curr = NULL; | 651 | cfs_rq->curr = NULL; |
643 | } | 652 | } |
644 | 653 | ||
645 | static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) | 654 | static void |
655 | entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) | ||
646 | { | 656 | { |
647 | /* | 657 | /* |
648 | * Update run-time statistics of the 'current'. | 658 | * Update run-time statistics of the 'current'. |
649 | */ | 659 | */ |
650 | update_curr(cfs_rq); | 660 | update_curr(cfs_rq); |
651 | 661 | ||
662 | #ifdef CONFIG_SCHED_HRTICK | ||
663 | /* | ||
664 | * queued ticks are scheduled to match the slice, so don't bother | ||
665 | * validating it and just reschedule. | ||
666 | */ | ||
667 | if (queued) | ||
668 | return resched_task(rq_of(cfs_rq)->curr); | ||
669 | /* | ||
670 | * don't let the period tick interfere with the hrtick preemption | ||
671 | */ | ||
672 | if (!sched_feat(DOUBLE_TICK) && | ||
673 | hrtimer_active(&rq_of(cfs_rq)->hrtick_timer)) | ||
674 | return; | ||
675 | #endif | ||
676 | |||
652 | if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT)) | 677 | if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT)) |
653 | check_preempt_tick(cfs_rq, curr); | 678 | check_preempt_tick(cfs_rq, curr); |
654 | } | 679 | } |
@@ -690,7 +715,7 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) | |||
690 | 715 | ||
691 | /* Iterate thr' all leaf cfs_rq's on a runqueue */ | 716 | /* Iterate thr' all leaf cfs_rq's on a runqueue */ |
692 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ | 717 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ |
693 | list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) | 718 | list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) |
694 | 719 | ||
695 | /* Do the two (enqueued) entities belong to the same group ? */ | 720 | /* Do the two (enqueued) entities belong to the same group ? */ |
696 | static inline int | 721 | static inline int |
@@ -707,6 +732,8 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se) | |||
707 | return se->parent; | 732 | return se->parent; |
708 | } | 733 | } |
709 | 734 | ||
735 | #define GROUP_IMBALANCE_PCT 20 | ||
736 | |||
710 | #else /* CONFIG_FAIR_GROUP_SCHED */ | 737 | #else /* CONFIG_FAIR_GROUP_SCHED */ |
711 | 738 | ||
712 | #define for_each_sched_entity(se) \ | 739 | #define for_each_sched_entity(se) \ |
@@ -752,6 +779,43 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se) | |||
752 | 779 | ||
753 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 780 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
754 | 781 | ||
782 | #ifdef CONFIG_SCHED_HRTICK | ||
783 | static void hrtick_start_fair(struct rq *rq, struct task_struct *p) | ||
784 | { | ||
785 | int requeue = rq->curr == p; | ||
786 | struct sched_entity *se = &p->se; | ||
787 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
788 | |||
789 | WARN_ON(task_rq(p) != rq); | ||
790 | |||
791 | if (hrtick_enabled(rq) && cfs_rq->nr_running > 1) { | ||
792 | u64 slice = sched_slice(cfs_rq, se); | ||
793 | u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; | ||
794 | s64 delta = slice - ran; | ||
795 | |||
796 | if (delta < 0) { | ||
797 | if (rq->curr == p) | ||
798 | resched_task(p); | ||
799 | return; | ||
800 | } | ||
801 | |||
802 | /* | ||
803 | * Don't schedule slices shorter than 10000ns, that just | ||
804 | * doesn't make sense. Rely on vruntime for fairness. | ||
805 | */ | ||
806 | if (!requeue) | ||
807 | delta = max(10000LL, delta); | ||
808 | |||
809 | hrtick_start(rq, delta, requeue); | ||
810 | } | ||
811 | } | ||
812 | #else | ||
813 | static inline void | ||
814 | hrtick_start_fair(struct rq *rq, struct task_struct *p) | ||
815 | { | ||
816 | } | ||
817 | #endif | ||
818 | |||
755 | /* | 819 | /* |
756 | * The enqueue_task method is called before nr_running is | 820 | * The enqueue_task method is called before nr_running is |
757 | * increased. Here we update the fair scheduling stats and | 821 | * increased. Here we update the fair scheduling stats and |
@@ -760,15 +824,28 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se) | |||
760 | static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) | 824 | static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) |
761 | { | 825 | { |
762 | struct cfs_rq *cfs_rq; | 826 | struct cfs_rq *cfs_rq; |
763 | struct sched_entity *se = &p->se; | 827 | struct sched_entity *se = &p->se, |
828 | *topse = NULL; /* Highest schedulable entity */ | ||
829 | int incload = 1; | ||
764 | 830 | ||
765 | for_each_sched_entity(se) { | 831 | for_each_sched_entity(se) { |
766 | if (se->on_rq) | 832 | topse = se; |
833 | if (se->on_rq) { | ||
834 | incload = 0; | ||
767 | break; | 835 | break; |
836 | } | ||
768 | cfs_rq = cfs_rq_of(se); | 837 | cfs_rq = cfs_rq_of(se); |
769 | enqueue_entity(cfs_rq, se, wakeup); | 838 | enqueue_entity(cfs_rq, se, wakeup); |
770 | wakeup = 1; | 839 | wakeup = 1; |
771 | } | 840 | } |
841 | /* Increment cpu load if we just enqueued the first task of a group on | ||
842 | * 'rq->cpu'. 'topse' represents the group to which task 'p' belongs | ||
843 | * at the highest grouping level. | ||
844 | */ | ||
845 | if (incload) | ||
846 | inc_cpu_load(rq, topse->load.weight); | ||
847 | |||
848 | hrtick_start_fair(rq, rq->curr); | ||
772 | } | 849 | } |
773 | 850 | ||
774 | /* | 851 | /* |
@@ -779,16 +856,30 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) | |||
779 | static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) | 856 | static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) |
780 | { | 857 | { |
781 | struct cfs_rq *cfs_rq; | 858 | struct cfs_rq *cfs_rq; |
782 | struct sched_entity *se = &p->se; | 859 | struct sched_entity *se = &p->se, |
860 | *topse = NULL; /* Highest schedulable entity */ | ||
861 | int decload = 1; | ||
783 | 862 | ||
784 | for_each_sched_entity(se) { | 863 | for_each_sched_entity(se) { |
864 | topse = se; | ||
785 | cfs_rq = cfs_rq_of(se); | 865 | cfs_rq = cfs_rq_of(se); |
786 | dequeue_entity(cfs_rq, se, sleep); | 866 | dequeue_entity(cfs_rq, se, sleep); |
787 | /* Don't dequeue parent if it has other entities besides us */ | 867 | /* Don't dequeue parent if it has other entities besides us */ |
788 | if (cfs_rq->load.weight) | 868 | if (cfs_rq->load.weight) { |
869 | if (parent_entity(se)) | ||
870 | decload = 0; | ||
789 | break; | 871 | break; |
872 | } | ||
790 | sleep = 1; | 873 | sleep = 1; |
791 | } | 874 | } |
875 | /* Decrement cpu load if we just dequeued the last task of a group on | ||
876 | * 'rq->cpu'. 'topse' represents the group to which task 'p' belongs | ||
877 | * at the highest grouping level. | ||
878 | */ | ||
879 | if (decload) | ||
880 | dec_cpu_load(rq, topse->load.weight); | ||
881 | |||
882 | hrtick_start_fair(rq, rq->curr); | ||
792 | } | 883 | } |
793 | 884 | ||
794 | /* | 885 | /* |
@@ -836,6 +927,154 @@ static void yield_task_fair(struct rq *rq) | |||
836 | } | 927 | } |
837 | 928 | ||
838 | /* | 929 | /* |
930 | * wake_idle() will wake a task on an idle cpu if task->cpu is | ||
931 | * not idle and an idle cpu is available. The span of cpus to | ||
932 | * search starts with cpus closest then further out as needed, | ||
933 | * so we always favor a closer, idle cpu. | ||
934 | * | ||
935 | * Returns the CPU we should wake onto. | ||
936 | */ | ||
937 | #if defined(ARCH_HAS_SCHED_WAKE_IDLE) | ||
938 | static int wake_idle(int cpu, struct task_struct *p) | ||
939 | { | ||
940 | cpumask_t tmp; | ||
941 | struct sched_domain *sd; | ||
942 | int i; | ||
943 | |||
944 | /* | ||
945 | * If it is idle, then it is the best cpu to run this task. | ||
946 | * | ||
947 | * This cpu is also the best, if it has more than one task already. | ||
948 | * Siblings must be also busy(in most cases) as they didn't already | ||
949 | * pickup the extra load from this cpu and hence we need not check | ||
950 | * sibling runqueue info. This will avoid the checks and cache miss | ||
951 | * penalities associated with that. | ||
952 | */ | ||
953 | if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1) | ||
954 | return cpu; | ||
955 | |||
956 | for_each_domain(cpu, sd) { | ||
957 | if (sd->flags & SD_WAKE_IDLE) { | ||
958 | cpus_and(tmp, sd->span, p->cpus_allowed); | ||
959 | for_each_cpu_mask(i, tmp) { | ||
960 | if (idle_cpu(i)) { | ||
961 | if (i != task_cpu(p)) { | ||
962 | schedstat_inc(p, | ||
963 | se.nr_wakeups_idle); | ||
964 | } | ||
965 | return i; | ||
966 | } | ||
967 | } | ||
968 | } else { | ||
969 | break; | ||
970 | } | ||
971 | } | ||
972 | return cpu; | ||
973 | } | ||
974 | #else | ||
975 | static inline int wake_idle(int cpu, struct task_struct *p) | ||
976 | { | ||
977 | return cpu; | ||
978 | } | ||
979 | #endif | ||
980 | |||
981 | #ifdef CONFIG_SMP | ||
982 | static int select_task_rq_fair(struct task_struct *p, int sync) | ||
983 | { | ||
984 | int cpu, this_cpu; | ||
985 | struct rq *rq; | ||
986 | struct sched_domain *sd, *this_sd = NULL; | ||
987 | int new_cpu; | ||
988 | |||
989 | cpu = task_cpu(p); | ||
990 | rq = task_rq(p); | ||
991 | this_cpu = smp_processor_id(); | ||
992 | new_cpu = cpu; | ||
993 | |||
994 | if (cpu == this_cpu) | ||
995 | goto out_set_cpu; | ||
996 | |||
997 | for_each_domain(this_cpu, sd) { | ||
998 | if (cpu_isset(cpu, sd->span)) { | ||
999 | this_sd = sd; | ||
1000 | break; | ||
1001 | } | ||
1002 | } | ||
1003 | |||
1004 | if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) | ||
1005 | goto out_set_cpu; | ||
1006 | |||
1007 | /* | ||
1008 | * Check for affine wakeup and passive balancing possibilities. | ||
1009 | */ | ||
1010 | if (this_sd) { | ||
1011 | int idx = this_sd->wake_idx; | ||
1012 | unsigned int imbalance; | ||
1013 | unsigned long load, this_load; | ||
1014 | |||
1015 | imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; | ||
1016 | |||
1017 | load = source_load(cpu, idx); | ||
1018 | this_load = target_load(this_cpu, idx); | ||
1019 | |||
1020 | new_cpu = this_cpu; /* Wake to this CPU if we can */ | ||
1021 | |||
1022 | if (this_sd->flags & SD_WAKE_AFFINE) { | ||
1023 | unsigned long tl = this_load; | ||
1024 | unsigned long tl_per_task; | ||
1025 | |||
1026 | /* | ||
1027 | * Attract cache-cold tasks on sync wakeups: | ||
1028 | */ | ||
1029 | if (sync && !task_hot(p, rq->clock, this_sd)) | ||
1030 | goto out_set_cpu; | ||
1031 | |||
1032 | schedstat_inc(p, se.nr_wakeups_affine_attempts); | ||
1033 | tl_per_task = cpu_avg_load_per_task(this_cpu); | ||
1034 | |||
1035 | /* | ||
1036 | * If sync wakeup then subtract the (maximum possible) | ||
1037 | * effect of the currently running task from the load | ||
1038 | * of the current CPU: | ||
1039 | */ | ||
1040 | if (sync) | ||
1041 | tl -= current->se.load.weight; | ||
1042 | |||
1043 | if ((tl <= load && | ||
1044 | tl + target_load(cpu, idx) <= tl_per_task) || | ||
1045 | 100*(tl + p->se.load.weight) <= imbalance*load) { | ||
1046 | /* | ||
1047 | * This domain has SD_WAKE_AFFINE and | ||
1048 | * p is cache cold in this domain, and | ||
1049 | * there is no bad imbalance. | ||
1050 | */ | ||
1051 | schedstat_inc(this_sd, ttwu_move_affine); | ||
1052 | schedstat_inc(p, se.nr_wakeups_affine); | ||
1053 | goto out_set_cpu; | ||
1054 | } | ||
1055 | } | ||
1056 | |||
1057 | /* | ||
1058 | * Start passive balancing when half the imbalance_pct | ||
1059 | * limit is reached. | ||
1060 | */ | ||
1061 | if (this_sd->flags & SD_WAKE_BALANCE) { | ||
1062 | if (imbalance*this_load <= 100*load) { | ||
1063 | schedstat_inc(this_sd, ttwu_move_balance); | ||
1064 | schedstat_inc(p, se.nr_wakeups_passive); | ||
1065 | goto out_set_cpu; | ||
1066 | } | ||
1067 | } | ||
1068 | } | ||
1069 | |||
1070 | new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */ | ||
1071 | out_set_cpu: | ||
1072 | return wake_idle(new_cpu, p); | ||
1073 | } | ||
1074 | #endif /* CONFIG_SMP */ | ||
1075 | |||
1076 | |||
1077 | /* | ||
839 | * Preempt the current task with a newly woken task if needed: | 1078 | * Preempt the current task with a newly woken task if needed: |
840 | */ | 1079 | */ |
841 | static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) | 1080 | static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) |
@@ -876,6 +1115,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) | |||
876 | 1115 | ||
877 | static struct task_struct *pick_next_task_fair(struct rq *rq) | 1116 | static struct task_struct *pick_next_task_fair(struct rq *rq) |
878 | { | 1117 | { |
1118 | struct task_struct *p; | ||
879 | struct cfs_rq *cfs_rq = &rq->cfs; | 1119 | struct cfs_rq *cfs_rq = &rq->cfs; |
880 | struct sched_entity *se; | 1120 | struct sched_entity *se; |
881 | 1121 | ||
@@ -887,7 +1127,10 @@ static struct task_struct *pick_next_task_fair(struct rq *rq) | |||
887 | cfs_rq = group_cfs_rq(se); | 1127 | cfs_rq = group_cfs_rq(se); |
888 | } while (cfs_rq); | 1128 | } while (cfs_rq); |
889 | 1129 | ||
890 | return task_of(se); | 1130 | p = task_of(se); |
1131 | hrtick_start_fair(rq, p); | ||
1132 | |||
1133 | return p; | ||
891 | } | 1134 | } |
892 | 1135 | ||
893 | /* | 1136 | /* |
@@ -944,25 +1187,6 @@ static struct task_struct *load_balance_next_fair(void *arg) | |||
944 | return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); | 1187 | return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); |
945 | } | 1188 | } |
946 | 1189 | ||
947 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
948 | static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) | ||
949 | { | ||
950 | struct sched_entity *curr; | ||
951 | struct task_struct *p; | ||
952 | |||
953 | if (!cfs_rq->nr_running) | ||
954 | return MAX_PRIO; | ||
955 | |||
956 | curr = cfs_rq->curr; | ||
957 | if (!curr) | ||
958 | curr = __pick_next_entity(cfs_rq); | ||
959 | |||
960 | p = task_of(curr); | ||
961 | |||
962 | return p->prio; | ||
963 | } | ||
964 | #endif | ||
965 | |||
966 | static unsigned long | 1190 | static unsigned long |
967 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 1191 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
968 | unsigned long max_load_move, | 1192 | unsigned long max_load_move, |
@@ -972,28 +1196,45 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
972 | struct cfs_rq *busy_cfs_rq; | 1196 | struct cfs_rq *busy_cfs_rq; |
973 | long rem_load_move = max_load_move; | 1197 | long rem_load_move = max_load_move; |
974 | struct rq_iterator cfs_rq_iterator; | 1198 | struct rq_iterator cfs_rq_iterator; |
1199 | unsigned long load_moved; | ||
975 | 1200 | ||
976 | cfs_rq_iterator.start = load_balance_start_fair; | 1201 | cfs_rq_iterator.start = load_balance_start_fair; |
977 | cfs_rq_iterator.next = load_balance_next_fair; | 1202 | cfs_rq_iterator.next = load_balance_next_fair; |
978 | 1203 | ||
979 | for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { | 1204 | for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { |
980 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1205 | #ifdef CONFIG_FAIR_GROUP_SCHED |
981 | struct cfs_rq *this_cfs_rq; | 1206 | struct cfs_rq *this_cfs_rq = busy_cfs_rq->tg->cfs_rq[this_cpu]; |
982 | long imbalance; | 1207 | unsigned long maxload, task_load, group_weight; |
983 | unsigned long maxload; | 1208 | unsigned long thisload, per_task_load; |
1209 | struct sched_entity *se = busy_cfs_rq->tg->se[busiest->cpu]; | ||
1210 | |||
1211 | task_load = busy_cfs_rq->load.weight; | ||
1212 | group_weight = se->load.weight; | ||
984 | 1213 | ||
985 | this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); | 1214 | /* |
1215 | * 'group_weight' is contributed by tasks of total weight | ||
1216 | * 'task_load'. To move 'rem_load_move' worth of weight only, | ||
1217 | * we need to move a maximum task load of: | ||
1218 | * | ||
1219 | * maxload = (remload / group_weight) * task_load; | ||
1220 | */ | ||
1221 | maxload = (rem_load_move * task_load) / group_weight; | ||
986 | 1222 | ||
987 | imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight; | 1223 | if (!maxload || !task_load) |
988 | /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */ | ||
989 | if (imbalance <= 0) | ||
990 | continue; | 1224 | continue; |
991 | 1225 | ||
992 | /* Don't pull more than imbalance/2 */ | 1226 | per_task_load = task_load / busy_cfs_rq->nr_running; |
993 | imbalance /= 2; | 1227 | /* |
994 | maxload = min(rem_load_move, imbalance); | 1228 | * balance_tasks will try to forcibly move atleast one task if |
1229 | * possible (because of SCHED_LOAD_SCALE_FUZZ). Avoid that if | ||
1230 | * maxload is less than GROUP_IMBALANCE_FUZZ% the per_task_load. | ||
1231 | */ | ||
1232 | if (100 * maxload < GROUP_IMBALANCE_PCT * per_task_load) | ||
1233 | continue; | ||
995 | 1234 | ||
996 | *this_best_prio = cfs_rq_best_prio(this_cfs_rq); | 1235 | /* Disable priority-based load balance */ |
1236 | *this_best_prio = 0; | ||
1237 | thisload = this_cfs_rq->load.weight; | ||
997 | #else | 1238 | #else |
998 | # define maxload rem_load_move | 1239 | # define maxload rem_load_move |
999 | #endif | 1240 | #endif |
@@ -1002,11 +1243,33 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
1002 | * load_balance_[start|next]_fair iterators | 1243 | * load_balance_[start|next]_fair iterators |
1003 | */ | 1244 | */ |
1004 | cfs_rq_iterator.arg = busy_cfs_rq; | 1245 | cfs_rq_iterator.arg = busy_cfs_rq; |
1005 | rem_load_move -= balance_tasks(this_rq, this_cpu, busiest, | 1246 | load_moved = balance_tasks(this_rq, this_cpu, busiest, |
1006 | maxload, sd, idle, all_pinned, | 1247 | maxload, sd, idle, all_pinned, |
1007 | this_best_prio, | 1248 | this_best_prio, |
1008 | &cfs_rq_iterator); | 1249 | &cfs_rq_iterator); |
1009 | 1250 | ||
1251 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1252 | /* | ||
1253 | * load_moved holds the task load that was moved. The | ||
1254 | * effective (group) weight moved would be: | ||
1255 | * load_moved_eff = load_moved/task_load * group_weight; | ||
1256 | */ | ||
1257 | load_moved = (group_weight * load_moved) / task_load; | ||
1258 | |||
1259 | /* Adjust shares on both cpus to reflect load_moved */ | ||
1260 | group_weight -= load_moved; | ||
1261 | set_se_shares(se, group_weight); | ||
1262 | |||
1263 | se = busy_cfs_rq->tg->se[this_cpu]; | ||
1264 | if (!thisload) | ||
1265 | group_weight = load_moved; | ||
1266 | else | ||
1267 | group_weight = se->load.weight + load_moved; | ||
1268 | set_se_shares(se, group_weight); | ||
1269 | #endif | ||
1270 | |||
1271 | rem_load_move -= load_moved; | ||
1272 | |||
1010 | if (rem_load_move <= 0) | 1273 | if (rem_load_move <= 0) |
1011 | break; | 1274 | break; |
1012 | } | 1275 | } |
@@ -1042,14 +1305,14 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
1042 | /* | 1305 | /* |
1043 | * scheduler tick hitting a task of our scheduling class: | 1306 | * scheduler tick hitting a task of our scheduling class: |
1044 | */ | 1307 | */ |
1045 | static void task_tick_fair(struct rq *rq, struct task_struct *curr) | 1308 | static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) |
1046 | { | 1309 | { |
1047 | struct cfs_rq *cfs_rq; | 1310 | struct cfs_rq *cfs_rq; |
1048 | struct sched_entity *se = &curr->se; | 1311 | struct sched_entity *se = &curr->se; |
1049 | 1312 | ||
1050 | for_each_sched_entity(se) { | 1313 | for_each_sched_entity(se) { |
1051 | cfs_rq = cfs_rq_of(se); | 1314 | cfs_rq = cfs_rq_of(se); |
1052 | entity_tick(cfs_rq, se); | 1315 | entity_tick(cfs_rq, se, queued); |
1053 | } | 1316 | } |
1054 | } | 1317 | } |
1055 | 1318 | ||
@@ -1087,6 +1350,42 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) | |||
1087 | resched_task(rq->curr); | 1350 | resched_task(rq->curr); |
1088 | } | 1351 | } |
1089 | 1352 | ||
1353 | /* | ||
1354 | * Priority of the task has changed. Check to see if we preempt | ||
1355 | * the current task. | ||
1356 | */ | ||
1357 | static void prio_changed_fair(struct rq *rq, struct task_struct *p, | ||
1358 | int oldprio, int running) | ||
1359 | { | ||
1360 | /* | ||
1361 | * Reschedule if we are currently running on this runqueue and | ||
1362 | * our priority decreased, or if we are not currently running on | ||
1363 | * this runqueue and our priority is higher than the current's | ||
1364 | */ | ||
1365 | if (running) { | ||
1366 | if (p->prio > oldprio) | ||
1367 | resched_task(rq->curr); | ||
1368 | } else | ||
1369 | check_preempt_curr(rq, p); | ||
1370 | } | ||
1371 | |||
1372 | /* | ||
1373 | * We switched to the sched_fair class. | ||
1374 | */ | ||
1375 | static void switched_to_fair(struct rq *rq, struct task_struct *p, | ||
1376 | int running) | ||
1377 | { | ||
1378 | /* | ||
1379 | * We were most likely switched from sched_rt, so | ||
1380 | * kick off the schedule if running, otherwise just see | ||
1381 | * if we can still preempt the current task. | ||
1382 | */ | ||
1383 | if (running) | ||
1384 | resched_task(rq->curr); | ||
1385 | else | ||
1386 | check_preempt_curr(rq, p); | ||
1387 | } | ||
1388 | |||
1090 | /* Account for a task changing its policy or group. | 1389 | /* Account for a task changing its policy or group. |
1091 | * | 1390 | * |
1092 | * This routine is mostly called to set cfs_rq->curr field when a task | 1391 | * This routine is mostly called to set cfs_rq->curr field when a task |
@@ -1108,6 +1407,9 @@ static const struct sched_class fair_sched_class = { | |||
1108 | .enqueue_task = enqueue_task_fair, | 1407 | .enqueue_task = enqueue_task_fair, |
1109 | .dequeue_task = dequeue_task_fair, | 1408 | .dequeue_task = dequeue_task_fair, |
1110 | .yield_task = yield_task_fair, | 1409 | .yield_task = yield_task_fair, |
1410 | #ifdef CONFIG_SMP | ||
1411 | .select_task_rq = select_task_rq_fair, | ||
1412 | #endif /* CONFIG_SMP */ | ||
1111 | 1413 | ||
1112 | .check_preempt_curr = check_preempt_wakeup, | 1414 | .check_preempt_curr = check_preempt_wakeup, |
1113 | 1415 | ||
@@ -1122,6 +1424,9 @@ static const struct sched_class fair_sched_class = { | |||
1122 | .set_curr_task = set_curr_task_fair, | 1424 | .set_curr_task = set_curr_task_fair, |
1123 | .task_tick = task_tick_fair, | 1425 | .task_tick = task_tick_fair, |
1124 | .task_new = task_new_fair, | 1426 | .task_new = task_new_fair, |
1427 | |||
1428 | .prio_changed = prio_changed_fair, | ||
1429 | .switched_to = switched_to_fair, | ||
1125 | }; | 1430 | }; |
1126 | 1431 | ||
1127 | #ifdef CONFIG_SCHED_DEBUG | 1432 | #ifdef CONFIG_SCHED_DEBUG |
@@ -1132,7 +1437,9 @@ static void print_cfs_stats(struct seq_file *m, int cpu) | |||
1132 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1437 | #ifdef CONFIG_FAIR_GROUP_SCHED |
1133 | print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs); | 1438 | print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs); |
1134 | #endif | 1439 | #endif |
1440 | rcu_read_lock(); | ||
1135 | for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) | 1441 | for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) |
1136 | print_cfs_rq(m, cpu, cfs_rq); | 1442 | print_cfs_rq(m, cpu, cfs_rq); |
1443 | rcu_read_unlock(); | ||
1137 | } | 1444 | } |
1138 | #endif | 1445 | #endif |