diff options
author | Ingo Molnar <mingo@elte.hu> | 2007-07-09 12:51:59 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2007-07-09 12:51:59 -0400 |
commit | dd41f596cda0d7d6e4a8b139ffdfabcefdd46528 (patch) | |
tree | 6f0e3677b348c3038f60c9d0cf165301771ece48 | |
parent | f3479f10c5d667e591f4417a0bba78e221924206 (diff) |
sched: cfs core code
apply the CFS core code.
this change switches over the scheduler core to CFS's modular
design and makes use of kernel/sched_fair/rt/idletask.c to implement
Linux's scheduling policies.
thanks to Andrew Morton and Thomas Gleixner for lots of detailed review
feedback and for fixlets.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
-rw-r--r-- | kernel/sched.c | 1532 |
1 files changed, 758 insertions, 774 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index f5a204b46655..01ba4b1848a0 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -391,6 +391,11 @@ struct rq { | |||
391 | static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp; | 391 | static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp; |
392 | static DEFINE_MUTEX(sched_hotcpu_mutex); | 392 | static DEFINE_MUTEX(sched_hotcpu_mutex); |
393 | 393 | ||
394 | static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) | ||
395 | { | ||
396 | rq->curr->sched_class->check_preempt_curr(rq, p); | ||
397 | } | ||
398 | |||
394 | static inline int cpu_of(struct rq *rq) | 399 | static inline int cpu_of(struct rq *rq) |
395 | { | 400 | { |
396 | #ifdef CONFIG_SMP | 401 | #ifdef CONFIG_SMP |
@@ -669,8 +674,6 @@ static inline void resched_task(struct task_struct *p) | |||
669 | } | 674 | } |
670 | #endif | 675 | #endif |
671 | 676 | ||
672 | #include "sched_stats.h" | ||
673 | |||
674 | static u64 div64_likely32(u64 divident, unsigned long divisor) | 677 | static u64 div64_likely32(u64 divident, unsigned long divisor) |
675 | { | 678 | { |
676 | #if BITS_PER_LONG == 32 | 679 | #if BITS_PER_LONG == 32 |
@@ -788,120 +791,146 @@ static void update_curr_load(struct rq *rq, u64 now) | |||
788 | * this code will need modification | 791 | * this code will need modification |
789 | */ | 792 | */ |
790 | #define TIME_SLICE_NICE_ZERO DEF_TIMESLICE | 793 | #define TIME_SLICE_NICE_ZERO DEF_TIMESLICE |
791 | #define LOAD_WEIGHT(lp) \ | 794 | #define load_weight(lp) \ |
792 | (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO) | 795 | (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO) |
793 | #define PRIO_TO_LOAD_WEIGHT(prio) \ | 796 | #define PRIO_TO_LOAD_WEIGHT(prio) \ |
794 | LOAD_WEIGHT(static_prio_timeslice(prio)) | 797 | load_weight(static_prio_timeslice(prio)) |
795 | #define RTPRIO_TO_LOAD_WEIGHT(rp) \ | 798 | #define RTPRIO_TO_LOAD_WEIGHT(rp) \ |
796 | (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp)) | 799 | (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + load_weight(rp)) |
800 | |||
801 | #define WEIGHT_IDLEPRIO 2 | ||
802 | #define WMULT_IDLEPRIO (1 << 31) | ||
803 | |||
804 | /* | ||
805 | * Nice levels are multiplicative, with a gentle 10% change for every | ||
806 | * nice level changed. I.e. when a CPU-bound task goes from nice 0 to | ||
807 | * nice 1, it will get ~10% less CPU time than another CPU-bound task | ||
808 | * that remained on nice 0. | ||
809 | * | ||
810 | * The "10% effect" is relative and cumulative: from _any_ nice level, | ||
811 | * if you go up 1 level, it's -10% CPU usage, if you go down 1 level | ||
812 | * it's +10% CPU usage. | ||
813 | */ | ||
814 | static const int prio_to_weight[40] = { | ||
815 | /* -20 */ 88818, 71054, 56843, 45475, 36380, 29104, 23283, 18626, 14901, 11921, | ||
816 | /* -10 */ 9537, 7629, 6103, 4883, 3906, 3125, 2500, 2000, 1600, 1280, | ||
817 | /* 0 */ NICE_0_LOAD /* 1024 */, | ||
818 | /* 1 */ 819, 655, 524, 419, 336, 268, 215, 172, 137, | ||
819 | /* 10 */ 110, 87, 70, 56, 45, 36, 29, 23, 18, 15, | ||
820 | }; | ||
821 | |||
822 | static const u32 prio_to_wmult[40] = { | ||
823 | 48356, 60446, 75558, 94446, 118058, 147573, | ||
824 | 184467, 230589, 288233, 360285, 450347, | ||
825 | 562979, 703746, 879575, 1099582, 1374389, | ||
826 | 717986, 2147483, 2684354, 3355443, 4194304, | ||
827 | 244160, 6557201, 8196502, 10250518, 12782640, | ||
828 | 16025997, 19976592, 24970740, 31350126, 39045157, | ||
829 | 49367440, 61356675, 76695844, 95443717, 119304647, | ||
830 | 148102320, 186737708, 238609294, 286331153, | ||
831 | }; | ||
797 | 832 | ||
798 | static inline void | 833 | static inline void |
799 | inc_raw_weighted_load(struct rq *rq, const struct task_struct *p) | 834 | inc_load(struct rq *rq, const struct task_struct *p, u64 now) |
800 | { | 835 | { |
801 | rq->raw_weighted_load += p->load_weight; | 836 | update_curr_load(rq, now); |
837 | update_load_add(&rq->ls.load, p->se.load.weight); | ||
802 | } | 838 | } |
803 | 839 | ||
804 | static inline void | 840 | static inline void |
805 | dec_raw_weighted_load(struct rq *rq, const struct task_struct *p) | 841 | dec_load(struct rq *rq, const struct task_struct *p, u64 now) |
806 | { | 842 | { |
807 | rq->raw_weighted_load -= p->load_weight; | 843 | update_curr_load(rq, now); |
844 | update_load_sub(&rq->ls.load, p->se.load.weight); | ||
808 | } | 845 | } |
809 | 846 | ||
810 | static inline void inc_nr_running(struct task_struct *p, struct rq *rq) | 847 | static inline void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now) |
811 | { | 848 | { |
812 | rq->nr_running++; | 849 | rq->nr_running++; |
813 | inc_raw_weighted_load(rq, p); | 850 | inc_load(rq, p, now); |
814 | } | 851 | } |
815 | 852 | ||
816 | static inline void dec_nr_running(struct task_struct *p, struct rq *rq) | 853 | static inline void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now) |
817 | { | 854 | { |
818 | rq->nr_running--; | 855 | rq->nr_running--; |
819 | dec_raw_weighted_load(rq, p); | 856 | dec_load(rq, p, now); |
820 | } | 857 | } |
821 | 858 | ||
859 | static void activate_task(struct rq *rq, struct task_struct *p, int wakeup); | ||
860 | |||
861 | /* | ||
862 | * runqueue iterator, to support SMP load-balancing between different | ||
863 | * scheduling classes, without having to expose their internal data | ||
864 | * structures to the load-balancing proper: | ||
865 | */ | ||
866 | struct rq_iterator { | ||
867 | void *arg; | ||
868 | struct task_struct *(*start)(void *); | ||
869 | struct task_struct *(*next)(void *); | ||
870 | }; | ||
871 | |||
872 | static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
873 | unsigned long max_nr_move, unsigned long max_load_move, | ||
874 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
875 | int *all_pinned, unsigned long *load_moved, | ||
876 | int this_best_prio, int best_prio, int best_prio_seen, | ||
877 | struct rq_iterator *iterator); | ||
878 | |||
879 | #include "sched_stats.h" | ||
880 | #include "sched_rt.c" | ||
881 | #include "sched_fair.c" | ||
882 | #include "sched_idletask.c" | ||
883 | #ifdef CONFIG_SCHED_DEBUG | ||
884 | # include "sched_debug.c" | ||
885 | #endif | ||
886 | |||
887 | #define sched_class_highest (&rt_sched_class) | ||
888 | |||
822 | static void set_load_weight(struct task_struct *p) | 889 | static void set_load_weight(struct task_struct *p) |
823 | { | 890 | { |
891 | task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime; | ||
892 | p->se.wait_runtime = 0; | ||
893 | |||
824 | if (task_has_rt_policy(p)) { | 894 | if (task_has_rt_policy(p)) { |
825 | #ifdef CONFIG_SMP | 895 | p->se.load.weight = prio_to_weight[0] * 2; |
826 | if (p == task_rq(p)->migration_thread) | 896 | p->se.load.inv_weight = prio_to_wmult[0] >> 1; |
827 | /* | 897 | return; |
828 | * The migration thread does the actual balancing. | 898 | } |
829 | * Giving its load any weight will skew balancing | ||
830 | * adversely. | ||
831 | */ | ||
832 | p->load_weight = 0; | ||
833 | else | ||
834 | #endif | ||
835 | p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority); | ||
836 | } else | ||
837 | p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio); | ||
838 | } | ||
839 | 899 | ||
840 | /* | 900 | /* |
841 | * Adding/removing a task to/from a priority array: | 901 | * SCHED_IDLE tasks get minimal weight: |
842 | */ | 902 | */ |
843 | static void dequeue_task(struct task_struct *p, struct prio_array *array) | 903 | if (p->policy == SCHED_IDLE) { |
844 | { | 904 | p->se.load.weight = WEIGHT_IDLEPRIO; |
845 | array->nr_active--; | 905 | p->se.load.inv_weight = WMULT_IDLEPRIO; |
846 | list_del(&p->run_list); | 906 | return; |
847 | if (list_empty(array->queue + p->prio)) | 907 | } |
848 | __clear_bit(p->prio, array->bitmap); | ||
849 | } | ||
850 | 908 | ||
851 | static void enqueue_task(struct task_struct *p, struct prio_array *array) | 909 | p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO]; |
852 | { | 910 | p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; |
853 | sched_info_queued(p); | ||
854 | list_add_tail(&p->run_list, array->queue + p->prio); | ||
855 | __set_bit(p->prio, array->bitmap); | ||
856 | array->nr_active++; | ||
857 | p->array = array; | ||
858 | } | 911 | } |
859 | 912 | ||
860 | /* | 913 | static void |
861 | * Put task to the end of the run list without the overhead of dequeue | 914 | enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, u64 now) |
862 | * followed by enqueue. | ||
863 | */ | ||
864 | static void requeue_task(struct task_struct *p, struct prio_array *array) | ||
865 | { | 915 | { |
866 | list_move_tail(&p->run_list, array->queue + p->prio); | 916 | sched_info_queued(p); |
917 | p->sched_class->enqueue_task(rq, p, wakeup, now); | ||
918 | p->se.on_rq = 1; | ||
867 | } | 919 | } |
868 | 920 | ||
869 | static inline void | 921 | static void |
870 | enqueue_task_head(struct task_struct *p, struct prio_array *array) | 922 | dequeue_task(struct rq *rq, struct task_struct *p, int sleep, u64 now) |
871 | { | 923 | { |
872 | list_add(&p->run_list, array->queue + p->prio); | 924 | p->sched_class->dequeue_task(rq, p, sleep, now); |
873 | __set_bit(p->prio, array->bitmap); | 925 | p->se.on_rq = 0; |
874 | array->nr_active++; | ||
875 | p->array = array; | ||
876 | } | 926 | } |
877 | 927 | ||
878 | /* | 928 | /* |
879 | * __normal_prio - return the priority that is based on the static | 929 | * __normal_prio - return the priority that is based on the static prio |
880 | * priority but is modified by bonuses/penalties. | ||
881 | * | ||
882 | * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] | ||
883 | * into the -5 ... 0 ... +5 bonus/penalty range. | ||
884 | * | ||
885 | * We use 25% of the full 0...39 priority range so that: | ||
886 | * | ||
887 | * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. | ||
888 | * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. | ||
889 | * | ||
890 | * Both properties are important to certain workloads. | ||
891 | */ | 930 | */ |
892 | |||
893 | static inline int __normal_prio(struct task_struct *p) | 931 | static inline int __normal_prio(struct task_struct *p) |
894 | { | 932 | { |
895 | int bonus, prio; | 933 | return p->static_prio; |
896 | |||
897 | bonus = 0; | ||
898 | |||
899 | prio = p->static_prio - bonus; | ||
900 | if (prio < MAX_RT_PRIO) | ||
901 | prio = MAX_RT_PRIO; | ||
902 | if (prio > MAX_PRIO-1) | ||
903 | prio = MAX_PRIO-1; | ||
904 | return prio; | ||
905 | } | 934 | } |
906 | 935 | ||
907 | /* | 936 | /* |
@@ -943,84 +972,45 @@ static int effective_prio(struct task_struct *p) | |||
943 | } | 972 | } |
944 | 973 | ||
945 | /* | 974 | /* |
946 | * __activate_task - move a task to the runqueue. | 975 | * activate_task - move a task to the runqueue. |
947 | */ | 976 | */ |
948 | static void __activate_task(struct task_struct *p, struct rq *rq) | 977 | static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) |
949 | { | 978 | { |
950 | struct prio_array *target = rq->active; | 979 | u64 now = rq_clock(rq); |
951 | 980 | ||
952 | if (batch_task(p)) | 981 | if (p->state == TASK_UNINTERRUPTIBLE) |
953 | target = rq->expired; | 982 | rq->nr_uninterruptible--; |
954 | enqueue_task(p, target); | ||
955 | inc_nr_running(p, rq); | ||
956 | } | ||
957 | |||
958 | /* | ||
959 | * __activate_idle_task - move idle task to the _front_ of runqueue. | ||
960 | */ | ||
961 | static inline void __activate_idle_task(struct task_struct *p, struct rq *rq) | ||
962 | { | ||
963 | enqueue_task_head(p, rq->active); | ||
964 | inc_nr_running(p, rq); | ||
965 | } | ||
966 | 983 | ||
967 | /* | 984 | enqueue_task(rq, p, wakeup, now); |
968 | * Recalculate p->normal_prio and p->prio after having slept, | 985 | inc_nr_running(p, rq, now); |
969 | * updating the sleep-average too: | ||
970 | */ | ||
971 | static int recalc_task_prio(struct task_struct *p, unsigned long long now) | ||
972 | { | ||
973 | return effective_prio(p); | ||
974 | } | 986 | } |
975 | 987 | ||
976 | /* | 988 | /* |
977 | * activate_task - move a task to the runqueue and do priority recalculation | 989 | * activate_idle_task - move idle task to the _front_ of runqueue. |
978 | * | ||
979 | * Update all the scheduling statistics stuff. (sleep average | ||
980 | * calculation, priority modifiers, etc.) | ||
981 | */ | 990 | */ |
982 | static void activate_task(struct task_struct *p, struct rq *rq, int local) | 991 | static inline void activate_idle_task(struct task_struct *p, struct rq *rq) |
983 | { | 992 | { |
984 | unsigned long long now; | 993 | u64 now = rq_clock(rq); |
985 | |||
986 | if (rt_task(p)) | ||
987 | goto out; | ||
988 | |||
989 | now = sched_clock(); | ||
990 | #ifdef CONFIG_SMP | ||
991 | if (!local) { | ||
992 | /* Compensate for drifting sched_clock */ | ||
993 | struct rq *this_rq = this_rq(); | ||
994 | now = (now - this_rq->most_recent_timestamp) | ||
995 | + rq->most_recent_timestamp; | ||
996 | } | ||
997 | #endif | ||
998 | 994 | ||
999 | /* | 995 | if (p->state == TASK_UNINTERRUPTIBLE) |
1000 | * Sleep time is in units of nanosecs, so shift by 20 to get a | 996 | rq->nr_uninterruptible--; |
1001 | * milliseconds-range estimation of the amount of time that the task | ||
1002 | * spent sleeping: | ||
1003 | */ | ||
1004 | if (unlikely(prof_on == SLEEP_PROFILING)) { | ||
1005 | if (p->state == TASK_UNINTERRUPTIBLE) | ||
1006 | profile_hits(SLEEP_PROFILING, (void *)get_wchan(p), | ||
1007 | (now - p->timestamp) >> 20); | ||
1008 | } | ||
1009 | 997 | ||
1010 | p->prio = recalc_task_prio(p, now); | 998 | enqueue_task(rq, p, 0, now); |
1011 | p->timestamp = now; | 999 | inc_nr_running(p, rq, now); |
1012 | out: | ||
1013 | __activate_task(p, rq); | ||
1014 | } | 1000 | } |
1015 | 1001 | ||
1016 | /* | 1002 | /* |
1017 | * deactivate_task - remove a task from the runqueue. | 1003 | * deactivate_task - remove a task from the runqueue. |
1018 | */ | 1004 | */ |
1019 | static void deactivate_task(struct task_struct *p, struct rq *rq) | 1005 | static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) |
1020 | { | 1006 | { |
1021 | dec_nr_running(p, rq); | 1007 | u64 now = rq_clock(rq); |
1022 | dequeue_task(p, p->array); | 1008 | |
1023 | p->array = NULL; | 1009 | if (p->state == TASK_UNINTERRUPTIBLE) |
1010 | rq->nr_uninterruptible++; | ||
1011 | |||
1012 | dequeue_task(rq, p, sleep, now); | ||
1013 | dec_nr_running(p, rq, now); | ||
1024 | } | 1014 | } |
1025 | 1015 | ||
1026 | /** | 1016 | /** |
@@ -1035,14 +1025,40 @@ inline int task_curr(const struct task_struct *p) | |||
1035 | /* Used instead of source_load when we know the type == 0 */ | 1025 | /* Used instead of source_load when we know the type == 0 */ |
1036 | unsigned long weighted_cpuload(const int cpu) | 1026 | unsigned long weighted_cpuload(const int cpu) |
1037 | { | 1027 | { |
1038 | return cpu_rq(cpu)->raw_weighted_load; | 1028 | return cpu_rq(cpu)->ls.load.weight; |
1029 | } | ||
1030 | |||
1031 | static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | ||
1032 | { | ||
1033 | #ifdef CONFIG_SMP | ||
1034 | task_thread_info(p)->cpu = cpu; | ||
1035 | set_task_cfs_rq(p); | ||
1036 | #endif | ||
1039 | } | 1037 | } |
1040 | 1038 | ||
1041 | #ifdef CONFIG_SMP | 1039 | #ifdef CONFIG_SMP |
1042 | 1040 | ||
1043 | void set_task_cpu(struct task_struct *p, unsigned int cpu) | 1041 | void set_task_cpu(struct task_struct *p, unsigned int new_cpu) |
1044 | { | 1042 | { |
1045 | task_thread_info(p)->cpu = cpu; | 1043 | int old_cpu = task_cpu(p); |
1044 | struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu); | ||
1045 | u64 clock_offset, fair_clock_offset; | ||
1046 | |||
1047 | clock_offset = old_rq->clock - new_rq->clock; | ||
1048 | fair_clock_offset = old_rq->cfs.fair_clock - | ||
1049 | new_rq->cfs.fair_clock; | ||
1050 | if (p->se.wait_start) | ||
1051 | p->se.wait_start -= clock_offset; | ||
1052 | if (p->se.wait_start_fair) | ||
1053 | p->se.wait_start_fair -= fair_clock_offset; | ||
1054 | if (p->se.sleep_start) | ||
1055 | p->se.sleep_start -= clock_offset; | ||
1056 | if (p->se.block_start) | ||
1057 | p->se.block_start -= clock_offset; | ||
1058 | if (p->se.sleep_start_fair) | ||
1059 | p->se.sleep_start_fair -= fair_clock_offset; | ||
1060 | |||
1061 | __set_task_cpu(p, new_cpu); | ||
1046 | } | 1062 | } |
1047 | 1063 | ||
1048 | struct migration_req { | 1064 | struct migration_req { |
@@ -1067,7 +1083,7 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) | |||
1067 | * If the task is not on a runqueue (and not running), then | 1083 | * If the task is not on a runqueue (and not running), then |
1068 | * it is sufficient to simply update the task's cpu field. | 1084 | * it is sufficient to simply update the task's cpu field. |
1069 | */ | 1085 | */ |
1070 | if (!p->array && !task_running(rq, p)) { | 1086 | if (!p->se.on_rq && !task_running(rq, p)) { |
1071 | set_task_cpu(p, dest_cpu); | 1087 | set_task_cpu(p, dest_cpu); |
1072 | return 0; | 1088 | return 0; |
1073 | } | 1089 | } |
@@ -1092,9 +1108,8 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) | |||
1092 | void wait_task_inactive(struct task_struct *p) | 1108 | void wait_task_inactive(struct task_struct *p) |
1093 | { | 1109 | { |
1094 | unsigned long flags; | 1110 | unsigned long flags; |
1111 | int running, on_rq; | ||
1095 | struct rq *rq; | 1112 | struct rq *rq; |
1096 | struct prio_array *array; | ||
1097 | int running; | ||
1098 | 1113 | ||
1099 | repeat: | 1114 | repeat: |
1100 | /* | 1115 | /* |
@@ -1126,7 +1141,7 @@ repeat: | |||
1126 | */ | 1141 | */ |
1127 | rq = task_rq_lock(p, &flags); | 1142 | rq = task_rq_lock(p, &flags); |
1128 | running = task_running(rq, p); | 1143 | running = task_running(rq, p); |
1129 | array = p->array; | 1144 | on_rq = p->se.on_rq; |
1130 | task_rq_unlock(rq, &flags); | 1145 | task_rq_unlock(rq, &flags); |
1131 | 1146 | ||
1132 | /* | 1147 | /* |
@@ -1149,7 +1164,7 @@ repeat: | |||
1149 | * running right now), it's preempted, and we should | 1164 | * running right now), it's preempted, and we should |
1150 | * yield - it could be a while. | 1165 | * yield - it could be a while. |
1151 | */ | 1166 | */ |
1152 | if (unlikely(array)) { | 1167 | if (unlikely(on_rq)) { |
1153 | yield(); | 1168 | yield(); |
1154 | goto repeat; | 1169 | goto repeat; |
1155 | } | 1170 | } |
@@ -1195,11 +1210,12 @@ void kick_process(struct task_struct *p) | |||
1195 | static inline unsigned long source_load(int cpu, int type) | 1210 | static inline unsigned long source_load(int cpu, int type) |
1196 | { | 1211 | { |
1197 | struct rq *rq = cpu_rq(cpu); | 1212 | struct rq *rq = cpu_rq(cpu); |
1213 | unsigned long total = weighted_cpuload(cpu); | ||
1198 | 1214 | ||
1199 | if (type == 0) | 1215 | if (type == 0) |
1200 | return rq->raw_weighted_load; | 1216 | return total; |
1201 | 1217 | ||
1202 | return min(rq->cpu_load[type-1], rq->raw_weighted_load); | 1218 | return min(rq->cpu_load[type-1], total); |
1203 | } | 1219 | } |
1204 | 1220 | ||
1205 | /* | 1221 | /* |
@@ -1209,11 +1225,12 @@ static inline unsigned long source_load(int cpu, int type) | |||
1209 | static inline unsigned long target_load(int cpu, int type) | 1225 | static inline unsigned long target_load(int cpu, int type) |
1210 | { | 1226 | { |
1211 | struct rq *rq = cpu_rq(cpu); | 1227 | struct rq *rq = cpu_rq(cpu); |
1228 | unsigned long total = weighted_cpuload(cpu); | ||
1212 | 1229 | ||
1213 | if (type == 0) | 1230 | if (type == 0) |
1214 | return rq->raw_weighted_load; | 1231 | return total; |
1215 | 1232 | ||
1216 | return max(rq->cpu_load[type-1], rq->raw_weighted_load); | 1233 | return max(rq->cpu_load[type-1], total); |
1217 | } | 1234 | } |
1218 | 1235 | ||
1219 | /* | 1236 | /* |
@@ -1222,9 +1239,10 @@ static inline unsigned long target_load(int cpu, int type) | |||
1222 | static inline unsigned long cpu_avg_load_per_task(int cpu) | 1239 | static inline unsigned long cpu_avg_load_per_task(int cpu) |
1223 | { | 1240 | { |
1224 | struct rq *rq = cpu_rq(cpu); | 1241 | struct rq *rq = cpu_rq(cpu); |
1242 | unsigned long total = weighted_cpuload(cpu); | ||
1225 | unsigned long n = rq->nr_running; | 1243 | unsigned long n = rq->nr_running; |
1226 | 1244 | ||
1227 | return n ? rq->raw_weighted_load / n : SCHED_LOAD_SCALE; | 1245 | return n ? total / n : SCHED_LOAD_SCALE; |
1228 | } | 1246 | } |
1229 | 1247 | ||
1230 | /* | 1248 | /* |
@@ -1455,7 +1473,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
1455 | if (!(old_state & state)) | 1473 | if (!(old_state & state)) |
1456 | goto out; | 1474 | goto out; |
1457 | 1475 | ||
1458 | if (p->array) | 1476 | if (p->se.on_rq) |
1459 | goto out_running; | 1477 | goto out_running; |
1460 | 1478 | ||
1461 | cpu = task_cpu(p); | 1479 | cpu = task_cpu(p); |
@@ -1510,11 +1528,11 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
1510 | * of the current CPU: | 1528 | * of the current CPU: |
1511 | */ | 1529 | */ |
1512 | if (sync) | 1530 | if (sync) |
1513 | tl -= current->load_weight; | 1531 | tl -= current->se.load.weight; |
1514 | 1532 | ||
1515 | if ((tl <= load && | 1533 | if ((tl <= load && |
1516 | tl + target_load(cpu, idx) <= tl_per_task) || | 1534 | tl + target_load(cpu, idx) <= tl_per_task) || |
1517 | 100*(tl + p->load_weight) <= imbalance*load) { | 1535 | 100*(tl + p->se.load.weight) <= imbalance*load) { |
1518 | /* | 1536 | /* |
1519 | * This domain has SD_WAKE_AFFINE and | 1537 | * This domain has SD_WAKE_AFFINE and |
1520 | * p is cache cold in this domain, and | 1538 | * p is cache cold in this domain, and |
@@ -1548,7 +1566,7 @@ out_set_cpu: | |||
1548 | old_state = p->state; | 1566 | old_state = p->state; |
1549 | if (!(old_state & state)) | 1567 | if (!(old_state & state)) |
1550 | goto out; | 1568 | goto out; |
1551 | if (p->array) | 1569 | if (p->se.on_rq) |
1552 | goto out_running; | 1570 | goto out_running; |
1553 | 1571 | ||
1554 | this_cpu = smp_processor_id(); | 1572 | this_cpu = smp_processor_id(); |
@@ -1557,10 +1575,7 @@ out_set_cpu: | |||
1557 | 1575 | ||
1558 | out_activate: | 1576 | out_activate: |
1559 | #endif /* CONFIG_SMP */ | 1577 | #endif /* CONFIG_SMP */ |
1560 | if (old_state == TASK_UNINTERRUPTIBLE) | 1578 | activate_task(rq, p, 1); |
1561 | rq->nr_uninterruptible--; | ||
1562 | |||
1563 | activate_task(p, rq, cpu == this_cpu); | ||
1564 | /* | 1579 | /* |
1565 | * Sync wakeups (i.e. those types of wakeups where the waker | 1580 | * Sync wakeups (i.e. those types of wakeups where the waker |
1566 | * has indicated that it will leave the CPU in short order) | 1581 | * has indicated that it will leave the CPU in short order) |
@@ -1569,10 +1584,8 @@ out_activate: | |||
1569 | * the waker guarantees that the freshly woken up task is going | 1584 | * the waker guarantees that the freshly woken up task is going |
1570 | * to be considered on this CPU.) | 1585 | * to be considered on this CPU.) |
1571 | */ | 1586 | */ |
1572 | if (!sync || cpu != this_cpu) { | 1587 | if (!sync || cpu != this_cpu) |
1573 | if (TASK_PREEMPTS_CURR(p, rq)) | 1588 | check_preempt_curr(rq, p); |
1574 | resched_task(rq->curr); | ||
1575 | } | ||
1576 | success = 1; | 1589 | success = 1; |
1577 | 1590 | ||
1578 | out_running: | 1591 | out_running: |
@@ -1595,19 +1608,36 @@ int fastcall wake_up_state(struct task_struct *p, unsigned int state) | |||
1595 | return try_to_wake_up(p, state, 0); | 1608 | return try_to_wake_up(p, state, 0); |
1596 | } | 1609 | } |
1597 | 1610 | ||
1598 | static void task_running_tick(struct rq *rq, struct task_struct *p); | ||
1599 | /* | 1611 | /* |
1600 | * Perform scheduler related setup for a newly forked process p. | 1612 | * Perform scheduler related setup for a newly forked process p. |
1601 | * p is forked by current. | 1613 | * p is forked by current. |
1602 | */ | 1614 | * |
1603 | void fastcall sched_fork(struct task_struct *p, int clone_flags) | 1615 | * __sched_fork() is basic setup used by init_idle() too: |
1604 | { | 1616 | */ |
1605 | int cpu = get_cpu(); | 1617 | static void __sched_fork(struct task_struct *p) |
1618 | { | ||
1619 | p->se.wait_start_fair = 0; | ||
1620 | p->se.wait_start = 0; | ||
1621 | p->se.exec_start = 0; | ||
1622 | p->se.sum_exec_runtime = 0; | ||
1623 | p->se.delta_exec = 0; | ||
1624 | p->se.delta_fair_run = 0; | ||
1625 | p->se.delta_fair_sleep = 0; | ||
1626 | p->se.wait_runtime = 0; | ||
1627 | p->se.sum_wait_runtime = 0; | ||
1628 | p->se.sum_sleep_runtime = 0; | ||
1629 | p->se.sleep_start = 0; | ||
1630 | p->se.sleep_start_fair = 0; | ||
1631 | p->se.block_start = 0; | ||
1632 | p->se.sleep_max = 0; | ||
1633 | p->se.block_max = 0; | ||
1634 | p->se.exec_max = 0; | ||
1635 | p->se.wait_max = 0; | ||
1636 | p->se.wait_runtime_overruns = 0; | ||
1637 | p->se.wait_runtime_underruns = 0; | ||
1606 | 1638 | ||
1607 | #ifdef CONFIG_SMP | 1639 | INIT_LIST_HEAD(&p->run_list); |
1608 | cpu = sched_balance_self(cpu, SD_BALANCE_FORK); | 1640 | p->se.on_rq = 0; |
1609 | #endif | ||
1610 | set_task_cpu(p, cpu); | ||
1611 | 1641 | ||
1612 | /* | 1642 | /* |
1613 | * We mark the process as running here, but have not actually | 1643 | * We mark the process as running here, but have not actually |
@@ -1616,16 +1646,29 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags) | |||
1616 | * event cannot wake it up and insert it on the runqueue either. | 1646 | * event cannot wake it up and insert it on the runqueue either. |
1617 | */ | 1647 | */ |
1618 | p->state = TASK_RUNNING; | 1648 | p->state = TASK_RUNNING; |
1649 | } | ||
1650 | |||
1651 | /* | ||
1652 | * fork()/clone()-time setup: | ||
1653 | */ | ||
1654 | void sched_fork(struct task_struct *p, int clone_flags) | ||
1655 | { | ||
1656 | int cpu = get_cpu(); | ||
1657 | |||
1658 | __sched_fork(p); | ||
1659 | |||
1660 | #ifdef CONFIG_SMP | ||
1661 | cpu = sched_balance_self(cpu, SD_BALANCE_FORK); | ||
1662 | #endif | ||
1663 | __set_task_cpu(p, cpu); | ||
1619 | 1664 | ||
1620 | /* | 1665 | /* |
1621 | * Make sure we do not leak PI boosting priority to the child: | 1666 | * Make sure we do not leak PI boosting priority to the child: |
1622 | */ | 1667 | */ |
1623 | p->prio = current->normal_prio; | 1668 | p->prio = current->normal_prio; |
1624 | 1669 | ||
1625 | INIT_LIST_HEAD(&p->run_list); | ||
1626 | p->array = NULL; | ||
1627 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) | 1670 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) |
1628 | if (unlikely(sched_info_on())) | 1671 | if (likely(sched_info_on())) |
1629 | memset(&p->sched_info, 0, sizeof(p->sched_info)); | 1672 | memset(&p->sched_info, 0, sizeof(p->sched_info)); |
1630 | #endif | 1673 | #endif |
1631 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) | 1674 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) |
@@ -1635,34 +1678,16 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags) | |||
1635 | /* Want to start with kernel preemption disabled. */ | 1678 | /* Want to start with kernel preemption disabled. */ |
1636 | task_thread_info(p)->preempt_count = 1; | 1679 | task_thread_info(p)->preempt_count = 1; |
1637 | #endif | 1680 | #endif |
1638 | /* | ||
1639 | * Share the timeslice between parent and child, thus the | ||
1640 | * total amount of pending timeslices in the system doesn't change, | ||
1641 | * resulting in more scheduling fairness. | ||
1642 | */ | ||
1643 | local_irq_disable(); | ||
1644 | p->time_slice = (current->time_slice + 1) >> 1; | ||
1645 | /* | ||
1646 | * The remainder of the first timeslice might be recovered by | ||
1647 | * the parent if the child exits early enough. | ||
1648 | */ | ||
1649 | p->first_time_slice = 1; | ||
1650 | current->time_slice >>= 1; | ||
1651 | p->timestamp = sched_clock(); | ||
1652 | if (unlikely(!current->time_slice)) { | ||
1653 | /* | ||
1654 | * This case is rare, it happens when the parent has only | ||
1655 | * a single jiffy left from its timeslice. Taking the | ||
1656 | * runqueue lock is not a problem. | ||
1657 | */ | ||
1658 | current->time_slice = 1; | ||
1659 | task_running_tick(cpu_rq(cpu), current); | ||
1660 | } | ||
1661 | local_irq_enable(); | ||
1662 | put_cpu(); | 1681 | put_cpu(); |
1663 | } | 1682 | } |
1664 | 1683 | ||
1665 | /* | 1684 | /* |
1685 | * After fork, child runs first. (default) If set to 0 then | ||
1686 | * parent will (try to) run first. | ||
1687 | */ | ||
1688 | unsigned int __read_mostly sysctl_sched_child_runs_first = 1; | ||
1689 | |||
1690 | /* | ||
1666 | * wake_up_new_task - wake up a newly created task for the first time. | 1691 | * wake_up_new_task - wake up a newly created task for the first time. |
1667 | * | 1692 | * |
1668 | * This function will do some initial scheduler statistics housekeeping | 1693 | * This function will do some initial scheduler statistics housekeeping |
@@ -1671,77 +1696,28 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags) | |||
1671 | */ | 1696 | */ |
1672 | void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | 1697 | void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) |
1673 | { | 1698 | { |
1674 | struct rq *rq, *this_rq; | ||
1675 | unsigned long flags; | 1699 | unsigned long flags; |
1676 | int this_cpu, cpu; | 1700 | struct rq *rq; |
1701 | int this_cpu; | ||
1677 | 1702 | ||
1678 | rq = task_rq_lock(p, &flags); | 1703 | rq = task_rq_lock(p, &flags); |
1679 | BUG_ON(p->state != TASK_RUNNING); | 1704 | BUG_ON(p->state != TASK_RUNNING); |
1680 | this_cpu = smp_processor_id(); | 1705 | this_cpu = smp_processor_id(); /* parent's CPU */ |
1681 | cpu = task_cpu(p); | ||
1682 | |||
1683 | /* | ||
1684 | * We decrease the sleep average of forking parents | ||
1685 | * and children as well, to keep max-interactive tasks | ||
1686 | * from forking tasks that are max-interactive. The parent | ||
1687 | * (current) is done further down, under its lock. | ||
1688 | */ | ||
1689 | p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * | ||
1690 | CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); | ||
1691 | 1706 | ||
1692 | p->prio = effective_prio(p); | 1707 | p->prio = effective_prio(p); |
1693 | 1708 | ||
1694 | if (likely(cpu == this_cpu)) { | 1709 | if (!sysctl_sched_child_runs_first || (clone_flags & CLONE_VM) || |
1695 | if (!(clone_flags & CLONE_VM)) { | 1710 | task_cpu(p) != this_cpu || !current->se.on_rq) { |
1696 | /* | 1711 | activate_task(rq, p, 0); |
1697 | * The VM isn't cloned, so we're in a good position to | ||
1698 | * do child-runs-first in anticipation of an exec. This | ||
1699 | * usually avoids a lot of COW overhead. | ||
1700 | */ | ||
1701 | if (unlikely(!current->array)) | ||
1702 | __activate_task(p, rq); | ||
1703 | else { | ||
1704 | p->prio = current->prio; | ||
1705 | p->normal_prio = current->normal_prio; | ||
1706 | list_add_tail(&p->run_list, ¤t->run_list); | ||
1707 | p->array = current->array; | ||
1708 | p->array->nr_active++; | ||
1709 | inc_nr_running(p, rq); | ||
1710 | } | ||
1711 | set_need_resched(); | ||
1712 | } else | ||
1713 | /* Run child last */ | ||
1714 | __activate_task(p, rq); | ||
1715 | /* | ||
1716 | * We skip the following code due to cpu == this_cpu | ||
1717 | * | ||
1718 | * task_rq_unlock(rq, &flags); | ||
1719 | * this_rq = task_rq_lock(current, &flags); | ||
1720 | */ | ||
1721 | this_rq = rq; | ||
1722 | } else { | 1712 | } else { |
1723 | this_rq = cpu_rq(this_cpu); | ||
1724 | |||
1725 | /* | 1713 | /* |
1726 | * Not the local CPU - must adjust timestamp. This should | 1714 | * Let the scheduling class do new task startup |
1727 | * get optimised away in the !CONFIG_SMP case. | 1715 | * management (if any): |
1728 | */ | 1716 | */ |
1729 | p->timestamp = (p->timestamp - this_rq->most_recent_timestamp) | 1717 | p->sched_class->task_new(rq, p); |
1730 | + rq->most_recent_timestamp; | ||
1731 | __activate_task(p, rq); | ||
1732 | if (TASK_PREEMPTS_CURR(p, rq)) | ||
1733 | resched_task(rq->curr); | ||
1734 | |||
1735 | /* | ||
1736 | * Parent and child are on different CPUs, now get the | ||
1737 | * parent runqueue to update the parent's ->sleep_avg: | ||
1738 | */ | ||
1739 | task_rq_unlock(rq, &flags); | ||
1740 | this_rq = task_rq_lock(current, &flags); | ||
1741 | } | 1718 | } |
1742 | current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * | 1719 | check_preempt_curr(rq, p); |
1743 | PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); | 1720 | task_rq_unlock(rq, &flags); |
1744 | task_rq_unlock(this_rq, &flags); | ||
1745 | } | 1721 | } |
1746 | 1722 | ||
1747 | /** | 1723 | /** |
@@ -1833,13 +1809,15 @@ asmlinkage void schedule_tail(struct task_struct *prev) | |||
1833 | * context_switch - switch to the new MM and the new | 1809 | * context_switch - switch to the new MM and the new |
1834 | * thread's register state. | 1810 | * thread's register state. |
1835 | */ | 1811 | */ |
1836 | static inline struct task_struct * | 1812 | static inline void |
1837 | context_switch(struct rq *rq, struct task_struct *prev, | 1813 | context_switch(struct rq *rq, struct task_struct *prev, |
1838 | struct task_struct *next) | 1814 | struct task_struct *next) |
1839 | { | 1815 | { |
1840 | struct mm_struct *mm = next->mm; | 1816 | struct mm_struct *mm, *oldmm; |
1841 | struct mm_struct *oldmm = prev->active_mm; | ||
1842 | 1817 | ||
1818 | prepare_task_switch(rq, next); | ||
1819 | mm = next->mm; | ||
1820 | oldmm = prev->active_mm; | ||
1843 | /* | 1821 | /* |
1844 | * For paravirt, this is coupled with an exit in switch_to to | 1822 | * For paravirt, this is coupled with an exit in switch_to to |
1845 | * combine the page table reload and the switch backend into | 1823 | * combine the page table reload and the switch backend into |
@@ -1847,16 +1825,15 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
1847 | */ | 1825 | */ |
1848 | arch_enter_lazy_cpu_mode(); | 1826 | arch_enter_lazy_cpu_mode(); |
1849 | 1827 | ||
1850 | if (!mm) { | 1828 | if (unlikely(!mm)) { |
1851 | next->active_mm = oldmm; | 1829 | next->active_mm = oldmm; |
1852 | atomic_inc(&oldmm->mm_count); | 1830 | atomic_inc(&oldmm->mm_count); |
1853 | enter_lazy_tlb(oldmm, next); | 1831 | enter_lazy_tlb(oldmm, next); |
1854 | } else | 1832 | } else |
1855 | switch_mm(oldmm, mm, next); | 1833 | switch_mm(oldmm, mm, next); |
1856 | 1834 | ||
1857 | if (!prev->mm) { | 1835 | if (unlikely(!prev->mm)) { |
1858 | prev->active_mm = NULL; | 1836 | prev->active_mm = NULL; |
1859 | WARN_ON(rq->prev_mm); | ||
1860 | rq->prev_mm = oldmm; | 1837 | rq->prev_mm = oldmm; |
1861 | } | 1838 | } |
1862 | /* | 1839 | /* |
@@ -1872,7 +1849,13 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
1872 | /* Here we just switch the register state and the stack. */ | 1849 | /* Here we just switch the register state and the stack. */ |
1873 | switch_to(prev, next, prev); | 1850 | switch_to(prev, next, prev); |
1874 | 1851 | ||
1875 | return prev; | 1852 | barrier(); |
1853 | /* | ||
1854 | * this_rq must be evaluated again because prev may have moved | ||
1855 | * CPUs since it called schedule(), thus the 'rq' on its stack | ||
1856 | * frame will be invalid. | ||
1857 | */ | ||
1858 | finish_task_switch(this_rq(), prev); | ||
1876 | } | 1859 | } |
1877 | 1860 | ||
1878 | /* | 1861 | /* |
@@ -1945,17 +1928,65 @@ unsigned long nr_active(void) | |||
1945 | return running + uninterruptible; | 1928 | return running + uninterruptible; |
1946 | } | 1929 | } |
1947 | 1930 | ||
1948 | #ifdef CONFIG_SMP | ||
1949 | |||
1950 | /* | 1931 | /* |
1951 | * Is this task likely cache-hot: | 1932 | * Update rq->cpu_load[] statistics. This function is usually called every |
1933 | * scheduler tick (TICK_NSEC). | ||
1952 | */ | 1934 | */ |
1953 | static inline int | 1935 | static void update_cpu_load(struct rq *this_rq) |
1954 | task_hot(struct task_struct *p, unsigned long long now, struct sched_domain *sd) | ||
1955 | { | 1936 | { |
1956 | return (long long)(now - p->last_ran) < (long long)sd->cache_hot_time; | 1937 | u64 fair_delta64, exec_delta64, idle_delta64, sample_interval64, tmp64; |
1938 | unsigned long total_load = this_rq->ls.load.weight; | ||
1939 | unsigned long this_load = total_load; | ||
1940 | struct load_stat *ls = &this_rq->ls; | ||
1941 | u64 now = __rq_clock(this_rq); | ||
1942 | int i, scale; | ||
1943 | |||
1944 | this_rq->nr_load_updates++; | ||
1945 | if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD))) | ||
1946 | goto do_avg; | ||
1947 | |||
1948 | /* Update delta_fair/delta_exec fields first */ | ||
1949 | update_curr_load(this_rq, now); | ||
1950 | |||
1951 | fair_delta64 = ls->delta_fair + 1; | ||
1952 | ls->delta_fair = 0; | ||
1953 | |||
1954 | exec_delta64 = ls->delta_exec + 1; | ||
1955 | ls->delta_exec = 0; | ||
1956 | |||
1957 | sample_interval64 = now - ls->load_update_last; | ||
1958 | ls->load_update_last = now; | ||
1959 | |||
1960 | if ((s64)sample_interval64 < (s64)TICK_NSEC) | ||
1961 | sample_interval64 = TICK_NSEC; | ||
1962 | |||
1963 | if (exec_delta64 > sample_interval64) | ||
1964 | exec_delta64 = sample_interval64; | ||
1965 | |||
1966 | idle_delta64 = sample_interval64 - exec_delta64; | ||
1967 | |||
1968 | tmp64 = div64_64(SCHED_LOAD_SCALE * exec_delta64, fair_delta64); | ||
1969 | tmp64 = div64_64(tmp64 * exec_delta64, sample_interval64); | ||
1970 | |||
1971 | this_load = (unsigned long)tmp64; | ||
1972 | |||
1973 | do_avg: | ||
1974 | |||
1975 | /* Update our load: */ | ||
1976 | for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { | ||
1977 | unsigned long old_load, new_load; | ||
1978 | |||
1979 | /* scale is effectively 1 << i now, and >> i divides by scale */ | ||
1980 | |||
1981 | old_load = this_rq->cpu_load[i]; | ||
1982 | new_load = this_load; | ||
1983 | |||
1984 | this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; | ||
1985 | } | ||
1957 | } | 1986 | } |
1958 | 1987 | ||
1988 | #ifdef CONFIG_SMP | ||
1989 | |||
1959 | /* | 1990 | /* |
1960 | * double_rq_lock - safely lock two runqueues | 1991 | * double_rq_lock - safely lock two runqueues |
1961 | * | 1992 | * |
@@ -2072,23 +2103,17 @@ void sched_exec(void) | |||
2072 | * pull_task - move a task from a remote runqueue to the local runqueue. | 2103 | * pull_task - move a task from a remote runqueue to the local runqueue. |
2073 | * Both runqueues must be locked. | 2104 | * Both runqueues must be locked. |
2074 | */ | 2105 | */ |
2075 | static void pull_task(struct rq *src_rq, struct prio_array *src_array, | 2106 | static void pull_task(struct rq *src_rq, struct task_struct *p, |
2076 | struct task_struct *p, struct rq *this_rq, | 2107 | struct rq *this_rq, int this_cpu) |
2077 | struct prio_array *this_array, int this_cpu) | ||
2078 | { | 2108 | { |
2079 | dequeue_task(p, src_array); | 2109 | deactivate_task(src_rq, p, 0); |
2080 | dec_nr_running(p, src_rq); | ||
2081 | set_task_cpu(p, this_cpu); | 2110 | set_task_cpu(p, this_cpu); |
2082 | inc_nr_running(p, this_rq); | 2111 | activate_task(this_rq, p, 0); |
2083 | enqueue_task(p, this_array); | ||
2084 | p->timestamp = (p->timestamp - src_rq->most_recent_timestamp) | ||
2085 | + this_rq->most_recent_timestamp; | ||
2086 | /* | 2112 | /* |
2087 | * Note that idle threads have a prio of MAX_PRIO, for this test | 2113 | * Note that idle threads have a prio of MAX_PRIO, for this test |
2088 | * to be always true for them. | 2114 | * to be always true for them. |
2089 | */ | 2115 | */ |
2090 | if (TASK_PREEMPTS_CURR(p, this_rq)) | 2116 | check_preempt_curr(this_rq, p); |
2091 | resched_task(this_rq->curr); | ||
2092 | } | 2117 | } |
2093 | 2118 | ||
2094 | /* | 2119 | /* |
@@ -2113,132 +2138,67 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, | |||
2113 | return 0; | 2138 | return 0; |
2114 | 2139 | ||
2115 | /* | 2140 | /* |
2116 | * Aggressive migration if: | 2141 | * Aggressive migration if too many balance attempts have failed: |
2117 | * 1) task is cache cold, or | ||
2118 | * 2) too many balance attempts have failed. | ||
2119 | */ | 2142 | */ |
2120 | 2143 | if (sd->nr_balance_failed > sd->cache_nice_tries) | |
2121 | if (sd->nr_balance_failed > sd->cache_nice_tries) { | ||
2122 | #ifdef CONFIG_SCHEDSTATS | ||
2123 | if (task_hot(p, rq->most_recent_timestamp, sd)) | ||
2124 | schedstat_inc(sd, lb_hot_gained[idle]); | ||
2125 | #endif | ||
2126 | return 1; | 2144 | return 1; |
2127 | } | ||
2128 | 2145 | ||
2129 | if (task_hot(p, rq->most_recent_timestamp, sd)) | ||
2130 | return 0; | ||
2131 | return 1; | 2146 | return 1; |
2132 | } | 2147 | } |
2133 | 2148 | ||
2134 | #define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio) | 2149 | static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, |
2135 | |||
2136 | /* | ||
2137 | * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted | ||
2138 | * load from busiest to this_rq, as part of a balancing operation within | ||
2139 | * "domain". Returns the number of tasks moved. | ||
2140 | * | ||
2141 | * Called with both runqueues locked. | ||
2142 | */ | ||
2143 | static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
2144 | unsigned long max_nr_move, unsigned long max_load_move, | 2150 | unsigned long max_nr_move, unsigned long max_load_move, |
2145 | struct sched_domain *sd, enum cpu_idle_type idle, | 2151 | struct sched_domain *sd, enum cpu_idle_type idle, |
2146 | int *all_pinned) | 2152 | int *all_pinned, unsigned long *load_moved, |
2153 | int this_best_prio, int best_prio, int best_prio_seen, | ||
2154 | struct rq_iterator *iterator) | ||
2147 | { | 2155 | { |
2148 | int idx, pulled = 0, pinned = 0, this_best_prio, best_prio, | 2156 | int pulled = 0, pinned = 0, skip_for_load; |
2149 | best_prio_seen, skip_for_load; | 2157 | struct task_struct *p; |
2150 | struct prio_array *array, *dst_array; | 2158 | long rem_load_move = max_load_move; |
2151 | struct list_head *head, *curr; | ||
2152 | struct task_struct *tmp; | ||
2153 | long rem_load_move; | ||
2154 | 2159 | ||
2155 | if (max_nr_move == 0 || max_load_move == 0) | 2160 | if (max_nr_move == 0 || max_load_move == 0) |
2156 | goto out; | 2161 | goto out; |
2157 | 2162 | ||
2158 | rem_load_move = max_load_move; | ||
2159 | pinned = 1; | 2163 | pinned = 1; |
2160 | this_best_prio = rq_best_prio(this_rq); | ||
2161 | best_prio = rq_best_prio(busiest); | ||
2162 | /* | ||
2163 | * Enable handling of the case where there is more than one task | ||
2164 | * with the best priority. If the current running task is one | ||
2165 | * of those with prio==best_prio we know it won't be moved | ||
2166 | * and therefore it's safe to override the skip (based on load) of | ||
2167 | * any task we find with that prio. | ||
2168 | */ | ||
2169 | best_prio_seen = best_prio == busiest->curr->prio; | ||
2170 | 2164 | ||
2171 | /* | 2165 | /* |
2172 | * We first consider expired tasks. Those will likely not be | 2166 | * Start the load-balancing iterator: |
2173 | * executed in the near future, and they are most likely to | ||
2174 | * be cache-cold, thus switching CPUs has the least effect | ||
2175 | * on them. | ||
2176 | */ | 2167 | */ |
2177 | if (busiest->expired->nr_active) { | 2168 | p = iterator->start(iterator->arg); |
2178 | array = busiest->expired; | 2169 | next: |
2179 | dst_array = this_rq->expired; | 2170 | if (!p) |
2180 | } else { | ||
2181 | array = busiest->active; | ||
2182 | dst_array = this_rq->active; | ||
2183 | } | ||
2184 | |||
2185 | new_array: | ||
2186 | /* Start searching at priority 0: */ | ||
2187 | idx = 0; | ||
2188 | skip_bitmap: | ||
2189 | if (!idx) | ||
2190 | idx = sched_find_first_bit(array->bitmap); | ||
2191 | else | ||
2192 | idx = find_next_bit(array->bitmap, MAX_PRIO, idx); | ||
2193 | if (idx >= MAX_PRIO) { | ||
2194 | if (array == busiest->expired && busiest->active->nr_active) { | ||
2195 | array = busiest->active; | ||
2196 | dst_array = this_rq->active; | ||
2197 | goto new_array; | ||
2198 | } | ||
2199 | goto out; | 2171 | goto out; |
2200 | } | ||
2201 | |||
2202 | head = array->queue + idx; | ||
2203 | curr = head->prev; | ||
2204 | skip_queue: | ||
2205 | tmp = list_entry(curr, struct task_struct, run_list); | ||
2206 | |||
2207 | curr = curr->prev; | ||
2208 | |||
2209 | /* | 2172 | /* |
2210 | * To help distribute high priority tasks accross CPUs we don't | 2173 | * To help distribute high priority tasks accross CPUs we don't |
2211 | * skip a task if it will be the highest priority task (i.e. smallest | 2174 | * skip a task if it will be the highest priority task (i.e. smallest |
2212 | * prio value) on its new queue regardless of its load weight | 2175 | * prio value) on its new queue regardless of its load weight |
2213 | */ | 2176 | */ |
2214 | skip_for_load = tmp->load_weight > rem_load_move; | 2177 | skip_for_load = (p->se.load.weight >> 1) > rem_load_move + |
2215 | if (skip_for_load && idx < this_best_prio) | 2178 | SCHED_LOAD_SCALE_FUZZ; |
2216 | skip_for_load = !best_prio_seen && idx == best_prio; | 2179 | if (skip_for_load && p->prio < this_best_prio) |
2180 | skip_for_load = !best_prio_seen && p->prio == best_prio; | ||
2217 | if (skip_for_load || | 2181 | if (skip_for_load || |
2218 | !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { | 2182 | !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { |
2219 | 2183 | ||
2220 | best_prio_seen |= idx == best_prio; | 2184 | best_prio_seen |= p->prio == best_prio; |
2221 | if (curr != head) | 2185 | p = iterator->next(iterator->arg); |
2222 | goto skip_queue; | 2186 | goto next; |
2223 | idx++; | ||
2224 | goto skip_bitmap; | ||
2225 | } | 2187 | } |
2226 | 2188 | ||
2227 | pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); | 2189 | pull_task(busiest, p, this_rq, this_cpu); |
2228 | pulled++; | 2190 | pulled++; |
2229 | rem_load_move -= tmp->load_weight; | 2191 | rem_load_move -= p->se.load.weight; |
2230 | 2192 | ||
2231 | /* | 2193 | /* |
2232 | * We only want to steal up to the prescribed number of tasks | 2194 | * We only want to steal up to the prescribed number of tasks |
2233 | * and the prescribed amount of weighted load. | 2195 | * and the prescribed amount of weighted load. |
2234 | */ | 2196 | */ |
2235 | if (pulled < max_nr_move && rem_load_move > 0) { | 2197 | if (pulled < max_nr_move && rem_load_move > 0) { |
2236 | if (idx < this_best_prio) | 2198 | if (p->prio < this_best_prio) |
2237 | this_best_prio = idx; | 2199 | this_best_prio = p->prio; |
2238 | if (curr != head) | 2200 | p = iterator->next(iterator->arg); |
2239 | goto skip_queue; | 2201 | goto next; |
2240 | idx++; | ||
2241 | goto skip_bitmap; | ||
2242 | } | 2202 | } |
2243 | out: | 2203 | out: |
2244 | /* | 2204 | /* |
@@ -2250,18 +2210,48 @@ out: | |||
2250 | 2210 | ||
2251 | if (all_pinned) | 2211 | if (all_pinned) |
2252 | *all_pinned = pinned; | 2212 | *all_pinned = pinned; |
2213 | *load_moved = max_load_move - rem_load_move; | ||
2253 | return pulled; | 2214 | return pulled; |
2254 | } | 2215 | } |
2255 | 2216 | ||
2256 | /* | 2217 | /* |
2218 | * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted | ||
2219 | * load from busiest to this_rq, as part of a balancing operation within | ||
2220 | * "domain". Returns the number of tasks moved. | ||
2221 | * | ||
2222 | * Called with both runqueues locked. | ||
2223 | */ | ||
2224 | static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
2225 | unsigned long max_nr_move, unsigned long max_load_move, | ||
2226 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
2227 | int *all_pinned) | ||
2228 | { | ||
2229 | struct sched_class *class = sched_class_highest; | ||
2230 | unsigned long load_moved, total_nr_moved = 0, nr_moved; | ||
2231 | long rem_load_move = max_load_move; | ||
2232 | |||
2233 | do { | ||
2234 | nr_moved = class->load_balance(this_rq, this_cpu, busiest, | ||
2235 | max_nr_move, (unsigned long)rem_load_move, | ||
2236 | sd, idle, all_pinned, &load_moved); | ||
2237 | total_nr_moved += nr_moved; | ||
2238 | max_nr_move -= nr_moved; | ||
2239 | rem_load_move -= load_moved; | ||
2240 | class = class->next; | ||
2241 | } while (class && max_nr_move && rem_load_move > 0); | ||
2242 | |||
2243 | return total_nr_moved; | ||
2244 | } | ||
2245 | |||
2246 | /* | ||
2257 | * find_busiest_group finds and returns the busiest CPU group within the | 2247 | * find_busiest_group finds and returns the busiest CPU group within the |
2258 | * domain. It calculates and returns the amount of weighted load which | 2248 | * domain. It calculates and returns the amount of weighted load which |
2259 | * should be moved to restore balance via the imbalance parameter. | 2249 | * should be moved to restore balance via the imbalance parameter. |
2260 | */ | 2250 | */ |
2261 | static struct sched_group * | 2251 | static struct sched_group * |
2262 | find_busiest_group(struct sched_domain *sd, int this_cpu, | 2252 | find_busiest_group(struct sched_domain *sd, int this_cpu, |
2263 | unsigned long *imbalance, enum cpu_idle_type idle, int *sd_idle, | 2253 | unsigned long *imbalance, enum cpu_idle_type idle, |
2264 | cpumask_t *cpus, int *balance) | 2254 | int *sd_idle, cpumask_t *cpus, int *balance) |
2265 | { | 2255 | { |
2266 | struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; | 2256 | struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; |
2267 | unsigned long max_load, avg_load, total_load, this_load, total_pwr; | 2257 | unsigned long max_load, avg_load, total_load, this_load, total_pwr; |
@@ -2325,7 +2315,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2325 | 2315 | ||
2326 | avg_load += load; | 2316 | avg_load += load; |
2327 | sum_nr_running += rq->nr_running; | 2317 | sum_nr_running += rq->nr_running; |
2328 | sum_weighted_load += rq->raw_weighted_load; | 2318 | sum_weighted_load += weighted_cpuload(i); |
2329 | } | 2319 | } |
2330 | 2320 | ||
2331 | /* | 2321 | /* |
@@ -2365,8 +2355,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2365 | * Busy processors will not participate in power savings | 2355 | * Busy processors will not participate in power savings |
2366 | * balance. | 2356 | * balance. |
2367 | */ | 2357 | */ |
2368 | if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) | 2358 | if (idle == CPU_NOT_IDLE || |
2369 | goto group_next; | 2359 | !(sd->flags & SD_POWERSAVINGS_BALANCE)) |
2360 | goto group_next; | ||
2370 | 2361 | ||
2371 | /* | 2362 | /* |
2372 | * If the local group is idle or completely loaded | 2363 | * If the local group is idle or completely loaded |
@@ -2376,42 +2367,42 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2376 | !this_nr_running)) | 2367 | !this_nr_running)) |
2377 | power_savings_balance = 0; | 2368 | power_savings_balance = 0; |
2378 | 2369 | ||
2379 | /* | 2370 | /* |
2380 | * If a group is already running at full capacity or idle, | 2371 | * If a group is already running at full capacity or idle, |
2381 | * don't include that group in power savings calculations | 2372 | * don't include that group in power savings calculations |
2382 | */ | 2373 | */ |
2383 | if (!power_savings_balance || sum_nr_running >= group_capacity | 2374 | if (!power_savings_balance || sum_nr_running >= group_capacity |
2384 | || !sum_nr_running) | 2375 | || !sum_nr_running) |
2385 | goto group_next; | 2376 | goto group_next; |
2386 | 2377 | ||
2387 | /* | 2378 | /* |
2388 | * Calculate the group which has the least non-idle load. | 2379 | * Calculate the group which has the least non-idle load. |
2389 | * This is the group from where we need to pick up the load | 2380 | * This is the group from where we need to pick up the load |
2390 | * for saving power | 2381 | * for saving power |
2391 | */ | 2382 | */ |
2392 | if ((sum_nr_running < min_nr_running) || | 2383 | if ((sum_nr_running < min_nr_running) || |
2393 | (sum_nr_running == min_nr_running && | 2384 | (sum_nr_running == min_nr_running && |
2394 | first_cpu(group->cpumask) < | 2385 | first_cpu(group->cpumask) < |
2395 | first_cpu(group_min->cpumask))) { | 2386 | first_cpu(group_min->cpumask))) { |
2396 | group_min = group; | 2387 | group_min = group; |
2397 | min_nr_running = sum_nr_running; | 2388 | min_nr_running = sum_nr_running; |
2398 | min_load_per_task = sum_weighted_load / | 2389 | min_load_per_task = sum_weighted_load / |
2399 | sum_nr_running; | 2390 | sum_nr_running; |
2400 | } | 2391 | } |
2401 | 2392 | ||
2402 | /* | 2393 | /* |
2403 | * Calculate the group which is almost near its | 2394 | * Calculate the group which is almost near its |
2404 | * capacity but still has some space to pick up some load | 2395 | * capacity but still has some space to pick up some load |
2405 | * from other group and save more power | 2396 | * from other group and save more power |
2406 | */ | 2397 | */ |
2407 | if (sum_nr_running <= group_capacity - 1) { | 2398 | if (sum_nr_running <= group_capacity - 1) { |
2408 | if (sum_nr_running > leader_nr_running || | 2399 | if (sum_nr_running > leader_nr_running || |
2409 | (sum_nr_running == leader_nr_running && | 2400 | (sum_nr_running == leader_nr_running && |
2410 | first_cpu(group->cpumask) > | 2401 | first_cpu(group->cpumask) > |
2411 | first_cpu(group_leader->cpumask))) { | 2402 | first_cpu(group_leader->cpumask))) { |
2412 | group_leader = group; | 2403 | group_leader = group; |
2413 | leader_nr_running = sum_nr_running; | 2404 | leader_nr_running = sum_nr_running; |
2414 | } | 2405 | } |
2415 | } | 2406 | } |
2416 | group_next: | 2407 | group_next: |
2417 | #endif | 2408 | #endif |
@@ -2466,7 +2457,7 @@ group_next: | |||
2466 | * a think about bumping its value to force at least one task to be | 2457 | * a think about bumping its value to force at least one task to be |
2467 | * moved | 2458 | * moved |
2468 | */ | 2459 | */ |
2469 | if (*imbalance < busiest_load_per_task) { | 2460 | if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) { |
2470 | unsigned long tmp, pwr_now, pwr_move; | 2461 | unsigned long tmp, pwr_now, pwr_move; |
2471 | unsigned int imbn; | 2462 | unsigned int imbn; |
2472 | 2463 | ||
@@ -2480,7 +2471,8 @@ small_imbalance: | |||
2480 | } else | 2471 | } else |
2481 | this_load_per_task = SCHED_LOAD_SCALE; | 2472 | this_load_per_task = SCHED_LOAD_SCALE; |
2482 | 2473 | ||
2483 | if (max_load - this_load >= busiest_load_per_task * imbn) { | 2474 | if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >= |
2475 | busiest_load_per_task * imbn) { | ||
2484 | *imbalance = busiest_load_per_task; | 2476 | *imbalance = busiest_load_per_task; |
2485 | return busiest; | 2477 | return busiest; |
2486 | } | 2478 | } |
@@ -2552,17 +2544,19 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, | |||
2552 | int i; | 2544 | int i; |
2553 | 2545 | ||
2554 | for_each_cpu_mask(i, group->cpumask) { | 2546 | for_each_cpu_mask(i, group->cpumask) { |
2547 | unsigned long wl; | ||
2555 | 2548 | ||
2556 | if (!cpu_isset(i, *cpus)) | 2549 | if (!cpu_isset(i, *cpus)) |
2557 | continue; | 2550 | continue; |
2558 | 2551 | ||
2559 | rq = cpu_rq(i); | 2552 | rq = cpu_rq(i); |
2553 | wl = weighted_cpuload(i); | ||
2560 | 2554 | ||
2561 | if (rq->nr_running == 1 && rq->raw_weighted_load > imbalance) | 2555 | if (rq->nr_running == 1 && wl > imbalance) |
2562 | continue; | 2556 | continue; |
2563 | 2557 | ||
2564 | if (rq->raw_weighted_load > max_load) { | 2558 | if (wl > max_load) { |
2565 | max_load = rq->raw_weighted_load; | 2559 | max_load = wl; |
2566 | busiest = rq; | 2560 | busiest = rq; |
2567 | } | 2561 | } |
2568 | } | 2562 | } |
@@ -2599,7 +2593,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
2599 | /* | 2593 | /* |
2600 | * When power savings policy is enabled for the parent domain, idle | 2594 | * When power savings policy is enabled for the parent domain, idle |
2601 | * sibling can pick up load irrespective of busy siblings. In this case, | 2595 | * sibling can pick up load irrespective of busy siblings. In this case, |
2602 | * let the state of idle sibling percolate up as IDLE, instead of | 2596 | * let the state of idle sibling percolate up as CPU_IDLE, instead of |
2603 | * portraying it as CPU_NOT_IDLE. | 2597 | * portraying it as CPU_NOT_IDLE. |
2604 | */ | 2598 | */ |
2605 | if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && | 2599 | if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && |
@@ -2822,8 +2816,8 @@ out_balanced: | |||
2822 | static void idle_balance(int this_cpu, struct rq *this_rq) | 2816 | static void idle_balance(int this_cpu, struct rq *this_rq) |
2823 | { | 2817 | { |
2824 | struct sched_domain *sd; | 2818 | struct sched_domain *sd; |
2825 | int pulled_task = 0; | 2819 | int pulled_task = -1; |
2826 | unsigned long next_balance = jiffies + 60 * HZ; | 2820 | unsigned long next_balance = jiffies + HZ; |
2827 | 2821 | ||
2828 | for_each_domain(this_cpu, sd) { | 2822 | for_each_domain(this_cpu, sd) { |
2829 | unsigned long interval; | 2823 | unsigned long interval; |
@@ -2842,12 +2836,13 @@ static void idle_balance(int this_cpu, struct rq *this_rq) | |||
2842 | if (pulled_task) | 2836 | if (pulled_task) |
2843 | break; | 2837 | break; |
2844 | } | 2838 | } |
2845 | if (!pulled_task) | 2839 | if (pulled_task || time_after(jiffies, this_rq->next_balance)) { |
2846 | /* | 2840 | /* |
2847 | * We are going idle. next_balance may be set based on | 2841 | * We are going idle. next_balance may be set based on |
2848 | * a busy processor. So reset next_balance. | 2842 | * a busy processor. So reset next_balance. |
2849 | */ | 2843 | */ |
2850 | this_rq->next_balance = next_balance; | 2844 | this_rq->next_balance = next_balance; |
2845 | } | ||
2851 | } | 2846 | } |
2852 | 2847 | ||
2853 | /* | 2848 | /* |
@@ -2900,32 +2895,6 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) | |||
2900 | spin_unlock(&target_rq->lock); | 2895 | spin_unlock(&target_rq->lock); |
2901 | } | 2896 | } |
2902 | 2897 | ||
2903 | static void update_load(struct rq *this_rq) | ||
2904 | { | ||
2905 | unsigned long this_load; | ||
2906 | unsigned int i, scale; | ||
2907 | |||
2908 | this_load = this_rq->raw_weighted_load; | ||
2909 | |||
2910 | /* Update our load: */ | ||
2911 | for (i = 0, scale = 1; i < 3; i++, scale += scale) { | ||
2912 | unsigned long old_load, new_load; | ||
2913 | |||
2914 | /* scale is effectively 1 << i now, and >> i divides by scale */ | ||
2915 | |||
2916 | old_load = this_rq->cpu_load[i]; | ||
2917 | new_load = this_load; | ||
2918 | /* | ||
2919 | * Round up the averaging division if load is increasing. This | ||
2920 | * prevents us from getting stuck on 9 if the load is 10, for | ||
2921 | * example. | ||
2922 | */ | ||
2923 | if (new_load > old_load) | ||
2924 | new_load += scale-1; | ||
2925 | this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; | ||
2926 | } | ||
2927 | } | ||
2928 | |||
2929 | #ifdef CONFIG_NO_HZ | 2898 | #ifdef CONFIG_NO_HZ |
2930 | static struct { | 2899 | static struct { |
2931 | atomic_t load_balancer; | 2900 | atomic_t load_balancer; |
@@ -3029,6 +2998,9 @@ static inline void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
3029 | interval = msecs_to_jiffies(interval); | 2998 | interval = msecs_to_jiffies(interval); |
3030 | if (unlikely(!interval)) | 2999 | if (unlikely(!interval)) |
3031 | interval = 1; | 3000 | interval = 1; |
3001 | if (interval > HZ*NR_CPUS/10) | ||
3002 | interval = HZ*NR_CPUS/10; | ||
3003 | |||
3032 | 3004 | ||
3033 | if (sd->flags & SD_SERIALIZE) { | 3005 | if (sd->flags & SD_SERIALIZE) { |
3034 | if (!spin_trylock(&balancing)) | 3006 | if (!spin_trylock(&balancing)) |
@@ -3070,11 +3042,12 @@ out: | |||
3070 | */ | 3042 | */ |
3071 | static void run_rebalance_domains(struct softirq_action *h) | 3043 | static void run_rebalance_domains(struct softirq_action *h) |
3072 | { | 3044 | { |
3073 | int local_cpu = smp_processor_id(); | 3045 | int this_cpu = smp_processor_id(); |
3074 | struct rq *local_rq = cpu_rq(local_cpu); | 3046 | struct rq *this_rq = cpu_rq(this_cpu); |
3075 | enum cpu_idle_type idle = local_rq->idle_at_tick ? CPU_IDLE : CPU_NOT_IDLE; | 3047 | enum cpu_idle_type idle = this_rq->idle_at_tick ? |
3048 | CPU_IDLE : CPU_NOT_IDLE; | ||
3076 | 3049 | ||
3077 | rebalance_domains(local_cpu, idle); | 3050 | rebalance_domains(this_cpu, idle); |
3078 | 3051 | ||
3079 | #ifdef CONFIG_NO_HZ | 3052 | #ifdef CONFIG_NO_HZ |
3080 | /* | 3053 | /* |
@@ -3082,13 +3055,13 @@ static void run_rebalance_domains(struct softirq_action *h) | |||
3082 | * balancing on behalf of the other idle cpus whose ticks are | 3055 | * balancing on behalf of the other idle cpus whose ticks are |
3083 | * stopped. | 3056 | * stopped. |
3084 | */ | 3057 | */ |
3085 | if (local_rq->idle_at_tick && | 3058 | if (this_rq->idle_at_tick && |
3086 | atomic_read(&nohz.load_balancer) == local_cpu) { | 3059 | atomic_read(&nohz.load_balancer) == this_cpu) { |
3087 | cpumask_t cpus = nohz.cpu_mask; | 3060 | cpumask_t cpus = nohz.cpu_mask; |
3088 | struct rq *rq; | 3061 | struct rq *rq; |
3089 | int balance_cpu; | 3062 | int balance_cpu; |
3090 | 3063 | ||
3091 | cpu_clear(local_cpu, cpus); | 3064 | cpu_clear(this_cpu, cpus); |
3092 | for_each_cpu_mask(balance_cpu, cpus) { | 3065 | for_each_cpu_mask(balance_cpu, cpus) { |
3093 | /* | 3066 | /* |
3094 | * If this cpu gets work to do, stop the load balancing | 3067 | * If this cpu gets work to do, stop the load balancing |
@@ -3098,11 +3071,11 @@ static void run_rebalance_domains(struct softirq_action *h) | |||
3098 | if (need_resched()) | 3071 | if (need_resched()) |
3099 | break; | 3072 | break; |
3100 | 3073 | ||
3101 | rebalance_domains(balance_cpu, CPU_IDLE); | 3074 | rebalance_domains(balance_cpu, SCHED_IDLE); |
3102 | 3075 | ||
3103 | rq = cpu_rq(balance_cpu); | 3076 | rq = cpu_rq(balance_cpu); |
3104 | if (time_after(local_rq->next_balance, rq->next_balance)) | 3077 | if (time_after(this_rq->next_balance, rq->next_balance)) |
3105 | local_rq->next_balance = rq->next_balance; | 3078 | this_rq->next_balance = rq->next_balance; |
3106 | } | 3079 | } |
3107 | } | 3080 | } |
3108 | #endif | 3081 | #endif |
@@ -3115,9 +3088,8 @@ static void run_rebalance_domains(struct softirq_action *h) | |||
3115 | * idle load balancing owner or decide to stop the periodic load balancing, | 3088 | * idle load balancing owner or decide to stop the periodic load balancing, |
3116 | * if the whole system is idle. | 3089 | * if the whole system is idle. |
3117 | */ | 3090 | */ |
3118 | static inline void trigger_load_balance(int cpu) | 3091 | static inline void trigger_load_balance(struct rq *rq, int cpu) |
3119 | { | 3092 | { |
3120 | struct rq *rq = cpu_rq(cpu); | ||
3121 | #ifdef CONFIG_NO_HZ | 3093 | #ifdef CONFIG_NO_HZ |
3122 | /* | 3094 | /* |
3123 | * If we were in the nohz mode recently and busy at the current | 3095 | * If we were in the nohz mode recently and busy at the current |
@@ -3169,13 +3141,29 @@ static inline void trigger_load_balance(int cpu) | |||
3169 | if (time_after_eq(jiffies, rq->next_balance)) | 3141 | if (time_after_eq(jiffies, rq->next_balance)) |
3170 | raise_softirq(SCHED_SOFTIRQ); | 3142 | raise_softirq(SCHED_SOFTIRQ); |
3171 | } | 3143 | } |
3172 | #else | 3144 | |
3145 | #else /* CONFIG_SMP */ | ||
3146 | |||
3173 | /* | 3147 | /* |
3174 | * on UP we do not need to balance between CPUs: | 3148 | * on UP we do not need to balance between CPUs: |
3175 | */ | 3149 | */ |
3176 | static inline void idle_balance(int cpu, struct rq *rq) | 3150 | static inline void idle_balance(int cpu, struct rq *rq) |
3177 | { | 3151 | { |
3178 | } | 3152 | } |
3153 | |||
3154 | /* Avoid "used but not defined" warning on UP */ | ||
3155 | static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
3156 | unsigned long max_nr_move, unsigned long max_load_move, | ||
3157 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
3158 | int *all_pinned, unsigned long *load_moved, | ||
3159 | int this_best_prio, int best_prio, int best_prio_seen, | ||
3160 | struct rq_iterator *iterator) | ||
3161 | { | ||
3162 | *load_moved = 0; | ||
3163 | |||
3164 | return 0; | ||
3165 | } | ||
3166 | |||
3179 | #endif | 3167 | #endif |
3180 | 3168 | ||
3181 | DEFINE_PER_CPU(struct kernel_stat, kstat); | 3169 | DEFINE_PER_CPU(struct kernel_stat, kstat); |
@@ -3277,81 +3265,6 @@ void account_steal_time(struct task_struct *p, cputime_t steal) | |||
3277 | cpustat->steal = cputime64_add(cpustat->steal, tmp); | 3265 | cpustat->steal = cputime64_add(cpustat->steal, tmp); |
3278 | } | 3266 | } |
3279 | 3267 | ||
3280 | static void task_running_tick(struct rq *rq, struct task_struct *p) | ||
3281 | { | ||
3282 | if (p->array != rq->active) { | ||
3283 | /* Task has expired but was not scheduled yet */ | ||
3284 | set_tsk_need_resched(p); | ||
3285 | return; | ||
3286 | } | ||
3287 | spin_lock(&rq->lock); | ||
3288 | /* | ||
3289 | * The task was running during this tick - update the | ||
3290 | * time slice counter. Note: we do not update a thread's | ||
3291 | * priority until it either goes to sleep or uses up its | ||
3292 | * timeslice. This makes it possible for interactive tasks | ||
3293 | * to use up their timeslices at their highest priority levels. | ||
3294 | */ | ||
3295 | if (rt_task(p)) { | ||
3296 | /* | ||
3297 | * RR tasks need a special form of timeslice management. | ||
3298 | * FIFO tasks have no timeslices. | ||
3299 | */ | ||
3300 | if ((p->policy == SCHED_RR) && !--p->time_slice) { | ||
3301 | p->time_slice = task_timeslice(p); | ||
3302 | p->first_time_slice = 0; | ||
3303 | set_tsk_need_resched(p); | ||
3304 | |||
3305 | /* put it at the end of the queue: */ | ||
3306 | requeue_task(p, rq->active); | ||
3307 | } | ||
3308 | goto out_unlock; | ||
3309 | } | ||
3310 | if (!--p->time_slice) { | ||
3311 | dequeue_task(p, rq->active); | ||
3312 | set_tsk_need_resched(p); | ||
3313 | p->prio = effective_prio(p); | ||
3314 | p->time_slice = task_timeslice(p); | ||
3315 | p->first_time_slice = 0; | ||
3316 | |||
3317 | if (!rq->expired_timestamp) | ||
3318 | rq->expired_timestamp = jiffies; | ||
3319 | if (!TASK_INTERACTIVE(p)) { | ||
3320 | enqueue_task(p, rq->expired); | ||
3321 | if (p->static_prio < rq->best_expired_prio) | ||
3322 | rq->best_expired_prio = p->static_prio; | ||
3323 | } else | ||
3324 | enqueue_task(p, rq->active); | ||
3325 | } else { | ||
3326 | /* | ||
3327 | * Prevent a too long timeslice allowing a task to monopolize | ||
3328 | * the CPU. We do this by splitting up the timeslice into | ||
3329 | * smaller pieces. | ||
3330 | * | ||
3331 | * Note: this does not mean the task's timeslices expire or | ||
3332 | * get lost in any way, they just might be preempted by | ||
3333 | * another task of equal priority. (one with higher | ||
3334 | * priority would have preempted this task already.) We | ||
3335 | * requeue this task to the end of the list on this priority | ||
3336 | * level, which is in essence a round-robin of tasks with | ||
3337 | * equal priority. | ||
3338 | * | ||
3339 | * This only applies to tasks in the interactive | ||
3340 | * delta range with at least TIMESLICE_GRANULARITY to requeue. | ||
3341 | */ | ||
3342 | if (TASK_INTERACTIVE(p) && !((task_timeslice(p) - | ||
3343 | p->time_slice) % TIMESLICE_GRANULARITY(p)) && | ||
3344 | (p->time_slice >= TIMESLICE_GRANULARITY(p)) && | ||
3345 | (p->array == rq->active)) { | ||
3346 | |||
3347 | requeue_task(p, rq->active); | ||
3348 | set_tsk_need_resched(p); | ||
3349 | } | ||
3350 | } | ||
3351 | out_unlock: | ||
3352 | spin_unlock(&rq->lock); | ||
3353 | } | ||
3354 | |||
3355 | /* | 3268 | /* |
3356 | * This function gets called by the timer code, with HZ frequency. | 3269 | * This function gets called by the timer code, with HZ frequency. |
3357 | * We call it with interrupts disabled. | 3270 | * We call it with interrupts disabled. |
@@ -3361,17 +3274,19 @@ out_unlock: | |||
3361 | */ | 3274 | */ |
3362 | void scheduler_tick(void) | 3275 | void scheduler_tick(void) |
3363 | { | 3276 | { |
3364 | struct task_struct *p = current; | ||
3365 | int cpu = smp_processor_id(); | 3277 | int cpu = smp_processor_id(); |
3366 | int idle_at_tick = idle_cpu(cpu); | ||
3367 | struct rq *rq = cpu_rq(cpu); | 3278 | struct rq *rq = cpu_rq(cpu); |
3279 | struct task_struct *curr = rq->curr; | ||
3280 | |||
3281 | spin_lock(&rq->lock); | ||
3282 | if (curr != rq->idle) /* FIXME: needed? */ | ||
3283 | curr->sched_class->task_tick(rq, curr); | ||
3284 | update_cpu_load(rq); | ||
3285 | spin_unlock(&rq->lock); | ||
3368 | 3286 | ||
3369 | if (!idle_at_tick) | ||
3370 | task_running_tick(rq, p); | ||
3371 | #ifdef CONFIG_SMP | 3287 | #ifdef CONFIG_SMP |
3372 | update_load(rq); | 3288 | rq->idle_at_tick = idle_cpu(cpu); |
3373 | rq->idle_at_tick = idle_at_tick; | 3289 | trigger_load_balance(rq, cpu); |
3374 | trigger_load_balance(cpu); | ||
3375 | #endif | 3290 | #endif |
3376 | } | 3291 | } |
3377 | 3292 | ||
@@ -3414,140 +3329,128 @@ EXPORT_SYMBOL(sub_preempt_count); | |||
3414 | #endif | 3329 | #endif |
3415 | 3330 | ||
3416 | /* | 3331 | /* |
3417 | * schedule() is the main scheduler function. | 3332 | * Print scheduling while atomic bug: |
3418 | */ | 3333 | */ |
3419 | asmlinkage void __sched schedule(void) | 3334 | static noinline void __schedule_bug(struct task_struct *prev) |
3420 | { | 3335 | { |
3421 | struct task_struct *prev, *next; | 3336 | printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d\n", |
3422 | struct prio_array *array; | 3337 | prev->comm, preempt_count(), prev->pid); |
3423 | struct list_head *queue; | 3338 | debug_show_held_locks(prev); |
3424 | unsigned long long now; | 3339 | if (irqs_disabled()) |
3425 | unsigned long run_time; | 3340 | print_irqtrace_events(prev); |
3426 | int cpu, idx; | 3341 | dump_stack(); |
3427 | long *switch_count; | 3342 | } |
3428 | struct rq *rq; | ||
3429 | 3343 | ||
3344 | /* | ||
3345 | * Various schedule()-time debugging checks and statistics: | ||
3346 | */ | ||
3347 | static inline void schedule_debug(struct task_struct *prev) | ||
3348 | { | ||
3430 | /* | 3349 | /* |
3431 | * Test if we are atomic. Since do_exit() needs to call into | 3350 | * Test if we are atomic. Since do_exit() needs to call into |
3432 | * schedule() atomically, we ignore that path for now. | 3351 | * schedule() atomically, we ignore that path for now. |
3433 | * Otherwise, whine if we are scheduling when we should not be. | 3352 | * Otherwise, whine if we are scheduling when we should not be. |
3434 | */ | 3353 | */ |
3435 | if (unlikely(in_atomic() && !current->exit_state)) { | 3354 | if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state)) |
3436 | printk(KERN_ERR "BUG: scheduling while atomic: " | 3355 | __schedule_bug(prev); |
3437 | "%s/0x%08x/%d\n", | 3356 | |
3438 | current->comm, preempt_count(), current->pid); | ||
3439 | debug_show_held_locks(current); | ||
3440 | if (irqs_disabled()) | ||
3441 | print_irqtrace_events(current); | ||
3442 | dump_stack(); | ||
3443 | } | ||
3444 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); | 3357 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); |
3445 | 3358 | ||
3446 | need_resched: | 3359 | schedstat_inc(this_rq(), sched_cnt); |
3447 | preempt_disable(); | 3360 | } |
3448 | prev = current; | 3361 | |
3449 | release_kernel_lock(prev); | 3362 | /* |
3450 | need_resched_nonpreemptible: | 3363 | * Pick up the highest-prio task: |
3451 | rq = this_rq(); | 3364 | */ |
3365 | static inline struct task_struct * | ||
3366 | pick_next_task(struct rq *rq, struct task_struct *prev, u64 now) | ||
3367 | { | ||
3368 | struct sched_class *class; | ||
3369 | struct task_struct *p; | ||
3452 | 3370 | ||
3453 | /* | 3371 | /* |
3454 | * The idle thread is not allowed to schedule! | 3372 | * Optimization: we know that if all tasks are in |
3455 | * Remove this check after it has been exercised a bit. | 3373 | * the fair class we can call that function directly: |
3456 | */ | 3374 | */ |
3457 | if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) { | 3375 | if (likely(rq->nr_running == rq->cfs.nr_running)) { |
3458 | printk(KERN_ERR "bad: scheduling from the idle thread!\n"); | 3376 | p = fair_sched_class.pick_next_task(rq, now); |
3459 | dump_stack(); | 3377 | if (likely(p)) |
3378 | return p; | ||
3460 | } | 3379 | } |
3461 | 3380 | ||
3462 | schedstat_inc(rq, sched_cnt); | 3381 | class = sched_class_highest; |
3463 | now = sched_clock(); | 3382 | for ( ; ; ) { |
3464 | if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) { | 3383 | p = class->pick_next_task(rq, now); |
3465 | run_time = now - prev->timestamp; | 3384 | if (p) |
3466 | if (unlikely((long long)(now - prev->timestamp) < 0)) | 3385 | return p; |
3467 | run_time = 0; | 3386 | /* |
3468 | } else | 3387 | * Will never be NULL as the idle class always |
3469 | run_time = NS_MAX_SLEEP_AVG; | 3388 | * returns a non-NULL p: |
3389 | */ | ||
3390 | class = class->next; | ||
3391 | } | ||
3392 | } | ||
3470 | 3393 | ||
3471 | /* | 3394 | /* |
3472 | * Tasks charged proportionately less run_time at high sleep_avg to | 3395 | * schedule() is the main scheduler function. |
3473 | * delay them losing their interactive status | 3396 | */ |
3474 | */ | 3397 | asmlinkage void __sched schedule(void) |
3475 | run_time /= (CURRENT_BONUS(prev) ? : 1); | 3398 | { |
3399 | struct task_struct *prev, *next; | ||
3400 | long *switch_count; | ||
3401 | struct rq *rq; | ||
3402 | u64 now; | ||
3403 | int cpu; | ||
3404 | |||
3405 | need_resched: | ||
3406 | preempt_disable(); | ||
3407 | cpu = smp_processor_id(); | ||
3408 | rq = cpu_rq(cpu); | ||
3409 | rcu_qsctr_inc(cpu); | ||
3410 | prev = rq->curr; | ||
3411 | switch_count = &prev->nivcsw; | ||
3412 | |||
3413 | release_kernel_lock(prev); | ||
3414 | need_resched_nonpreemptible: | ||
3415 | |||
3416 | schedule_debug(prev); | ||
3476 | 3417 | ||
3477 | spin_lock_irq(&rq->lock); | 3418 | spin_lock_irq(&rq->lock); |
3419 | clear_tsk_need_resched(prev); | ||
3478 | 3420 | ||
3479 | switch_count = &prev->nivcsw; | ||
3480 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { | 3421 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { |
3481 | switch_count = &prev->nvcsw; | ||
3482 | if (unlikely((prev->state & TASK_INTERRUPTIBLE) && | 3422 | if (unlikely((prev->state & TASK_INTERRUPTIBLE) && |
3483 | unlikely(signal_pending(prev)))) | 3423 | unlikely(signal_pending(prev)))) { |
3484 | prev->state = TASK_RUNNING; | 3424 | prev->state = TASK_RUNNING; |
3485 | else { | 3425 | } else { |
3486 | if (prev->state == TASK_UNINTERRUPTIBLE) | 3426 | deactivate_task(rq, prev, 1); |
3487 | rq->nr_uninterruptible++; | ||
3488 | deactivate_task(prev, rq); | ||
3489 | } | 3427 | } |
3428 | switch_count = &prev->nvcsw; | ||
3490 | } | 3429 | } |
3491 | 3430 | ||
3492 | cpu = smp_processor_id(); | 3431 | if (unlikely(!rq->nr_running)) |
3493 | if (unlikely(!rq->nr_running)) { | ||
3494 | idle_balance(cpu, rq); | 3432 | idle_balance(cpu, rq); |
3495 | if (!rq->nr_running) { | ||
3496 | next = rq->idle; | ||
3497 | rq->expired_timestamp = 0; | ||
3498 | goto switch_tasks; | ||
3499 | } | ||
3500 | } | ||
3501 | 3433 | ||
3502 | array = rq->active; | 3434 | now = __rq_clock(rq); |
3503 | if (unlikely(!array->nr_active)) { | 3435 | prev->sched_class->put_prev_task(rq, prev, now); |
3504 | /* | 3436 | next = pick_next_task(rq, prev, now); |
3505 | * Switch the active and expired arrays. | ||
3506 | */ | ||
3507 | schedstat_inc(rq, sched_switch); | ||
3508 | rq->active = rq->expired; | ||
3509 | rq->expired = array; | ||
3510 | array = rq->active; | ||
3511 | rq->expired_timestamp = 0; | ||
3512 | rq->best_expired_prio = MAX_PRIO; | ||
3513 | } | ||
3514 | |||
3515 | idx = sched_find_first_bit(array->bitmap); | ||
3516 | queue = array->queue + idx; | ||
3517 | next = list_entry(queue->next, struct task_struct, run_list); | ||
3518 | |||
3519 | switch_tasks: | ||
3520 | if (next == rq->idle) | ||
3521 | schedstat_inc(rq, sched_goidle); | ||
3522 | prefetch(next); | ||
3523 | prefetch_stack(next); | ||
3524 | clear_tsk_need_resched(prev); | ||
3525 | rcu_qsctr_inc(task_cpu(prev)); | ||
3526 | |||
3527 | prev->timestamp = prev->last_ran = now; | ||
3528 | 3437 | ||
3529 | sched_info_switch(prev, next); | 3438 | sched_info_switch(prev, next); |
3439 | |||
3530 | if (likely(prev != next)) { | 3440 | if (likely(prev != next)) { |
3531 | next->timestamp = next->last_ran = now; | ||
3532 | rq->nr_switches++; | 3441 | rq->nr_switches++; |
3533 | rq->curr = next; | 3442 | rq->curr = next; |
3534 | ++*switch_count; | 3443 | ++*switch_count; |
3535 | 3444 | ||
3536 | prepare_task_switch(rq, next); | 3445 | context_switch(rq, prev, next); /* unlocks the rq */ |
3537 | prev = context_switch(rq, prev, next); | ||
3538 | barrier(); | ||
3539 | /* | ||
3540 | * this_rq must be evaluated again because prev may have moved | ||
3541 | * CPUs since it called schedule(), thus the 'rq' on its stack | ||
3542 | * frame will be invalid. | ||
3543 | */ | ||
3544 | finish_task_switch(this_rq(), prev); | ||
3545 | } else | 3446 | } else |
3546 | spin_unlock_irq(&rq->lock); | 3447 | spin_unlock_irq(&rq->lock); |
3547 | 3448 | ||
3548 | prev = current; | 3449 | if (unlikely(reacquire_kernel_lock(current) < 0)) { |
3549 | if (unlikely(reacquire_kernel_lock(prev) < 0)) | 3450 | cpu = smp_processor_id(); |
3451 | rq = cpu_rq(cpu); | ||
3550 | goto need_resched_nonpreemptible; | 3452 | goto need_resched_nonpreemptible; |
3453 | } | ||
3551 | preempt_enable_no_resched(); | 3454 | preempt_enable_no_resched(); |
3552 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) | 3455 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) |
3553 | goto need_resched; | 3456 | goto need_resched; |
@@ -3959,29 +3862,30 @@ EXPORT_SYMBOL(sleep_on_timeout); | |||
3959 | */ | 3862 | */ |
3960 | void rt_mutex_setprio(struct task_struct *p, int prio) | 3863 | void rt_mutex_setprio(struct task_struct *p, int prio) |
3961 | { | 3864 | { |
3962 | struct prio_array *array; | ||
3963 | unsigned long flags; | 3865 | unsigned long flags; |
3866 | int oldprio, on_rq; | ||
3964 | struct rq *rq; | 3867 | struct rq *rq; |
3965 | int oldprio; | 3868 | u64 now; |
3966 | 3869 | ||
3967 | BUG_ON(prio < 0 || prio > MAX_PRIO); | 3870 | BUG_ON(prio < 0 || prio > MAX_PRIO); |
3968 | 3871 | ||
3969 | rq = task_rq_lock(p, &flags); | 3872 | rq = task_rq_lock(p, &flags); |
3873 | now = rq_clock(rq); | ||
3970 | 3874 | ||
3971 | oldprio = p->prio; | 3875 | oldprio = p->prio; |
3972 | array = p->array; | 3876 | on_rq = p->se.on_rq; |
3973 | if (array) | 3877 | if (on_rq) |
3974 | dequeue_task(p, array); | 3878 | dequeue_task(rq, p, 0, now); |
3879 | |||
3880 | if (rt_prio(prio)) | ||
3881 | p->sched_class = &rt_sched_class; | ||
3882 | else | ||
3883 | p->sched_class = &fair_sched_class; | ||
3884 | |||
3975 | p->prio = prio; | 3885 | p->prio = prio; |
3976 | 3886 | ||
3977 | if (array) { | 3887 | if (on_rq) { |
3978 | /* | 3888 | enqueue_task(rq, p, 0, now); |
3979 | * If changing to an RT priority then queue it | ||
3980 | * in the active array! | ||
3981 | */ | ||
3982 | if (rt_task(p)) | ||
3983 | array = rq->active; | ||
3984 | enqueue_task(p, array); | ||
3985 | /* | 3889 | /* |
3986 | * Reschedule if we are currently running on this runqueue and | 3890 | * Reschedule if we are currently running on this runqueue and |
3987 | * our priority decreased, or if we are not currently running on | 3891 | * our priority decreased, or if we are not currently running on |
@@ -3990,8 +3894,9 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
3990 | if (task_running(rq, p)) { | 3894 | if (task_running(rq, p)) { |
3991 | if (p->prio > oldprio) | 3895 | if (p->prio > oldprio) |
3992 | resched_task(rq->curr); | 3896 | resched_task(rq->curr); |
3993 | } else if (TASK_PREEMPTS_CURR(p, rq)) | 3897 | } else { |
3994 | resched_task(rq->curr); | 3898 | check_preempt_curr(rq, p); |
3899 | } | ||
3995 | } | 3900 | } |
3996 | task_rq_unlock(rq, &flags); | 3901 | task_rq_unlock(rq, &flags); |
3997 | } | 3902 | } |
@@ -4000,10 +3905,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
4000 | 3905 | ||
4001 | void set_user_nice(struct task_struct *p, long nice) | 3906 | void set_user_nice(struct task_struct *p, long nice) |
4002 | { | 3907 | { |
4003 | struct prio_array *array; | 3908 | int old_prio, delta, on_rq; |
4004 | int old_prio, delta; | ||
4005 | unsigned long flags; | 3909 | unsigned long flags; |
4006 | struct rq *rq; | 3910 | struct rq *rq; |
3911 | u64 now; | ||
4007 | 3912 | ||
4008 | if (TASK_NICE(p) == nice || nice < -20 || nice > 19) | 3913 | if (TASK_NICE(p) == nice || nice < -20 || nice > 19) |
4009 | return; | 3914 | return; |
@@ -4012,20 +3917,21 @@ void set_user_nice(struct task_struct *p, long nice) | |||
4012 | * the task might be in the middle of scheduling on another CPU. | 3917 | * the task might be in the middle of scheduling on another CPU. |
4013 | */ | 3918 | */ |
4014 | rq = task_rq_lock(p, &flags); | 3919 | rq = task_rq_lock(p, &flags); |
3920 | now = rq_clock(rq); | ||
4015 | /* | 3921 | /* |
4016 | * The RT priorities are set via sched_setscheduler(), but we still | 3922 | * The RT priorities are set via sched_setscheduler(), but we still |
4017 | * allow the 'normal' nice value to be set - but as expected | 3923 | * allow the 'normal' nice value to be set - but as expected |
4018 | * it wont have any effect on scheduling until the task is | 3924 | * it wont have any effect on scheduling until the task is |
4019 | * not SCHED_NORMAL/SCHED_BATCH: | 3925 | * SCHED_FIFO/SCHED_RR: |
4020 | */ | 3926 | */ |
4021 | if (task_has_rt_policy(p)) { | 3927 | if (task_has_rt_policy(p)) { |
4022 | p->static_prio = NICE_TO_PRIO(nice); | 3928 | p->static_prio = NICE_TO_PRIO(nice); |
4023 | goto out_unlock; | 3929 | goto out_unlock; |
4024 | } | 3930 | } |
4025 | array = p->array; | 3931 | on_rq = p->se.on_rq; |
4026 | if (array) { | 3932 | if (on_rq) { |
4027 | dequeue_task(p, array); | 3933 | dequeue_task(rq, p, 0, now); |
4028 | dec_raw_weighted_load(rq, p); | 3934 | dec_load(rq, p, now); |
4029 | } | 3935 | } |
4030 | 3936 | ||
4031 | p->static_prio = NICE_TO_PRIO(nice); | 3937 | p->static_prio = NICE_TO_PRIO(nice); |
@@ -4034,9 +3940,9 @@ void set_user_nice(struct task_struct *p, long nice) | |||
4034 | p->prio = effective_prio(p); | 3940 | p->prio = effective_prio(p); |
4035 | delta = p->prio - old_prio; | 3941 | delta = p->prio - old_prio; |
4036 | 3942 | ||
4037 | if (array) { | 3943 | if (on_rq) { |
4038 | enqueue_task(p, array); | 3944 | enqueue_task(rq, p, 0, now); |
4039 | inc_raw_weighted_load(rq, p); | 3945 | inc_load(rq, p, now); |
4040 | /* | 3946 | /* |
4041 | * If the task increased its priority or is running and | 3947 | * If the task increased its priority or is running and |
4042 | * lowered its priority, then reschedule its CPU: | 3948 | * lowered its priority, then reschedule its CPU: |
@@ -4156,11 +4062,24 @@ static inline struct task_struct *find_process_by_pid(pid_t pid) | |||
4156 | } | 4062 | } |
4157 | 4063 | ||
4158 | /* Actually do priority change: must hold rq lock. */ | 4064 | /* Actually do priority change: must hold rq lock. */ |
4159 | static void __setscheduler(struct task_struct *p, int policy, int prio) | 4065 | static void |
4066 | __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) | ||
4160 | { | 4067 | { |
4161 | BUG_ON(p->array); | 4068 | BUG_ON(p->se.on_rq); |
4162 | 4069 | ||
4163 | p->policy = policy; | 4070 | p->policy = policy; |
4071 | switch (p->policy) { | ||
4072 | case SCHED_NORMAL: | ||
4073 | case SCHED_BATCH: | ||
4074 | case SCHED_IDLE: | ||
4075 | p->sched_class = &fair_sched_class; | ||
4076 | break; | ||
4077 | case SCHED_FIFO: | ||
4078 | case SCHED_RR: | ||
4079 | p->sched_class = &rt_sched_class; | ||
4080 | break; | ||
4081 | } | ||
4082 | |||
4164 | p->rt_priority = prio; | 4083 | p->rt_priority = prio; |
4165 | p->normal_prio = normal_prio(p); | 4084 | p->normal_prio = normal_prio(p); |
4166 | /* we are holding p->pi_lock already */ | 4085 | /* we are holding p->pi_lock already */ |
@@ -4179,8 +4098,7 @@ static void __setscheduler(struct task_struct *p, int policy, int prio) | |||
4179 | int sched_setscheduler(struct task_struct *p, int policy, | 4098 | int sched_setscheduler(struct task_struct *p, int policy, |
4180 | struct sched_param *param) | 4099 | struct sched_param *param) |
4181 | { | 4100 | { |
4182 | int retval, oldprio, oldpolicy = -1; | 4101 | int retval, oldprio, oldpolicy = -1, on_rq; |
4183 | struct prio_array *array; | ||
4184 | unsigned long flags; | 4102 | unsigned long flags; |
4185 | struct rq *rq; | 4103 | struct rq *rq; |
4186 | 4104 | ||
@@ -4191,12 +4109,13 @@ recheck: | |||
4191 | if (policy < 0) | 4109 | if (policy < 0) |
4192 | policy = oldpolicy = p->policy; | 4110 | policy = oldpolicy = p->policy; |
4193 | else if (policy != SCHED_FIFO && policy != SCHED_RR && | 4111 | else if (policy != SCHED_FIFO && policy != SCHED_RR && |
4194 | policy != SCHED_NORMAL && policy != SCHED_BATCH) | 4112 | policy != SCHED_NORMAL && policy != SCHED_BATCH && |
4113 | policy != SCHED_IDLE) | ||
4195 | return -EINVAL; | 4114 | return -EINVAL; |
4196 | /* | 4115 | /* |
4197 | * Valid priorities for SCHED_FIFO and SCHED_RR are | 4116 | * Valid priorities for SCHED_FIFO and SCHED_RR are |
4198 | * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and | 4117 | * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, |
4199 | * SCHED_BATCH is 0. | 4118 | * SCHED_BATCH and SCHED_IDLE is 0. |
4200 | */ | 4119 | */ |
4201 | if (param->sched_priority < 0 || | 4120 | if (param->sched_priority < 0 || |
4202 | (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || | 4121 | (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || |
@@ -4211,7 +4130,6 @@ recheck: | |||
4211 | if (!capable(CAP_SYS_NICE)) { | 4130 | if (!capable(CAP_SYS_NICE)) { |
4212 | if (rt_policy(policy)) { | 4131 | if (rt_policy(policy)) { |
4213 | unsigned long rlim_rtprio; | 4132 | unsigned long rlim_rtprio; |
4214 | unsigned long flags; | ||
4215 | 4133 | ||
4216 | if (!lock_task_sighand(p, &flags)) | 4134 | if (!lock_task_sighand(p, &flags)) |
4217 | return -ESRCH; | 4135 | return -ESRCH; |
@@ -4227,6 +4145,12 @@ recheck: | |||
4227 | param->sched_priority > rlim_rtprio) | 4145 | param->sched_priority > rlim_rtprio) |
4228 | return -EPERM; | 4146 | return -EPERM; |
4229 | } | 4147 | } |
4148 | /* | ||
4149 | * Like positive nice levels, dont allow tasks to | ||
4150 | * move out of SCHED_IDLE either: | ||
4151 | */ | ||
4152 | if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) | ||
4153 | return -EPERM; | ||
4230 | 4154 | ||
4231 | /* can't change other user's priorities */ | 4155 | /* can't change other user's priorities */ |
4232 | if ((current->euid != p->euid) && | 4156 | if ((current->euid != p->euid) && |
@@ -4254,13 +4178,13 @@ recheck: | |||
4254 | spin_unlock_irqrestore(&p->pi_lock, flags); | 4178 | spin_unlock_irqrestore(&p->pi_lock, flags); |
4255 | goto recheck; | 4179 | goto recheck; |
4256 | } | 4180 | } |
4257 | array = p->array; | 4181 | on_rq = p->se.on_rq; |
4258 | if (array) | 4182 | if (on_rq) |
4259 | deactivate_task(p, rq); | 4183 | deactivate_task(rq, p, 0); |
4260 | oldprio = p->prio; | 4184 | oldprio = p->prio; |
4261 | __setscheduler(p, policy, param->sched_priority); | 4185 | __setscheduler(rq, p, policy, param->sched_priority); |
4262 | if (array) { | 4186 | if (on_rq) { |
4263 | __activate_task(p, rq); | 4187 | activate_task(rq, p, 0); |
4264 | /* | 4188 | /* |
4265 | * Reschedule if we are currently running on this runqueue and | 4189 | * Reschedule if we are currently running on this runqueue and |
4266 | * our priority decreased, or if we are not currently running on | 4190 | * our priority decreased, or if we are not currently running on |
@@ -4269,8 +4193,9 @@ recheck: | |||
4269 | if (task_running(rq, p)) { | 4193 | if (task_running(rq, p)) { |
4270 | if (p->prio > oldprio) | 4194 | if (p->prio > oldprio) |
4271 | resched_task(rq->curr); | 4195 | resched_task(rq->curr); |
4272 | } else if (TASK_PREEMPTS_CURR(p, rq)) | 4196 | } else { |
4273 | resched_task(rq->curr); | 4197 | check_preempt_curr(rq, p); |
4198 | } | ||
4274 | } | 4199 | } |
4275 | __task_rq_unlock(rq); | 4200 | __task_rq_unlock(rq); |
4276 | spin_unlock_irqrestore(&p->pi_lock, flags); | 4201 | spin_unlock_irqrestore(&p->pi_lock, flags); |
@@ -4542,41 +4467,18 @@ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, | |||
4542 | /** | 4467 | /** |
4543 | * sys_sched_yield - yield the current processor to other threads. | 4468 | * sys_sched_yield - yield the current processor to other threads. |
4544 | * | 4469 | * |
4545 | * This function yields the current CPU by moving the calling thread | 4470 | * This function yields the current CPU to other tasks. If there are no |
4546 | * to the expired array. If there are no other threads running on this | 4471 | * other threads running on this CPU then this function will return. |
4547 | * CPU then this function will return. | ||
4548 | */ | 4472 | */ |
4549 | asmlinkage long sys_sched_yield(void) | 4473 | asmlinkage long sys_sched_yield(void) |
4550 | { | 4474 | { |
4551 | struct rq *rq = this_rq_lock(); | 4475 | struct rq *rq = this_rq_lock(); |
4552 | struct prio_array *array = current->array, *target = rq->expired; | ||
4553 | 4476 | ||
4554 | schedstat_inc(rq, yld_cnt); | 4477 | schedstat_inc(rq, yld_cnt); |
4555 | /* | 4478 | if (unlikely(rq->nr_running == 1)) |
4556 | * We implement yielding by moving the task into the expired | ||
4557 | * queue. | ||
4558 | * | ||
4559 | * (special rule: RT tasks will just roundrobin in the active | ||
4560 | * array.) | ||
4561 | */ | ||
4562 | if (rt_task(current)) | ||
4563 | target = rq->active; | ||
4564 | |||
4565 | if (array->nr_active == 1) { | ||
4566 | schedstat_inc(rq, yld_act_empty); | 4479 | schedstat_inc(rq, yld_act_empty); |
4567 | if (!rq->expired->nr_active) | 4480 | else |
4568 | schedstat_inc(rq, yld_both_empty); | 4481 | current->sched_class->yield_task(rq, current); |
4569 | } else if (!rq->expired->nr_active) | ||
4570 | schedstat_inc(rq, yld_exp_empty); | ||
4571 | |||
4572 | if (array != target) { | ||
4573 | dequeue_task(current, array); | ||
4574 | enqueue_task(current, target); | ||
4575 | } else | ||
4576 | /* | ||
4577 | * requeue_task is cheaper so perform that if possible. | ||
4578 | */ | ||
4579 | requeue_task(current, array); | ||
4580 | 4482 | ||
4581 | /* | 4483 | /* |
4582 | * Since we are going to call schedule() anyway, there's | 4484 | * Since we are going to call schedule() anyway, there's |
@@ -4727,6 +4629,7 @@ asmlinkage long sys_sched_get_priority_max(int policy) | |||
4727 | break; | 4629 | break; |
4728 | case SCHED_NORMAL: | 4630 | case SCHED_NORMAL: |
4729 | case SCHED_BATCH: | 4631 | case SCHED_BATCH: |
4632 | case SCHED_IDLE: | ||
4730 | ret = 0; | 4633 | ret = 0; |
4731 | break; | 4634 | break; |
4732 | } | 4635 | } |
@@ -4751,6 +4654,7 @@ asmlinkage long sys_sched_get_priority_min(int policy) | |||
4751 | break; | 4654 | break; |
4752 | case SCHED_NORMAL: | 4655 | case SCHED_NORMAL: |
4753 | case SCHED_BATCH: | 4656 | case SCHED_BATCH: |
4657 | case SCHED_IDLE: | ||
4754 | ret = 0; | 4658 | ret = 0; |
4755 | } | 4659 | } |
4756 | return ret; | 4660 | return ret; |
@@ -4785,7 +4689,7 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) | |||
4785 | goto out_unlock; | 4689 | goto out_unlock; |
4786 | 4690 | ||
4787 | jiffies_to_timespec(p->policy == SCHED_FIFO ? | 4691 | jiffies_to_timespec(p->policy == SCHED_FIFO ? |
4788 | 0 : task_timeslice(p), &t); | 4692 | 0 : static_prio_timeslice(p->static_prio), &t); |
4789 | read_unlock(&tasklist_lock); | 4693 | read_unlock(&tasklist_lock); |
4790 | retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; | 4694 | retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; |
4791 | out_nounlock: | 4695 | out_nounlock: |
@@ -4860,6 +4764,9 @@ void show_state_filter(unsigned long state_filter) | |||
4860 | 4764 | ||
4861 | touch_all_softlockup_watchdogs(); | 4765 | touch_all_softlockup_watchdogs(); |
4862 | 4766 | ||
4767 | #ifdef CONFIG_SCHED_DEBUG | ||
4768 | sysrq_sched_debug_show(); | ||
4769 | #endif | ||
4863 | read_unlock(&tasklist_lock); | 4770 | read_unlock(&tasklist_lock); |
4864 | /* | 4771 | /* |
4865 | * Only show locks if all tasks are dumped: | 4772 | * Only show locks if all tasks are dumped: |
@@ -4870,7 +4777,7 @@ void show_state_filter(unsigned long state_filter) | |||
4870 | 4777 | ||
4871 | void __cpuinit init_idle_bootup_task(struct task_struct *idle) | 4778 | void __cpuinit init_idle_bootup_task(struct task_struct *idle) |
4872 | { | 4779 | { |
4873 | /* nothing yet */ | 4780 | idle->sched_class = &idle_sched_class; |
4874 | } | 4781 | } |
4875 | 4782 | ||
4876 | /** | 4783 | /** |
@@ -4886,12 +4793,12 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) | |||
4886 | struct rq *rq = cpu_rq(cpu); | 4793 | struct rq *rq = cpu_rq(cpu); |
4887 | unsigned long flags; | 4794 | unsigned long flags; |
4888 | 4795 | ||
4889 | idle->timestamp = sched_clock(); | 4796 | __sched_fork(idle); |
4890 | idle->array = NULL; | 4797 | idle->se.exec_start = sched_clock(); |
4798 | |||
4891 | idle->prio = idle->normal_prio = MAX_PRIO; | 4799 | idle->prio = idle->normal_prio = MAX_PRIO; |
4892 | idle->state = TASK_RUNNING; | ||
4893 | idle->cpus_allowed = cpumask_of_cpu(cpu); | 4800 | idle->cpus_allowed = cpumask_of_cpu(cpu); |
4894 | set_task_cpu(idle, cpu); | 4801 | __set_task_cpu(idle, cpu); |
4895 | 4802 | ||
4896 | spin_lock_irqsave(&rq->lock, flags); | 4803 | spin_lock_irqsave(&rq->lock, flags); |
4897 | rq->curr = rq->idle = idle; | 4804 | rq->curr = rq->idle = idle; |
@@ -4906,6 +4813,10 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) | |||
4906 | #else | 4813 | #else |
4907 | task_thread_info(idle)->preempt_count = 0; | 4814 | task_thread_info(idle)->preempt_count = 0; |
4908 | #endif | 4815 | #endif |
4816 | /* | ||
4817 | * The idle tasks have their own, simple scheduling class: | ||
4818 | */ | ||
4819 | idle->sched_class = &idle_sched_class; | ||
4909 | } | 4820 | } |
4910 | 4821 | ||
4911 | /* | 4822 | /* |
@@ -4917,6 +4828,28 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) | |||
4917 | */ | 4828 | */ |
4918 | cpumask_t nohz_cpu_mask = CPU_MASK_NONE; | 4829 | cpumask_t nohz_cpu_mask = CPU_MASK_NONE; |
4919 | 4830 | ||
4831 | /* | ||
4832 | * Increase the granularity value when there are more CPUs, | ||
4833 | * because with more CPUs the 'effective latency' as visible | ||
4834 | * to users decreases. But the relationship is not linear, | ||
4835 | * so pick a second-best guess by going with the log2 of the | ||
4836 | * number of CPUs. | ||
4837 | * | ||
4838 | * This idea comes from the SD scheduler of Con Kolivas: | ||
4839 | */ | ||
4840 | static inline void sched_init_granularity(void) | ||
4841 | { | ||
4842 | unsigned int factor = 1 + ilog2(num_online_cpus()); | ||
4843 | const unsigned long gran_limit = 10000000; | ||
4844 | |||
4845 | sysctl_sched_granularity *= factor; | ||
4846 | if (sysctl_sched_granularity > gran_limit) | ||
4847 | sysctl_sched_granularity = gran_limit; | ||
4848 | |||
4849 | sysctl_sched_runtime_limit = sysctl_sched_granularity * 4; | ||
4850 | sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2; | ||
4851 | } | ||
4852 | |||
4920 | #ifdef CONFIG_SMP | 4853 | #ifdef CONFIG_SMP |
4921 | /* | 4854 | /* |
4922 | * This is how migration works: | 4855 | * This is how migration works: |
@@ -4990,7 +4923,7 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed); | |||
4990 | static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | 4923 | static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) |
4991 | { | 4924 | { |
4992 | struct rq *rq_dest, *rq_src; | 4925 | struct rq *rq_dest, *rq_src; |
4993 | int ret = 0; | 4926 | int ret = 0, on_rq; |
4994 | 4927 | ||
4995 | if (unlikely(cpu_is_offline(dest_cpu))) | 4928 | if (unlikely(cpu_is_offline(dest_cpu))) |
4996 | return ret; | 4929 | return ret; |
@@ -5006,20 +4939,13 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | |||
5006 | if (!cpu_isset(dest_cpu, p->cpus_allowed)) | 4939 | if (!cpu_isset(dest_cpu, p->cpus_allowed)) |
5007 | goto out; | 4940 | goto out; |
5008 | 4941 | ||
4942 | on_rq = p->se.on_rq; | ||
4943 | if (on_rq) | ||
4944 | deactivate_task(rq_src, p, 0); | ||
5009 | set_task_cpu(p, dest_cpu); | 4945 | set_task_cpu(p, dest_cpu); |
5010 | if (p->array) { | 4946 | if (on_rq) { |
5011 | /* | 4947 | activate_task(rq_dest, p, 0); |
5012 | * Sync timestamp with rq_dest's before activating. | 4948 | check_preempt_curr(rq_dest, p); |
5013 | * The same thing could be achieved by doing this step | ||
5014 | * afterwards, and pretending it was a local activate. | ||
5015 | * This way is cleaner and logically correct. | ||
5016 | */ | ||
5017 | p->timestamp = p->timestamp - rq_src->most_recent_timestamp | ||
5018 | + rq_dest->most_recent_timestamp; | ||
5019 | deactivate_task(p, rq_src); | ||
5020 | __activate_task(p, rq_dest); | ||
5021 | if (TASK_PREEMPTS_CURR(p, rq_dest)) | ||
5022 | resched_task(rq_dest->curr); | ||
5023 | } | 4949 | } |
5024 | ret = 1; | 4950 | ret = 1; |
5025 | out: | 4951 | out: |
@@ -5171,7 +5097,8 @@ static void migrate_live_tasks(int src_cpu) | |||
5171 | write_unlock_irq(&tasklist_lock); | 5097 | write_unlock_irq(&tasklist_lock); |
5172 | } | 5098 | } |
5173 | 5099 | ||
5174 | /* Schedules idle task to be the next runnable task on current CPU. | 5100 | /* |
5101 | * Schedules idle task to be the next runnable task on current CPU. | ||
5175 | * It does so by boosting its priority to highest possible and adding it to | 5102 | * It does so by boosting its priority to highest possible and adding it to |
5176 | * the _front_ of the runqueue. Used by CPU offline code. | 5103 | * the _front_ of the runqueue. Used by CPU offline code. |
5177 | */ | 5104 | */ |
@@ -5191,10 +5118,10 @@ void sched_idle_next(void) | |||
5191 | */ | 5118 | */ |
5192 | spin_lock_irqsave(&rq->lock, flags); | 5119 | spin_lock_irqsave(&rq->lock, flags); |
5193 | 5120 | ||
5194 | __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); | 5121 | __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); |
5195 | 5122 | ||
5196 | /* Add idle task to the _front_ of its priority queue: */ | 5123 | /* Add idle task to the _front_ of its priority queue: */ |
5197 | __activate_idle_task(p, rq); | 5124 | activate_idle_task(p, rq); |
5198 | 5125 | ||
5199 | spin_unlock_irqrestore(&rq->lock, flags); | 5126 | spin_unlock_irqrestore(&rq->lock, flags); |
5200 | } | 5127 | } |
@@ -5244,16 +5171,15 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) | |||
5244 | static void migrate_dead_tasks(unsigned int dead_cpu) | 5171 | static void migrate_dead_tasks(unsigned int dead_cpu) |
5245 | { | 5172 | { |
5246 | struct rq *rq = cpu_rq(dead_cpu); | 5173 | struct rq *rq = cpu_rq(dead_cpu); |
5247 | unsigned int arr, i; | 5174 | struct task_struct *next; |
5248 | |||
5249 | for (arr = 0; arr < 2; arr++) { | ||
5250 | for (i = 0; i < MAX_PRIO; i++) { | ||
5251 | struct list_head *list = &rq->arrays[arr].queue[i]; | ||
5252 | 5175 | ||
5253 | while (!list_empty(list)) | 5176 | for ( ; ; ) { |
5254 | migrate_dead(dead_cpu, list_entry(list->next, | 5177 | if (!rq->nr_running) |
5255 | struct task_struct, run_list)); | 5178 | break; |
5256 | } | 5179 | next = pick_next_task(rq, rq->curr, rq_clock(rq)); |
5180 | if (!next) | ||
5181 | break; | ||
5182 | migrate_dead(dead_cpu, next); | ||
5257 | } | 5183 | } |
5258 | } | 5184 | } |
5259 | #endif /* CONFIG_HOTPLUG_CPU */ | 5185 | #endif /* CONFIG_HOTPLUG_CPU */ |
@@ -5277,14 +5203,14 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
5277 | 5203 | ||
5278 | case CPU_UP_PREPARE: | 5204 | case CPU_UP_PREPARE: |
5279 | case CPU_UP_PREPARE_FROZEN: | 5205 | case CPU_UP_PREPARE_FROZEN: |
5280 | p = kthread_create(migration_thread, hcpu, "migration/%d",cpu); | 5206 | p = kthread_create(migration_thread, hcpu, "migration/%d", cpu); |
5281 | if (IS_ERR(p)) | 5207 | if (IS_ERR(p)) |
5282 | return NOTIFY_BAD; | 5208 | return NOTIFY_BAD; |
5283 | p->flags |= PF_NOFREEZE; | 5209 | p->flags |= PF_NOFREEZE; |
5284 | kthread_bind(p, cpu); | 5210 | kthread_bind(p, cpu); |
5285 | /* Must be high prio: stop_machine expects to yield to it. */ | 5211 | /* Must be high prio: stop_machine expects to yield to it. */ |
5286 | rq = task_rq_lock(p, &flags); | 5212 | rq = task_rq_lock(p, &flags); |
5287 | __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); | 5213 | __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); |
5288 | task_rq_unlock(rq, &flags); | 5214 | task_rq_unlock(rq, &flags); |
5289 | cpu_rq(cpu)->migration_thread = p; | 5215 | cpu_rq(cpu)->migration_thread = p; |
5290 | break; | 5216 | break; |
@@ -5315,9 +5241,10 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
5315 | rq->migration_thread = NULL; | 5241 | rq->migration_thread = NULL; |
5316 | /* Idle task back to normal (off runqueue, low prio) */ | 5242 | /* Idle task back to normal (off runqueue, low prio) */ |
5317 | rq = task_rq_lock(rq->idle, &flags); | 5243 | rq = task_rq_lock(rq->idle, &flags); |
5318 | deactivate_task(rq->idle, rq); | 5244 | deactivate_task(rq, rq->idle, 0); |
5319 | rq->idle->static_prio = MAX_PRIO; | 5245 | rq->idle->static_prio = MAX_PRIO; |
5320 | __setscheduler(rq->idle, SCHED_NORMAL, 0); | 5246 | __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); |
5247 | rq->idle->sched_class = &idle_sched_class; | ||
5321 | migrate_dead_tasks(cpu); | 5248 | migrate_dead_tasks(cpu); |
5322 | task_rq_unlock(rq, &flags); | 5249 | task_rq_unlock(rq, &flags); |
5323 | migrate_nr_uninterruptible(rq); | 5250 | migrate_nr_uninterruptible(rq); |
@@ -5926,7 +5853,6 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
5926 | static int build_sched_domains(const cpumask_t *cpu_map) | 5853 | static int build_sched_domains(const cpumask_t *cpu_map) |
5927 | { | 5854 | { |
5928 | int i; | 5855 | int i; |
5929 | struct sched_domain *sd; | ||
5930 | #ifdef CONFIG_NUMA | 5856 | #ifdef CONFIG_NUMA |
5931 | struct sched_group **sched_group_nodes = NULL; | 5857 | struct sched_group **sched_group_nodes = NULL; |
5932 | int sd_allnodes = 0; | 5858 | int sd_allnodes = 0; |
@@ -5934,7 +5860,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
5934 | /* | 5860 | /* |
5935 | * Allocate the per-node list of sched groups | 5861 | * Allocate the per-node list of sched groups |
5936 | */ | 5862 | */ |
5937 | sched_group_nodes = kzalloc(sizeof(struct sched_group*)*MAX_NUMNODES, | 5863 | sched_group_nodes = kzalloc(sizeof(struct sched_group *)*MAX_NUMNODES, |
5938 | GFP_KERNEL); | 5864 | GFP_KERNEL); |
5939 | if (!sched_group_nodes) { | 5865 | if (!sched_group_nodes) { |
5940 | printk(KERN_WARNING "Can not alloc sched group node list\n"); | 5866 | printk(KERN_WARNING "Can not alloc sched group node list\n"); |
@@ -5953,8 +5879,8 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
5953 | cpus_and(nodemask, nodemask, *cpu_map); | 5879 | cpus_and(nodemask, nodemask, *cpu_map); |
5954 | 5880 | ||
5955 | #ifdef CONFIG_NUMA | 5881 | #ifdef CONFIG_NUMA |
5956 | if (cpus_weight(*cpu_map) | 5882 | if (cpus_weight(*cpu_map) > |
5957 | > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { | 5883 | SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { |
5958 | sd = &per_cpu(allnodes_domains, i); | 5884 | sd = &per_cpu(allnodes_domains, i); |
5959 | *sd = SD_ALLNODES_INIT; | 5885 | *sd = SD_ALLNODES_INIT; |
5960 | sd->span = *cpu_map; | 5886 | sd->span = *cpu_map; |
@@ -6013,7 +5939,8 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6013 | if (i != first_cpu(this_sibling_map)) | 5939 | if (i != first_cpu(this_sibling_map)) |
6014 | continue; | 5940 | continue; |
6015 | 5941 | ||
6016 | init_sched_build_groups(this_sibling_map, cpu_map, &cpu_to_cpu_group); | 5942 | init_sched_build_groups(this_sibling_map, cpu_map, |
5943 | &cpu_to_cpu_group); | ||
6017 | } | 5944 | } |
6018 | #endif | 5945 | #endif |
6019 | 5946 | ||
@@ -6024,11 +5951,11 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6024 | cpus_and(this_core_map, this_core_map, *cpu_map); | 5951 | cpus_and(this_core_map, this_core_map, *cpu_map); |
6025 | if (i != first_cpu(this_core_map)) | 5952 | if (i != first_cpu(this_core_map)) |
6026 | continue; | 5953 | continue; |
6027 | init_sched_build_groups(this_core_map, cpu_map, &cpu_to_core_group); | 5954 | init_sched_build_groups(this_core_map, cpu_map, |
5955 | &cpu_to_core_group); | ||
6028 | } | 5956 | } |
6029 | #endif | 5957 | #endif |
6030 | 5958 | ||
6031 | |||
6032 | /* Set up physical groups */ | 5959 | /* Set up physical groups */ |
6033 | for (i = 0; i < MAX_NUMNODES; i++) { | 5960 | for (i = 0; i < MAX_NUMNODES; i++) { |
6034 | cpumask_t nodemask = node_to_cpumask(i); | 5961 | cpumask_t nodemask = node_to_cpumask(i); |
@@ -6043,7 +5970,8 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6043 | #ifdef CONFIG_NUMA | 5970 | #ifdef CONFIG_NUMA |
6044 | /* Set up node groups */ | 5971 | /* Set up node groups */ |
6045 | if (sd_allnodes) | 5972 | if (sd_allnodes) |
6046 | init_sched_build_groups(*cpu_map, cpu_map, &cpu_to_allnodes_group); | 5973 | init_sched_build_groups(*cpu_map, cpu_map, |
5974 | &cpu_to_allnodes_group); | ||
6047 | 5975 | ||
6048 | for (i = 0; i < MAX_NUMNODES; i++) { | 5976 | for (i = 0; i < MAX_NUMNODES; i++) { |
6049 | /* Set up node groups */ | 5977 | /* Set up node groups */ |
@@ -6115,19 +6043,22 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6115 | /* Calculate CPU power for physical packages and nodes */ | 6043 | /* Calculate CPU power for physical packages and nodes */ |
6116 | #ifdef CONFIG_SCHED_SMT | 6044 | #ifdef CONFIG_SCHED_SMT |
6117 | for_each_cpu_mask(i, *cpu_map) { | 6045 | for_each_cpu_mask(i, *cpu_map) { |
6118 | sd = &per_cpu(cpu_domains, i); | 6046 | struct sched_domain *sd = &per_cpu(cpu_domains, i); |
6047 | |||
6119 | init_sched_groups_power(i, sd); | 6048 | init_sched_groups_power(i, sd); |
6120 | } | 6049 | } |
6121 | #endif | 6050 | #endif |
6122 | #ifdef CONFIG_SCHED_MC | 6051 | #ifdef CONFIG_SCHED_MC |
6123 | for_each_cpu_mask(i, *cpu_map) { | 6052 | for_each_cpu_mask(i, *cpu_map) { |
6124 | sd = &per_cpu(core_domains, i); | 6053 | struct sched_domain *sd = &per_cpu(core_domains, i); |
6054 | |||
6125 | init_sched_groups_power(i, sd); | 6055 | init_sched_groups_power(i, sd); |
6126 | } | 6056 | } |
6127 | #endif | 6057 | #endif |
6128 | 6058 | ||
6129 | for_each_cpu_mask(i, *cpu_map) { | 6059 | for_each_cpu_mask(i, *cpu_map) { |
6130 | sd = &per_cpu(phys_domains, i); | 6060 | struct sched_domain *sd = &per_cpu(phys_domains, i); |
6061 | |||
6131 | init_sched_groups_power(i, sd); | 6062 | init_sched_groups_power(i, sd); |
6132 | } | 6063 | } |
6133 | 6064 | ||
@@ -6361,10 +6292,12 @@ void __init sched_init_smp(void) | |||
6361 | /* Move init over to a non-isolated CPU */ | 6292 | /* Move init over to a non-isolated CPU */ |
6362 | if (set_cpus_allowed(current, non_isolated_cpus) < 0) | 6293 | if (set_cpus_allowed(current, non_isolated_cpus) < 0) |
6363 | BUG(); | 6294 | BUG(); |
6295 | sched_init_granularity(); | ||
6364 | } | 6296 | } |
6365 | #else | 6297 | #else |
6366 | void __init sched_init_smp(void) | 6298 | void __init sched_init_smp(void) |
6367 | { | 6299 | { |
6300 | sched_init_granularity(); | ||
6368 | } | 6301 | } |
6369 | #endif /* CONFIG_SMP */ | 6302 | #endif /* CONFIG_SMP */ |
6370 | 6303 | ||
@@ -6378,28 +6311,51 @@ int in_sched_functions(unsigned long addr) | |||
6378 | && addr < (unsigned long)__sched_text_end); | 6311 | && addr < (unsigned long)__sched_text_end); |
6379 | } | 6312 | } |
6380 | 6313 | ||
6314 | static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) | ||
6315 | { | ||
6316 | cfs_rq->tasks_timeline = RB_ROOT; | ||
6317 | cfs_rq->fair_clock = 1; | ||
6318 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
6319 | cfs_rq->rq = rq; | ||
6320 | #endif | ||
6321 | } | ||
6322 | |||
6381 | void __init sched_init(void) | 6323 | void __init sched_init(void) |
6382 | { | 6324 | { |
6383 | int i, j, k; | 6325 | u64 now = sched_clock(); |
6384 | int highest_cpu = 0; | 6326 | int highest_cpu = 0; |
6327 | int i, j; | ||
6328 | |||
6329 | /* | ||
6330 | * Link up the scheduling class hierarchy: | ||
6331 | */ | ||
6332 | rt_sched_class.next = &fair_sched_class; | ||
6333 | fair_sched_class.next = &idle_sched_class; | ||
6334 | idle_sched_class.next = NULL; | ||
6385 | 6335 | ||
6386 | for_each_possible_cpu(i) { | 6336 | for_each_possible_cpu(i) { |
6387 | struct prio_array *array; | 6337 | struct rt_prio_array *array; |
6388 | struct rq *rq; | 6338 | struct rq *rq; |
6389 | 6339 | ||
6390 | rq = cpu_rq(i); | 6340 | rq = cpu_rq(i); |
6391 | spin_lock_init(&rq->lock); | 6341 | spin_lock_init(&rq->lock); |
6392 | lockdep_set_class(&rq->lock, &rq->rq_lock_key); | 6342 | lockdep_set_class(&rq->lock, &rq->rq_lock_key); |
6393 | rq->nr_running = 0; | 6343 | rq->nr_running = 0; |
6394 | rq->active = rq->arrays; | 6344 | rq->clock = 1; |
6395 | rq->expired = rq->arrays + 1; | 6345 | init_cfs_rq(&rq->cfs, rq); |
6396 | rq->best_expired_prio = MAX_PRIO; | 6346 | #ifdef CONFIG_FAIR_GROUP_SCHED |
6347 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); | ||
6348 | list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); | ||
6349 | #endif | ||
6350 | rq->ls.load_update_last = now; | ||
6351 | rq->ls.load_update_start = now; | ||
6397 | 6352 | ||
6353 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) | ||
6354 | rq->cpu_load[j] = 0; | ||
6398 | #ifdef CONFIG_SMP | 6355 | #ifdef CONFIG_SMP |
6399 | rq->sd = NULL; | 6356 | rq->sd = NULL; |
6400 | for (j = 1; j < 3; j++) | ||
6401 | rq->cpu_load[j] = 0; | ||
6402 | rq->active_balance = 0; | 6357 | rq->active_balance = 0; |
6358 | rq->next_balance = jiffies; | ||
6403 | rq->push_cpu = 0; | 6359 | rq->push_cpu = 0; |
6404 | rq->cpu = i; | 6360 | rq->cpu = i; |
6405 | rq->migration_thread = NULL; | 6361 | rq->migration_thread = NULL; |
@@ -6407,16 +6363,14 @@ void __init sched_init(void) | |||
6407 | #endif | 6363 | #endif |
6408 | atomic_set(&rq->nr_iowait, 0); | 6364 | atomic_set(&rq->nr_iowait, 0); |
6409 | 6365 | ||
6410 | for (j = 0; j < 2; j++) { | 6366 | array = &rq->rt.active; |
6411 | array = rq->arrays + j; | 6367 | for (j = 0; j < MAX_RT_PRIO; j++) { |
6412 | for (k = 0; k < MAX_PRIO; k++) { | 6368 | INIT_LIST_HEAD(array->queue + j); |
6413 | INIT_LIST_HEAD(array->queue + k); | 6369 | __clear_bit(j, array->bitmap); |
6414 | __clear_bit(k, array->bitmap); | ||
6415 | } | ||
6416 | // delimiter for bitsearch | ||
6417 | __set_bit(MAX_PRIO, array->bitmap); | ||
6418 | } | 6370 | } |
6419 | highest_cpu = i; | 6371 | highest_cpu = i; |
6372 | /* delimiter for bitsearch: */ | ||
6373 | __set_bit(MAX_RT_PRIO, array->bitmap); | ||
6420 | } | 6374 | } |
6421 | 6375 | ||
6422 | set_load_weight(&init_task); | 6376 | set_load_weight(&init_task); |
@@ -6443,6 +6397,10 @@ void __init sched_init(void) | |||
6443 | * when this runqueue becomes "idle". | 6397 | * when this runqueue becomes "idle". |
6444 | */ | 6398 | */ |
6445 | init_idle(current, smp_processor_id()); | 6399 | init_idle(current, smp_processor_id()); |
6400 | /* | ||
6401 | * During early bootup we pretend to be a normal task: | ||
6402 | */ | ||
6403 | current->sched_class = &fair_sched_class; | ||
6446 | } | 6404 | } |
6447 | 6405 | ||
6448 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP | 6406 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP |
@@ -6473,29 +6431,55 @@ EXPORT_SYMBOL(__might_sleep); | |||
6473 | #ifdef CONFIG_MAGIC_SYSRQ | 6431 | #ifdef CONFIG_MAGIC_SYSRQ |
6474 | void normalize_rt_tasks(void) | 6432 | void normalize_rt_tasks(void) |
6475 | { | 6433 | { |
6476 | struct prio_array *array; | ||
6477 | struct task_struct *g, *p; | 6434 | struct task_struct *g, *p; |
6478 | unsigned long flags; | 6435 | unsigned long flags; |
6479 | struct rq *rq; | 6436 | struct rq *rq; |
6437 | int on_rq; | ||
6480 | 6438 | ||
6481 | read_lock_irq(&tasklist_lock); | 6439 | read_lock_irq(&tasklist_lock); |
6482 | |||
6483 | do_each_thread(g, p) { | 6440 | do_each_thread(g, p) { |
6484 | if (!rt_task(p)) | 6441 | p->se.fair_key = 0; |
6442 | p->se.wait_runtime = 0; | ||
6443 | p->se.wait_start_fair = 0; | ||
6444 | p->se.wait_start = 0; | ||
6445 | p->se.exec_start = 0; | ||
6446 | p->se.sleep_start = 0; | ||
6447 | p->se.sleep_start_fair = 0; | ||
6448 | p->se.block_start = 0; | ||
6449 | task_rq(p)->cfs.fair_clock = 0; | ||
6450 | task_rq(p)->clock = 0; | ||
6451 | |||
6452 | if (!rt_task(p)) { | ||
6453 | /* | ||
6454 | * Renice negative nice level userspace | ||
6455 | * tasks back to 0: | ||
6456 | */ | ||
6457 | if (TASK_NICE(p) < 0 && p->mm) | ||
6458 | set_user_nice(p, 0); | ||
6485 | continue; | 6459 | continue; |
6460 | } | ||
6486 | 6461 | ||
6487 | spin_lock_irqsave(&p->pi_lock, flags); | 6462 | spin_lock_irqsave(&p->pi_lock, flags); |
6488 | rq = __task_rq_lock(p); | 6463 | rq = __task_rq_lock(p); |
6464 | #ifdef CONFIG_SMP | ||
6465 | /* | ||
6466 | * Do not touch the migration thread: | ||
6467 | */ | ||
6468 | if (p == rq->migration_thread) | ||
6469 | goto out_unlock; | ||
6470 | #endif | ||
6489 | 6471 | ||
6490 | array = p->array; | 6472 | on_rq = p->se.on_rq; |
6491 | if (array) | 6473 | if (on_rq) |
6492 | deactivate_task(p, task_rq(p)); | 6474 | deactivate_task(task_rq(p), p, 0); |
6493 | __setscheduler(p, SCHED_NORMAL, 0); | 6475 | __setscheduler(rq, p, SCHED_NORMAL, 0); |
6494 | if (array) { | 6476 | if (on_rq) { |
6495 | __activate_task(p, task_rq(p)); | 6477 | activate_task(task_rq(p), p, 0); |
6496 | resched_task(rq->curr); | 6478 | resched_task(rq->curr); |
6497 | } | 6479 | } |
6498 | 6480 | #ifdef CONFIG_SMP | |
6481 | out_unlock: | ||
6482 | #endif | ||
6499 | __task_rq_unlock(rq); | 6483 | __task_rq_unlock(rq); |
6500 | spin_unlock_irqrestore(&p->pi_lock, flags); | 6484 | spin_unlock_irqrestore(&p->pi_lock, flags); |
6501 | } while_each_thread(g, p); | 6485 | } while_each_thread(g, p); |