diff options
| -rw-r--r-- | kernel/sched.c | 1532 |
1 files changed, 758 insertions, 774 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index f5a204b46655..01ba4b1848a0 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
| @@ -391,6 +391,11 @@ struct rq { | |||
| 391 | static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp; | 391 | static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp; |
| 392 | static DEFINE_MUTEX(sched_hotcpu_mutex); | 392 | static DEFINE_MUTEX(sched_hotcpu_mutex); |
| 393 | 393 | ||
| 394 | static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) | ||
| 395 | { | ||
| 396 | rq->curr->sched_class->check_preempt_curr(rq, p); | ||
| 397 | } | ||
| 398 | |||
| 394 | static inline int cpu_of(struct rq *rq) | 399 | static inline int cpu_of(struct rq *rq) |
| 395 | { | 400 | { |
| 396 | #ifdef CONFIG_SMP | 401 | #ifdef CONFIG_SMP |
| @@ -669,8 +674,6 @@ static inline void resched_task(struct task_struct *p) | |||
| 669 | } | 674 | } |
| 670 | #endif | 675 | #endif |
| 671 | 676 | ||
| 672 | #include "sched_stats.h" | ||
| 673 | |||
| 674 | static u64 div64_likely32(u64 divident, unsigned long divisor) | 677 | static u64 div64_likely32(u64 divident, unsigned long divisor) |
| 675 | { | 678 | { |
| 676 | #if BITS_PER_LONG == 32 | 679 | #if BITS_PER_LONG == 32 |
| @@ -788,120 +791,146 @@ static void update_curr_load(struct rq *rq, u64 now) | |||
| 788 | * this code will need modification | 791 | * this code will need modification |
| 789 | */ | 792 | */ |
| 790 | #define TIME_SLICE_NICE_ZERO DEF_TIMESLICE | 793 | #define TIME_SLICE_NICE_ZERO DEF_TIMESLICE |
| 791 | #define LOAD_WEIGHT(lp) \ | 794 | #define load_weight(lp) \ |
| 792 | (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO) | 795 | (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO) |
| 793 | #define PRIO_TO_LOAD_WEIGHT(prio) \ | 796 | #define PRIO_TO_LOAD_WEIGHT(prio) \ |
| 794 | LOAD_WEIGHT(static_prio_timeslice(prio)) | 797 | load_weight(static_prio_timeslice(prio)) |
| 795 | #define RTPRIO_TO_LOAD_WEIGHT(rp) \ | 798 | #define RTPRIO_TO_LOAD_WEIGHT(rp) \ |
| 796 | (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp)) | 799 | (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + load_weight(rp)) |
| 800 | |||
| 801 | #define WEIGHT_IDLEPRIO 2 | ||
| 802 | #define WMULT_IDLEPRIO (1 << 31) | ||
| 803 | |||
| 804 | /* | ||
| 805 | * Nice levels are multiplicative, with a gentle 10% change for every | ||
| 806 | * nice level changed. I.e. when a CPU-bound task goes from nice 0 to | ||
| 807 | * nice 1, it will get ~10% less CPU time than another CPU-bound task | ||
| 808 | * that remained on nice 0. | ||
| 809 | * | ||
| 810 | * The "10% effect" is relative and cumulative: from _any_ nice level, | ||
| 811 | * if you go up 1 level, it's -10% CPU usage, if you go down 1 level | ||
| 812 | * it's +10% CPU usage. | ||
| 813 | */ | ||
| 814 | static const int prio_to_weight[40] = { | ||
| 815 | /* -20 */ 88818, 71054, 56843, 45475, 36380, 29104, 23283, 18626, 14901, 11921, | ||
| 816 | /* -10 */ 9537, 7629, 6103, 4883, 3906, 3125, 2500, 2000, 1600, 1280, | ||
| 817 | /* 0 */ NICE_0_LOAD /* 1024 */, | ||
| 818 | /* 1 */ 819, 655, 524, 419, 336, 268, 215, 172, 137, | ||
| 819 | /* 10 */ 110, 87, 70, 56, 45, 36, 29, 23, 18, 15, | ||
| 820 | }; | ||
| 821 | |||
| 822 | static const u32 prio_to_wmult[40] = { | ||
| 823 | 48356, 60446, 75558, 94446, 118058, 147573, | ||
| 824 | 184467, 230589, 288233, 360285, 450347, | ||
| 825 | 562979, 703746, 879575, 1099582, 1374389, | ||
| 826 | 717986, 2147483, 2684354, 3355443, 4194304, | ||
| 827 | 244160, 6557201, 8196502, 10250518, 12782640, | ||
| 828 | 16025997, 19976592, 24970740, 31350126, 39045157, | ||
| 829 | 49367440, 61356675, 76695844, 95443717, 119304647, | ||
| 830 | 148102320, 186737708, 238609294, 286331153, | ||
| 831 | }; | ||
| 797 | 832 | ||
| 798 | static inline void | 833 | static inline void |
| 799 | inc_raw_weighted_load(struct rq *rq, const struct task_struct *p) | 834 | inc_load(struct rq *rq, const struct task_struct *p, u64 now) |
| 800 | { | 835 | { |
| 801 | rq->raw_weighted_load += p->load_weight; | 836 | update_curr_load(rq, now); |
| 837 | update_load_add(&rq->ls.load, p->se.load.weight); | ||
| 802 | } | 838 | } |
| 803 | 839 | ||
| 804 | static inline void | 840 | static inline void |
| 805 | dec_raw_weighted_load(struct rq *rq, const struct task_struct *p) | 841 | dec_load(struct rq *rq, const struct task_struct *p, u64 now) |
| 806 | { | 842 | { |
| 807 | rq->raw_weighted_load -= p->load_weight; | 843 | update_curr_load(rq, now); |
| 844 | update_load_sub(&rq->ls.load, p->se.load.weight); | ||
| 808 | } | 845 | } |
| 809 | 846 | ||
| 810 | static inline void inc_nr_running(struct task_struct *p, struct rq *rq) | 847 | static inline void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now) |
| 811 | { | 848 | { |
| 812 | rq->nr_running++; | 849 | rq->nr_running++; |
| 813 | inc_raw_weighted_load(rq, p); | 850 | inc_load(rq, p, now); |
| 814 | } | 851 | } |
| 815 | 852 | ||
| 816 | static inline void dec_nr_running(struct task_struct *p, struct rq *rq) | 853 | static inline void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now) |
| 817 | { | 854 | { |
| 818 | rq->nr_running--; | 855 | rq->nr_running--; |
| 819 | dec_raw_weighted_load(rq, p); | 856 | dec_load(rq, p, now); |
| 820 | } | 857 | } |
| 821 | 858 | ||
| 859 | static void activate_task(struct rq *rq, struct task_struct *p, int wakeup); | ||
| 860 | |||
| 861 | /* | ||
| 862 | * runqueue iterator, to support SMP load-balancing between different | ||
| 863 | * scheduling classes, without having to expose their internal data | ||
| 864 | * structures to the load-balancing proper: | ||
| 865 | */ | ||
| 866 | struct rq_iterator { | ||
| 867 | void *arg; | ||
| 868 | struct task_struct *(*start)(void *); | ||
| 869 | struct task_struct *(*next)(void *); | ||
| 870 | }; | ||
| 871 | |||
| 872 | static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
| 873 | unsigned long max_nr_move, unsigned long max_load_move, | ||
| 874 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
| 875 | int *all_pinned, unsigned long *load_moved, | ||
| 876 | int this_best_prio, int best_prio, int best_prio_seen, | ||
| 877 | struct rq_iterator *iterator); | ||
| 878 | |||
| 879 | #include "sched_stats.h" | ||
| 880 | #include "sched_rt.c" | ||
| 881 | #include "sched_fair.c" | ||
| 882 | #include "sched_idletask.c" | ||
| 883 | #ifdef CONFIG_SCHED_DEBUG | ||
| 884 | # include "sched_debug.c" | ||
| 885 | #endif | ||
| 886 | |||
| 887 | #define sched_class_highest (&rt_sched_class) | ||
| 888 | |||
| 822 | static void set_load_weight(struct task_struct *p) | 889 | static void set_load_weight(struct task_struct *p) |
| 823 | { | 890 | { |
| 891 | task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime; | ||
| 892 | p->se.wait_runtime = 0; | ||
| 893 | |||
| 824 | if (task_has_rt_policy(p)) { | 894 | if (task_has_rt_policy(p)) { |
| 825 | #ifdef CONFIG_SMP | 895 | p->se.load.weight = prio_to_weight[0] * 2; |
| 826 | if (p == task_rq(p)->migration_thread) | 896 | p->se.load.inv_weight = prio_to_wmult[0] >> 1; |
| 827 | /* | 897 | return; |
| 828 | * The migration thread does the actual balancing. | 898 | } |
| 829 | * Giving its load any weight will skew balancing | ||
| 830 | * adversely. | ||
| 831 | */ | ||
| 832 | p->load_weight = 0; | ||
| 833 | else | ||
| 834 | #endif | ||
| 835 | p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority); | ||
| 836 | } else | ||
| 837 | p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio); | ||
| 838 | } | ||
| 839 | 899 | ||
| 840 | /* | 900 | /* |
| 841 | * Adding/removing a task to/from a priority array: | 901 | * SCHED_IDLE tasks get minimal weight: |
| 842 | */ | 902 | */ |
| 843 | static void dequeue_task(struct task_struct *p, struct prio_array *array) | 903 | if (p->policy == SCHED_IDLE) { |
| 844 | { | 904 | p->se.load.weight = WEIGHT_IDLEPRIO; |
| 845 | array->nr_active--; | 905 | p->se.load.inv_weight = WMULT_IDLEPRIO; |
| 846 | list_del(&p->run_list); | 906 | return; |
| 847 | if (list_empty(array->queue + p->prio)) | 907 | } |
| 848 | __clear_bit(p->prio, array->bitmap); | ||
| 849 | } | ||
| 850 | 908 | ||
| 851 | static void enqueue_task(struct task_struct *p, struct prio_array *array) | 909 | p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO]; |
| 852 | { | 910 | p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; |
| 853 | sched_info_queued(p); | ||
| 854 | list_add_tail(&p->run_list, array->queue + p->prio); | ||
| 855 | __set_bit(p->prio, array->bitmap); | ||
| 856 | array->nr_active++; | ||
| 857 | p->array = array; | ||
| 858 | } | 911 | } |
| 859 | 912 | ||
| 860 | /* | 913 | static void |
| 861 | * Put task to the end of the run list without the overhead of dequeue | 914 | enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, u64 now) |
| 862 | * followed by enqueue. | ||
| 863 | */ | ||
| 864 | static void requeue_task(struct task_struct *p, struct prio_array *array) | ||
| 865 | { | 915 | { |
| 866 | list_move_tail(&p->run_list, array->queue + p->prio); | 916 | sched_info_queued(p); |
| 917 | p->sched_class->enqueue_task(rq, p, wakeup, now); | ||
| 918 | p->se.on_rq = 1; | ||
| 867 | } | 919 | } |
| 868 | 920 | ||
| 869 | static inline void | 921 | static void |
| 870 | enqueue_task_head(struct task_struct *p, struct prio_array *array) | 922 | dequeue_task(struct rq *rq, struct task_struct *p, int sleep, u64 now) |
| 871 | { | 923 | { |
| 872 | list_add(&p->run_list, array->queue + p->prio); | 924 | p->sched_class->dequeue_task(rq, p, sleep, now); |
| 873 | __set_bit(p->prio, array->bitmap); | 925 | p->se.on_rq = 0; |
| 874 | array->nr_active++; | ||
| 875 | p->array = array; | ||
| 876 | } | 926 | } |
| 877 | 927 | ||
| 878 | /* | 928 | /* |
| 879 | * __normal_prio - return the priority that is based on the static | 929 | * __normal_prio - return the priority that is based on the static prio |
| 880 | * priority but is modified by bonuses/penalties. | ||
| 881 | * | ||
| 882 | * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] | ||
| 883 | * into the -5 ... 0 ... +5 bonus/penalty range. | ||
| 884 | * | ||
| 885 | * We use 25% of the full 0...39 priority range so that: | ||
| 886 | * | ||
| 887 | * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. | ||
| 888 | * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. | ||
| 889 | * | ||
| 890 | * Both properties are important to certain workloads. | ||
| 891 | */ | 930 | */ |
| 892 | |||
| 893 | static inline int __normal_prio(struct task_struct *p) | 931 | static inline int __normal_prio(struct task_struct *p) |
| 894 | { | 932 | { |
| 895 | int bonus, prio; | 933 | return p->static_prio; |
| 896 | |||
| 897 | bonus = 0; | ||
| 898 | |||
| 899 | prio = p->static_prio - bonus; | ||
| 900 | if (prio < MAX_RT_PRIO) | ||
| 901 | prio = MAX_RT_PRIO; | ||
| 902 | if (prio > MAX_PRIO-1) | ||
| 903 | prio = MAX_PRIO-1; | ||
| 904 | return prio; | ||
| 905 | } | 934 | } |
| 906 | 935 | ||
| 907 | /* | 936 | /* |
| @@ -943,84 +972,45 @@ static int effective_prio(struct task_struct *p) | |||
| 943 | } | 972 | } |
| 944 | 973 | ||
| 945 | /* | 974 | /* |
| 946 | * __activate_task - move a task to the runqueue. | 975 | * activate_task - move a task to the runqueue. |
| 947 | */ | 976 | */ |
| 948 | static void __activate_task(struct task_struct *p, struct rq *rq) | 977 | static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) |
| 949 | { | 978 | { |
| 950 | struct prio_array *target = rq->active; | 979 | u64 now = rq_clock(rq); |
| 951 | 980 | ||
| 952 | if (batch_task(p)) | 981 | if (p->state == TASK_UNINTERRUPTIBLE) |
| 953 | target = rq->expired; | 982 | rq->nr_uninterruptible--; |
| 954 | enqueue_task(p, target); | ||
| 955 | inc_nr_running(p, rq); | ||
| 956 | } | ||
| 957 | |||
| 958 | /* | ||
| 959 | * __activate_idle_task - move idle task to the _front_ of runqueue. | ||
| 960 | */ | ||
| 961 | static inline void __activate_idle_task(struct task_struct *p, struct rq *rq) | ||
| 962 | { | ||
| 963 | enqueue_task_head(p, rq->active); | ||
| 964 | inc_nr_running(p, rq); | ||
| 965 | } | ||
| 966 | 983 | ||
| 967 | /* | 984 | enqueue_task(rq, p, wakeup, now); |
| 968 | * Recalculate p->normal_prio and p->prio after having slept, | 985 | inc_nr_running(p, rq, now); |
| 969 | * updating the sleep-average too: | ||
| 970 | */ | ||
| 971 | static int recalc_task_prio(struct task_struct *p, unsigned long long now) | ||
| 972 | { | ||
| 973 | return effective_prio(p); | ||
| 974 | } | 986 | } |
| 975 | 987 | ||
| 976 | /* | 988 | /* |
| 977 | * activate_task - move a task to the runqueue and do priority recalculation | 989 | * activate_idle_task - move idle task to the _front_ of runqueue. |
| 978 | * | ||
| 979 | * Update all the scheduling statistics stuff. (sleep average | ||
| 980 | * calculation, priority modifiers, etc.) | ||
| 981 | */ | 990 | */ |
| 982 | static void activate_task(struct task_struct *p, struct rq *rq, int local) | 991 | static inline void activate_idle_task(struct task_struct *p, struct rq *rq) |
| 983 | { | 992 | { |
| 984 | unsigned long long now; | 993 | u64 now = rq_clock(rq); |
| 985 | |||
| 986 | if (rt_task(p)) | ||
| 987 | goto out; | ||
| 988 | |||
| 989 | now = sched_clock(); | ||
| 990 | #ifdef CONFIG_SMP | ||
| 991 | if (!local) { | ||
| 992 | /* Compensate for drifting sched_clock */ | ||
| 993 | struct rq *this_rq = this_rq(); | ||
| 994 | now = (now - this_rq->most_recent_timestamp) | ||
| 995 | + rq->most_recent_timestamp; | ||
| 996 | } | ||
| 997 | #endif | ||
| 998 | 994 | ||
| 999 | /* | 995 | if (p->state == TASK_UNINTERRUPTIBLE) |
| 1000 | * Sleep time is in units of nanosecs, so shift by 20 to get a | 996 | rq->nr_uninterruptible--; |
| 1001 | * milliseconds-range estimation of the amount of time that the task | ||
| 1002 | * spent sleeping: | ||
| 1003 | */ | ||
| 1004 | if (unlikely(prof_on == SLEEP_PROFILING)) { | ||
| 1005 | if (p->state == TASK_UNINTERRUPTIBLE) | ||
| 1006 | profile_hits(SLEEP_PROFILING, (void *)get_wchan(p), | ||
| 1007 | (now - p->timestamp) >> 20); | ||
| 1008 | } | ||
| 1009 | 997 | ||
| 1010 | p->prio = recalc_task_prio(p, now); | 998 | enqueue_task(rq, p, 0, now); |
| 1011 | p->timestamp = now; | 999 | inc_nr_running(p, rq, now); |
| 1012 | out: | ||
| 1013 | __activate_task(p, rq); | ||
| 1014 | } | 1000 | } |
| 1015 | 1001 | ||
| 1016 | /* | 1002 | /* |
| 1017 | * deactivate_task - remove a task from the runqueue. | 1003 | * deactivate_task - remove a task from the runqueue. |
| 1018 | */ | 1004 | */ |
| 1019 | static void deactivate_task(struct task_struct *p, struct rq *rq) | 1005 | static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) |
| 1020 | { | 1006 | { |
| 1021 | dec_nr_running(p, rq); | 1007 | u64 now = rq_clock(rq); |
| 1022 | dequeue_task(p, p->array); | 1008 | |
| 1023 | p->array = NULL; | 1009 | if (p->state == TASK_UNINTERRUPTIBLE) |
| 1010 | rq->nr_uninterruptible++; | ||
| 1011 | |||
| 1012 | dequeue_task(rq, p, sleep, now); | ||
| 1013 | dec_nr_running(p, rq, now); | ||
| 1024 | } | 1014 | } |
| 1025 | 1015 | ||
| 1026 | /** | 1016 | /** |
| @@ -1035,14 +1025,40 @@ inline int task_curr(const struct task_struct *p) | |||
| 1035 | /* Used instead of source_load when we know the type == 0 */ | 1025 | /* Used instead of source_load when we know the type == 0 */ |
| 1036 | unsigned long weighted_cpuload(const int cpu) | 1026 | unsigned long weighted_cpuload(const int cpu) |
| 1037 | { | 1027 | { |
| 1038 | return cpu_rq(cpu)->raw_weighted_load; | 1028 | return cpu_rq(cpu)->ls.load.weight; |
| 1029 | } | ||
| 1030 | |||
| 1031 | static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | ||
| 1032 | { | ||
| 1033 | #ifdef CONFIG_SMP | ||
| 1034 | task_thread_info(p)->cpu = cpu; | ||
| 1035 | set_task_cfs_rq(p); | ||
| 1036 | #endif | ||
| 1039 | } | 1037 | } |
| 1040 | 1038 | ||
| 1041 | #ifdef CONFIG_SMP | 1039 | #ifdef CONFIG_SMP |
| 1042 | 1040 | ||
| 1043 | void set_task_cpu(struct task_struct *p, unsigned int cpu) | 1041 | void set_task_cpu(struct task_struct *p, unsigned int new_cpu) |
| 1044 | { | 1042 | { |
| 1045 | task_thread_info(p)->cpu = cpu; | 1043 | int old_cpu = task_cpu(p); |
| 1044 | struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu); | ||
| 1045 | u64 clock_offset, fair_clock_offset; | ||
| 1046 | |||
| 1047 | clock_offset = old_rq->clock - new_rq->clock; | ||
| 1048 | fair_clock_offset = old_rq->cfs.fair_clock - | ||
| 1049 | new_rq->cfs.fair_clock; | ||
| 1050 | if (p->se.wait_start) | ||
| 1051 | p->se.wait_start -= clock_offset; | ||
| 1052 | if (p->se.wait_start_fair) | ||
| 1053 | p->se.wait_start_fair -= fair_clock_offset; | ||
| 1054 | if (p->se.sleep_start) | ||
| 1055 | p->se.sleep_start -= clock_offset; | ||
| 1056 | if (p->se.block_start) | ||
| 1057 | p->se.block_start -= clock_offset; | ||
| 1058 | if (p->se.sleep_start_fair) | ||
| 1059 | p->se.sleep_start_fair -= fair_clock_offset; | ||
| 1060 | |||
| 1061 | __set_task_cpu(p, new_cpu); | ||
| 1046 | } | 1062 | } |
| 1047 | 1063 | ||
| 1048 | struct migration_req { | 1064 | struct migration_req { |
| @@ -1067,7 +1083,7 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) | |||
| 1067 | * If the task is not on a runqueue (and not running), then | 1083 | * If the task is not on a runqueue (and not running), then |
| 1068 | * it is sufficient to simply update the task's cpu field. | 1084 | * it is sufficient to simply update the task's cpu field. |
| 1069 | */ | 1085 | */ |
| 1070 | if (!p->array && !task_running(rq, p)) { | 1086 | if (!p->se.on_rq && !task_running(rq, p)) { |
| 1071 | set_task_cpu(p, dest_cpu); | 1087 | set_task_cpu(p, dest_cpu); |
| 1072 | return 0; | 1088 | return 0; |
| 1073 | } | 1089 | } |
| @@ -1092,9 +1108,8 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) | |||
| 1092 | void wait_task_inactive(struct task_struct *p) | 1108 | void wait_task_inactive(struct task_struct *p) |
| 1093 | { | 1109 | { |
| 1094 | unsigned long flags; | 1110 | unsigned long flags; |
| 1111 | int running, on_rq; | ||
| 1095 | struct rq *rq; | 1112 | struct rq *rq; |
| 1096 | struct prio_array *array; | ||
| 1097 | int running; | ||
| 1098 | 1113 | ||
| 1099 | repeat: | 1114 | repeat: |
| 1100 | /* | 1115 | /* |
| @@ -1126,7 +1141,7 @@ repeat: | |||
| 1126 | */ | 1141 | */ |
| 1127 | rq = task_rq_lock(p, &flags); | 1142 | rq = task_rq_lock(p, &flags); |
| 1128 | running = task_running(rq, p); | 1143 | running = task_running(rq, p); |
| 1129 | array = p->array; | 1144 | on_rq = p->se.on_rq; |
| 1130 | task_rq_unlock(rq, &flags); | 1145 | task_rq_unlock(rq, &flags); |
| 1131 | 1146 | ||
| 1132 | /* | 1147 | /* |
| @@ -1149,7 +1164,7 @@ repeat: | |||
| 1149 | * running right now), it's preempted, and we should | 1164 | * running right now), it's preempted, and we should |
| 1150 | * yield - it could be a while. | 1165 | * yield - it could be a while. |
| 1151 | */ | 1166 | */ |
| 1152 | if (unlikely(array)) { | 1167 | if (unlikely(on_rq)) { |
| 1153 | yield(); | 1168 | yield(); |
| 1154 | goto repeat; | 1169 | goto repeat; |
| 1155 | } | 1170 | } |
| @@ -1195,11 +1210,12 @@ void kick_process(struct task_struct *p) | |||
| 1195 | static inline unsigned long source_load(int cpu, int type) | 1210 | static inline unsigned long source_load(int cpu, int type) |
| 1196 | { | 1211 | { |
| 1197 | struct rq *rq = cpu_rq(cpu); | 1212 | struct rq *rq = cpu_rq(cpu); |
| 1213 | unsigned long total = weighted_cpuload(cpu); | ||
| 1198 | 1214 | ||
| 1199 | if (type == 0) | 1215 | if (type == 0) |
| 1200 | return rq->raw_weighted_load; | 1216 | return total; |
| 1201 | 1217 | ||
| 1202 | return min(rq->cpu_load[type-1], rq->raw_weighted_load); | 1218 | return min(rq->cpu_load[type-1], total); |
| 1203 | } | 1219 | } |
| 1204 | 1220 | ||
| 1205 | /* | 1221 | /* |
| @@ -1209,11 +1225,12 @@ static inline unsigned long source_load(int cpu, int type) | |||
| 1209 | static inline unsigned long target_load(int cpu, int type) | 1225 | static inline unsigned long target_load(int cpu, int type) |
| 1210 | { | 1226 | { |
| 1211 | struct rq *rq = cpu_rq(cpu); | 1227 | struct rq *rq = cpu_rq(cpu); |
| 1228 | unsigned long total = weighted_cpuload(cpu); | ||
| 1212 | 1229 | ||
| 1213 | if (type == 0) | 1230 | if (type == 0) |
| 1214 | return rq->raw_weighted_load; | 1231 | return total; |
| 1215 | 1232 | ||
| 1216 | return max(rq->cpu_load[type-1], rq->raw_weighted_load); | 1233 | return max(rq->cpu_load[type-1], total); |
| 1217 | } | 1234 | } |
| 1218 | 1235 | ||
| 1219 | /* | 1236 | /* |
| @@ -1222,9 +1239,10 @@ static inline unsigned long target_load(int cpu, int type) | |||
| 1222 | static inline unsigned long cpu_avg_load_per_task(int cpu) | 1239 | static inline unsigned long cpu_avg_load_per_task(int cpu) |
| 1223 | { | 1240 | { |
| 1224 | struct rq *rq = cpu_rq(cpu); | 1241 | struct rq *rq = cpu_rq(cpu); |
| 1242 | unsigned long total = weighted_cpuload(cpu); | ||
| 1225 | unsigned long n = rq->nr_running; | 1243 | unsigned long n = rq->nr_running; |
| 1226 | 1244 | ||
| 1227 | return n ? rq->raw_weighted_load / n : SCHED_LOAD_SCALE; | 1245 | return n ? total / n : SCHED_LOAD_SCALE; |
| 1228 | } | 1246 | } |
| 1229 | 1247 | ||
| 1230 | /* | 1248 | /* |
| @@ -1455,7 +1473,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
| 1455 | if (!(old_state & state)) | 1473 | if (!(old_state & state)) |
| 1456 | goto out; | 1474 | goto out; |
| 1457 | 1475 | ||
| 1458 | if (p->array) | 1476 | if (p->se.on_rq) |
| 1459 | goto out_running; | 1477 | goto out_running; |
| 1460 | 1478 | ||
| 1461 | cpu = task_cpu(p); | 1479 | cpu = task_cpu(p); |
| @@ -1510,11 +1528,11 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
| 1510 | * of the current CPU: | 1528 | * of the current CPU: |
| 1511 | */ | 1529 | */ |
| 1512 | if (sync) | 1530 | if (sync) |
| 1513 | tl -= current->load_weight; | 1531 | tl -= current->se.load.weight; |
| 1514 | 1532 | ||
| 1515 | if ((tl <= load && | 1533 | if ((tl <= load && |
| 1516 | tl + target_load(cpu, idx) <= tl_per_task) || | 1534 | tl + target_load(cpu, idx) <= tl_per_task) || |
| 1517 | 100*(tl + p->load_weight) <= imbalance*load) { | 1535 | 100*(tl + p->se.load.weight) <= imbalance*load) { |
| 1518 | /* | 1536 | /* |
| 1519 | * This domain has SD_WAKE_AFFINE and | 1537 | * This domain has SD_WAKE_AFFINE and |
| 1520 | * p is cache cold in this domain, and | 1538 | * p is cache cold in this domain, and |
| @@ -1548,7 +1566,7 @@ out_set_cpu: | |||
| 1548 | old_state = p->state; | 1566 | old_state = p->state; |
| 1549 | if (!(old_state & state)) | 1567 | if (!(old_state & state)) |
| 1550 | goto out; | 1568 | goto out; |
| 1551 | if (p->array) | 1569 | if (p->se.on_rq) |
| 1552 | goto out_running; | 1570 | goto out_running; |
| 1553 | 1571 | ||
| 1554 | this_cpu = smp_processor_id(); | 1572 | this_cpu = smp_processor_id(); |
| @@ -1557,10 +1575,7 @@ out_set_cpu: | |||
| 1557 | 1575 | ||
| 1558 | out_activate: | 1576 | out_activate: |
| 1559 | #endif /* CONFIG_SMP */ | 1577 | #endif /* CONFIG_SMP */ |
| 1560 | if (old_state == TASK_UNINTERRUPTIBLE) | 1578 | activate_task(rq, p, 1); |
| 1561 | rq->nr_uninterruptible--; | ||
| 1562 | |||
| 1563 | activate_task(p, rq, cpu == this_cpu); | ||
| 1564 | /* | 1579 | /* |
| 1565 | * Sync wakeups (i.e. those types of wakeups where the waker | 1580 | * Sync wakeups (i.e. those types of wakeups where the waker |
| 1566 | * has indicated that it will leave the CPU in short order) | 1581 | * has indicated that it will leave the CPU in short order) |
| @@ -1569,10 +1584,8 @@ out_activate: | |||
| 1569 | * the waker guarantees that the freshly woken up task is going | 1584 | * the waker guarantees that the freshly woken up task is going |
| 1570 | * to be considered on this CPU.) | 1585 | * to be considered on this CPU.) |
| 1571 | */ | 1586 | */ |
| 1572 | if (!sync || cpu != this_cpu) { | 1587 | if (!sync || cpu != this_cpu) |
| 1573 | if (TASK_PREEMPTS_CURR(p, rq)) | 1588 | check_preempt_curr(rq, p); |
| 1574 | resched_task(rq->curr); | ||
| 1575 | } | ||
| 1576 | success = 1; | 1589 | success = 1; |
| 1577 | 1590 | ||
| 1578 | out_running: | 1591 | out_running: |
| @@ -1595,19 +1608,36 @@ int fastcall wake_up_state(struct task_struct *p, unsigned int state) | |||
| 1595 | return try_to_wake_up(p, state, 0); | 1608 | return try_to_wake_up(p, state, 0); |
| 1596 | } | 1609 | } |
| 1597 | 1610 | ||
| 1598 | static void task_running_tick(struct rq *rq, struct task_struct *p); | ||
| 1599 | /* | 1611 | /* |
| 1600 | * Perform scheduler related setup for a newly forked process p. | 1612 | * Perform scheduler related setup for a newly forked process p. |
| 1601 | * p is forked by current. | 1613 | * p is forked by current. |
| 1602 | */ | 1614 | * |
| 1603 | void fastcall sched_fork(struct task_struct *p, int clone_flags) | 1615 | * __sched_fork() is basic setup used by init_idle() too: |
| 1604 | { | 1616 | */ |
| 1605 | int cpu = get_cpu(); | 1617 | static void __sched_fork(struct task_struct *p) |
| 1618 | { | ||
| 1619 | p->se.wait_start_fair = 0; | ||
| 1620 | p->se.wait_start = 0; | ||
| 1621 | p->se.exec_start = 0; | ||
| 1622 | p->se.sum_exec_runtime = 0; | ||
| 1623 | p->se.delta_exec = 0; | ||
| 1624 | p->se.delta_fair_run = 0; | ||
| 1625 | p->se.delta_fair_sleep = 0; | ||
| 1626 | p->se.wait_runtime = 0; | ||
| 1627 | p->se.sum_wait_runtime = 0; | ||
| 1628 | p->se.sum_sleep_runtime = 0; | ||
| 1629 | p->se.sleep_start = 0; | ||
| 1630 | p->se.sleep_start_fair = 0; | ||
| 1631 | p->se.block_start = 0; | ||
| 1632 | p->se.sleep_max = 0; | ||
| 1633 | p->se.block_max = 0; | ||
| 1634 | p->se.exec_max = 0; | ||
| 1635 | p->se.wait_max = 0; | ||
| 1636 | p->se.wait_runtime_overruns = 0; | ||
| 1637 | p->se.wait_runtime_underruns = 0; | ||
| 1606 | 1638 | ||
| 1607 | #ifdef CONFIG_SMP | 1639 | INIT_LIST_HEAD(&p->run_list); |
| 1608 | cpu = sched_balance_self(cpu, SD_BALANCE_FORK); | 1640 | p->se.on_rq = 0; |
| 1609 | #endif | ||
| 1610 | set_task_cpu(p, cpu); | ||
| 1611 | 1641 | ||
| 1612 | /* | 1642 | /* |
| 1613 | * We mark the process as running here, but have not actually | 1643 | * We mark the process as running here, but have not actually |
| @@ -1616,16 +1646,29 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags) | |||
| 1616 | * event cannot wake it up and insert it on the runqueue either. | 1646 | * event cannot wake it up and insert it on the runqueue either. |
| 1617 | */ | 1647 | */ |
| 1618 | p->state = TASK_RUNNING; | 1648 | p->state = TASK_RUNNING; |
| 1649 | } | ||
| 1650 | |||
| 1651 | /* | ||
| 1652 | * fork()/clone()-time setup: | ||
| 1653 | */ | ||
| 1654 | void sched_fork(struct task_struct *p, int clone_flags) | ||
| 1655 | { | ||
| 1656 | int cpu = get_cpu(); | ||
| 1657 | |||
| 1658 | __sched_fork(p); | ||
| 1659 | |||
| 1660 | #ifdef CONFIG_SMP | ||
| 1661 | cpu = sched_balance_self(cpu, SD_BALANCE_FORK); | ||
| 1662 | #endif | ||
| 1663 | __set_task_cpu(p, cpu); | ||
| 1619 | 1664 | ||
| 1620 | /* | 1665 | /* |
| 1621 | * Make sure we do not leak PI boosting priority to the child: | 1666 | * Make sure we do not leak PI boosting priority to the child: |
| 1622 | */ | 1667 | */ |
| 1623 | p->prio = current->normal_prio; | 1668 | p->prio = current->normal_prio; |
| 1624 | 1669 | ||
| 1625 | INIT_LIST_HEAD(&p->run_list); | ||
| 1626 | p->array = NULL; | ||
| 1627 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) | 1670 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) |
| 1628 | if (unlikely(sched_info_on())) | 1671 | if (likely(sched_info_on())) |
| 1629 | memset(&p->sched_info, 0, sizeof(p->sched_info)); | 1672 | memset(&p->sched_info, 0, sizeof(p->sched_info)); |
| 1630 | #endif | 1673 | #endif |
| 1631 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) | 1674 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) |
| @@ -1635,34 +1678,16 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags) | |||
| 1635 | /* Want to start with kernel preemption disabled. */ | 1678 | /* Want to start with kernel preemption disabled. */ |
| 1636 | task_thread_info(p)->preempt_count = 1; | 1679 | task_thread_info(p)->preempt_count = 1; |
| 1637 | #endif | 1680 | #endif |
| 1638 | /* | ||
| 1639 | * Share the timeslice between parent and child, thus the | ||
| 1640 | * total amount of pending timeslices in the system doesn't change, | ||
| 1641 | * resulting in more scheduling fairness. | ||
| 1642 | */ | ||
| 1643 | local_irq_disable(); | ||
| 1644 | p->time_slice = (current->time_slice + 1) >> 1; | ||
| 1645 | /* | ||
| 1646 | * The remainder of the first timeslice might be recovered by | ||
| 1647 | * the parent if the child exits early enough. | ||
| 1648 | */ | ||
| 1649 | p->first_time_slice = 1; | ||
| 1650 | current->time_slice >>= 1; | ||
| 1651 | p->timestamp = sched_clock(); | ||
| 1652 | if (unlikely(!current->time_slice)) { | ||
| 1653 | /* | ||
| 1654 | * This case is rare, it happens when the parent has only | ||
| 1655 | * a single jiffy left from its timeslice. Taking the | ||
| 1656 | * runqueue lock is not a problem. | ||
| 1657 | */ | ||
| 1658 | current->time_slice = 1; | ||
| 1659 | task_running_tick(cpu_rq(cpu), current); | ||
| 1660 | } | ||
| 1661 | local_irq_enable(); | ||
| 1662 | put_cpu(); | 1681 | put_cpu(); |
| 1663 | } | 1682 | } |
| 1664 | 1683 | ||
| 1665 | /* | 1684 | /* |
| 1685 | * After fork, child runs first. (default) If set to 0 then | ||
| 1686 | * parent will (try to) run first. | ||
| 1687 | */ | ||
| 1688 | unsigned int __read_mostly sysctl_sched_child_runs_first = 1; | ||
| 1689 | |||
| 1690 | /* | ||
| 1666 | * wake_up_new_task - wake up a newly created task for the first time. | 1691 | * wake_up_new_task - wake up a newly created task for the first time. |
| 1667 | * | 1692 | * |
| 1668 | * This function will do some initial scheduler statistics housekeeping | 1693 | * This function will do some initial scheduler statistics housekeeping |
| @@ -1671,77 +1696,28 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags) | |||
| 1671 | */ | 1696 | */ |
| 1672 | void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | 1697 | void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) |
| 1673 | { | 1698 | { |
| 1674 | struct rq *rq, *this_rq; | ||
| 1675 | unsigned long flags; | 1699 | unsigned long flags; |
| 1676 | int this_cpu, cpu; | 1700 | struct rq *rq; |
| 1701 | int this_cpu; | ||
| 1677 | 1702 | ||
| 1678 | rq = task_rq_lock(p, &flags); | 1703 | rq = task_rq_lock(p, &flags); |
| 1679 | BUG_ON(p->state != TASK_RUNNING); | 1704 | BUG_ON(p->state != TASK_RUNNING); |
| 1680 | this_cpu = smp_processor_id(); | 1705 | this_cpu = smp_processor_id(); /* parent's CPU */ |
| 1681 | cpu = task_cpu(p); | ||
| 1682 | |||
| 1683 | /* | ||
| 1684 | * We decrease the sleep average of forking parents | ||
| 1685 | * and children as well, to keep max-interactive tasks | ||
| 1686 | * from forking tasks that are max-interactive. The parent | ||
| 1687 | * (current) is done further down, under its lock. | ||
| 1688 | */ | ||
| 1689 | p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * | ||
| 1690 | CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); | ||
| 1691 | 1706 | ||
| 1692 | p->prio = effective_prio(p); | 1707 | p->prio = effective_prio(p); |
| 1693 | 1708 | ||
| 1694 | if (likely(cpu == this_cpu)) { | 1709 | if (!sysctl_sched_child_runs_first || (clone_flags & CLONE_VM) || |
| 1695 | if (!(clone_flags & CLONE_VM)) { | 1710 | task_cpu(p) != this_cpu || !current->se.on_rq) { |
| 1696 | /* | 1711 | activate_task(rq, p, 0); |
| 1697 | * The VM isn't cloned, so we're in a good position to | ||
| 1698 | * do child-runs-first in anticipation of an exec. This | ||
| 1699 | * usually avoids a lot of COW overhead. | ||
| 1700 | */ | ||
| 1701 | if (unlikely(!current->array)) | ||
| 1702 | __activate_task(p, rq); | ||
| 1703 | else { | ||
| 1704 | p->prio = current->prio; | ||
| 1705 | p->normal_prio = current->normal_prio; | ||
| 1706 | list_add_tail(&p->run_list, ¤t->run_list); | ||
| 1707 | p->array = current->array; | ||
| 1708 | p->array->nr_active++; | ||
| 1709 | inc_nr_running(p, rq); | ||
| 1710 | } | ||
| 1711 | set_need_resched(); | ||
| 1712 | } else | ||
| 1713 | /* Run child last */ | ||
| 1714 | __activate_task(p, rq); | ||
| 1715 | /* | ||
| 1716 | * We skip the following code due to cpu == this_cpu | ||
| 1717 | * | ||
| 1718 | * task_rq_unlock(rq, &flags); | ||
| 1719 | * this_rq = task_rq_lock(current, &flags); | ||
| 1720 | */ | ||
| 1721 | this_rq = rq; | ||
| 1722 | } else { | 1712 | } else { |
| 1723 | this_rq = cpu_rq(this_cpu); | ||
| 1724 | |||
| 1725 | /* | 1713 | /* |
| 1726 | * Not the local CPU - must adjust timestamp. This should | 1714 | * Let the scheduling class do new task startup |
| 1727 | * get optimised away in the !CONFIG_SMP case. | 1715 | * management (if any): |
| 1728 | */ | 1716 | */ |
| 1729 | p->timestamp = (p->timestamp - this_rq->most_recent_timestamp) | 1717 | p->sched_class->task_new(rq, p); |
| 1730 | + rq->most_recent_timestamp; | ||
| 1731 | __activate_task(p, rq); | ||
| 1732 | if (TASK_PREEMPTS_CURR(p, rq)) | ||
| 1733 | resched_task(rq->curr); | ||
| 1734 | |||
| 1735 | /* | ||
| 1736 | * Parent and child are on different CPUs, now get the | ||
| 1737 | * parent runqueue to update the parent's ->sleep_avg: | ||
| 1738 | */ | ||
| 1739 | task_rq_unlock(rq, &flags); | ||
| 1740 | this_rq = task_rq_lock(current, &flags); | ||
| 1741 | } | 1718 | } |
| 1742 | current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * | 1719 | check_preempt_curr(rq, p); |
| 1743 | PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); | 1720 | task_rq_unlock(rq, &flags); |
| 1744 | task_rq_unlock(this_rq, &flags); | ||
| 1745 | } | 1721 | } |
| 1746 | 1722 | ||
| 1747 | /** | 1723 | /** |
| @@ -1833,13 +1809,15 @@ asmlinkage void schedule_tail(struct task_struct *prev) | |||
| 1833 | * context_switch - switch to the new MM and the new | 1809 | * context_switch - switch to the new MM and the new |
| 1834 | * thread's register state. | 1810 | * thread's register state. |
| 1835 | */ | 1811 | */ |
| 1836 | static inline struct task_struct * | 1812 | static inline void |
| 1837 | context_switch(struct rq *rq, struct task_struct *prev, | 1813 | context_switch(struct rq *rq, struct task_struct *prev, |
| 1838 | struct task_struct *next) | 1814 | struct task_struct *next) |
| 1839 | { | 1815 | { |
| 1840 | struct mm_struct *mm = next->mm; | 1816 | struct mm_struct *mm, *oldmm; |
| 1841 | struct mm_struct *oldmm = prev->active_mm; | ||
| 1842 | 1817 | ||
| 1818 | prepare_task_switch(rq, next); | ||
| 1819 | mm = next->mm; | ||
| 1820 | oldmm = prev->active_mm; | ||
| 1843 | /* | 1821 | /* |
| 1844 | * For paravirt, this is coupled with an exit in switch_to to | 1822 | * For paravirt, this is coupled with an exit in switch_to to |
| 1845 | * combine the page table reload and the switch backend into | 1823 | * combine the page table reload and the switch backend into |
| @@ -1847,16 +1825,15 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
| 1847 | */ | 1825 | */ |
| 1848 | arch_enter_lazy_cpu_mode(); | 1826 | arch_enter_lazy_cpu_mode(); |
| 1849 | 1827 | ||
| 1850 | if (!mm) { | 1828 | if (unlikely(!mm)) { |
| 1851 | next->active_mm = oldmm; | 1829 | next->active_mm = oldmm; |
| 1852 | atomic_inc(&oldmm->mm_count); | 1830 | atomic_inc(&oldmm->mm_count); |
| 1853 | enter_lazy_tlb(oldmm, next); | 1831 | enter_lazy_tlb(oldmm, next); |
| 1854 | } else | 1832 | } else |
| 1855 | switch_mm(oldmm, mm, next); | 1833 | switch_mm(oldmm, mm, next); |
| 1856 | 1834 | ||
| 1857 | if (!prev->mm) { | 1835 | if (unlikely(!prev->mm)) { |
| 1858 | prev->active_mm = NULL; | 1836 | prev->active_mm = NULL; |
| 1859 | WARN_ON(rq->prev_mm); | ||
| 1860 | rq->prev_mm = oldmm; | 1837 | rq->prev_mm = oldmm; |
| 1861 | } | 1838 | } |
| 1862 | /* | 1839 | /* |
| @@ -1872,7 +1849,13 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
| 1872 | /* Here we just switch the register state and the stack. */ | 1849 | /* Here we just switch the register state and the stack. */ |
| 1873 | switch_to(prev, next, prev); | 1850 | switch_to(prev, next, prev); |
| 1874 | 1851 | ||
| 1875 | return prev; | 1852 | barrier(); |
| 1853 | /* | ||
| 1854 | * this_rq must be evaluated again because prev may have moved | ||
| 1855 | * CPUs since it called schedule(), thus the 'rq' on its stack | ||
| 1856 | * frame will be invalid. | ||
| 1857 | */ | ||
| 1858 | finish_task_switch(this_rq(), prev); | ||
| 1876 | } | 1859 | } |
| 1877 | 1860 | ||
| 1878 | /* | 1861 | /* |
| @@ -1945,17 +1928,65 @@ unsigned long nr_active(void) | |||
| 1945 | return running + uninterruptible; | 1928 | return running + uninterruptible; |
| 1946 | } | 1929 | } |
| 1947 | 1930 | ||
| 1948 | #ifdef CONFIG_SMP | ||
| 1949 | |||
| 1950 | /* | 1931 | /* |
| 1951 | * Is this task likely cache-hot: | 1932 | * Update rq->cpu_load[] statistics. This function is usually called every |
| 1933 | * scheduler tick (TICK_NSEC). | ||
| 1952 | */ | 1934 | */ |
| 1953 | static inline int | 1935 | static void update_cpu_load(struct rq *this_rq) |
| 1954 | task_hot(struct task_struct *p, unsigned long long now, struct sched_domain *sd) | ||
| 1955 | { | 1936 | { |
| 1956 | return (long long)(now - p->last_ran) < (long long)sd->cache_hot_time; | 1937 | u64 fair_delta64, exec_delta64, idle_delta64, sample_interval64, tmp64; |
| 1938 | unsigned long total_load = this_rq->ls.load.weight; | ||
| 1939 | unsigned long this_load = total_load; | ||
| 1940 | struct load_stat *ls = &this_rq->ls; | ||
| 1941 | u64 now = __rq_clock(this_rq); | ||
| 1942 | int i, scale; | ||
| 1943 | |||
| 1944 | this_rq->nr_load_updates++; | ||
| 1945 | if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD))) | ||
| 1946 | goto do_avg; | ||
| 1947 | |||
| 1948 | /* Update delta_fair/delta_exec fields first */ | ||
| 1949 | update_curr_load(this_rq, now); | ||
| 1950 | |||
| 1951 | fair_delta64 = ls->delta_fair + 1; | ||
| 1952 | ls->delta_fair = 0; | ||
| 1953 | |||
| 1954 | exec_delta64 = ls->delta_exec + 1; | ||
| 1955 | ls->delta_exec = 0; | ||
| 1956 | |||
| 1957 | sample_interval64 = now - ls->load_update_last; | ||
| 1958 | ls->load_update_last = now; | ||
| 1959 | |||
| 1960 | if ((s64)sample_interval64 < (s64)TICK_NSEC) | ||
| 1961 | sample_interval64 = TICK_NSEC; | ||
| 1962 | |||
| 1963 | if (exec_delta64 > sample_interval64) | ||
| 1964 | exec_delta64 = sample_interval64; | ||
| 1965 | |||
| 1966 | idle_delta64 = sample_interval64 - exec_delta64; | ||
| 1967 | |||
| 1968 | tmp64 = div64_64(SCHED_LOAD_SCALE * exec_delta64, fair_delta64); | ||
| 1969 | tmp64 = div64_64(tmp64 * exec_delta64, sample_interval64); | ||
| 1970 | |||
| 1971 | this_load = (unsigned long)tmp64; | ||
| 1972 | |||
| 1973 | do_avg: | ||
| 1974 | |||
| 1975 | /* Update our load: */ | ||
| 1976 | for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { | ||
| 1977 | unsigned long old_load, new_load; | ||
| 1978 | |||
| 1979 | /* scale is effectively 1 << i now, and >> i divides by scale */ | ||
| 1980 | |||
| 1981 | old_load = this_rq->cpu_load[i]; | ||
| 1982 | new_load = this_load; | ||
| 1983 | |||
| 1984 | this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; | ||
| 1985 | } | ||
| 1957 | } | 1986 | } |
| 1958 | 1987 | ||
| 1988 | #ifdef CONFIG_SMP | ||
| 1989 | |||
| 1959 | /* | 1990 | /* |
| 1960 | * double_rq_lock - safely lock two runqueues | 1991 | * double_rq_lock - safely lock two runqueues |
| 1961 | * | 1992 | * |
| @@ -2072,23 +2103,17 @@ void sched_exec(void) | |||
| 2072 | * pull_task - move a task from a remote runqueue to the local runqueue. | 2103 | * pull_task - move a task from a remote runqueue to the local runqueue. |
| 2073 | * Both runqueues must be locked. | 2104 | * Both runqueues must be locked. |
| 2074 | */ | 2105 | */ |
| 2075 | static void pull_task(struct rq *src_rq, struct prio_array *src_array, | 2106 | static void pull_task(struct rq *src_rq, struct task_struct *p, |
| 2076 | struct task_struct *p, struct rq *this_rq, | 2107 | struct rq *this_rq, int this_cpu) |
| 2077 | struct prio_array *this_array, int this_cpu) | ||
| 2078 | { | 2108 | { |
| 2079 | dequeue_task(p, src_array); | 2109 | deactivate_task(src_rq, p, 0); |
| 2080 | dec_nr_running(p, src_rq); | ||
| 2081 | set_task_cpu(p, this_cpu); | 2110 | set_task_cpu(p, this_cpu); |
| 2082 | inc_nr_running(p, this_rq); | 2111 | activate_task(this_rq, p, 0); |
| 2083 | enqueue_task(p, this_array); | ||
| 2084 | p->timestamp = (p->timestamp - src_rq->most_recent_timestamp) | ||
| 2085 | + this_rq->most_recent_timestamp; | ||
| 2086 | /* | 2112 | /* |
| 2087 | * Note that idle threads have a prio of MAX_PRIO, for this test | 2113 | * Note that idle threads have a prio of MAX_PRIO, for this test |
| 2088 | * to be always true for them. | 2114 | * to be always true for them. |
| 2089 | */ | 2115 | */ |
| 2090 | if (TASK_PREEMPTS_CURR(p, this_rq)) | 2116 | check_preempt_curr(this_rq, p); |
| 2091 | resched_task(this_rq->curr); | ||
| 2092 | } | 2117 | } |
| 2093 | 2118 | ||
| 2094 | /* | 2119 | /* |
| @@ -2113,132 +2138,67 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, | |||
| 2113 | return 0; | 2138 | return 0; |
| 2114 | 2139 | ||
| 2115 | /* | 2140 | /* |
| 2116 | * Aggressive migration if: | 2141 | * Aggressive migration if too many balance attempts have failed: |
| 2117 | * 1) task is cache cold, or | ||
| 2118 | * 2) too many balance attempts have failed. | ||
| 2119 | */ | 2142 | */ |
| 2120 | 2143 | if (sd->nr_balance_failed > sd->cache_nice_tries) | |
| 2121 | if (sd->nr_balance_failed > sd->cache_nice_tries) { | ||
| 2122 | #ifdef CONFIG_SCHEDSTATS | ||
| 2123 | if (task_hot(p, rq->most_recent_timestamp, sd)) | ||
| 2124 | schedstat_inc(sd, lb_hot_gained[idle]); | ||
| 2125 | #endif | ||
| 2126 | return 1; | 2144 | return 1; |
| 2127 | } | ||
| 2128 | 2145 | ||
| 2129 | if (task_hot(p, rq->most_recent_timestamp, sd)) | ||
| 2130 | return 0; | ||
| 2131 | return 1; | 2146 | return 1; |
| 2132 | } | 2147 | } |
| 2133 | 2148 | ||
| 2134 | #define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio) | 2149 | static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, |
| 2135 | |||
| 2136 | /* | ||
| 2137 | * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted | ||
| 2138 | * load from busiest to this_rq, as part of a balancing operation within | ||
| 2139 | * "domain". Returns the number of tasks moved. | ||
| 2140 | * | ||
| 2141 | * Called with both runqueues locked. | ||
| 2142 | */ | ||
| 2143 | static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
| 2144 | unsigned long max_nr_move, unsigned long max_load_move, | 2150 | unsigned long max_nr_move, unsigned long max_load_move, |
| 2145 | struct sched_domain *sd, enum cpu_idle_type idle, | 2151 | struct sched_domain *sd, enum cpu_idle_type idle, |
| 2146 | int *all_pinned) | 2152 | int *all_pinned, unsigned long *load_moved, |
| 2153 | int this_best_prio, int best_prio, int best_prio_seen, | ||
| 2154 | struct rq_iterator *iterator) | ||
| 2147 | { | 2155 | { |
| 2148 | int idx, pulled = 0, pinned = 0, this_best_prio, best_prio, | 2156 | int pulled = 0, pinned = 0, skip_for_load; |
| 2149 | best_prio_seen, skip_for_load; | 2157 | struct task_struct *p; |
| 2150 | struct prio_array *array, *dst_array; | 2158 | long rem_load_move = max_load_move; |
| 2151 | struct list_head *head, *curr; | ||
| 2152 | struct task_struct *tmp; | ||
| 2153 | long rem_load_move; | ||
| 2154 | 2159 | ||
| 2155 | if (max_nr_move == 0 || max_load_move == 0) | 2160 | if (max_nr_move == 0 || max_load_move == 0) |
| 2156 | goto out; | 2161 | goto out; |
| 2157 | 2162 | ||
| 2158 | rem_load_move = max_load_move; | ||
| 2159 | pinned = 1; | 2163 | pinned = 1; |
| 2160 | this_best_prio = rq_best_prio(this_rq); | ||
| 2161 | best_prio = rq_best_prio(busiest); | ||
| 2162 | /* | ||
| 2163 | * Enable handling of the case where there is more than one task | ||
| 2164 | * with the best priority. If the current running task is one | ||
| 2165 | * of those with prio==best_prio we know it won't be moved | ||
| 2166 | * and therefore it's safe to override the skip (based on load) of | ||
| 2167 | * any task we find with that prio. | ||
| 2168 | */ | ||
| 2169 | best_prio_seen = best_prio == busiest->curr->prio; | ||
| 2170 | 2164 | ||
| 2171 | /* | 2165 | /* |
| 2172 | * We first consider expired tasks. Those will likely not be | 2166 | * Start the load-balancing iterator: |
| 2173 | * executed in the near future, and they are most likely to | ||
| 2174 | * be cache-cold, thus switching CPUs has the least effect | ||
| 2175 | * on them. | ||
| 2176 | */ | 2167 | */ |
| 2177 | if (busiest->expired->nr_active) { | 2168 | p = iterator->start(iterator->arg); |
| 2178 | array = busiest->expired; | 2169 | next: |
| 2179 | dst_array = this_rq->expired; | 2170 | if (!p) |
| 2180 | } else { | ||
| 2181 | array = busiest->active; | ||
| 2182 | dst_array = this_rq->active; | ||
| 2183 | } | ||
| 2184 | |||
| 2185 | new_array: | ||
| 2186 | /* Start searching at priority 0: */ | ||
| 2187 | idx = 0; | ||
| 2188 | skip_bitmap: | ||
| 2189 | if (!idx) | ||
| 2190 | idx = sched_find_first_bit(array->bitmap); | ||
| 2191 | else | ||
| 2192 | idx = find_next_bit(array->bitmap, MAX_PRIO, idx); | ||
| 2193 | if (idx >= MAX_PRIO) { | ||
| 2194 | if (array == busiest->expired && busiest->active->nr_active) { | ||
| 2195 | array = busiest->active; | ||
| 2196 | dst_array = this_rq->active; | ||
| 2197 | goto new_array; | ||
| 2198 | } | ||
| 2199 | goto out; | 2171 | goto out; |
| 2200 | } | ||
| 2201 | |||
| 2202 | head = array->queue + idx; | ||
| 2203 | curr = head->prev; | ||
| 2204 | skip_queue: | ||
| 2205 | tmp = list_entry(curr, struct task_struct, run_list); | ||
| 2206 | |||
| 2207 | curr = curr->prev; | ||
| 2208 | |||
| 2209 | /* | 2172 | /* |
| 2210 | * To help distribute high priority tasks accross CPUs we don't | 2173 | * To help distribute high priority tasks accross CPUs we don't |
| 2211 | * skip a task if it will be the highest priority task (i.e. smallest | 2174 | * skip a task if it will be the highest priority task (i.e. smallest |
| 2212 | * prio value) on its new queue regardless of its load weight | 2175 | * prio value) on its new queue regardless of its load weight |
| 2213 | */ | 2176 | */ |
| 2214 | skip_for_load = tmp->load_weight > rem_load_move; | 2177 | skip_for_load = (p->se.load.weight >> 1) > rem_load_move + |
| 2215 | if (skip_for_load && idx < this_best_prio) | 2178 | SCHED_LOAD_SCALE_FUZZ; |
| 2216 | skip_for_load = !best_prio_seen && idx == best_prio; | 2179 | if (skip_for_load && p->prio < this_best_prio) |
| 2180 | skip_for_load = !best_prio_seen && p->prio == best_prio; | ||
| 2217 | if (skip_for_load || | 2181 | if (skip_for_load || |
| 2218 | !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { | 2182 | !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { |
| 2219 | 2183 | ||
| 2220 | best_prio_seen |= idx == best_prio; | 2184 | best_prio_seen |= p->prio == best_prio; |
| 2221 | if (curr != head) | 2185 | p = iterator->next(iterator->arg); |
| 2222 | goto skip_queue; | 2186 | goto next; |
| 2223 | idx++; | ||
| 2224 | goto skip_bitmap; | ||
| 2225 | } | 2187 | } |
| 2226 | 2188 | ||
| 2227 | pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); | 2189 | pull_task(busiest, p, this_rq, this_cpu); |
| 2228 | pulled++; | 2190 | pulled++; |
| 2229 | rem_load_move -= tmp->load_weight; | 2191 | rem_load_move -= p->se.load.weight; |
| 2230 | 2192 | ||
| 2231 | /* | 2193 | /* |
| 2232 | * We only want to steal up to the prescribed number of tasks | 2194 | * We only want to steal up to the prescribed number of tasks |
| 2233 | * and the prescribed amount of weighted load. | 2195 | * and the prescribed amount of weighted load. |
| 2234 | */ | 2196 | */ |
| 2235 | if (pulled < max_nr_move && rem_load_move > 0) { | 2197 | if (pulled < max_nr_move && rem_load_move > 0) { |
| 2236 | if (idx < this_best_prio) | 2198 | if (p->prio < this_best_prio) |
| 2237 | this_best_prio = idx; | 2199 | this_best_prio = p->prio; |
| 2238 | if (curr != head) | 2200 | p = iterator->next(iterator->arg); |
| 2239 | goto skip_queue; | 2201 | goto next; |
| 2240 | idx++; | ||
| 2241 | goto skip_bitmap; | ||
| 2242 | } | 2202 | } |
| 2243 | out: | 2203 | out: |
| 2244 | /* | 2204 | /* |
| @@ -2250,18 +2210,48 @@ out: | |||
| 2250 | 2210 | ||
| 2251 | if (all_pinned) | 2211 | if (all_pinned) |
| 2252 | *all_pinned = pinned; | 2212 | *all_pinned = pinned; |
| 2213 | *load_moved = max_load_move - rem_load_move; | ||
| 2253 | return pulled; | 2214 | return pulled; |
| 2254 | } | 2215 | } |
| 2255 | 2216 | ||
| 2256 | /* | 2217 | /* |
| 2218 | * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted | ||
| 2219 | * load from busiest to this_rq, as part of a balancing operation within | ||
| 2220 | * "domain". Returns the number of tasks moved. | ||
| 2221 | * | ||
| 2222 | * Called with both runqueues locked. | ||
| 2223 | */ | ||
| 2224 | static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
| 2225 | unsigned long max_nr_move, unsigned long max_load_move, | ||
| 2226 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
| 2227 | int *all_pinned) | ||
| 2228 | { | ||
| 2229 | struct sched_class *class = sched_class_highest; | ||
| 2230 | unsigned long load_moved, total_nr_moved = 0, nr_moved; | ||
| 2231 | long rem_load_move = max_load_move; | ||
| 2232 | |||
| 2233 | do { | ||
| 2234 | nr_moved = class->load_balance(this_rq, this_cpu, busiest, | ||
| 2235 | max_nr_move, (unsigned long)rem_load_move, | ||
| 2236 | sd, idle, all_pinned, &load_moved); | ||
| 2237 | total_nr_moved += nr_moved; | ||
| 2238 | max_nr_move -= nr_moved; | ||
| 2239 | rem_load_move -= load_moved; | ||
| 2240 | class = class->next; | ||
| 2241 | } while (class && max_nr_move && rem_load_move > 0); | ||
| 2242 | |||
| 2243 | return total_nr_moved; | ||
| 2244 | } | ||
| 2245 | |||
| 2246 | /* | ||
| 2257 | * find_busiest_group finds and returns the busiest CPU group within the | 2247 | * find_busiest_group finds and returns the busiest CPU group within the |
| 2258 | * domain. It calculates and returns the amount of weighted load which | 2248 | * domain. It calculates and returns the amount of weighted load which |
| 2259 | * should be moved to restore balance via the imbalance parameter. | 2249 | * should be moved to restore balance via the imbalance parameter. |
| 2260 | */ | 2250 | */ |
| 2261 | static struct sched_group * | 2251 | static struct sched_group * |
| 2262 | find_busiest_group(struct sched_domain *sd, int this_cpu, | 2252 | find_busiest_group(struct sched_domain *sd, int this_cpu, |
| 2263 | unsigned long *imbalance, enum cpu_idle_type idle, int *sd_idle, | 2253 | unsigned long *imbalance, enum cpu_idle_type idle, |
| 2264 | cpumask_t *cpus, int *balance) | 2254 | int *sd_idle, cpumask_t *cpus, int *balance) |
| 2265 | { | 2255 | { |
| 2266 | struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; | 2256 | struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; |
| 2267 | unsigned long max_load, avg_load, total_load, this_load, total_pwr; | 2257 | unsigned long max_load, avg_load, total_load, this_load, total_pwr; |
| @@ -2325,7 +2315,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
| 2325 | 2315 | ||
| 2326 | avg_load += load; | 2316 | avg_load += load; |
| 2327 | sum_nr_running += rq->nr_running; | 2317 | sum_nr_running += rq->nr_running; |
| 2328 | sum_weighted_load += rq->raw_weighted_load; | 2318 | sum_weighted_load += weighted_cpuload(i); |
| 2329 | } | 2319 | } |
| 2330 | 2320 | ||
| 2331 | /* | 2321 | /* |
| @@ -2365,8 +2355,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
| 2365 | * Busy processors will not participate in power savings | 2355 | * Busy processors will not participate in power savings |
| 2366 | * balance. | 2356 | * balance. |
| 2367 | */ | 2357 | */ |
| 2368 | if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) | 2358 | if (idle == CPU_NOT_IDLE || |
| 2369 | goto group_next; | 2359 | !(sd->flags & SD_POWERSAVINGS_BALANCE)) |
| 2360 | goto group_next; | ||
| 2370 | 2361 | ||
| 2371 | /* | 2362 | /* |
| 2372 | * If the local group is idle or completely loaded | 2363 | * If the local group is idle or completely loaded |
| @@ -2376,42 +2367,42 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
| 2376 | !this_nr_running)) | 2367 | !this_nr_running)) |
| 2377 | power_savings_balance = 0; | 2368 | power_savings_balance = 0; |
| 2378 | 2369 | ||
| 2379 | /* | 2370 | /* |
| 2380 | * If a group is already running at full capacity or idle, | 2371 | * If a group is already running at full capacity or idle, |
| 2381 | * don't include that group in power savings calculations | 2372 | * don't include that group in power savings calculations |
| 2382 | */ | 2373 | */ |
| 2383 | if (!power_savings_balance || sum_nr_running >= group_capacity | 2374 | if (!power_savings_balance || sum_nr_running >= group_capacity |
| 2384 | || !sum_nr_running) | 2375 | || !sum_nr_running) |
| 2385 | goto group_next; | 2376 | goto group_next; |
| 2386 | 2377 | ||
| 2387 | /* | 2378 | /* |
| 2388 | * Calculate the group which has the least non-idle load. | 2379 | * Calculate the group which has the least non-idle load. |
| 2389 | * This is the group from where we need to pick up the load | 2380 | * This is the group from where we need to pick up the load |
| 2390 | * for saving power | 2381 | * for saving power |
| 2391 | */ | 2382 | */ |
| 2392 | if ((sum_nr_running < min_nr_running) || | 2383 | if ((sum_nr_running < min_nr_running) || |
| 2393 | (sum_nr_running == min_nr_running && | 2384 | (sum_nr_running == min_nr_running && |
| 2394 | first_cpu(group->cpumask) < | 2385 | first_cpu(group->cpumask) < |
| 2395 | first_cpu(group_min->cpumask))) { | 2386 | first_cpu(group_min->cpumask))) { |
| 2396 | group_min = group; | 2387 | group_min = group; |
| 2397 | min_nr_running = sum_nr_running; | 2388 | min_nr_running = sum_nr_running; |
| 2398 | min_load_per_task = sum_weighted_load / | 2389 | min_load_per_task = sum_weighted_load / |
| 2399 | sum_nr_running; | 2390 | sum_nr_running; |
| 2400 | } | 2391 | } |
| 2401 | 2392 | ||
| 2402 | /* | 2393 | /* |
| 2403 | * Calculate the group which is almost near its | 2394 | * Calculate the group which is almost near its |
| 2404 | * capacity but still has some space to pick up some load | 2395 | * capacity but still has some space to pick up some load |
| 2405 | * from other group and save more power | 2396 | * from other group and save more power |
| 2406 | */ | 2397 | */ |
| 2407 | if (sum_nr_running <= group_capacity - 1) { | 2398 | if (sum_nr_running <= group_capacity - 1) { |
| 2408 | if (sum_nr_running > leader_nr_running || | 2399 | if (sum_nr_running > leader_nr_running || |
| 2409 | (sum_nr_running == leader_nr_running && | 2400 | (sum_nr_running == leader_nr_running && |
| 2410 | first_cpu(group->cpumask) > | 2401 | first_cpu(group->cpumask) > |
| 2411 | first_cpu(group_leader->cpumask))) { | 2402 | first_cpu(group_leader->cpumask))) { |
| 2412 | group_leader = group; | 2403 | group_leader = group; |
| 2413 | leader_nr_running = sum_nr_running; | 2404 | leader_nr_running = sum_nr_running; |
| 2414 | } | 2405 | } |
| 2415 | } | 2406 | } |
| 2416 | group_next: | 2407 | group_next: |
| 2417 | #endif | 2408 | #endif |
| @@ -2466,7 +2457,7 @@ group_next: | |||
| 2466 | * a think about bumping its value to force at least one task to be | 2457 | * a think about bumping its value to force at least one task to be |
| 2467 | * moved | 2458 | * moved |
| 2468 | */ | 2459 | */ |
| 2469 | if (*imbalance < busiest_load_per_task) { | 2460 | if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) { |
| 2470 | unsigned long tmp, pwr_now, pwr_move; | 2461 | unsigned long tmp, pwr_now, pwr_move; |
| 2471 | unsigned int imbn; | 2462 | unsigned int imbn; |
| 2472 | 2463 | ||
| @@ -2480,7 +2471,8 @@ small_imbalance: | |||
| 2480 | } else | 2471 | } else |
| 2481 | this_load_per_task = SCHED_LOAD_SCALE; | 2472 | this_load_per_task = SCHED_LOAD_SCALE; |
| 2482 | 2473 | ||
| 2483 | if (max_load - this_load >= busiest_load_per_task * imbn) { | 2474 | if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >= |
| 2475 | busiest_load_per_task * imbn) { | ||
| 2484 | *imbalance = busiest_load_per_task; | 2476 | *imbalance = busiest_load_per_task; |
| 2485 | return busiest; | 2477 | return busiest; |
| 2486 | } | 2478 | } |
| @@ -2552,17 +2544,19 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, | |||
| 2552 | int i; | 2544 | int i; |
| 2553 | 2545 | ||
| 2554 | for_each_cpu_mask(i, group->cpumask) { | 2546 | for_each_cpu_mask(i, group->cpumask) { |
| 2547 | unsigned long wl; | ||
| 2555 | 2548 | ||
| 2556 | if (!cpu_isset(i, *cpus)) | 2549 | if (!cpu_isset(i, *cpus)) |
| 2557 | continue; | 2550 | continue; |
| 2558 | 2551 | ||
| 2559 | rq = cpu_rq(i); | 2552 | rq = cpu_rq(i); |
| 2553 | wl = weighted_cpuload(i); | ||
| 2560 | 2554 | ||
| 2561 | if (rq->nr_running == 1 && rq->raw_weighted_load > imbalance) | 2555 | if (rq->nr_running == 1 && wl > imbalance) |
| 2562 | continue; | 2556 | continue; |
| 2563 | 2557 | ||
| 2564 | if (rq->raw_weighted_load > max_load) { | 2558 | if (wl > max_load) { |
| 2565 | max_load = rq->raw_weighted_load; | 2559 | max_load = wl; |
| 2566 | busiest = rq; | 2560 | busiest = rq; |
| 2567 | } | 2561 | } |
| 2568 | } | 2562 | } |
| @@ -2599,7 +2593,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
| 2599 | /* | 2593 | /* |
| 2600 | * When power savings policy is enabled for the parent domain, idle | 2594 | * When power savings policy is enabled for the parent domain, idle |
| 2601 | * sibling can pick up load irrespective of busy siblings. In this case, | 2595 | * sibling can pick up load irrespective of busy siblings. In this case, |
| 2602 | * let the state of idle sibling percolate up as IDLE, instead of | 2596 | * let the state of idle sibling percolate up as CPU_IDLE, instead of |
| 2603 | * portraying it as CPU_NOT_IDLE. | 2597 | * portraying it as CPU_NOT_IDLE. |
| 2604 | */ | 2598 | */ |
| 2605 | if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && | 2599 | if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && |
| @@ -2822,8 +2816,8 @@ out_balanced: | |||
| 2822 | static void idle_balance(int this_cpu, struct rq *this_rq) | 2816 | static void idle_balance(int this_cpu, struct rq *this_rq) |
| 2823 | { | 2817 | { |
| 2824 | struct sched_domain *sd; | 2818 | struct sched_domain *sd; |
| 2825 | int pulled_task = 0; | 2819 | int pulled_task = -1; |
| 2826 | unsigned long next_balance = jiffies + 60 * HZ; | 2820 | unsigned long next_balance = jiffies + HZ; |
| 2827 | 2821 | ||
| 2828 | for_each_domain(this_cpu, sd) { | 2822 | for_each_domain(this_cpu, sd) { |
| 2829 | unsigned long interval; | 2823 | unsigned long interval; |
| @@ -2842,12 +2836,13 @@ static void idle_balance(int this_cpu, struct rq *this_rq) | |||
| 2842 | if (pulled_task) | 2836 | if (pulled_task) |
| 2843 | break; | 2837 | break; |
| 2844 | } | 2838 | } |
| 2845 | if (!pulled_task) | 2839 | if (pulled_task || time_after(jiffies, this_rq->next_balance)) { |
| 2846 | /* | 2840 | /* |
| 2847 | * We are going idle. next_balance may be set based on | 2841 | * We are going idle. next_balance may be set based on |
| 2848 | * a busy processor. So reset next_balance. | 2842 | * a busy processor. So reset next_balance. |
| 2849 | */ | 2843 | */ |
| 2850 | this_rq->next_balance = next_balance; | 2844 | this_rq->next_balance = next_balance; |
| 2845 | } | ||
| 2851 | } | 2846 | } |
| 2852 | 2847 | ||
| 2853 | /* | 2848 | /* |
| @@ -2900,32 +2895,6 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) | |||
| 2900 | spin_unlock(&target_rq->lock); | 2895 | spin_unlock(&target_rq->lock); |
| 2901 | } | 2896 | } |
| 2902 | 2897 | ||
| 2903 | static void update_load(struct rq *this_rq) | ||
| 2904 | { | ||
| 2905 | unsigned long this_load; | ||
| 2906 | unsigned int i, scale; | ||
| 2907 | |||
| 2908 | this_load = this_rq->raw_weighted_load; | ||
| 2909 | |||
| 2910 | /* Update our load: */ | ||
| 2911 | for (i = 0, scale = 1; i < 3; i++, scale += scale) { | ||
| 2912 | unsigned long old_load, new_load; | ||
| 2913 | |||
| 2914 | /* scale is effectively 1 << i now, and >> i divides by scale */ | ||
| 2915 | |||
| 2916 | old_load = this_rq->cpu_load[i]; | ||
| 2917 | new_load = this_load; | ||
| 2918 | /* | ||
| 2919 | * Round up the averaging division if load is increasing. This | ||
| 2920 | * prevents us from getting stuck on 9 if the load is 10, for | ||
| 2921 | * example. | ||
| 2922 | */ | ||
| 2923 | if (new_load > old_load) | ||
| 2924 | new_load += scale-1; | ||
| 2925 | this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; | ||
| 2926 | } | ||
| 2927 | } | ||
| 2928 | |||
| 2929 | #ifdef CONFIG_NO_HZ | 2898 | #ifdef CONFIG_NO_HZ |
| 2930 | static struct { | 2899 | static struct { |
| 2931 | atomic_t load_balancer; | 2900 | atomic_t load_balancer; |
| @@ -3029,6 +2998,9 @@ static inline void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
| 3029 | interval = msecs_to_jiffies(interval); | 2998 | interval = msecs_to_jiffies(interval); |
| 3030 | if (unlikely(!interval)) | 2999 | if (unlikely(!interval)) |
| 3031 | interval = 1; | 3000 | interval = 1; |
| 3001 | if (interval > HZ*NR_CPUS/10) | ||
| 3002 | interval = HZ*NR_CPUS/10; | ||
| 3003 | |||
| 3032 | 3004 | ||
| 3033 | if (sd->flags & SD_SERIALIZE) { | 3005 | if (sd->flags & SD_SERIALIZE) { |
| 3034 | if (!spin_trylock(&balancing)) | 3006 | if (!spin_trylock(&balancing)) |
| @@ -3070,11 +3042,12 @@ out: | |||
| 3070 | */ | 3042 | */ |
| 3071 | static void run_rebalance_domains(struct softirq_action *h) | 3043 | static void run_rebalance_domains(struct softirq_action *h) |
| 3072 | { | 3044 | { |
| 3073 | int local_cpu = smp_processor_id(); | 3045 | int this_cpu = smp_processor_id(); |
| 3074 | struct rq *local_rq = cpu_rq(local_cpu); | 3046 | struct rq *this_rq = cpu_rq(this_cpu); |
| 3075 | enum cpu_idle_type idle = local_rq->idle_at_tick ? CPU_IDLE : CPU_NOT_IDLE; | 3047 | enum cpu_idle_type idle = this_rq->idle_at_tick ? |
| 3048 | CPU_IDLE : CPU_NOT_IDLE; | ||
| 3076 | 3049 | ||
| 3077 | rebalance_domains(local_cpu, idle); | 3050 | rebalance_domains(this_cpu, idle); |
| 3078 | 3051 | ||
| 3079 | #ifdef CONFIG_NO_HZ | 3052 | #ifdef CONFIG_NO_HZ |
| 3080 | /* | 3053 | /* |
| @@ -3082,13 +3055,13 @@ static void run_rebalance_domains(struct softirq_action *h) | |||
| 3082 | * balancing on behalf of the other idle cpus whose ticks are | 3055 | * balancing on behalf of the other idle cpus whose ticks are |
| 3083 | * stopped. | 3056 | * stopped. |
| 3084 | */ | 3057 | */ |
| 3085 | if (local_rq->idle_at_tick && | 3058 | if (this_rq->idle_at_tick && |
| 3086 | atomic_read(&nohz.load_balancer) == local_cpu) { | 3059 | atomic_read(&nohz.load_balancer) == this_cpu) { |
| 3087 | cpumask_t cpus = nohz.cpu_mask; | 3060 | cpumask_t cpus = nohz.cpu_mask; |
| 3088 | struct rq *rq; | 3061 | struct rq *rq; |
| 3089 | int balance_cpu; | 3062 | int balance_cpu; |
| 3090 | 3063 | ||
| 3091 | cpu_clear(local_cpu, cpus); | 3064 | cpu_clear(this_cpu, cpus); |
| 3092 | for_each_cpu_mask(balance_cpu, cpus) { | 3065 | for_each_cpu_mask(balance_cpu, cpus) { |
| 3093 | /* | 3066 | /* |
| 3094 | * If this cpu gets work to do, stop the load balancing | 3067 | * If this cpu gets work to do, stop the load balancing |
| @@ -3098,11 +3071,11 @@ static void run_rebalance_domains(struct softirq_action *h) | |||
| 3098 | if (need_resched()) | 3071 | if (need_resched()) |
| 3099 | break; | 3072 | break; |
| 3100 | 3073 | ||
| 3101 | rebalance_domains(balance_cpu, CPU_IDLE); | 3074 | rebalance_domains(balance_cpu, SCHED_IDLE); |
| 3102 | 3075 | ||
| 3103 | rq = cpu_rq(balance_cpu); | 3076 | rq = cpu_rq(balance_cpu); |
| 3104 | if (time_after(local_rq->next_balance, rq->next_balance)) | 3077 | if (time_after(this_rq->next_balance, rq->next_balance)) |
| 3105 | local_rq->next_balance = rq->next_balance; | 3078 | this_rq->next_balance = rq->next_balance; |
| 3106 | } | 3079 | } |
| 3107 | } | 3080 | } |
| 3108 | #endif | 3081 | #endif |
| @@ -3115,9 +3088,8 @@ static void run_rebalance_domains(struct softirq_action *h) | |||
| 3115 | * idle load balancing owner or decide to stop the periodic load balancing, | 3088 | * idle load balancing owner or decide to stop the periodic load balancing, |
| 3116 | * if the whole system is idle. | 3089 | * if the whole system is idle. |
| 3117 | */ | 3090 | */ |
| 3118 | static inline void trigger_load_balance(int cpu) | 3091 | static inline void trigger_load_balance(struct rq *rq, int cpu) |
| 3119 | { | 3092 | { |
| 3120 | struct rq *rq = cpu_rq(cpu); | ||
| 3121 | #ifdef CONFIG_NO_HZ | 3093 | #ifdef CONFIG_NO_HZ |
| 3122 | /* | 3094 | /* |
| 3123 | * If we were in the nohz mode recently and busy at the current | 3095 | * If we were in the nohz mode recently and busy at the current |
| @@ -3169,13 +3141,29 @@ static inline void trigger_load_balance(int cpu) | |||
| 3169 | if (time_after_eq(jiffies, rq->next_balance)) | 3141 | if (time_after_eq(jiffies, rq->next_balance)) |
| 3170 | raise_softirq(SCHED_SOFTIRQ); | 3142 | raise_softirq(SCHED_SOFTIRQ); |
| 3171 | } | 3143 | } |
| 3172 | #else | 3144 | |
| 3145 | #else /* CONFIG_SMP */ | ||
| 3146 | |||
| 3173 | /* | 3147 | /* |
| 3174 | * on UP we do not need to balance between CPUs: | 3148 | * on UP we do not need to balance between CPUs: |
| 3175 | */ | 3149 | */ |
| 3176 | static inline void idle_balance(int cpu, struct rq *rq) | 3150 | static inline void idle_balance(int cpu, struct rq *rq) |
| 3177 | { | 3151 | { |
| 3178 | } | 3152 | } |
| 3153 | |||
| 3154 | /* Avoid "used but not defined" warning on UP */ | ||
| 3155 | static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
| 3156 | unsigned long max_nr_move, unsigned long max_load_move, | ||
| 3157 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
| 3158 | int *all_pinned, unsigned long *load_moved, | ||
| 3159 | int this_best_prio, int best_prio, int best_prio_seen, | ||
| 3160 | struct rq_iterator *iterator) | ||
| 3161 | { | ||
| 3162 | *load_moved = 0; | ||
| 3163 | |||
| 3164 | return 0; | ||
| 3165 | } | ||
| 3166 | |||
| 3179 | #endif | 3167 | #endif |
| 3180 | 3168 | ||
| 3181 | DEFINE_PER_CPU(struct kernel_stat, kstat); | 3169 | DEFINE_PER_CPU(struct kernel_stat, kstat); |
| @@ -3277,81 +3265,6 @@ void account_steal_time(struct task_struct *p, cputime_t steal) | |||
| 3277 | cpustat->steal = cputime64_add(cpustat->steal, tmp); | 3265 | cpustat->steal = cputime64_add(cpustat->steal, tmp); |
| 3278 | } | 3266 | } |
| 3279 | 3267 | ||
| 3280 | static void task_running_tick(struct rq *rq, struct task_struct *p) | ||
| 3281 | { | ||
| 3282 | if (p->array != rq->active) { | ||
| 3283 | /* Task has expired but was not scheduled yet */ | ||
| 3284 | set_tsk_need_resched(p); | ||
| 3285 | return; | ||
| 3286 | } | ||
| 3287 | spin_lock(&rq->lock); | ||
| 3288 | /* | ||
| 3289 | * The task was running during this tick - update the | ||
| 3290 | * time slice counter. Note: we do not update a thread's | ||
| 3291 | * priority until it either goes to sleep or uses up its | ||
| 3292 | * timeslice. This makes it possible for interactive tasks | ||
| 3293 | * to use up their timeslices at their highest priority levels. | ||
| 3294 | */ | ||
| 3295 | if (rt_task(p)) { | ||
| 3296 | /* | ||
| 3297 | * RR tasks need a special form of timeslice management. | ||
| 3298 | * FIFO tasks have no timeslices. | ||
| 3299 | */ | ||
| 3300 | if ((p->policy == SCHED_RR) && !--p->time_slice) { | ||
| 3301 | p->time_slice = task_timeslice(p); | ||
| 3302 | p->first_time_slice = 0; | ||
| 3303 | set_tsk_need_resched(p); | ||
| 3304 | |||
| 3305 | /* put it at the end of the queue: */ | ||
| 3306 | requeue_task(p, rq->active); | ||
| 3307 | } | ||
| 3308 | goto out_unlock; | ||
| 3309 | } | ||
| 3310 | if (!--p->time_slice) { | ||
| 3311 | dequeue_task(p, rq->active); | ||
| 3312 | set_tsk_need_resched(p); | ||
| 3313 | p->prio = effective_prio(p); | ||
| 3314 | p->time_slice = task_timeslice(p); | ||
| 3315 | p->first_time_slice = 0; | ||
| 3316 | |||
| 3317 | if (!rq->expired_timestamp) | ||
| 3318 | rq->expired_timestamp = jiffies; | ||
| 3319 | if (!TASK_INTERACTIVE(p)) { | ||
| 3320 | enqueue_task(p, rq->expired); | ||
| 3321 | if (p->static_prio < rq->best_expired_prio) | ||
| 3322 | rq->best_expired_prio = p->static_prio; | ||
| 3323 | } else | ||
| 3324 | enqueue_task(p, rq->active); | ||
| 3325 | } else { | ||
| 3326 | /* | ||
| 3327 | * Prevent a too long timeslice allowing a task to monopolize | ||
| 3328 | * the CPU. We do this by splitting up the timeslice into | ||
| 3329 | * smaller pieces. | ||
| 3330 | * | ||
| 3331 | * Note: this does not mean the task's timeslices expire or | ||
| 3332 | * get lost in any way, they just might be preempted by | ||
| 3333 | * another task of equal priority. (one with higher | ||
| 3334 | * priority would have preempted this task already.) We | ||
| 3335 | * requeue this task to the end of the list on this priority | ||
| 3336 | * level, which is in essence a round-robin of tasks with | ||
| 3337 | * equal priority. | ||
| 3338 | * | ||
| 3339 | * This only applies to tasks in the interactive | ||
| 3340 | * delta range with at least TIMESLICE_GRANULARITY to requeue. | ||
| 3341 | */ | ||
| 3342 | if (TASK_INTERACTIVE(p) && !((task_timeslice(p) - | ||
| 3343 | p->time_slice) % TIMESLICE_GRANULARITY(p)) && | ||
| 3344 | (p->time_slice >= TIMESLICE_GRANULARITY(p)) && | ||
| 3345 | (p->array == rq->active)) { | ||
| 3346 | |||
| 3347 | requeue_task(p, rq->active); | ||
| 3348 | set_tsk_need_resched(p); | ||
| 3349 | } | ||
| 3350 | } | ||
| 3351 | out_unlock: | ||
| 3352 | spin_unlock(&rq->lock); | ||
| 3353 | } | ||
| 3354 | |||
| 3355 | /* | 3268 | /* |
| 3356 | * This function gets called by the timer code, with HZ frequency. | 3269 | * This function gets called by the timer code, with HZ frequency. |
| 3357 | * We call it with interrupts disabled. | 3270 | * We call it with interrupts disabled. |
| @@ -3361,17 +3274,19 @@ out_unlock: | |||
| 3361 | */ | 3274 | */ |
| 3362 | void scheduler_tick(void) | 3275 | void scheduler_tick(void) |
| 3363 | { | 3276 | { |
| 3364 | struct task_struct *p = current; | ||
| 3365 | int cpu = smp_processor_id(); | 3277 | int cpu = smp_processor_id(); |
| 3366 | int idle_at_tick = idle_cpu(cpu); | ||
| 3367 | struct rq *rq = cpu_rq(cpu); | 3278 | struct rq *rq = cpu_rq(cpu); |
| 3279 | struct task_struct *curr = rq->curr; | ||
| 3280 | |||
| 3281 | spin_lock(&rq->lock); | ||
| 3282 | if (curr != rq->idle) /* FIXME: needed? */ | ||
| 3283 | curr->sched_class->task_tick(rq, curr); | ||
| 3284 | update_cpu_load(rq); | ||
| 3285 | spin_unlock(&rq->lock); | ||
| 3368 | 3286 | ||
| 3369 | if (!idle_at_tick) | ||
| 3370 | task_running_tick(rq, p); | ||
| 3371 | #ifdef CONFIG_SMP | 3287 | #ifdef CONFIG_SMP |
| 3372 | update_load(rq); | 3288 | rq->idle_at_tick = idle_cpu(cpu); |
| 3373 | rq->idle_at_tick = idle_at_tick; | 3289 | trigger_load_balance(rq, cpu); |
| 3374 | trigger_load_balance(cpu); | ||
| 3375 | #endif | 3290 | #endif |
| 3376 | } | 3291 | } |
| 3377 | 3292 | ||
| @@ -3414,140 +3329,128 @@ EXPORT_SYMBOL(sub_preempt_count); | |||
| 3414 | #endif | 3329 | #endif |
| 3415 | 3330 | ||
| 3416 | /* | 3331 | /* |
| 3417 | * schedule() is the main scheduler function. | 3332 | * Print scheduling while atomic bug: |
| 3418 | */ | 3333 | */ |
| 3419 | asmlinkage void __sched schedule(void) | 3334 | static noinline void __schedule_bug(struct task_struct *prev) |
| 3420 | { | 3335 | { |
| 3421 | struct task_struct *prev, *next; | 3336 | printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d\n", |
| 3422 | struct prio_array *array; | 3337 | prev->comm, preempt_count(), prev->pid); |
| 3423 | struct list_head *queue; | 3338 | debug_show_held_locks(prev); |
| 3424 | unsigned long long now; | 3339 | if (irqs_disabled()) |
| 3425 | unsigned long run_time; | 3340 | print_irqtrace_events(prev); |
| 3426 | int cpu, idx; | 3341 | dump_stack(); |
| 3427 | long *switch_count; | 3342 | } |
| 3428 | struct rq *rq; | ||
| 3429 | 3343 | ||
| 3344 | /* | ||
| 3345 | * Various schedule()-time debugging checks and statistics: | ||
| 3346 | */ | ||
| 3347 | static inline void schedule_debug(struct task_struct *prev) | ||
| 3348 | { | ||
| 3430 | /* | 3349 | /* |
| 3431 | * Test if we are atomic. Since do_exit() needs to call into | 3350 | * Test if we are atomic. Since do_exit() needs to call into |
| 3432 | * schedule() atomically, we ignore that path for now. | 3351 | * schedule() atomically, we ignore that path for now. |
| 3433 | * Otherwise, whine if we are scheduling when we should not be. | 3352 | * Otherwise, whine if we are scheduling when we should not be. |
| 3434 | */ | 3353 | */ |
| 3435 | if (unlikely(in_atomic() && !current->exit_state)) { | 3354 | if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state)) |
| 3436 | printk(KERN_ERR "BUG: scheduling while atomic: " | 3355 | __schedule_bug(prev); |
| 3437 | "%s/0x%08x/%d\n", | 3356 | |
| 3438 | current->comm, preempt_count(), current->pid); | ||
| 3439 | debug_show_held_locks(current); | ||
| 3440 | if (irqs_disabled()) | ||
| 3441 | print_irqtrace_events(current); | ||
| 3442 | dump_stack(); | ||
| 3443 | } | ||
| 3444 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); | 3357 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); |
| 3445 | 3358 | ||
| 3446 | need_resched: | 3359 | schedstat_inc(this_rq(), sched_cnt); |
| 3447 | preempt_disable(); | 3360 | } |
| 3448 | prev = current; | 3361 | |
| 3449 | release_kernel_lock(prev); | 3362 | /* |
| 3450 | need_resched_nonpreemptible: | 3363 | * Pick up the highest-prio task: |
| 3451 | rq = this_rq(); | 3364 | */ |
| 3365 | static inline struct task_struct * | ||
| 3366 | pick_next_task(struct rq *rq, struct task_struct *prev, u64 now) | ||
| 3367 | { | ||
| 3368 | struct sched_class *class; | ||
| 3369 | struct task_struct *p; | ||
| 3452 | 3370 | ||
| 3453 | /* | 3371 | /* |
| 3454 | * The idle thread is not allowed to schedule! | 3372 | * Optimization: we know that if all tasks are in |
| 3455 | * Remove this check after it has been exercised a bit. | 3373 | * the fair class we can call that function directly: |
| 3456 | */ | 3374 | */ |
| 3457 | if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) { | 3375 | if (likely(rq->nr_running == rq->cfs.nr_running)) { |
| 3458 | printk(KERN_ERR "bad: scheduling from the idle thread!\n"); | 3376 | p = fair_sched_class.pick_next_task(rq, now); |
| 3459 | dump_stack(); | 3377 | if (likely(p)) |
| 3378 | return p; | ||
| 3460 | } | 3379 | } |
| 3461 | 3380 | ||
| 3462 | schedstat_inc(rq, sched_cnt); | 3381 | class = sched_class_highest; |
| 3463 | now = sched_clock(); | 3382 | for ( ; ; ) { |
| 3464 | if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) { | 3383 | p = class->pick_next_task(rq, now); |
| 3465 | run_time = now - prev->timestamp; | 3384 | if (p) |
| 3466 | if (unlikely((long long)(now - prev->timestamp) < 0)) | 3385 | return p; |
| 3467 | run_time = 0; | 3386 | /* |
| 3468 | } else | 3387 | * Will never be NULL as the idle class always |
| 3469 | run_time = NS_MAX_SLEEP_AVG; | 3388 | * returns a non-NULL p: |
| 3389 | */ | ||
| 3390 | class = class->next; | ||
| 3391 | } | ||
| 3392 | } | ||
| 3470 | 3393 | ||
| 3471 | /* | 3394 | /* |
| 3472 | * Tasks charged proportionately less run_time at high sleep_avg to | 3395 | * schedule() is the main scheduler function. |
| 3473 | * delay them losing their interactive status | 3396 | */ |
| 3474 | */ | 3397 | asmlinkage void __sched schedule(void) |
| 3475 | run_time /= (CURRENT_BONUS(prev) ? : 1); | 3398 | { |
| 3399 | struct task_struct *prev, *next; | ||
| 3400 | long *switch_count; | ||
| 3401 | struct rq *rq; | ||
| 3402 | u64 now; | ||
| 3403 | int cpu; | ||
| 3404 | |||
| 3405 | need_resched: | ||
| 3406 | preempt_disable(); | ||
| 3407 | cpu = smp_processor_id(); | ||
| 3408 | rq = cpu_rq(cpu); | ||
| 3409 | rcu_qsctr_inc(cpu); | ||
| 3410 | prev = rq->curr; | ||
| 3411 | switch_count = &prev->nivcsw; | ||
| 3412 | |||
| 3413 | release_kernel_lock(prev); | ||
| 3414 | need_resched_nonpreemptible: | ||
| 3415 | |||
| 3416 | schedule_debug(prev); | ||
| 3476 | 3417 | ||
| 3477 | spin_lock_irq(&rq->lock); | 3418 | spin_lock_irq(&rq->lock); |
| 3419 | clear_tsk_need_resched(prev); | ||
| 3478 | 3420 | ||
| 3479 | switch_count = &prev->nivcsw; | ||
| 3480 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { | 3421 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { |
| 3481 | switch_count = &prev->nvcsw; | ||
| 3482 | if (unlikely((prev->state & TASK_INTERRUPTIBLE) && | 3422 | if (unlikely((prev->state & TASK_INTERRUPTIBLE) && |
| 3483 | unlikely(signal_pending(prev)))) | 3423 | unlikely(signal_pending(prev)))) { |
| 3484 | prev->state = TASK_RUNNING; | 3424 | prev->state = TASK_RUNNING; |
| 3485 | else { | 3425 | } else { |
| 3486 | if (prev->state == TASK_UNINTERRUPTIBLE) | 3426 | deactivate_task(rq, prev, 1); |
| 3487 | rq->nr_uninterruptible++; | ||
| 3488 | deactivate_task(prev, rq); | ||
| 3489 | } | 3427 | } |
| 3428 | switch_count = &prev->nvcsw; | ||
| 3490 | } | 3429 | } |
| 3491 | 3430 | ||
| 3492 | cpu = smp_processor_id(); | 3431 | if (unlikely(!rq->nr_running)) |
| 3493 | if (unlikely(!rq->nr_running)) { | ||
| 3494 | idle_balance(cpu, rq); | 3432 | idle_balance(cpu, rq); |
| 3495 | if (!rq->nr_running) { | ||
| 3496 | next = rq->idle; | ||
| 3497 | rq->expired_timestamp = 0; | ||
| 3498 | goto switch_tasks; | ||
| 3499 | } | ||
| 3500 | } | ||
| 3501 | 3433 | ||
| 3502 | array = rq->active; | 3434 | now = __rq_clock(rq); |
| 3503 | if (unlikely(!array->nr_active)) { | 3435 | prev->sched_class->put_prev_task(rq, prev, now); |
| 3504 | /* | 3436 | next = pick_next_task(rq, prev, now); |
| 3505 | * Switch the active and expired arrays. | ||
| 3506 | */ | ||
| 3507 | schedstat_inc(rq, sched_switch); | ||
| 3508 | rq->active = rq->expired; | ||
| 3509 | rq->expired = array; | ||
| 3510 | array = rq->active; | ||
| 3511 | rq->expired_timestamp = 0; | ||
| 3512 | rq->best_expired_prio = MAX_PRIO; | ||
| 3513 | } | ||
| 3514 | |||
| 3515 | idx = sched_find_first_bit(array->bitmap); | ||
| 3516 | queue = array->queue + idx; | ||
| 3517 | next = list_entry(queue->next, struct task_struct, run_list); | ||
| 3518 | |||
| 3519 | switch_tasks: | ||
| 3520 | if (next == rq->idle) | ||
| 3521 | schedstat_inc(rq, sched_goidle); | ||
| 3522 | prefetch(next); | ||
| 3523 | prefetch_stack(next); | ||
| 3524 | clear_tsk_need_resched(prev); | ||
| 3525 | rcu_qsctr_inc(task_cpu(prev)); | ||
| 3526 | |||
| 3527 | prev->timestamp = prev->last_ran = now; | ||
| 3528 | 3437 | ||
| 3529 | sched_info_switch(prev, next); | 3438 | sched_info_switch(prev, next); |
| 3439 | |||
| 3530 | if (likely(prev != next)) { | 3440 | if (likely(prev != next)) { |
| 3531 | next->timestamp = next->last_ran = now; | ||
| 3532 | rq->nr_switches++; | 3441 | rq->nr_switches++; |
| 3533 | rq->curr = next; | 3442 | rq->curr = next; |
| 3534 | ++*switch_count; | 3443 | ++*switch_count; |
| 3535 | 3444 | ||
| 3536 | prepare_task_switch(rq, next); | 3445 | context_switch(rq, prev, next); /* unlocks the rq */ |
| 3537 | prev = context_switch(rq, prev, next); | ||
| 3538 | barrier(); | ||
| 3539 | /* | ||
| 3540 | * this_rq must be evaluated again because prev may have moved | ||
| 3541 | * CPUs since it called schedule(), thus the 'rq' on its stack | ||
| 3542 | * frame will be invalid. | ||
| 3543 | */ | ||
| 3544 | finish_task_switch(this_rq(), prev); | ||
| 3545 | } else | 3446 | } else |
| 3546 | spin_unlock_irq(&rq->lock); | 3447 | spin_unlock_irq(&rq->lock); |
| 3547 | 3448 | ||
| 3548 | prev = current; | 3449 | if (unlikely(reacquire_kernel_lock(current) < 0)) { |
| 3549 | if (unlikely(reacquire_kernel_lock(prev) < 0)) | 3450 | cpu = smp_processor_id(); |
| 3451 | rq = cpu_rq(cpu); | ||
| 3550 | goto need_resched_nonpreemptible; | 3452 | goto need_resched_nonpreemptible; |
| 3453 | } | ||
| 3551 | preempt_enable_no_resched(); | 3454 | preempt_enable_no_resched(); |
| 3552 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) | 3455 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) |
| 3553 | goto need_resched; | 3456 | goto need_resched; |
| @@ -3959,29 +3862,30 @@ EXPORT_SYMBOL(sleep_on_timeout); | |||
| 3959 | */ | 3862 | */ |
| 3960 | void rt_mutex_setprio(struct task_struct *p, int prio) | 3863 | void rt_mutex_setprio(struct task_struct *p, int prio) |
| 3961 | { | 3864 | { |
| 3962 | struct prio_array *array; | ||
| 3963 | unsigned long flags; | 3865 | unsigned long flags; |
| 3866 | int oldprio, on_rq; | ||
| 3964 | struct rq *rq; | 3867 | struct rq *rq; |
| 3965 | int oldprio; | 3868 | u64 now; |
| 3966 | 3869 | ||
| 3967 | BUG_ON(prio < 0 || prio > MAX_PRIO); | 3870 | BUG_ON(prio < 0 || prio > MAX_PRIO); |
| 3968 | 3871 | ||
| 3969 | rq = task_rq_lock(p, &flags); | 3872 | rq = task_rq_lock(p, &flags); |
| 3873 | now = rq_clock(rq); | ||
| 3970 | 3874 | ||
| 3971 | oldprio = p->prio; | 3875 | oldprio = p->prio; |
| 3972 | array = p->array; | 3876 | on_rq = p->se.on_rq; |
| 3973 | if (array) | 3877 | if (on_rq) |
| 3974 | dequeue_task(p, array); | 3878 | dequeue_task(rq, p, 0, now); |
| 3879 | |||
| 3880 | if (rt_prio(prio)) | ||
| 3881 | p->sched_class = &rt_sched_class; | ||
| 3882 | else | ||
| 3883 | p->sched_class = &fair_sched_class; | ||
| 3884 | |||
| 3975 | p->prio = prio; | 3885 | p->prio = prio; |
| 3976 | 3886 | ||
| 3977 | if (array) { | 3887 | if (on_rq) { |
| 3978 | /* | 3888 | enqueue_task(rq, p, 0, now); |
| 3979 | * If changing to an RT priority then queue it | ||
| 3980 | * in the active array! | ||
| 3981 | */ | ||
| 3982 | if (rt_task(p)) | ||
| 3983 | array = rq->active; | ||
| 3984 | enqueue_task(p, array); | ||
| 3985 | /* | 3889 | /* |
| 3986 | * Reschedule if we are currently running on this runqueue and | 3890 | * Reschedule if we are currently running on this runqueue and |
| 3987 | * our priority decreased, or if we are not currently running on | 3891 | * our priority decreased, or if we are not currently running on |
| @@ -3990,8 +3894,9 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
| 3990 | if (task_running(rq, p)) { | 3894 | if (task_running(rq, p)) { |
| 3991 | if (p->prio > oldprio) | 3895 | if (p->prio > oldprio) |
| 3992 | resched_task(rq->curr); | 3896 | resched_task(rq->curr); |
| 3993 | } else if (TASK_PREEMPTS_CURR(p, rq)) | 3897 | } else { |
| 3994 | resched_task(rq->curr); | 3898 | check_preempt_curr(rq, p); |
| 3899 | } | ||
| 3995 | } | 3900 | } |
| 3996 | task_rq_unlock(rq, &flags); | 3901 | task_rq_unlock(rq, &flags); |
| 3997 | } | 3902 | } |
| @@ -4000,10 +3905,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
| 4000 | 3905 | ||
| 4001 | void set_user_nice(struct task_struct *p, long nice) | 3906 | void set_user_nice(struct task_struct *p, long nice) |
| 4002 | { | 3907 | { |
| 4003 | struct prio_array *array; | 3908 | int old_prio, delta, on_rq; |
| 4004 | int old_prio, delta; | ||
| 4005 | unsigned long flags; | 3909 | unsigned long flags; |
| 4006 | struct rq *rq; | 3910 | struct rq *rq; |
| 3911 | u64 now; | ||
| 4007 | 3912 | ||
| 4008 | if (TASK_NICE(p) == nice || nice < -20 || nice > 19) | 3913 | if (TASK_NICE(p) == nice || nice < -20 || nice > 19) |
| 4009 | return; | 3914 | return; |
| @@ -4012,20 +3917,21 @@ void set_user_nice(struct task_struct *p, long nice) | |||
| 4012 | * the task might be in the middle of scheduling on another CPU. | 3917 | * the task might be in the middle of scheduling on another CPU. |
| 4013 | */ | 3918 | */ |
| 4014 | rq = task_rq_lock(p, &flags); | 3919 | rq = task_rq_lock(p, &flags); |
| 3920 | now = rq_clock(rq); | ||
| 4015 | /* | 3921 | /* |
| 4016 | * The RT priorities are set via sched_setscheduler(), but we still | 3922 | * The RT priorities are set via sched_setscheduler(), but we still |
| 4017 | * allow the 'normal' nice value to be set - but as expected | 3923 | * allow the 'normal' nice value to be set - but as expected |
| 4018 | * it wont have any effect on scheduling until the task is | 3924 | * it wont have any effect on scheduling until the task is |
| 4019 | * not SCHED_NORMAL/SCHED_BATCH: | 3925 | * SCHED_FIFO/SCHED_RR: |
| 4020 | */ | 3926 | */ |
| 4021 | if (task_has_rt_policy(p)) { | 3927 | if (task_has_rt_policy(p)) { |
| 4022 | p->static_prio = NICE_TO_PRIO(nice); | 3928 | p->static_prio = NICE_TO_PRIO(nice); |
| 4023 | goto out_unlock; | 3929 | goto out_unlock; |
| 4024 | } | 3930 | } |
| 4025 | array = p->array; | 3931 | on_rq = p->se.on_rq; |
| 4026 | if (array) { | 3932 | if (on_rq) { |
| 4027 | dequeue_task(p, array); | 3933 | dequeue_task(rq, p, 0, now); |
| 4028 | dec_raw_weighted_load(rq, p); | 3934 | dec_load(rq, p, now); |
| 4029 | } | 3935 | } |
| 4030 | 3936 | ||
| 4031 | p->static_prio = NICE_TO_PRIO(nice); | 3937 | p->static_prio = NICE_TO_PRIO(nice); |
| @@ -4034,9 +3940,9 @@ void set_user_nice(struct task_struct *p, long nice) | |||
| 4034 | p->prio = effective_prio(p); | 3940 | p->prio = effective_prio(p); |
| 4035 | delta = p->prio - old_prio; | 3941 | delta = p->prio - old_prio; |
| 4036 | 3942 | ||
| 4037 | if (array) { | 3943 | if (on_rq) { |
| 4038 | enqueue_task(p, array); | 3944 | enqueue_task(rq, p, 0, now); |
| 4039 | inc_raw_weighted_load(rq, p); | 3945 | inc_load(rq, p, now); |
| 4040 | /* | 3946 | /* |
| 4041 | * If the task increased its priority or is running and | 3947 | * If the task increased its priority or is running and |
| 4042 | * lowered its priority, then reschedule its CPU: | 3948 | * lowered its priority, then reschedule its CPU: |
| @@ -4156,11 +4062,24 @@ static inline struct task_struct *find_process_by_pid(pid_t pid) | |||
| 4156 | } | 4062 | } |
| 4157 | 4063 | ||
| 4158 | /* Actually do priority change: must hold rq lock. */ | 4064 | /* Actually do priority change: must hold rq lock. */ |
| 4159 | static void __setscheduler(struct task_struct *p, int policy, int prio) | 4065 | static void |
| 4066 | __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) | ||
| 4160 | { | 4067 | { |
| 4161 | BUG_ON(p->array); | 4068 | BUG_ON(p->se.on_rq); |
| 4162 | 4069 | ||
| 4163 | p->policy = policy; | 4070 | p->policy = policy; |
| 4071 | switch (p->policy) { | ||
| 4072 | case SCHED_NORMAL: | ||
| 4073 | case SCHED_BATCH: | ||
| 4074 | case SCHED_IDLE: | ||
| 4075 | p->sched_class = &fair_sched_class; | ||
| 4076 | break; | ||
| 4077 | case SCHED_FIFO: | ||
| 4078 | case SCHED_RR: | ||
| 4079 | p->sched_class = &rt_sched_class; | ||
| 4080 | break; | ||
| 4081 | } | ||
| 4082 | |||
| 4164 | p->rt_priority = prio; | 4083 | p->rt_priority = prio; |
| 4165 | p->normal_prio = normal_prio(p); | 4084 | p->normal_prio = normal_prio(p); |
| 4166 | /* we are holding p->pi_lock already */ | 4085 | /* we are holding p->pi_lock already */ |
| @@ -4179,8 +4098,7 @@ static void __setscheduler(struct task_struct *p, int policy, int prio) | |||
| 4179 | int sched_setscheduler(struct task_struct *p, int policy, | 4098 | int sched_setscheduler(struct task_struct *p, int policy, |
| 4180 | struct sched_param *param) | 4099 | struct sched_param *param) |
| 4181 | { | 4100 | { |
| 4182 | int retval, oldprio, oldpolicy = -1; | 4101 | int retval, oldprio, oldpolicy = -1, on_rq; |
| 4183 | struct prio_array *array; | ||
| 4184 | unsigned long flags; | 4102 | unsigned long flags; |
| 4185 | struct rq *rq; | 4103 | struct rq *rq; |
| 4186 | 4104 | ||
| @@ -4191,12 +4109,13 @@ recheck: | |||
| 4191 | if (policy < 0) | 4109 | if (policy < 0) |
| 4192 | policy = oldpolicy = p->policy; | 4110 | policy = oldpolicy = p->policy; |
| 4193 | else if (policy != SCHED_FIFO && policy != SCHED_RR && | 4111 | else if (policy != SCHED_FIFO && policy != SCHED_RR && |
| 4194 | policy != SCHED_NORMAL && policy != SCHED_BATCH) | 4112 | policy != SCHED_NORMAL && policy != SCHED_BATCH && |
| 4113 | policy != SCHED_IDLE) | ||
| 4195 | return -EINVAL; | 4114 | return -EINVAL; |
| 4196 | /* | 4115 | /* |
| 4197 | * Valid priorities for SCHED_FIFO and SCHED_RR are | 4116 | * Valid priorities for SCHED_FIFO and SCHED_RR are |
| 4198 | * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and | 4117 | * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, |
| 4199 | * SCHED_BATCH is 0. | 4118 | * SCHED_BATCH and SCHED_IDLE is 0. |
| 4200 | */ | 4119 | */ |
| 4201 | if (param->sched_priority < 0 || | 4120 | if (param->sched_priority < 0 || |
| 4202 | (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || | 4121 | (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || |
| @@ -4211,7 +4130,6 @@ recheck: | |||
| 4211 | if (!capable(CAP_SYS_NICE)) { | 4130 | if (!capable(CAP_SYS_NICE)) { |
| 4212 | if (rt_policy(policy)) { | 4131 | if (rt_policy(policy)) { |
| 4213 | unsigned long rlim_rtprio; | 4132 | unsigned long rlim_rtprio; |
| 4214 | unsigned long flags; | ||
| 4215 | 4133 | ||
| 4216 | if (!lock_task_sighand(p, &flags)) | 4134 | if (!lock_task_sighand(p, &flags)) |
| 4217 | return -ESRCH; | 4135 | return -ESRCH; |
| @@ -4227,6 +4145,12 @@ recheck: | |||
| 4227 | param->sched_priority > rlim_rtprio) | 4145 | param->sched_priority > rlim_rtprio) |
| 4228 | return -EPERM; | 4146 | return -EPERM; |
| 4229 | } | 4147 | } |
| 4148 | /* | ||
| 4149 | * Like positive nice levels, dont allow tasks to | ||
| 4150 | * move out of SCHED_IDLE either: | ||
| 4151 | */ | ||
| 4152 | if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) | ||
| 4153 | return -EPERM; | ||
| 4230 | 4154 | ||
| 4231 | /* can't change other user's priorities */ | 4155 | /* can't change other user's priorities */ |
| 4232 | if ((current->euid != p->euid) && | 4156 | if ((current->euid != p->euid) && |
| @@ -4254,13 +4178,13 @@ recheck: | |||
| 4254 | spin_unlock_irqrestore(&p->pi_lock, flags); | 4178 | spin_unlock_irqrestore(&p->pi_lock, flags); |
| 4255 | goto recheck; | 4179 | goto recheck; |
| 4256 | } | 4180 | } |
| 4257 | array = p->array; | 4181 | on_rq = p->se.on_rq; |
| 4258 | if (array) | 4182 | if (on_rq) |
| 4259 | deactivate_task(p, rq); | 4183 | deactivate_task(rq, p, 0); |
| 4260 | oldprio = p->prio; | 4184 | oldprio = p->prio; |
| 4261 | __setscheduler(p, policy, param->sched_priority); | 4185 | __setscheduler(rq, p, policy, param->sched_priority); |
| 4262 | if (array) { | 4186 | if (on_rq) { |
| 4263 | __activate_task(p, rq); | 4187 | activate_task(rq, p, 0); |
| 4264 | /* | 4188 | /* |
| 4265 | * Reschedule if we are currently running on this runqueue and | 4189 | * Reschedule if we are currently running on this runqueue and |
| 4266 | * our priority decreased, or if we are not currently running on | 4190 | * our priority decreased, or if we are not currently running on |
| @@ -4269,8 +4193,9 @@ recheck: | |||
| 4269 | if (task_running(rq, p)) { | 4193 | if (task_running(rq, p)) { |
| 4270 | if (p->prio > oldprio) | 4194 | if (p->prio > oldprio) |
| 4271 | resched_task(rq->curr); | 4195 | resched_task(rq->curr); |
| 4272 | } else if (TASK_PREEMPTS_CURR(p, rq)) | 4196 | } else { |
| 4273 | resched_task(rq->curr); | 4197 | check_preempt_curr(rq, p); |
| 4198 | } | ||
| 4274 | } | 4199 | } |
| 4275 | __task_rq_unlock(rq); | 4200 | __task_rq_unlock(rq); |
| 4276 | spin_unlock_irqrestore(&p->pi_lock, flags); | 4201 | spin_unlock_irqrestore(&p->pi_lock, flags); |
| @@ -4542,41 +4467,18 @@ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, | |||
| 4542 | /** | 4467 | /** |
| 4543 | * sys_sched_yield - yield the current processor to other threads. | 4468 | * sys_sched_yield - yield the current processor to other threads. |
| 4544 | * | 4469 | * |
| 4545 | * This function yields the current CPU by moving the calling thread | 4470 | * This function yields the current CPU to other tasks. If there are no |
| 4546 | * to the expired array. If there are no other threads running on this | 4471 | * other threads running on this CPU then this function will return. |
| 4547 | * CPU then this function will return. | ||
| 4548 | */ | 4472 | */ |
| 4549 | asmlinkage long sys_sched_yield(void) | 4473 | asmlinkage long sys_sched_yield(void) |
| 4550 | { | 4474 | { |
| 4551 | struct rq *rq = this_rq_lock(); | 4475 | struct rq *rq = this_rq_lock(); |
| 4552 | struct prio_array *array = current->array, *target = rq->expired; | ||
| 4553 | 4476 | ||
| 4554 | schedstat_inc(rq, yld_cnt); | 4477 | schedstat_inc(rq, yld_cnt); |
| 4555 | /* | 4478 | if (unlikely(rq->nr_running == 1)) |
| 4556 | * We implement yielding by moving the task into the expired | ||
| 4557 | * queue. | ||
| 4558 | * | ||
| 4559 | * (special rule: RT tasks will just roundrobin in the active | ||
| 4560 | * array.) | ||
| 4561 | */ | ||
| 4562 | if (rt_task(current)) | ||
| 4563 | target = rq->active; | ||
| 4564 | |||
| 4565 | if (array->nr_active == 1) { | ||
| 4566 | schedstat_inc(rq, yld_act_empty); | 4479 | schedstat_inc(rq, yld_act_empty); |
| 4567 | if (!rq->expired->nr_active) | 4480 | else |
| 4568 | schedstat_inc(rq, yld_both_empty); | 4481 | current->sched_class->yield_task(rq, current); |
| 4569 | } else if (!rq->expired->nr_active) | ||
| 4570 | schedstat_inc(rq, yld_exp_empty); | ||
| 4571 | |||
| 4572 | if (array != target) { | ||
| 4573 | dequeue_task(current, array); | ||
| 4574 | enqueue_task(current, target); | ||
| 4575 | } else | ||
| 4576 | /* | ||
| 4577 | * requeue_task is cheaper so perform that if possible. | ||
| 4578 | */ | ||
| 4579 | requeue_task(current, array); | ||
| 4580 | 4482 | ||
| 4581 | /* | 4483 | /* |
| 4582 | * Since we are going to call schedule() anyway, there's | 4484 | * Since we are going to call schedule() anyway, there's |
| @@ -4727,6 +4629,7 @@ asmlinkage long sys_sched_get_priority_max(int policy) | |||
| 4727 | break; | 4629 | break; |
| 4728 | case SCHED_NORMAL: | 4630 | case SCHED_NORMAL: |
| 4729 | case SCHED_BATCH: | 4631 | case SCHED_BATCH: |
| 4632 | case SCHED_IDLE: | ||
| 4730 | ret = 0; | 4633 | ret = 0; |
| 4731 | break; | 4634 | break; |
| 4732 | } | 4635 | } |
| @@ -4751,6 +4654,7 @@ asmlinkage long sys_sched_get_priority_min(int policy) | |||
| 4751 | break; | 4654 | break; |
| 4752 | case SCHED_NORMAL: | 4655 | case SCHED_NORMAL: |
| 4753 | case SCHED_BATCH: | 4656 | case SCHED_BATCH: |
| 4657 | case SCHED_IDLE: | ||
| 4754 | ret = 0; | 4658 | ret = 0; |
| 4755 | } | 4659 | } |
| 4756 | return ret; | 4660 | return ret; |
| @@ -4785,7 +4689,7 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) | |||
| 4785 | goto out_unlock; | 4689 | goto out_unlock; |
| 4786 | 4690 | ||
| 4787 | jiffies_to_timespec(p->policy == SCHED_FIFO ? | 4691 | jiffies_to_timespec(p->policy == SCHED_FIFO ? |
| 4788 | 0 : task_timeslice(p), &t); | 4692 | 0 : static_prio_timeslice(p->static_prio), &t); |
| 4789 | read_unlock(&tasklist_lock); | 4693 | read_unlock(&tasklist_lock); |
| 4790 | retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; | 4694 | retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; |
| 4791 | out_nounlock: | 4695 | out_nounlock: |
| @@ -4860,6 +4764,9 @@ void show_state_filter(unsigned long state_filter) | |||
| 4860 | 4764 | ||
| 4861 | touch_all_softlockup_watchdogs(); | 4765 | touch_all_softlockup_watchdogs(); |
| 4862 | 4766 | ||
| 4767 | #ifdef CONFIG_SCHED_DEBUG | ||
| 4768 | sysrq_sched_debug_show(); | ||
| 4769 | #endif | ||
| 4863 | read_unlock(&tasklist_lock); | 4770 | read_unlock(&tasklist_lock); |
| 4864 | /* | 4771 | /* |
| 4865 | * Only show locks if all tasks are dumped: | 4772 | * Only show locks if all tasks are dumped: |
| @@ -4870,7 +4777,7 @@ void show_state_filter(unsigned long state_filter) | |||
| 4870 | 4777 | ||
| 4871 | void __cpuinit init_idle_bootup_task(struct task_struct *idle) | 4778 | void __cpuinit init_idle_bootup_task(struct task_struct *idle) |
| 4872 | { | 4779 | { |
| 4873 | /* nothing yet */ | 4780 | idle->sched_class = &idle_sched_class; |
| 4874 | } | 4781 | } |
| 4875 | 4782 | ||
| 4876 | /** | 4783 | /** |
| @@ -4886,12 +4793,12 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) | |||
| 4886 | struct rq *rq = cpu_rq(cpu); | 4793 | struct rq *rq = cpu_rq(cpu); |
| 4887 | unsigned long flags; | 4794 | unsigned long flags; |
| 4888 | 4795 | ||
| 4889 | idle->timestamp = sched_clock(); | 4796 | __sched_fork(idle); |
| 4890 | idle->array = NULL; | 4797 | idle->se.exec_start = sched_clock(); |
| 4798 | |||
| 4891 | idle->prio = idle->normal_prio = MAX_PRIO; | 4799 | idle->prio = idle->normal_prio = MAX_PRIO; |
| 4892 | idle->state = TASK_RUNNING; | ||
| 4893 | idle->cpus_allowed = cpumask_of_cpu(cpu); | 4800 | idle->cpus_allowed = cpumask_of_cpu(cpu); |
| 4894 | set_task_cpu(idle, cpu); | 4801 | __set_task_cpu(idle, cpu); |
| 4895 | 4802 | ||
| 4896 | spin_lock_irqsave(&rq->lock, flags); | 4803 | spin_lock_irqsave(&rq->lock, flags); |
| 4897 | rq->curr = rq->idle = idle; | 4804 | rq->curr = rq->idle = idle; |
| @@ -4906,6 +4813,10 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) | |||
| 4906 | #else | 4813 | #else |
| 4907 | task_thread_info(idle)->preempt_count = 0; | 4814 | task_thread_info(idle)->preempt_count = 0; |
| 4908 | #endif | 4815 | #endif |
| 4816 | /* | ||
| 4817 | * The idle tasks have their own, simple scheduling class: | ||
| 4818 | */ | ||
| 4819 | idle->sched_class = &idle_sched_class; | ||
| 4909 | } | 4820 | } |
| 4910 | 4821 | ||
| 4911 | /* | 4822 | /* |
| @@ -4917,6 +4828,28 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) | |||
| 4917 | */ | 4828 | */ |
| 4918 | cpumask_t nohz_cpu_mask = CPU_MASK_NONE; | 4829 | cpumask_t nohz_cpu_mask = CPU_MASK_NONE; |
| 4919 | 4830 | ||
| 4831 | /* | ||
| 4832 | * Increase the granularity value when there are more CPUs, | ||
| 4833 | * because with more CPUs the 'effective latency' as visible | ||
| 4834 | * to users decreases. But the relationship is not linear, | ||
| 4835 | * so pick a second-best guess by going with the log2 of the | ||
| 4836 | * number of CPUs. | ||
| 4837 | * | ||
| 4838 | * This idea comes from the SD scheduler of Con Kolivas: | ||
| 4839 | */ | ||
| 4840 | static inline void sched_init_granularity(void) | ||
| 4841 | { | ||
| 4842 | unsigned int factor = 1 + ilog2(num_online_cpus()); | ||
| 4843 | const unsigned long gran_limit = 10000000; | ||
| 4844 | |||
| 4845 | sysctl_sched_granularity *= factor; | ||
| 4846 | if (sysctl_sched_granularity > gran_limit) | ||
| 4847 | sysctl_sched_granularity = gran_limit; | ||
| 4848 | |||
| 4849 | sysctl_sched_runtime_limit = sysctl_sched_granularity * 4; | ||
| 4850 | sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2; | ||
| 4851 | } | ||
| 4852 | |||
| 4920 | #ifdef CONFIG_SMP | 4853 | #ifdef CONFIG_SMP |
| 4921 | /* | 4854 | /* |
| 4922 | * This is how migration works: | 4855 | * This is how migration works: |
| @@ -4990,7 +4923,7 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed); | |||
| 4990 | static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | 4923 | static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) |
| 4991 | { | 4924 | { |
| 4992 | struct rq *rq_dest, *rq_src; | 4925 | struct rq *rq_dest, *rq_src; |
| 4993 | int ret = 0; | 4926 | int ret = 0, on_rq; |
| 4994 | 4927 | ||
| 4995 | if (unlikely(cpu_is_offline(dest_cpu))) | 4928 | if (unlikely(cpu_is_offline(dest_cpu))) |
| 4996 | return ret; | 4929 | return ret; |
| @@ -5006,20 +4939,13 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | |||
| 5006 | if (!cpu_isset(dest_cpu, p->cpus_allowed)) | 4939 | if (!cpu_isset(dest_cpu, p->cpus_allowed)) |
| 5007 | goto out; | 4940 | goto out; |
| 5008 | 4941 | ||
| 4942 | on_rq = p->se.on_rq; | ||
| 4943 | if (on_rq) | ||
| 4944 | deactivate_task(rq_src, p, 0); | ||
| 5009 | set_task_cpu(p, dest_cpu); | 4945 | set_task_cpu(p, dest_cpu); |
| 5010 | if (p->array) { | 4946 | if (on_rq) { |
| 5011 | /* | 4947 | activate_task(rq_dest, p, 0); |
| 5012 | * Sync timestamp with rq_dest's before activating. | 4948 | check_preempt_curr(rq_dest, p); |
| 5013 | * The same thing could be achieved by doing this step | ||
| 5014 | * afterwards, and pretending it was a local activate. | ||
| 5015 | * This way is cleaner and logically correct. | ||
| 5016 | */ | ||
| 5017 | p->timestamp = p->timestamp - rq_src->most_recent_timestamp | ||
| 5018 | + rq_dest->most_recent_timestamp; | ||
| 5019 | deactivate_task(p, rq_src); | ||
| 5020 | __activate_task(p, rq_dest); | ||
| 5021 | if (TASK_PREEMPTS_CURR(p, rq_dest)) | ||
| 5022 | resched_task(rq_dest->curr); | ||
| 5023 | } | 4949 | } |
| 5024 | ret = 1; | 4950 | ret = 1; |
| 5025 | out: | 4951 | out: |
| @@ -5171,7 +5097,8 @@ static void migrate_live_tasks(int src_cpu) | |||
| 5171 | write_unlock_irq(&tasklist_lock); | 5097 | write_unlock_irq(&tasklist_lock); |
| 5172 | } | 5098 | } |
| 5173 | 5099 | ||
| 5174 | /* Schedules idle task to be the next runnable task on current CPU. | 5100 | /* |
| 5101 | * Schedules idle task to be the next runnable task on current CPU. | ||
| 5175 | * It does so by boosting its priority to highest possible and adding it to | 5102 | * It does so by boosting its priority to highest possible and adding it to |
| 5176 | * the _front_ of the runqueue. Used by CPU offline code. | 5103 | * the _front_ of the runqueue. Used by CPU offline code. |
| 5177 | */ | 5104 | */ |
| @@ -5191,10 +5118,10 @@ void sched_idle_next(void) | |||
| 5191 | */ | 5118 | */ |
| 5192 | spin_lock_irqsave(&rq->lock, flags); | 5119 | spin_lock_irqsave(&rq->lock, flags); |
| 5193 | 5120 | ||
| 5194 | __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); | 5121 | __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); |
| 5195 | 5122 | ||
| 5196 | /* Add idle task to the _front_ of its priority queue: */ | 5123 | /* Add idle task to the _front_ of its priority queue: */ |
| 5197 | __activate_idle_task(p, rq); | 5124 | activate_idle_task(p, rq); |
| 5198 | 5125 | ||
| 5199 | spin_unlock_irqrestore(&rq->lock, flags); | 5126 | spin_unlock_irqrestore(&rq->lock, flags); |
| 5200 | } | 5127 | } |
| @@ -5244,16 +5171,15 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) | |||
| 5244 | static void migrate_dead_tasks(unsigned int dead_cpu) | 5171 | static void migrate_dead_tasks(unsigned int dead_cpu) |
| 5245 | { | 5172 | { |
| 5246 | struct rq *rq = cpu_rq(dead_cpu); | 5173 | struct rq *rq = cpu_rq(dead_cpu); |
| 5247 | unsigned int arr, i; | 5174 | struct task_struct *next; |
| 5248 | |||
| 5249 | for (arr = 0; arr < 2; arr++) { | ||
| 5250 | for (i = 0; i < MAX_PRIO; i++) { | ||
| 5251 | struct list_head *list = &rq->arrays[arr].queue[i]; | ||
| 5252 | 5175 | ||
| 5253 | while (!list_empty(list)) | 5176 | for ( ; ; ) { |
| 5254 | migrate_dead(dead_cpu, list_entry(list->next, | 5177 | if (!rq->nr_running) |
| 5255 | struct task_struct, run_list)); | 5178 | break; |
| 5256 | } | 5179 | next = pick_next_task(rq, rq->curr, rq_clock(rq)); |
| 5180 | if (!next) | ||
| 5181 | break; | ||
| 5182 | migrate_dead(dead_cpu, next); | ||
| 5257 | } | 5183 | } |
| 5258 | } | 5184 | } |
| 5259 | #endif /* CONFIG_HOTPLUG_CPU */ | 5185 | #endif /* CONFIG_HOTPLUG_CPU */ |
| @@ -5277,14 +5203,14 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
| 5277 | 5203 | ||
| 5278 | case CPU_UP_PREPARE: | 5204 | case CPU_UP_PREPARE: |
| 5279 | case CPU_UP_PREPARE_FROZEN: | 5205 | case CPU_UP_PREPARE_FROZEN: |
| 5280 | p = kthread_create(migration_thread, hcpu, "migration/%d",cpu); | 5206 | p = kthread_create(migration_thread, hcpu, "migration/%d", cpu); |
| 5281 | if (IS_ERR(p)) | 5207 | if (IS_ERR(p)) |
| 5282 | return NOTIFY_BAD; | 5208 | return NOTIFY_BAD; |
| 5283 | p->flags |= PF_NOFREEZE; | 5209 | p->flags |= PF_NOFREEZE; |
| 5284 | kthread_bind(p, cpu); | 5210 | kthread_bind(p, cpu); |
| 5285 | /* Must be high prio: stop_machine expects to yield to it. */ | 5211 | /* Must be high prio: stop_machine expects to yield to it. */ |
| 5286 | rq = task_rq_lock(p, &flags); | 5212 | rq = task_rq_lock(p, &flags); |
| 5287 | __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); | 5213 | __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); |
| 5288 | task_rq_unlock(rq, &flags); | 5214 | task_rq_unlock(rq, &flags); |
| 5289 | cpu_rq(cpu)->migration_thread = p; | 5215 | cpu_rq(cpu)->migration_thread = p; |
| 5290 | break; | 5216 | break; |
| @@ -5315,9 +5241,10 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
| 5315 | rq->migration_thread = NULL; | 5241 | rq->migration_thread = NULL; |
| 5316 | /* Idle task back to normal (off runqueue, low prio) */ | 5242 | /* Idle task back to normal (off runqueue, low prio) */ |
| 5317 | rq = task_rq_lock(rq->idle, &flags); | 5243 | rq = task_rq_lock(rq->idle, &flags); |
| 5318 | deactivate_task(rq->idle, rq); | 5244 | deactivate_task(rq, rq->idle, 0); |
| 5319 | rq->idle->static_prio = MAX_PRIO; | 5245 | rq->idle->static_prio = MAX_PRIO; |
| 5320 | __setscheduler(rq->idle, SCHED_NORMAL, 0); | 5246 | __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); |
| 5247 | rq->idle->sched_class = &idle_sched_class; | ||
| 5321 | migrate_dead_tasks(cpu); | 5248 | migrate_dead_tasks(cpu); |
| 5322 | task_rq_unlock(rq, &flags); | 5249 | task_rq_unlock(rq, &flags); |
| 5323 | migrate_nr_uninterruptible(rq); | 5250 | migrate_nr_uninterruptible(rq); |
| @@ -5926,7 +5853,6 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
| 5926 | static int build_sched_domains(const cpumask_t *cpu_map) | 5853 | static int build_sched_domains(const cpumask_t *cpu_map) |
| 5927 | { | 5854 | { |
| 5928 | int i; | 5855 | int i; |
| 5929 | struct sched_domain *sd; | ||
| 5930 | #ifdef CONFIG_NUMA | 5856 | #ifdef CONFIG_NUMA |
| 5931 | struct sched_group **sched_group_nodes = NULL; | 5857 | struct sched_group **sched_group_nodes = NULL; |
| 5932 | int sd_allnodes = 0; | 5858 | int sd_allnodes = 0; |
| @@ -5934,7 +5860,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
| 5934 | /* | 5860 | /* |
| 5935 | * Allocate the per-node list of sched groups | 5861 | * Allocate the per-node list of sched groups |
| 5936 | */ | 5862 | */ |
| 5937 | sched_group_nodes = kzalloc(sizeof(struct sched_group*)*MAX_NUMNODES, | 5863 | sched_group_nodes = kzalloc(sizeof(struct sched_group *)*MAX_NUMNODES, |
| 5938 | GFP_KERNEL); | 5864 | GFP_KERNEL); |
| 5939 | if (!sched_group_nodes) { | 5865 | if (!sched_group_nodes) { |
| 5940 | printk(KERN_WARNING "Can not alloc sched group node list\n"); | 5866 | printk(KERN_WARNING "Can not alloc sched group node list\n"); |
| @@ -5953,8 +5879,8 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
| 5953 | cpus_and(nodemask, nodemask, *cpu_map); | 5879 | cpus_and(nodemask, nodemask, *cpu_map); |
| 5954 | 5880 | ||
| 5955 | #ifdef CONFIG_NUMA | 5881 | #ifdef CONFIG_NUMA |
| 5956 | if (cpus_weight(*cpu_map) | 5882 | if (cpus_weight(*cpu_map) > |
| 5957 | > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { | 5883 | SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { |
| 5958 | sd = &per_cpu(allnodes_domains, i); | 5884 | sd = &per_cpu(allnodes_domains, i); |
| 5959 | *sd = SD_ALLNODES_INIT; | 5885 | *sd = SD_ALLNODES_INIT; |
| 5960 | sd->span = *cpu_map; | 5886 | sd->span = *cpu_map; |
| @@ -6013,7 +5939,8 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
| 6013 | if (i != first_cpu(this_sibling_map)) | 5939 | if (i != first_cpu(this_sibling_map)) |
| 6014 | continue; | 5940 | continue; |
| 6015 | 5941 | ||
| 6016 | init_sched_build_groups(this_sibling_map, cpu_map, &cpu_to_cpu_group); | 5942 | init_sched_build_groups(this_sibling_map, cpu_map, |
| 5943 | &cpu_to_cpu_group); | ||
| 6017 | } | 5944 | } |
| 6018 | #endif | 5945 | #endif |
| 6019 | 5946 | ||
| @@ -6024,11 +5951,11 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
| 6024 | cpus_and(this_core_map, this_core_map, *cpu_map); | 5951 | cpus_and(this_core_map, this_core_map, *cpu_map); |
| 6025 | if (i != first_cpu(this_core_map)) | 5952 | if (i != first_cpu(this_core_map)) |
| 6026 | continue; | 5953 | continue; |
| 6027 | init_sched_build_groups(this_core_map, cpu_map, &cpu_to_core_group); | 5954 | init_sched_build_groups(this_core_map, cpu_map, |
| 5955 | &cpu_to_core_group); | ||
| 6028 | } | 5956 | } |
| 6029 | #endif | 5957 | #endif |
| 6030 | 5958 | ||
| 6031 | |||
| 6032 | /* Set up physical groups */ | 5959 | /* Set up physical groups */ |
| 6033 | for (i = 0; i < MAX_NUMNODES; i++) { | 5960 | for (i = 0; i < MAX_NUMNODES; i++) { |
| 6034 | cpumask_t nodemask = node_to_cpumask(i); | 5961 | cpumask_t nodemask = node_to_cpumask(i); |
| @@ -6043,7 +5970,8 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
| 6043 | #ifdef CONFIG_NUMA | 5970 | #ifdef CONFIG_NUMA |
| 6044 | /* Set up node groups */ | 5971 | /* Set up node groups */ |
| 6045 | if (sd_allnodes) | 5972 | if (sd_allnodes) |
| 6046 | init_sched_build_groups(*cpu_map, cpu_map, &cpu_to_allnodes_group); | 5973 | init_sched_build_groups(*cpu_map, cpu_map, |
| 5974 | &cpu_to_allnodes_group); | ||
| 6047 | 5975 | ||
| 6048 | for (i = 0; i < MAX_NUMNODES; i++) { | 5976 | for (i = 0; i < MAX_NUMNODES; i++) { |
| 6049 | /* Set up node groups */ | 5977 | /* Set up node groups */ |
| @@ -6115,19 +6043,22 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
| 6115 | /* Calculate CPU power for physical packages and nodes */ | 6043 | /* Calculate CPU power for physical packages and nodes */ |
| 6116 | #ifdef CONFIG_SCHED_SMT | 6044 | #ifdef CONFIG_SCHED_SMT |
| 6117 | for_each_cpu_mask(i, *cpu_map) { | 6045 | for_each_cpu_mask(i, *cpu_map) { |
| 6118 | sd = &per_cpu(cpu_domains, i); | 6046 | struct sched_domain *sd = &per_cpu(cpu_domains, i); |
| 6047 | |||
| 6119 | init_sched_groups_power(i, sd); | 6048 | init_sched_groups_power(i, sd); |
| 6120 | } | 6049 | } |
| 6121 | #endif | 6050 | #endif |
| 6122 | #ifdef CONFIG_SCHED_MC | 6051 | #ifdef CONFIG_SCHED_MC |
| 6123 | for_each_cpu_mask(i, *cpu_map) { | 6052 | for_each_cpu_mask(i, *cpu_map) { |
| 6124 | sd = &per_cpu(core_domains, i); | 6053 | struct sched_domain *sd = &per_cpu(core_domains, i); |
| 6054 | |||
| 6125 | init_sched_groups_power(i, sd); | 6055 | init_sched_groups_power(i, sd); |
| 6126 | } | 6056 | } |
| 6127 | #endif | 6057 | #endif |
| 6128 | 6058 | ||
| 6129 | for_each_cpu_mask(i, *cpu_map) { | 6059 | for_each_cpu_mask(i, *cpu_map) { |
| 6130 | sd = &per_cpu(phys_domains, i); | 6060 | struct sched_domain *sd = &per_cpu(phys_domains, i); |
| 6061 | |||
| 6131 | init_sched_groups_power(i, sd); | 6062 | init_sched_groups_power(i, sd); |
| 6132 | } | 6063 | } |
| 6133 | 6064 | ||
| @@ -6361,10 +6292,12 @@ void __init sched_init_smp(void) | |||
| 6361 | /* Move init over to a non-isolated CPU */ | 6292 | /* Move init over to a non-isolated CPU */ |
| 6362 | if (set_cpus_allowed(current, non_isolated_cpus) < 0) | 6293 | if (set_cpus_allowed(current, non_isolated_cpus) < 0) |
| 6363 | BUG(); | 6294 | BUG(); |
| 6295 | sched_init_granularity(); | ||
| 6364 | } | 6296 | } |
| 6365 | #else | 6297 | #else |
| 6366 | void __init sched_init_smp(void) | 6298 | void __init sched_init_smp(void) |
| 6367 | { | 6299 | { |
| 6300 | sched_init_granularity(); | ||
| 6368 | } | 6301 | } |
| 6369 | #endif /* CONFIG_SMP */ | 6302 | #endif /* CONFIG_SMP */ |
| 6370 | 6303 | ||
| @@ -6378,28 +6311,51 @@ int in_sched_functions(unsigned long addr) | |||
| 6378 | && addr < (unsigned long)__sched_text_end); | 6311 | && addr < (unsigned long)__sched_text_end); |
| 6379 | } | 6312 | } |
| 6380 | 6313 | ||
| 6314 | static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) | ||
| 6315 | { | ||
| 6316 | cfs_rq->tasks_timeline = RB_ROOT; | ||
| 6317 | cfs_rq->fair_clock = 1; | ||
| 6318 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 6319 | cfs_rq->rq = rq; | ||
| 6320 | #endif | ||
| 6321 | } | ||
| 6322 | |||
| 6381 | void __init sched_init(void) | 6323 | void __init sched_init(void) |
| 6382 | { | 6324 | { |
| 6383 | int i, j, k; | 6325 | u64 now = sched_clock(); |
| 6384 | int highest_cpu = 0; | 6326 | int highest_cpu = 0; |
| 6327 | int i, j; | ||
| 6328 | |||
| 6329 | /* | ||
| 6330 | * Link up the scheduling class hierarchy: | ||
| 6331 | */ | ||
| 6332 | rt_sched_class.next = &fair_sched_class; | ||
| 6333 | fair_sched_class.next = &idle_sched_class; | ||
| 6334 | idle_sched_class.next = NULL; | ||
| 6385 | 6335 | ||
| 6386 | for_each_possible_cpu(i) { | 6336 | for_each_possible_cpu(i) { |
| 6387 | struct prio_array *array; | 6337 | struct rt_prio_array *array; |
| 6388 | struct rq *rq; | 6338 | struct rq *rq; |
| 6389 | 6339 | ||
| 6390 | rq = cpu_rq(i); | 6340 | rq = cpu_rq(i); |
| 6391 | spin_lock_init(&rq->lock); | 6341 | spin_lock_init(&rq->lock); |
| 6392 | lockdep_set_class(&rq->lock, &rq->rq_lock_key); | 6342 | lockdep_set_class(&rq->lock, &rq->rq_lock_key); |
| 6393 | rq->nr_running = 0; | 6343 | rq->nr_running = 0; |
| 6394 | rq->active = rq->arrays; | 6344 | rq->clock = 1; |
| 6395 | rq->expired = rq->arrays + 1; | 6345 | init_cfs_rq(&rq->cfs, rq); |
| 6396 | rq->best_expired_prio = MAX_PRIO; | 6346 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 6347 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); | ||
| 6348 | list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); | ||
| 6349 | #endif | ||
| 6350 | rq->ls.load_update_last = now; | ||
| 6351 | rq->ls.load_update_start = now; | ||
| 6397 | 6352 | ||
| 6353 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) | ||
| 6354 | rq->cpu_load[j] = 0; | ||
| 6398 | #ifdef CONFIG_SMP | 6355 | #ifdef CONFIG_SMP |
| 6399 | rq->sd = NULL; | 6356 | rq->sd = NULL; |
| 6400 | for (j = 1; j < 3; j++) | ||
| 6401 | rq->cpu_load[j] = 0; | ||
| 6402 | rq->active_balance = 0; | 6357 | rq->active_balance = 0; |
| 6358 | rq->next_balance = jiffies; | ||
| 6403 | rq->push_cpu = 0; | 6359 | rq->push_cpu = 0; |
| 6404 | rq->cpu = i; | 6360 | rq->cpu = i; |
| 6405 | rq->migration_thread = NULL; | 6361 | rq->migration_thread = NULL; |
| @@ -6407,16 +6363,14 @@ void __init sched_init(void) | |||
| 6407 | #endif | 6363 | #endif |
| 6408 | atomic_set(&rq->nr_iowait, 0); | 6364 | atomic_set(&rq->nr_iowait, 0); |
| 6409 | 6365 | ||
| 6410 | for (j = 0; j < 2; j++) { | 6366 | array = &rq->rt.active; |
| 6411 | array = rq->arrays + j; | 6367 | for (j = 0; j < MAX_RT_PRIO; j++) { |
| 6412 | for (k = 0; k < MAX_PRIO; k++) { | 6368 | INIT_LIST_HEAD(array->queue + j); |
| 6413 | INIT_LIST_HEAD(array->queue + k); | 6369 | __clear_bit(j, array->bitmap); |
| 6414 | __clear_bit(k, array->bitmap); | ||
| 6415 | } | ||
| 6416 | // delimiter for bitsearch | ||
| 6417 | __set_bit(MAX_PRIO, array->bitmap); | ||
| 6418 | } | 6370 | } |
| 6419 | highest_cpu = i; | 6371 | highest_cpu = i; |
| 6372 | /* delimiter for bitsearch: */ | ||
| 6373 | __set_bit(MAX_RT_PRIO, array->bitmap); | ||
| 6420 | } | 6374 | } |
| 6421 | 6375 | ||
| 6422 | set_load_weight(&init_task); | 6376 | set_load_weight(&init_task); |
| @@ -6443,6 +6397,10 @@ void __init sched_init(void) | |||
| 6443 | * when this runqueue becomes "idle". | 6397 | * when this runqueue becomes "idle". |
| 6444 | */ | 6398 | */ |
| 6445 | init_idle(current, smp_processor_id()); | 6399 | init_idle(current, smp_processor_id()); |
| 6400 | /* | ||
| 6401 | * During early bootup we pretend to be a normal task: | ||
| 6402 | */ | ||
| 6403 | current->sched_class = &fair_sched_class; | ||
| 6446 | } | 6404 | } |
| 6447 | 6405 | ||
| 6448 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP | 6406 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP |
| @@ -6473,29 +6431,55 @@ EXPORT_SYMBOL(__might_sleep); | |||
| 6473 | #ifdef CONFIG_MAGIC_SYSRQ | 6431 | #ifdef CONFIG_MAGIC_SYSRQ |
| 6474 | void normalize_rt_tasks(void) | 6432 | void normalize_rt_tasks(void) |
| 6475 | { | 6433 | { |
| 6476 | struct prio_array *array; | ||
| 6477 | struct task_struct *g, *p; | 6434 | struct task_struct *g, *p; |
| 6478 | unsigned long flags; | 6435 | unsigned long flags; |
| 6479 | struct rq *rq; | 6436 | struct rq *rq; |
| 6437 | int on_rq; | ||
| 6480 | 6438 | ||
| 6481 | read_lock_irq(&tasklist_lock); | 6439 | read_lock_irq(&tasklist_lock); |
| 6482 | |||
| 6483 | do_each_thread(g, p) { | 6440 | do_each_thread(g, p) { |
| 6484 | if (!rt_task(p)) | 6441 | p->se.fair_key = 0; |
| 6442 | p->se.wait_runtime = 0; | ||
| 6443 | p->se.wait_start_fair = 0; | ||
| 6444 | p->se.wait_start = 0; | ||
| 6445 | p->se.exec_start = 0; | ||
| 6446 | p->se.sleep_start = 0; | ||
| 6447 | p->se.sleep_start_fair = 0; | ||
| 6448 | p->se.block_start = 0; | ||
| 6449 | task_rq(p)->cfs.fair_clock = 0; | ||
| 6450 | task_rq(p)->clock = 0; | ||
| 6451 | |||
| 6452 | if (!rt_task(p)) { | ||
| 6453 | /* | ||
| 6454 | * Renice negative nice level userspace | ||
| 6455 | * tasks back to 0: | ||
| 6456 | */ | ||
| 6457 | if (TASK_NICE(p) < 0 && p->mm) | ||
| 6458 | set_user_nice(p, 0); | ||
| 6485 | continue; | 6459 | continue; |
| 6460 | } | ||
| 6486 | 6461 | ||
| 6487 | spin_lock_irqsave(&p->pi_lock, flags); | 6462 | spin_lock_irqsave(&p->pi_lock, flags); |
| 6488 | rq = __task_rq_lock(p); | 6463 | rq = __task_rq_lock(p); |
| 6464 | #ifdef CONFIG_SMP | ||
| 6465 | /* | ||
| 6466 | * Do not touch the migration thread: | ||
| 6467 | */ | ||
| 6468 | if (p == rq->migration_thread) | ||
| 6469 | goto out_unlock; | ||
| 6470 | #endif | ||
| 6489 | 6471 | ||
| 6490 | array = p->array; | 6472 | on_rq = p->se.on_rq; |
| 6491 | if (array) | 6473 | if (on_rq) |
| 6492 | deactivate_task(p, task_rq(p)); | 6474 | deactivate_task(task_rq(p), p, 0); |
| 6493 | __setscheduler(p, SCHED_NORMAL, 0); | 6475 | __setscheduler(rq, p, SCHED_NORMAL, 0); |
| 6494 | if (array) { | 6476 | if (on_rq) { |
| 6495 | __activate_task(p, task_rq(p)); | 6477 | activate_task(task_rq(p), p, 0); |
| 6496 | resched_task(rq->curr); | 6478 | resched_task(rq->curr); |
| 6497 | } | 6479 | } |
| 6498 | 6480 | #ifdef CONFIG_SMP | |
| 6481 | out_unlock: | ||
| 6482 | #endif | ||
| 6499 | __task_rq_unlock(rq); | 6483 | __task_rq_unlock(rq); |
| 6500 | spin_unlock_irqrestore(&p->pi_lock, flags); | 6484 | spin_unlock_irqrestore(&p->pi_lock, flags); |
| 6501 | } while_each_thread(g, p); | 6485 | } while_each_thread(g, p); |
