aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c1532
1 files changed, 758 insertions, 774 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index f5a204b46655..01ba4b1848a0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -391,6 +391,11 @@ struct rq {
391static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp; 391static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
392static DEFINE_MUTEX(sched_hotcpu_mutex); 392static DEFINE_MUTEX(sched_hotcpu_mutex);
393 393
394static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
395{
396 rq->curr->sched_class->check_preempt_curr(rq, p);
397}
398
394static inline int cpu_of(struct rq *rq) 399static inline int cpu_of(struct rq *rq)
395{ 400{
396#ifdef CONFIG_SMP 401#ifdef CONFIG_SMP
@@ -669,8 +674,6 @@ static inline void resched_task(struct task_struct *p)
669} 674}
670#endif 675#endif
671 676
672#include "sched_stats.h"
673
674static u64 div64_likely32(u64 divident, unsigned long divisor) 677static u64 div64_likely32(u64 divident, unsigned long divisor)
675{ 678{
676#if BITS_PER_LONG == 32 679#if BITS_PER_LONG == 32
@@ -788,120 +791,146 @@ static void update_curr_load(struct rq *rq, u64 now)
788 * this code will need modification 791 * this code will need modification
789 */ 792 */
790#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE 793#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
791#define LOAD_WEIGHT(lp) \ 794#define load_weight(lp) \
792 (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO) 795 (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
793#define PRIO_TO_LOAD_WEIGHT(prio) \ 796#define PRIO_TO_LOAD_WEIGHT(prio) \
794 LOAD_WEIGHT(static_prio_timeslice(prio)) 797 load_weight(static_prio_timeslice(prio))
795#define RTPRIO_TO_LOAD_WEIGHT(rp) \ 798#define RTPRIO_TO_LOAD_WEIGHT(rp) \
796 (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp)) 799 (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + load_weight(rp))
800
801#define WEIGHT_IDLEPRIO 2
802#define WMULT_IDLEPRIO (1 << 31)
803
804/*
805 * Nice levels are multiplicative, with a gentle 10% change for every
806 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
807 * nice 1, it will get ~10% less CPU time than another CPU-bound task
808 * that remained on nice 0.
809 *
810 * The "10% effect" is relative and cumulative: from _any_ nice level,
811 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
812 * it's +10% CPU usage.
813 */
814static const int prio_to_weight[40] = {
815/* -20 */ 88818, 71054, 56843, 45475, 36380, 29104, 23283, 18626, 14901, 11921,
816/* -10 */ 9537, 7629, 6103, 4883, 3906, 3125, 2500, 2000, 1600, 1280,
817/* 0 */ NICE_0_LOAD /* 1024 */,
818/* 1 */ 819, 655, 524, 419, 336, 268, 215, 172, 137,
819/* 10 */ 110, 87, 70, 56, 45, 36, 29, 23, 18, 15,
820};
821
822static const u32 prio_to_wmult[40] = {
823 48356, 60446, 75558, 94446, 118058, 147573,
824 184467, 230589, 288233, 360285, 450347,
825 562979, 703746, 879575, 1099582, 1374389,
826 717986, 2147483, 2684354, 3355443, 4194304,
827 244160, 6557201, 8196502, 10250518, 12782640,
828 16025997, 19976592, 24970740, 31350126, 39045157,
829 49367440, 61356675, 76695844, 95443717, 119304647,
830 148102320, 186737708, 238609294, 286331153,
831};
797 832
798static inline void 833static inline void
799inc_raw_weighted_load(struct rq *rq, const struct task_struct *p) 834inc_load(struct rq *rq, const struct task_struct *p, u64 now)
800{ 835{
801 rq->raw_weighted_load += p->load_weight; 836 update_curr_load(rq, now);
837 update_load_add(&rq->ls.load, p->se.load.weight);
802} 838}
803 839
804static inline void 840static inline void
805dec_raw_weighted_load(struct rq *rq, const struct task_struct *p) 841dec_load(struct rq *rq, const struct task_struct *p, u64 now)
806{ 842{
807 rq->raw_weighted_load -= p->load_weight; 843 update_curr_load(rq, now);
844 update_load_sub(&rq->ls.load, p->se.load.weight);
808} 845}
809 846
810static inline void inc_nr_running(struct task_struct *p, struct rq *rq) 847static inline void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now)
811{ 848{
812 rq->nr_running++; 849 rq->nr_running++;
813 inc_raw_weighted_load(rq, p); 850 inc_load(rq, p, now);
814} 851}
815 852
816static inline void dec_nr_running(struct task_struct *p, struct rq *rq) 853static inline void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now)
817{ 854{
818 rq->nr_running--; 855 rq->nr_running--;
819 dec_raw_weighted_load(rq, p); 856 dec_load(rq, p, now);
820} 857}
821 858
859static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
860
861/*
862 * runqueue iterator, to support SMP load-balancing between different
863 * scheduling classes, without having to expose their internal data
864 * structures to the load-balancing proper:
865 */
866struct rq_iterator {
867 void *arg;
868 struct task_struct *(*start)(void *);
869 struct task_struct *(*next)(void *);
870};
871
872static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
873 unsigned long max_nr_move, unsigned long max_load_move,
874 struct sched_domain *sd, enum cpu_idle_type idle,
875 int *all_pinned, unsigned long *load_moved,
876 int this_best_prio, int best_prio, int best_prio_seen,
877 struct rq_iterator *iterator);
878
879#include "sched_stats.h"
880#include "sched_rt.c"
881#include "sched_fair.c"
882#include "sched_idletask.c"
883#ifdef CONFIG_SCHED_DEBUG
884# include "sched_debug.c"
885#endif
886
887#define sched_class_highest (&rt_sched_class)
888
822static void set_load_weight(struct task_struct *p) 889static void set_load_weight(struct task_struct *p)
823{ 890{
891 task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime;
892 p->se.wait_runtime = 0;
893
824 if (task_has_rt_policy(p)) { 894 if (task_has_rt_policy(p)) {
825#ifdef CONFIG_SMP 895 p->se.load.weight = prio_to_weight[0] * 2;
826 if (p == task_rq(p)->migration_thread) 896 p->se.load.inv_weight = prio_to_wmult[0] >> 1;
827 /* 897 return;
828 * The migration thread does the actual balancing. 898 }
829 * Giving its load any weight will skew balancing
830 * adversely.
831 */
832 p->load_weight = 0;
833 else
834#endif
835 p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority);
836 } else
837 p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
838}
839 899
840/* 900 /*
841 * Adding/removing a task to/from a priority array: 901 * SCHED_IDLE tasks get minimal weight:
842 */ 902 */
843static void dequeue_task(struct task_struct *p, struct prio_array *array) 903 if (p->policy == SCHED_IDLE) {
844{ 904 p->se.load.weight = WEIGHT_IDLEPRIO;
845 array->nr_active--; 905 p->se.load.inv_weight = WMULT_IDLEPRIO;
846 list_del(&p->run_list); 906 return;
847 if (list_empty(array->queue + p->prio)) 907 }
848 __clear_bit(p->prio, array->bitmap);
849}
850 908
851static void enqueue_task(struct task_struct *p, struct prio_array *array) 909 p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
852{ 910 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
853 sched_info_queued(p);
854 list_add_tail(&p->run_list, array->queue + p->prio);
855 __set_bit(p->prio, array->bitmap);
856 array->nr_active++;
857 p->array = array;
858} 911}
859 912
860/* 913static void
861 * Put task to the end of the run list without the overhead of dequeue 914enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
862 * followed by enqueue.
863 */
864static void requeue_task(struct task_struct *p, struct prio_array *array)
865{ 915{
866 list_move_tail(&p->run_list, array->queue + p->prio); 916 sched_info_queued(p);
917 p->sched_class->enqueue_task(rq, p, wakeup, now);
918 p->se.on_rq = 1;
867} 919}
868 920
869static inline void 921static void
870enqueue_task_head(struct task_struct *p, struct prio_array *array) 922dequeue_task(struct rq *rq, struct task_struct *p, int sleep, u64 now)
871{ 923{
872 list_add(&p->run_list, array->queue + p->prio); 924 p->sched_class->dequeue_task(rq, p, sleep, now);
873 __set_bit(p->prio, array->bitmap); 925 p->se.on_rq = 0;
874 array->nr_active++;
875 p->array = array;
876} 926}
877 927
878/* 928/*
879 * __normal_prio - return the priority that is based on the static 929 * __normal_prio - return the priority that is based on the static prio
880 * priority but is modified by bonuses/penalties.
881 *
882 * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
883 * into the -5 ... 0 ... +5 bonus/penalty range.
884 *
885 * We use 25% of the full 0...39 priority range so that:
886 *
887 * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
888 * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
889 *
890 * Both properties are important to certain workloads.
891 */ 930 */
892
893static inline int __normal_prio(struct task_struct *p) 931static inline int __normal_prio(struct task_struct *p)
894{ 932{
895 int bonus, prio; 933 return p->static_prio;
896
897 bonus = 0;
898
899 prio = p->static_prio - bonus;
900 if (prio < MAX_RT_PRIO)
901 prio = MAX_RT_PRIO;
902 if (prio > MAX_PRIO-1)
903 prio = MAX_PRIO-1;
904 return prio;
905} 934}
906 935
907/* 936/*
@@ -943,84 +972,45 @@ static int effective_prio(struct task_struct *p)
943} 972}
944 973
945/* 974/*
946 * __activate_task - move a task to the runqueue. 975 * activate_task - move a task to the runqueue.
947 */ 976 */
948static void __activate_task(struct task_struct *p, struct rq *rq) 977static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
949{ 978{
950 struct prio_array *target = rq->active; 979 u64 now = rq_clock(rq);
951 980
952 if (batch_task(p)) 981 if (p->state == TASK_UNINTERRUPTIBLE)
953 target = rq->expired; 982 rq->nr_uninterruptible--;
954 enqueue_task(p, target);
955 inc_nr_running(p, rq);
956}
957
958/*
959 * __activate_idle_task - move idle task to the _front_ of runqueue.
960 */
961static inline void __activate_idle_task(struct task_struct *p, struct rq *rq)
962{
963 enqueue_task_head(p, rq->active);
964 inc_nr_running(p, rq);
965}
966 983
967/* 984 enqueue_task(rq, p, wakeup, now);
968 * Recalculate p->normal_prio and p->prio after having slept, 985 inc_nr_running(p, rq, now);
969 * updating the sleep-average too:
970 */
971static int recalc_task_prio(struct task_struct *p, unsigned long long now)
972{
973 return effective_prio(p);
974} 986}
975 987
976/* 988/*
977 * activate_task - move a task to the runqueue and do priority recalculation 989 * activate_idle_task - move idle task to the _front_ of runqueue.
978 *
979 * Update all the scheduling statistics stuff. (sleep average
980 * calculation, priority modifiers, etc.)
981 */ 990 */
982static void activate_task(struct task_struct *p, struct rq *rq, int local) 991static inline void activate_idle_task(struct task_struct *p, struct rq *rq)
983{ 992{
984 unsigned long long now; 993 u64 now = rq_clock(rq);
985
986 if (rt_task(p))
987 goto out;
988
989 now = sched_clock();
990#ifdef CONFIG_SMP
991 if (!local) {
992 /* Compensate for drifting sched_clock */
993 struct rq *this_rq = this_rq();
994 now = (now - this_rq->most_recent_timestamp)
995 + rq->most_recent_timestamp;
996 }
997#endif
998 994
999 /* 995 if (p->state == TASK_UNINTERRUPTIBLE)
1000 * Sleep time is in units of nanosecs, so shift by 20 to get a 996 rq->nr_uninterruptible--;
1001 * milliseconds-range estimation of the amount of time that the task
1002 * spent sleeping:
1003 */
1004 if (unlikely(prof_on == SLEEP_PROFILING)) {
1005 if (p->state == TASK_UNINTERRUPTIBLE)
1006 profile_hits(SLEEP_PROFILING, (void *)get_wchan(p),
1007 (now - p->timestamp) >> 20);
1008 }
1009 997
1010 p->prio = recalc_task_prio(p, now); 998 enqueue_task(rq, p, 0, now);
1011 p->timestamp = now; 999 inc_nr_running(p, rq, now);
1012out:
1013 __activate_task(p, rq);
1014} 1000}
1015 1001
1016/* 1002/*
1017 * deactivate_task - remove a task from the runqueue. 1003 * deactivate_task - remove a task from the runqueue.
1018 */ 1004 */
1019static void deactivate_task(struct task_struct *p, struct rq *rq) 1005static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1020{ 1006{
1021 dec_nr_running(p, rq); 1007 u64 now = rq_clock(rq);
1022 dequeue_task(p, p->array); 1008
1023 p->array = NULL; 1009 if (p->state == TASK_UNINTERRUPTIBLE)
1010 rq->nr_uninterruptible++;
1011
1012 dequeue_task(rq, p, sleep, now);
1013 dec_nr_running(p, rq, now);
1024} 1014}
1025 1015
1026/** 1016/**
@@ -1035,14 +1025,40 @@ inline int task_curr(const struct task_struct *p)
1035/* Used instead of source_load when we know the type == 0 */ 1025/* Used instead of source_load when we know the type == 0 */
1036unsigned long weighted_cpuload(const int cpu) 1026unsigned long weighted_cpuload(const int cpu)
1037{ 1027{
1038 return cpu_rq(cpu)->raw_weighted_load; 1028 return cpu_rq(cpu)->ls.load.weight;
1029}
1030
1031static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1032{
1033#ifdef CONFIG_SMP
1034 task_thread_info(p)->cpu = cpu;
1035 set_task_cfs_rq(p);
1036#endif
1039} 1037}
1040 1038
1041#ifdef CONFIG_SMP 1039#ifdef CONFIG_SMP
1042 1040
1043void set_task_cpu(struct task_struct *p, unsigned int cpu) 1041void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1044{ 1042{
1045 task_thread_info(p)->cpu = cpu; 1043 int old_cpu = task_cpu(p);
1044 struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
1045 u64 clock_offset, fair_clock_offset;
1046
1047 clock_offset = old_rq->clock - new_rq->clock;
1048 fair_clock_offset = old_rq->cfs.fair_clock -
1049 new_rq->cfs.fair_clock;
1050 if (p->se.wait_start)
1051 p->se.wait_start -= clock_offset;
1052 if (p->se.wait_start_fair)
1053 p->se.wait_start_fair -= fair_clock_offset;
1054 if (p->se.sleep_start)
1055 p->se.sleep_start -= clock_offset;
1056 if (p->se.block_start)
1057 p->se.block_start -= clock_offset;
1058 if (p->se.sleep_start_fair)
1059 p->se.sleep_start_fair -= fair_clock_offset;
1060
1061 __set_task_cpu(p, new_cpu);
1046} 1062}
1047 1063
1048struct migration_req { 1064struct migration_req {
@@ -1067,7 +1083,7 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
1067 * If the task is not on a runqueue (and not running), then 1083 * If the task is not on a runqueue (and not running), then
1068 * it is sufficient to simply update the task's cpu field. 1084 * it is sufficient to simply update the task's cpu field.
1069 */ 1085 */
1070 if (!p->array && !task_running(rq, p)) { 1086 if (!p->se.on_rq && !task_running(rq, p)) {
1071 set_task_cpu(p, dest_cpu); 1087 set_task_cpu(p, dest_cpu);
1072 return 0; 1088 return 0;
1073 } 1089 }
@@ -1092,9 +1108,8 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
1092void wait_task_inactive(struct task_struct *p) 1108void wait_task_inactive(struct task_struct *p)
1093{ 1109{
1094 unsigned long flags; 1110 unsigned long flags;
1111 int running, on_rq;
1095 struct rq *rq; 1112 struct rq *rq;
1096 struct prio_array *array;
1097 int running;
1098 1113
1099repeat: 1114repeat:
1100 /* 1115 /*
@@ -1126,7 +1141,7 @@ repeat:
1126 */ 1141 */
1127 rq = task_rq_lock(p, &flags); 1142 rq = task_rq_lock(p, &flags);
1128 running = task_running(rq, p); 1143 running = task_running(rq, p);
1129 array = p->array; 1144 on_rq = p->se.on_rq;
1130 task_rq_unlock(rq, &flags); 1145 task_rq_unlock(rq, &flags);
1131 1146
1132 /* 1147 /*
@@ -1149,7 +1164,7 @@ repeat:
1149 * running right now), it's preempted, and we should 1164 * running right now), it's preempted, and we should
1150 * yield - it could be a while. 1165 * yield - it could be a while.
1151 */ 1166 */
1152 if (unlikely(array)) { 1167 if (unlikely(on_rq)) {
1153 yield(); 1168 yield();
1154 goto repeat; 1169 goto repeat;
1155 } 1170 }
@@ -1195,11 +1210,12 @@ void kick_process(struct task_struct *p)
1195static inline unsigned long source_load(int cpu, int type) 1210static inline unsigned long source_load(int cpu, int type)
1196{ 1211{
1197 struct rq *rq = cpu_rq(cpu); 1212 struct rq *rq = cpu_rq(cpu);
1213 unsigned long total = weighted_cpuload(cpu);
1198 1214
1199 if (type == 0) 1215 if (type == 0)
1200 return rq->raw_weighted_load; 1216 return total;
1201 1217
1202 return min(rq->cpu_load[type-1], rq->raw_weighted_load); 1218 return min(rq->cpu_load[type-1], total);
1203} 1219}
1204 1220
1205/* 1221/*
@@ -1209,11 +1225,12 @@ static inline unsigned long source_load(int cpu, int type)
1209static inline unsigned long target_load(int cpu, int type) 1225static inline unsigned long target_load(int cpu, int type)
1210{ 1226{
1211 struct rq *rq = cpu_rq(cpu); 1227 struct rq *rq = cpu_rq(cpu);
1228 unsigned long total = weighted_cpuload(cpu);
1212 1229
1213 if (type == 0) 1230 if (type == 0)
1214 return rq->raw_weighted_load; 1231 return total;
1215 1232
1216 return max(rq->cpu_load[type-1], rq->raw_weighted_load); 1233 return max(rq->cpu_load[type-1], total);
1217} 1234}
1218 1235
1219/* 1236/*
@@ -1222,9 +1239,10 @@ static inline unsigned long target_load(int cpu, int type)
1222static inline unsigned long cpu_avg_load_per_task(int cpu) 1239static inline unsigned long cpu_avg_load_per_task(int cpu)
1223{ 1240{
1224 struct rq *rq = cpu_rq(cpu); 1241 struct rq *rq = cpu_rq(cpu);
1242 unsigned long total = weighted_cpuload(cpu);
1225 unsigned long n = rq->nr_running; 1243 unsigned long n = rq->nr_running;
1226 1244
1227 return n ? rq->raw_weighted_load / n : SCHED_LOAD_SCALE; 1245 return n ? total / n : SCHED_LOAD_SCALE;
1228} 1246}
1229 1247
1230/* 1248/*
@@ -1455,7 +1473,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1455 if (!(old_state & state)) 1473 if (!(old_state & state))
1456 goto out; 1474 goto out;
1457 1475
1458 if (p->array) 1476 if (p->se.on_rq)
1459 goto out_running; 1477 goto out_running;
1460 1478
1461 cpu = task_cpu(p); 1479 cpu = task_cpu(p);
@@ -1510,11 +1528,11 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1510 * of the current CPU: 1528 * of the current CPU:
1511 */ 1529 */
1512 if (sync) 1530 if (sync)
1513 tl -= current->load_weight; 1531 tl -= current->se.load.weight;
1514 1532
1515 if ((tl <= load && 1533 if ((tl <= load &&
1516 tl + target_load(cpu, idx) <= tl_per_task) || 1534 tl + target_load(cpu, idx) <= tl_per_task) ||
1517 100*(tl + p->load_weight) <= imbalance*load) { 1535 100*(tl + p->se.load.weight) <= imbalance*load) {
1518 /* 1536 /*
1519 * This domain has SD_WAKE_AFFINE and 1537 * This domain has SD_WAKE_AFFINE and
1520 * p is cache cold in this domain, and 1538 * p is cache cold in this domain, and
@@ -1548,7 +1566,7 @@ out_set_cpu:
1548 old_state = p->state; 1566 old_state = p->state;
1549 if (!(old_state & state)) 1567 if (!(old_state & state))
1550 goto out; 1568 goto out;
1551 if (p->array) 1569 if (p->se.on_rq)
1552 goto out_running; 1570 goto out_running;
1553 1571
1554 this_cpu = smp_processor_id(); 1572 this_cpu = smp_processor_id();
@@ -1557,10 +1575,7 @@ out_set_cpu:
1557 1575
1558out_activate: 1576out_activate:
1559#endif /* CONFIG_SMP */ 1577#endif /* CONFIG_SMP */
1560 if (old_state == TASK_UNINTERRUPTIBLE) 1578 activate_task(rq, p, 1);
1561 rq->nr_uninterruptible--;
1562
1563 activate_task(p, rq, cpu == this_cpu);
1564 /* 1579 /*
1565 * Sync wakeups (i.e. those types of wakeups where the waker 1580 * Sync wakeups (i.e. those types of wakeups where the waker
1566 * has indicated that it will leave the CPU in short order) 1581 * has indicated that it will leave the CPU in short order)
@@ -1569,10 +1584,8 @@ out_activate:
1569 * the waker guarantees that the freshly woken up task is going 1584 * the waker guarantees that the freshly woken up task is going
1570 * to be considered on this CPU.) 1585 * to be considered on this CPU.)
1571 */ 1586 */
1572 if (!sync || cpu != this_cpu) { 1587 if (!sync || cpu != this_cpu)
1573 if (TASK_PREEMPTS_CURR(p, rq)) 1588 check_preempt_curr(rq, p);
1574 resched_task(rq->curr);
1575 }
1576 success = 1; 1589 success = 1;
1577 1590
1578out_running: 1591out_running:
@@ -1595,19 +1608,36 @@ int fastcall wake_up_state(struct task_struct *p, unsigned int state)
1595 return try_to_wake_up(p, state, 0); 1608 return try_to_wake_up(p, state, 0);
1596} 1609}
1597 1610
1598static void task_running_tick(struct rq *rq, struct task_struct *p);
1599/* 1611/*
1600 * Perform scheduler related setup for a newly forked process p. 1612 * Perform scheduler related setup for a newly forked process p.
1601 * p is forked by current. 1613 * p is forked by current.
1602 */ 1614 *
1603void fastcall sched_fork(struct task_struct *p, int clone_flags) 1615 * __sched_fork() is basic setup used by init_idle() too:
1604{ 1616 */
1605 int cpu = get_cpu(); 1617static void __sched_fork(struct task_struct *p)
1618{
1619 p->se.wait_start_fair = 0;
1620 p->se.wait_start = 0;
1621 p->se.exec_start = 0;
1622 p->se.sum_exec_runtime = 0;
1623 p->se.delta_exec = 0;
1624 p->se.delta_fair_run = 0;
1625 p->se.delta_fair_sleep = 0;
1626 p->se.wait_runtime = 0;
1627 p->se.sum_wait_runtime = 0;
1628 p->se.sum_sleep_runtime = 0;
1629 p->se.sleep_start = 0;
1630 p->se.sleep_start_fair = 0;
1631 p->se.block_start = 0;
1632 p->se.sleep_max = 0;
1633 p->se.block_max = 0;
1634 p->se.exec_max = 0;
1635 p->se.wait_max = 0;
1636 p->se.wait_runtime_overruns = 0;
1637 p->se.wait_runtime_underruns = 0;
1606 1638
1607#ifdef CONFIG_SMP 1639 INIT_LIST_HEAD(&p->run_list);
1608 cpu = sched_balance_self(cpu, SD_BALANCE_FORK); 1640 p->se.on_rq = 0;
1609#endif
1610 set_task_cpu(p, cpu);
1611 1641
1612 /* 1642 /*
1613 * We mark the process as running here, but have not actually 1643 * We mark the process as running here, but have not actually
@@ -1616,16 +1646,29 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags)
1616 * event cannot wake it up and insert it on the runqueue either. 1646 * event cannot wake it up and insert it on the runqueue either.
1617 */ 1647 */
1618 p->state = TASK_RUNNING; 1648 p->state = TASK_RUNNING;
1649}
1650
1651/*
1652 * fork()/clone()-time setup:
1653 */
1654void sched_fork(struct task_struct *p, int clone_flags)
1655{
1656 int cpu = get_cpu();
1657
1658 __sched_fork(p);
1659
1660#ifdef CONFIG_SMP
1661 cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
1662#endif
1663 __set_task_cpu(p, cpu);
1619 1664
1620 /* 1665 /*
1621 * Make sure we do not leak PI boosting priority to the child: 1666 * Make sure we do not leak PI boosting priority to the child:
1622 */ 1667 */
1623 p->prio = current->normal_prio; 1668 p->prio = current->normal_prio;
1624 1669
1625 INIT_LIST_HEAD(&p->run_list);
1626 p->array = NULL;
1627#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 1670#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
1628 if (unlikely(sched_info_on())) 1671 if (likely(sched_info_on()))
1629 memset(&p->sched_info, 0, sizeof(p->sched_info)); 1672 memset(&p->sched_info, 0, sizeof(p->sched_info));
1630#endif 1673#endif
1631#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 1674#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
@@ -1635,34 +1678,16 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags)
1635 /* Want to start with kernel preemption disabled. */ 1678 /* Want to start with kernel preemption disabled. */
1636 task_thread_info(p)->preempt_count = 1; 1679 task_thread_info(p)->preempt_count = 1;
1637#endif 1680#endif
1638 /*
1639 * Share the timeslice between parent and child, thus the
1640 * total amount of pending timeslices in the system doesn't change,
1641 * resulting in more scheduling fairness.
1642 */
1643 local_irq_disable();
1644 p->time_slice = (current->time_slice + 1) >> 1;
1645 /*
1646 * The remainder of the first timeslice might be recovered by
1647 * the parent if the child exits early enough.
1648 */
1649 p->first_time_slice = 1;
1650 current->time_slice >>= 1;
1651 p->timestamp = sched_clock();
1652 if (unlikely(!current->time_slice)) {
1653 /*
1654 * This case is rare, it happens when the parent has only
1655 * a single jiffy left from its timeslice. Taking the
1656 * runqueue lock is not a problem.
1657 */
1658 current->time_slice = 1;
1659 task_running_tick(cpu_rq(cpu), current);
1660 }
1661 local_irq_enable();
1662 put_cpu(); 1681 put_cpu();
1663} 1682}
1664 1683
1665/* 1684/*
1685 * After fork, child runs first. (default) If set to 0 then
1686 * parent will (try to) run first.
1687 */
1688unsigned int __read_mostly sysctl_sched_child_runs_first = 1;
1689
1690/*
1666 * wake_up_new_task - wake up a newly created task for the first time. 1691 * wake_up_new_task - wake up a newly created task for the first time.
1667 * 1692 *
1668 * This function will do some initial scheduler statistics housekeeping 1693 * This function will do some initial scheduler statistics housekeeping
@@ -1671,77 +1696,28 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags)
1671 */ 1696 */
1672void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) 1697void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
1673{ 1698{
1674 struct rq *rq, *this_rq;
1675 unsigned long flags; 1699 unsigned long flags;
1676 int this_cpu, cpu; 1700 struct rq *rq;
1701 int this_cpu;
1677 1702
1678 rq = task_rq_lock(p, &flags); 1703 rq = task_rq_lock(p, &flags);
1679 BUG_ON(p->state != TASK_RUNNING); 1704 BUG_ON(p->state != TASK_RUNNING);
1680 this_cpu = smp_processor_id(); 1705 this_cpu = smp_processor_id(); /* parent's CPU */
1681 cpu = task_cpu(p);
1682
1683 /*
1684 * We decrease the sleep average of forking parents
1685 * and children as well, to keep max-interactive tasks
1686 * from forking tasks that are max-interactive. The parent
1687 * (current) is done further down, under its lock.
1688 */
1689 p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
1690 CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
1691 1706
1692 p->prio = effective_prio(p); 1707 p->prio = effective_prio(p);
1693 1708
1694 if (likely(cpu == this_cpu)) { 1709 if (!sysctl_sched_child_runs_first || (clone_flags & CLONE_VM) ||
1695 if (!(clone_flags & CLONE_VM)) { 1710 task_cpu(p) != this_cpu || !current->se.on_rq) {
1696 /* 1711 activate_task(rq, p, 0);
1697 * The VM isn't cloned, so we're in a good position to
1698 * do child-runs-first in anticipation of an exec. This
1699 * usually avoids a lot of COW overhead.
1700 */
1701 if (unlikely(!current->array))
1702 __activate_task(p, rq);
1703 else {
1704 p->prio = current->prio;
1705 p->normal_prio = current->normal_prio;
1706 list_add_tail(&p->run_list, &current->run_list);
1707 p->array = current->array;
1708 p->array->nr_active++;
1709 inc_nr_running(p, rq);
1710 }
1711 set_need_resched();
1712 } else
1713 /* Run child last */
1714 __activate_task(p, rq);
1715 /*
1716 * We skip the following code due to cpu == this_cpu
1717 *
1718 * task_rq_unlock(rq, &flags);
1719 * this_rq = task_rq_lock(current, &flags);
1720 */
1721 this_rq = rq;
1722 } else { 1712 } else {
1723 this_rq = cpu_rq(this_cpu);
1724
1725 /* 1713 /*
1726 * Not the local CPU - must adjust timestamp. This should 1714 * Let the scheduling class do new task startup
1727 * get optimised away in the !CONFIG_SMP case. 1715 * management (if any):
1728 */ 1716 */
1729 p->timestamp = (p->timestamp - this_rq->most_recent_timestamp) 1717 p->sched_class->task_new(rq, p);
1730 + rq->most_recent_timestamp;
1731 __activate_task(p, rq);
1732 if (TASK_PREEMPTS_CURR(p, rq))
1733 resched_task(rq->curr);
1734
1735 /*
1736 * Parent and child are on different CPUs, now get the
1737 * parent runqueue to update the parent's ->sleep_avg:
1738 */
1739 task_rq_unlock(rq, &flags);
1740 this_rq = task_rq_lock(current, &flags);
1741 } 1718 }
1742 current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * 1719 check_preempt_curr(rq, p);
1743 PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); 1720 task_rq_unlock(rq, &flags);
1744 task_rq_unlock(this_rq, &flags);
1745} 1721}
1746 1722
1747/** 1723/**
@@ -1833,13 +1809,15 @@ asmlinkage void schedule_tail(struct task_struct *prev)
1833 * context_switch - switch to the new MM and the new 1809 * context_switch - switch to the new MM and the new
1834 * thread's register state. 1810 * thread's register state.
1835 */ 1811 */
1836static inline struct task_struct * 1812static inline void
1837context_switch(struct rq *rq, struct task_struct *prev, 1813context_switch(struct rq *rq, struct task_struct *prev,
1838 struct task_struct *next) 1814 struct task_struct *next)
1839{ 1815{
1840 struct mm_struct *mm = next->mm; 1816 struct mm_struct *mm, *oldmm;
1841 struct mm_struct *oldmm = prev->active_mm;
1842 1817
1818 prepare_task_switch(rq, next);
1819 mm = next->mm;
1820 oldmm = prev->active_mm;
1843 /* 1821 /*
1844 * For paravirt, this is coupled with an exit in switch_to to 1822 * For paravirt, this is coupled with an exit in switch_to to
1845 * combine the page table reload and the switch backend into 1823 * combine the page table reload and the switch backend into
@@ -1847,16 +1825,15 @@ context_switch(struct rq *rq, struct task_struct *prev,
1847 */ 1825 */
1848 arch_enter_lazy_cpu_mode(); 1826 arch_enter_lazy_cpu_mode();
1849 1827
1850 if (!mm) { 1828 if (unlikely(!mm)) {
1851 next->active_mm = oldmm; 1829 next->active_mm = oldmm;
1852 atomic_inc(&oldmm->mm_count); 1830 atomic_inc(&oldmm->mm_count);
1853 enter_lazy_tlb(oldmm, next); 1831 enter_lazy_tlb(oldmm, next);
1854 } else 1832 } else
1855 switch_mm(oldmm, mm, next); 1833 switch_mm(oldmm, mm, next);
1856 1834
1857 if (!prev->mm) { 1835 if (unlikely(!prev->mm)) {
1858 prev->active_mm = NULL; 1836 prev->active_mm = NULL;
1859 WARN_ON(rq->prev_mm);
1860 rq->prev_mm = oldmm; 1837 rq->prev_mm = oldmm;
1861 } 1838 }
1862 /* 1839 /*
@@ -1872,7 +1849,13 @@ context_switch(struct rq *rq, struct task_struct *prev,
1872 /* Here we just switch the register state and the stack. */ 1849 /* Here we just switch the register state and the stack. */
1873 switch_to(prev, next, prev); 1850 switch_to(prev, next, prev);
1874 1851
1875 return prev; 1852 barrier();
1853 /*
1854 * this_rq must be evaluated again because prev may have moved
1855 * CPUs since it called schedule(), thus the 'rq' on its stack
1856 * frame will be invalid.
1857 */
1858 finish_task_switch(this_rq(), prev);
1876} 1859}
1877 1860
1878/* 1861/*
@@ -1945,17 +1928,65 @@ unsigned long nr_active(void)
1945 return running + uninterruptible; 1928 return running + uninterruptible;
1946} 1929}
1947 1930
1948#ifdef CONFIG_SMP
1949
1950/* 1931/*
1951 * Is this task likely cache-hot: 1932 * Update rq->cpu_load[] statistics. This function is usually called every
1933 * scheduler tick (TICK_NSEC).
1952 */ 1934 */
1953static inline int 1935static void update_cpu_load(struct rq *this_rq)
1954task_hot(struct task_struct *p, unsigned long long now, struct sched_domain *sd)
1955{ 1936{
1956 return (long long)(now - p->last_ran) < (long long)sd->cache_hot_time; 1937 u64 fair_delta64, exec_delta64, idle_delta64, sample_interval64, tmp64;
1938 unsigned long total_load = this_rq->ls.load.weight;
1939 unsigned long this_load = total_load;
1940 struct load_stat *ls = &this_rq->ls;
1941 u64 now = __rq_clock(this_rq);
1942 int i, scale;
1943
1944 this_rq->nr_load_updates++;
1945 if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD)))
1946 goto do_avg;
1947
1948 /* Update delta_fair/delta_exec fields first */
1949 update_curr_load(this_rq, now);
1950
1951 fair_delta64 = ls->delta_fair + 1;
1952 ls->delta_fair = 0;
1953
1954 exec_delta64 = ls->delta_exec + 1;
1955 ls->delta_exec = 0;
1956
1957 sample_interval64 = now - ls->load_update_last;
1958 ls->load_update_last = now;
1959
1960 if ((s64)sample_interval64 < (s64)TICK_NSEC)
1961 sample_interval64 = TICK_NSEC;
1962
1963 if (exec_delta64 > sample_interval64)
1964 exec_delta64 = sample_interval64;
1965
1966 idle_delta64 = sample_interval64 - exec_delta64;
1967
1968 tmp64 = div64_64(SCHED_LOAD_SCALE * exec_delta64, fair_delta64);
1969 tmp64 = div64_64(tmp64 * exec_delta64, sample_interval64);
1970
1971 this_load = (unsigned long)tmp64;
1972
1973do_avg:
1974
1975 /* Update our load: */
1976 for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
1977 unsigned long old_load, new_load;
1978
1979 /* scale is effectively 1 << i now, and >> i divides by scale */
1980
1981 old_load = this_rq->cpu_load[i];
1982 new_load = this_load;
1983
1984 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
1985 }
1957} 1986}
1958 1987
1988#ifdef CONFIG_SMP
1989
1959/* 1990/*
1960 * double_rq_lock - safely lock two runqueues 1991 * double_rq_lock - safely lock two runqueues
1961 * 1992 *
@@ -2072,23 +2103,17 @@ void sched_exec(void)
2072 * pull_task - move a task from a remote runqueue to the local runqueue. 2103 * pull_task - move a task from a remote runqueue to the local runqueue.
2073 * Both runqueues must be locked. 2104 * Both runqueues must be locked.
2074 */ 2105 */
2075static void pull_task(struct rq *src_rq, struct prio_array *src_array, 2106static void pull_task(struct rq *src_rq, struct task_struct *p,
2076 struct task_struct *p, struct rq *this_rq, 2107 struct rq *this_rq, int this_cpu)
2077 struct prio_array *this_array, int this_cpu)
2078{ 2108{
2079 dequeue_task(p, src_array); 2109 deactivate_task(src_rq, p, 0);
2080 dec_nr_running(p, src_rq);
2081 set_task_cpu(p, this_cpu); 2110 set_task_cpu(p, this_cpu);
2082 inc_nr_running(p, this_rq); 2111 activate_task(this_rq, p, 0);
2083 enqueue_task(p, this_array);
2084 p->timestamp = (p->timestamp - src_rq->most_recent_timestamp)
2085 + this_rq->most_recent_timestamp;
2086 /* 2112 /*
2087 * Note that idle threads have a prio of MAX_PRIO, for this test 2113 * Note that idle threads have a prio of MAX_PRIO, for this test
2088 * to be always true for them. 2114 * to be always true for them.
2089 */ 2115 */
2090 if (TASK_PREEMPTS_CURR(p, this_rq)) 2116 check_preempt_curr(this_rq, p);
2091 resched_task(this_rq->curr);
2092} 2117}
2093 2118
2094/* 2119/*
@@ -2113,132 +2138,67 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2113 return 0; 2138 return 0;
2114 2139
2115 /* 2140 /*
2116 * Aggressive migration if: 2141 * Aggressive migration if too many balance attempts have failed:
2117 * 1) task is cache cold, or
2118 * 2) too many balance attempts have failed.
2119 */ 2142 */
2120 2143 if (sd->nr_balance_failed > sd->cache_nice_tries)
2121 if (sd->nr_balance_failed > sd->cache_nice_tries) {
2122#ifdef CONFIG_SCHEDSTATS
2123 if (task_hot(p, rq->most_recent_timestamp, sd))
2124 schedstat_inc(sd, lb_hot_gained[idle]);
2125#endif
2126 return 1; 2144 return 1;
2127 }
2128 2145
2129 if (task_hot(p, rq->most_recent_timestamp, sd))
2130 return 0;
2131 return 1; 2146 return 1;
2132} 2147}
2133 2148
2134#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio) 2149static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2135
2136/*
2137 * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
2138 * load from busiest to this_rq, as part of a balancing operation within
2139 * "domain". Returns the number of tasks moved.
2140 *
2141 * Called with both runqueues locked.
2142 */
2143static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2144 unsigned long max_nr_move, unsigned long max_load_move, 2150 unsigned long max_nr_move, unsigned long max_load_move,
2145 struct sched_domain *sd, enum cpu_idle_type idle, 2151 struct sched_domain *sd, enum cpu_idle_type idle,
2146 int *all_pinned) 2152 int *all_pinned, unsigned long *load_moved,
2153 int this_best_prio, int best_prio, int best_prio_seen,
2154 struct rq_iterator *iterator)
2147{ 2155{
2148 int idx, pulled = 0, pinned = 0, this_best_prio, best_prio, 2156 int pulled = 0, pinned = 0, skip_for_load;
2149 best_prio_seen, skip_for_load; 2157 struct task_struct *p;
2150 struct prio_array *array, *dst_array; 2158 long rem_load_move = max_load_move;
2151 struct list_head *head, *curr;
2152 struct task_struct *tmp;
2153 long rem_load_move;
2154 2159
2155 if (max_nr_move == 0 || max_load_move == 0) 2160 if (max_nr_move == 0 || max_load_move == 0)
2156 goto out; 2161 goto out;
2157 2162
2158 rem_load_move = max_load_move;
2159 pinned = 1; 2163 pinned = 1;
2160 this_best_prio = rq_best_prio(this_rq);
2161 best_prio = rq_best_prio(busiest);
2162 /*
2163 * Enable handling of the case where there is more than one task
2164 * with the best priority. If the current running task is one
2165 * of those with prio==best_prio we know it won't be moved
2166 * and therefore it's safe to override the skip (based on load) of
2167 * any task we find with that prio.
2168 */
2169 best_prio_seen = best_prio == busiest->curr->prio;
2170 2164
2171 /* 2165 /*
2172 * We first consider expired tasks. Those will likely not be 2166 * Start the load-balancing iterator:
2173 * executed in the near future, and they are most likely to
2174 * be cache-cold, thus switching CPUs has the least effect
2175 * on them.
2176 */ 2167 */
2177 if (busiest->expired->nr_active) { 2168 p = iterator->start(iterator->arg);
2178 array = busiest->expired; 2169next:
2179 dst_array = this_rq->expired; 2170 if (!p)
2180 } else {
2181 array = busiest->active;
2182 dst_array = this_rq->active;
2183 }
2184
2185new_array:
2186 /* Start searching at priority 0: */
2187 idx = 0;
2188skip_bitmap:
2189 if (!idx)
2190 idx = sched_find_first_bit(array->bitmap);
2191 else
2192 idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
2193 if (idx >= MAX_PRIO) {
2194 if (array == busiest->expired && busiest->active->nr_active) {
2195 array = busiest->active;
2196 dst_array = this_rq->active;
2197 goto new_array;
2198 }
2199 goto out; 2171 goto out;
2200 }
2201
2202 head = array->queue + idx;
2203 curr = head->prev;
2204skip_queue:
2205 tmp = list_entry(curr, struct task_struct, run_list);
2206
2207 curr = curr->prev;
2208
2209 /* 2172 /*
2210 * To help distribute high priority tasks accross CPUs we don't 2173 * To help distribute high priority tasks accross CPUs we don't
2211 * skip a task if it will be the highest priority task (i.e. smallest 2174 * skip a task if it will be the highest priority task (i.e. smallest
2212 * prio value) on its new queue regardless of its load weight 2175 * prio value) on its new queue regardless of its load weight
2213 */ 2176 */
2214 skip_for_load = tmp->load_weight > rem_load_move; 2177 skip_for_load = (p->se.load.weight >> 1) > rem_load_move +
2215 if (skip_for_load && idx < this_best_prio) 2178 SCHED_LOAD_SCALE_FUZZ;
2216 skip_for_load = !best_prio_seen && idx == best_prio; 2179 if (skip_for_load && p->prio < this_best_prio)
2180 skip_for_load = !best_prio_seen && p->prio == best_prio;
2217 if (skip_for_load || 2181 if (skip_for_load ||
2218 !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { 2182 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
2219 2183
2220 best_prio_seen |= idx == best_prio; 2184 best_prio_seen |= p->prio == best_prio;
2221 if (curr != head) 2185 p = iterator->next(iterator->arg);
2222 goto skip_queue; 2186 goto next;
2223 idx++;
2224 goto skip_bitmap;
2225 } 2187 }
2226 2188
2227 pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); 2189 pull_task(busiest, p, this_rq, this_cpu);
2228 pulled++; 2190 pulled++;
2229 rem_load_move -= tmp->load_weight; 2191 rem_load_move -= p->se.load.weight;
2230 2192
2231 /* 2193 /*
2232 * We only want to steal up to the prescribed number of tasks 2194 * We only want to steal up to the prescribed number of tasks
2233 * and the prescribed amount of weighted load. 2195 * and the prescribed amount of weighted load.
2234 */ 2196 */
2235 if (pulled < max_nr_move && rem_load_move > 0) { 2197 if (pulled < max_nr_move && rem_load_move > 0) {
2236 if (idx < this_best_prio) 2198 if (p->prio < this_best_prio)
2237 this_best_prio = idx; 2199 this_best_prio = p->prio;
2238 if (curr != head) 2200 p = iterator->next(iterator->arg);
2239 goto skip_queue; 2201 goto next;
2240 idx++;
2241 goto skip_bitmap;
2242 } 2202 }
2243out: 2203out:
2244 /* 2204 /*
@@ -2250,18 +2210,48 @@ out:
2250 2210
2251 if (all_pinned) 2211 if (all_pinned)
2252 *all_pinned = pinned; 2212 *all_pinned = pinned;
2213 *load_moved = max_load_move - rem_load_move;
2253 return pulled; 2214 return pulled;
2254} 2215}
2255 2216
2256/* 2217/*
2218 * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
2219 * load from busiest to this_rq, as part of a balancing operation within
2220 * "domain". Returns the number of tasks moved.
2221 *
2222 * Called with both runqueues locked.
2223 */
2224static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2225 unsigned long max_nr_move, unsigned long max_load_move,
2226 struct sched_domain *sd, enum cpu_idle_type idle,
2227 int *all_pinned)
2228{
2229 struct sched_class *class = sched_class_highest;
2230 unsigned long load_moved, total_nr_moved = 0, nr_moved;
2231 long rem_load_move = max_load_move;
2232
2233 do {
2234 nr_moved = class->load_balance(this_rq, this_cpu, busiest,
2235 max_nr_move, (unsigned long)rem_load_move,
2236 sd, idle, all_pinned, &load_moved);
2237 total_nr_moved += nr_moved;
2238 max_nr_move -= nr_moved;
2239 rem_load_move -= load_moved;
2240 class = class->next;
2241 } while (class && max_nr_move && rem_load_move > 0);
2242
2243 return total_nr_moved;
2244}
2245
2246/*
2257 * find_busiest_group finds and returns the busiest CPU group within the 2247 * find_busiest_group finds and returns the busiest CPU group within the
2258 * domain. It calculates and returns the amount of weighted load which 2248 * domain. It calculates and returns the amount of weighted load which
2259 * should be moved to restore balance via the imbalance parameter. 2249 * should be moved to restore balance via the imbalance parameter.
2260 */ 2250 */
2261static struct sched_group * 2251static struct sched_group *
2262find_busiest_group(struct sched_domain *sd, int this_cpu, 2252find_busiest_group(struct sched_domain *sd, int this_cpu,
2263 unsigned long *imbalance, enum cpu_idle_type idle, int *sd_idle, 2253 unsigned long *imbalance, enum cpu_idle_type idle,
2264 cpumask_t *cpus, int *balance) 2254 int *sd_idle, cpumask_t *cpus, int *balance)
2265{ 2255{
2266 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; 2256 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
2267 unsigned long max_load, avg_load, total_load, this_load, total_pwr; 2257 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
@@ -2325,7 +2315,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2325 2315
2326 avg_load += load; 2316 avg_load += load;
2327 sum_nr_running += rq->nr_running; 2317 sum_nr_running += rq->nr_running;
2328 sum_weighted_load += rq->raw_weighted_load; 2318 sum_weighted_load += weighted_cpuload(i);
2329 } 2319 }
2330 2320
2331 /* 2321 /*
@@ -2365,8 +2355,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2365 * Busy processors will not participate in power savings 2355 * Busy processors will not participate in power savings
2366 * balance. 2356 * balance.
2367 */ 2357 */
2368 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) 2358 if (idle == CPU_NOT_IDLE ||
2369 goto group_next; 2359 !(sd->flags & SD_POWERSAVINGS_BALANCE))
2360 goto group_next;
2370 2361
2371 /* 2362 /*
2372 * If the local group is idle or completely loaded 2363 * If the local group is idle or completely loaded
@@ -2376,42 +2367,42 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2376 !this_nr_running)) 2367 !this_nr_running))
2377 power_savings_balance = 0; 2368 power_savings_balance = 0;
2378 2369
2379 /* 2370 /*
2380 * If a group is already running at full capacity or idle, 2371 * If a group is already running at full capacity or idle,
2381 * don't include that group in power savings calculations 2372 * don't include that group in power savings calculations
2382 */ 2373 */
2383 if (!power_savings_balance || sum_nr_running >= group_capacity 2374 if (!power_savings_balance || sum_nr_running >= group_capacity
2384 || !sum_nr_running) 2375 || !sum_nr_running)
2385 goto group_next; 2376 goto group_next;
2386 2377
2387 /* 2378 /*
2388 * Calculate the group which has the least non-idle load. 2379 * Calculate the group which has the least non-idle load.
2389 * This is the group from where we need to pick up the load 2380 * This is the group from where we need to pick up the load
2390 * for saving power 2381 * for saving power
2391 */ 2382 */
2392 if ((sum_nr_running < min_nr_running) || 2383 if ((sum_nr_running < min_nr_running) ||
2393 (sum_nr_running == min_nr_running && 2384 (sum_nr_running == min_nr_running &&
2394 first_cpu(group->cpumask) < 2385 first_cpu(group->cpumask) <
2395 first_cpu(group_min->cpumask))) { 2386 first_cpu(group_min->cpumask))) {
2396 group_min = group; 2387 group_min = group;
2397 min_nr_running = sum_nr_running; 2388 min_nr_running = sum_nr_running;
2398 min_load_per_task = sum_weighted_load / 2389 min_load_per_task = sum_weighted_load /
2399 sum_nr_running; 2390 sum_nr_running;
2400 } 2391 }
2401 2392
2402 /* 2393 /*
2403 * Calculate the group which is almost near its 2394 * Calculate the group which is almost near its
2404 * capacity but still has some space to pick up some load 2395 * capacity but still has some space to pick up some load
2405 * from other group and save more power 2396 * from other group and save more power
2406 */ 2397 */
2407 if (sum_nr_running <= group_capacity - 1) { 2398 if (sum_nr_running <= group_capacity - 1) {
2408 if (sum_nr_running > leader_nr_running || 2399 if (sum_nr_running > leader_nr_running ||
2409 (sum_nr_running == leader_nr_running && 2400 (sum_nr_running == leader_nr_running &&
2410 first_cpu(group->cpumask) > 2401 first_cpu(group->cpumask) >
2411 first_cpu(group_leader->cpumask))) { 2402 first_cpu(group_leader->cpumask))) {
2412 group_leader = group; 2403 group_leader = group;
2413 leader_nr_running = sum_nr_running; 2404 leader_nr_running = sum_nr_running;
2414 } 2405 }
2415 } 2406 }
2416group_next: 2407group_next:
2417#endif 2408#endif
@@ -2466,7 +2457,7 @@ group_next:
2466 * a think about bumping its value to force at least one task to be 2457 * a think about bumping its value to force at least one task to be
2467 * moved 2458 * moved
2468 */ 2459 */
2469 if (*imbalance < busiest_load_per_task) { 2460 if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) {
2470 unsigned long tmp, pwr_now, pwr_move; 2461 unsigned long tmp, pwr_now, pwr_move;
2471 unsigned int imbn; 2462 unsigned int imbn;
2472 2463
@@ -2480,7 +2471,8 @@ small_imbalance:
2480 } else 2471 } else
2481 this_load_per_task = SCHED_LOAD_SCALE; 2472 this_load_per_task = SCHED_LOAD_SCALE;
2482 2473
2483 if (max_load - this_load >= busiest_load_per_task * imbn) { 2474 if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >=
2475 busiest_load_per_task * imbn) {
2484 *imbalance = busiest_load_per_task; 2476 *imbalance = busiest_load_per_task;
2485 return busiest; 2477 return busiest;
2486 } 2478 }
@@ -2552,17 +2544,19 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
2552 int i; 2544 int i;
2553 2545
2554 for_each_cpu_mask(i, group->cpumask) { 2546 for_each_cpu_mask(i, group->cpumask) {
2547 unsigned long wl;
2555 2548
2556 if (!cpu_isset(i, *cpus)) 2549 if (!cpu_isset(i, *cpus))
2557 continue; 2550 continue;
2558 2551
2559 rq = cpu_rq(i); 2552 rq = cpu_rq(i);
2553 wl = weighted_cpuload(i);
2560 2554
2561 if (rq->nr_running == 1 && rq->raw_weighted_load > imbalance) 2555 if (rq->nr_running == 1 && wl > imbalance)
2562 continue; 2556 continue;
2563 2557
2564 if (rq->raw_weighted_load > max_load) { 2558 if (wl > max_load) {
2565 max_load = rq->raw_weighted_load; 2559 max_load = wl;
2566 busiest = rq; 2560 busiest = rq;
2567 } 2561 }
2568 } 2562 }
@@ -2599,7 +2593,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
2599 /* 2593 /*
2600 * When power savings policy is enabled for the parent domain, idle 2594 * When power savings policy is enabled for the parent domain, idle
2601 * sibling can pick up load irrespective of busy siblings. In this case, 2595 * sibling can pick up load irrespective of busy siblings. In this case,
2602 * let the state of idle sibling percolate up as IDLE, instead of 2596 * let the state of idle sibling percolate up as CPU_IDLE, instead of
2603 * portraying it as CPU_NOT_IDLE. 2597 * portraying it as CPU_NOT_IDLE.
2604 */ 2598 */
2605 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && 2599 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
@@ -2822,8 +2816,8 @@ out_balanced:
2822static void idle_balance(int this_cpu, struct rq *this_rq) 2816static void idle_balance(int this_cpu, struct rq *this_rq)
2823{ 2817{
2824 struct sched_domain *sd; 2818 struct sched_domain *sd;
2825 int pulled_task = 0; 2819 int pulled_task = -1;
2826 unsigned long next_balance = jiffies + 60 * HZ; 2820 unsigned long next_balance = jiffies + HZ;
2827 2821
2828 for_each_domain(this_cpu, sd) { 2822 for_each_domain(this_cpu, sd) {
2829 unsigned long interval; 2823 unsigned long interval;
@@ -2842,12 +2836,13 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
2842 if (pulled_task) 2836 if (pulled_task)
2843 break; 2837 break;
2844 } 2838 }
2845 if (!pulled_task) 2839 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
2846 /* 2840 /*
2847 * We are going idle. next_balance may be set based on 2841 * We are going idle. next_balance may be set based on
2848 * a busy processor. So reset next_balance. 2842 * a busy processor. So reset next_balance.
2849 */ 2843 */
2850 this_rq->next_balance = next_balance; 2844 this_rq->next_balance = next_balance;
2845 }
2851} 2846}
2852 2847
2853/* 2848/*
@@ -2900,32 +2895,6 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
2900 spin_unlock(&target_rq->lock); 2895 spin_unlock(&target_rq->lock);
2901} 2896}
2902 2897
2903static void update_load(struct rq *this_rq)
2904{
2905 unsigned long this_load;
2906 unsigned int i, scale;
2907
2908 this_load = this_rq->raw_weighted_load;
2909
2910 /* Update our load: */
2911 for (i = 0, scale = 1; i < 3; i++, scale += scale) {
2912 unsigned long old_load, new_load;
2913
2914 /* scale is effectively 1 << i now, and >> i divides by scale */
2915
2916 old_load = this_rq->cpu_load[i];
2917 new_load = this_load;
2918 /*
2919 * Round up the averaging division if load is increasing. This
2920 * prevents us from getting stuck on 9 if the load is 10, for
2921 * example.
2922 */
2923 if (new_load > old_load)
2924 new_load += scale-1;
2925 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
2926 }
2927}
2928
2929#ifdef CONFIG_NO_HZ 2898#ifdef CONFIG_NO_HZ
2930static struct { 2899static struct {
2931 atomic_t load_balancer; 2900 atomic_t load_balancer;
@@ -3029,6 +2998,9 @@ static inline void rebalance_domains(int cpu, enum cpu_idle_type idle)
3029 interval = msecs_to_jiffies(interval); 2998 interval = msecs_to_jiffies(interval);
3030 if (unlikely(!interval)) 2999 if (unlikely(!interval))
3031 interval = 1; 3000 interval = 1;
3001 if (interval > HZ*NR_CPUS/10)
3002 interval = HZ*NR_CPUS/10;
3003
3032 3004
3033 if (sd->flags & SD_SERIALIZE) { 3005 if (sd->flags & SD_SERIALIZE) {
3034 if (!spin_trylock(&balancing)) 3006 if (!spin_trylock(&balancing))
@@ -3070,11 +3042,12 @@ out:
3070 */ 3042 */
3071static void run_rebalance_domains(struct softirq_action *h) 3043static void run_rebalance_domains(struct softirq_action *h)
3072{ 3044{
3073 int local_cpu = smp_processor_id(); 3045 int this_cpu = smp_processor_id();
3074 struct rq *local_rq = cpu_rq(local_cpu); 3046 struct rq *this_rq = cpu_rq(this_cpu);
3075 enum cpu_idle_type idle = local_rq->idle_at_tick ? CPU_IDLE : CPU_NOT_IDLE; 3047 enum cpu_idle_type idle = this_rq->idle_at_tick ?
3048 CPU_IDLE : CPU_NOT_IDLE;
3076 3049
3077 rebalance_domains(local_cpu, idle); 3050 rebalance_domains(this_cpu, idle);
3078 3051
3079#ifdef CONFIG_NO_HZ 3052#ifdef CONFIG_NO_HZ
3080 /* 3053 /*
@@ -3082,13 +3055,13 @@ static void run_rebalance_domains(struct softirq_action *h)
3082 * balancing on behalf of the other idle cpus whose ticks are 3055 * balancing on behalf of the other idle cpus whose ticks are
3083 * stopped. 3056 * stopped.
3084 */ 3057 */
3085 if (local_rq->idle_at_tick && 3058 if (this_rq->idle_at_tick &&
3086 atomic_read(&nohz.load_balancer) == local_cpu) { 3059 atomic_read(&nohz.load_balancer) == this_cpu) {
3087 cpumask_t cpus = nohz.cpu_mask; 3060 cpumask_t cpus = nohz.cpu_mask;
3088 struct rq *rq; 3061 struct rq *rq;
3089 int balance_cpu; 3062 int balance_cpu;
3090 3063
3091 cpu_clear(local_cpu, cpus); 3064 cpu_clear(this_cpu, cpus);
3092 for_each_cpu_mask(balance_cpu, cpus) { 3065 for_each_cpu_mask(balance_cpu, cpus) {
3093 /* 3066 /*
3094 * If this cpu gets work to do, stop the load balancing 3067 * If this cpu gets work to do, stop the load balancing
@@ -3098,11 +3071,11 @@ static void run_rebalance_domains(struct softirq_action *h)
3098 if (need_resched()) 3071 if (need_resched())
3099 break; 3072 break;
3100 3073
3101 rebalance_domains(balance_cpu, CPU_IDLE); 3074 rebalance_domains(balance_cpu, SCHED_IDLE);
3102 3075
3103 rq = cpu_rq(balance_cpu); 3076 rq = cpu_rq(balance_cpu);
3104 if (time_after(local_rq->next_balance, rq->next_balance)) 3077 if (time_after(this_rq->next_balance, rq->next_balance))
3105 local_rq->next_balance = rq->next_balance; 3078 this_rq->next_balance = rq->next_balance;
3106 } 3079 }
3107 } 3080 }
3108#endif 3081#endif
@@ -3115,9 +3088,8 @@ static void run_rebalance_domains(struct softirq_action *h)
3115 * idle load balancing owner or decide to stop the periodic load balancing, 3088 * idle load balancing owner or decide to stop the periodic load balancing,
3116 * if the whole system is idle. 3089 * if the whole system is idle.
3117 */ 3090 */
3118static inline void trigger_load_balance(int cpu) 3091static inline void trigger_load_balance(struct rq *rq, int cpu)
3119{ 3092{
3120 struct rq *rq = cpu_rq(cpu);
3121#ifdef CONFIG_NO_HZ 3093#ifdef CONFIG_NO_HZ
3122 /* 3094 /*
3123 * If we were in the nohz mode recently and busy at the current 3095 * If we were in the nohz mode recently and busy at the current
@@ -3169,13 +3141,29 @@ static inline void trigger_load_balance(int cpu)
3169 if (time_after_eq(jiffies, rq->next_balance)) 3141 if (time_after_eq(jiffies, rq->next_balance))
3170 raise_softirq(SCHED_SOFTIRQ); 3142 raise_softirq(SCHED_SOFTIRQ);
3171} 3143}
3172#else 3144
3145#else /* CONFIG_SMP */
3146
3173/* 3147/*
3174 * on UP we do not need to balance between CPUs: 3148 * on UP we do not need to balance between CPUs:
3175 */ 3149 */
3176static inline void idle_balance(int cpu, struct rq *rq) 3150static inline void idle_balance(int cpu, struct rq *rq)
3177{ 3151{
3178} 3152}
3153
3154/* Avoid "used but not defined" warning on UP */
3155static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3156 unsigned long max_nr_move, unsigned long max_load_move,
3157 struct sched_domain *sd, enum cpu_idle_type idle,
3158 int *all_pinned, unsigned long *load_moved,
3159 int this_best_prio, int best_prio, int best_prio_seen,
3160 struct rq_iterator *iterator)
3161{
3162 *load_moved = 0;
3163
3164 return 0;
3165}
3166
3179#endif 3167#endif
3180 3168
3181DEFINE_PER_CPU(struct kernel_stat, kstat); 3169DEFINE_PER_CPU(struct kernel_stat, kstat);
@@ -3277,81 +3265,6 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
3277 cpustat->steal = cputime64_add(cpustat->steal, tmp); 3265 cpustat->steal = cputime64_add(cpustat->steal, tmp);
3278} 3266}
3279 3267
3280static void task_running_tick(struct rq *rq, struct task_struct *p)
3281{
3282 if (p->array != rq->active) {
3283 /* Task has expired but was not scheduled yet */
3284 set_tsk_need_resched(p);
3285 return;
3286 }
3287 spin_lock(&rq->lock);
3288 /*
3289 * The task was running during this tick - update the
3290 * time slice counter. Note: we do not update a thread's
3291 * priority until it either goes to sleep or uses up its
3292 * timeslice. This makes it possible for interactive tasks
3293 * to use up their timeslices at their highest priority levels.
3294 */
3295 if (rt_task(p)) {
3296 /*
3297 * RR tasks need a special form of timeslice management.
3298 * FIFO tasks have no timeslices.
3299 */
3300 if ((p->policy == SCHED_RR) && !--p->time_slice) {
3301 p->time_slice = task_timeslice(p);
3302 p->first_time_slice = 0;
3303 set_tsk_need_resched(p);
3304
3305 /* put it at the end of the queue: */
3306 requeue_task(p, rq->active);
3307 }
3308 goto out_unlock;
3309 }
3310 if (!--p->time_slice) {
3311 dequeue_task(p, rq->active);
3312 set_tsk_need_resched(p);
3313 p->prio = effective_prio(p);
3314 p->time_slice = task_timeslice(p);
3315 p->first_time_slice = 0;
3316
3317 if (!rq->expired_timestamp)
3318 rq->expired_timestamp = jiffies;
3319 if (!TASK_INTERACTIVE(p)) {
3320 enqueue_task(p, rq->expired);
3321 if (p->static_prio < rq->best_expired_prio)
3322 rq->best_expired_prio = p->static_prio;
3323 } else
3324 enqueue_task(p, rq->active);
3325 } else {
3326 /*
3327 * Prevent a too long timeslice allowing a task to monopolize
3328 * the CPU. We do this by splitting up the timeslice into
3329 * smaller pieces.
3330 *
3331 * Note: this does not mean the task's timeslices expire or
3332 * get lost in any way, they just might be preempted by
3333 * another task of equal priority. (one with higher
3334 * priority would have preempted this task already.) We
3335 * requeue this task to the end of the list on this priority
3336 * level, which is in essence a round-robin of tasks with
3337 * equal priority.
3338 *
3339 * This only applies to tasks in the interactive
3340 * delta range with at least TIMESLICE_GRANULARITY to requeue.
3341 */
3342 if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
3343 p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
3344 (p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
3345 (p->array == rq->active)) {
3346
3347 requeue_task(p, rq->active);
3348 set_tsk_need_resched(p);
3349 }
3350 }
3351out_unlock:
3352 spin_unlock(&rq->lock);
3353}
3354
3355/* 3268/*
3356 * This function gets called by the timer code, with HZ frequency. 3269 * This function gets called by the timer code, with HZ frequency.
3357 * We call it with interrupts disabled. 3270 * We call it with interrupts disabled.
@@ -3361,17 +3274,19 @@ out_unlock:
3361 */ 3274 */
3362void scheduler_tick(void) 3275void scheduler_tick(void)
3363{ 3276{
3364 struct task_struct *p = current;
3365 int cpu = smp_processor_id(); 3277 int cpu = smp_processor_id();
3366 int idle_at_tick = idle_cpu(cpu);
3367 struct rq *rq = cpu_rq(cpu); 3278 struct rq *rq = cpu_rq(cpu);
3279 struct task_struct *curr = rq->curr;
3280
3281 spin_lock(&rq->lock);
3282 if (curr != rq->idle) /* FIXME: needed? */
3283 curr->sched_class->task_tick(rq, curr);
3284 update_cpu_load(rq);
3285 spin_unlock(&rq->lock);
3368 3286
3369 if (!idle_at_tick)
3370 task_running_tick(rq, p);
3371#ifdef CONFIG_SMP 3287#ifdef CONFIG_SMP
3372 update_load(rq); 3288 rq->idle_at_tick = idle_cpu(cpu);
3373 rq->idle_at_tick = idle_at_tick; 3289 trigger_load_balance(rq, cpu);
3374 trigger_load_balance(cpu);
3375#endif 3290#endif
3376} 3291}
3377 3292
@@ -3414,140 +3329,128 @@ EXPORT_SYMBOL(sub_preempt_count);
3414#endif 3329#endif
3415 3330
3416/* 3331/*
3417 * schedule() is the main scheduler function. 3332 * Print scheduling while atomic bug:
3418 */ 3333 */
3419asmlinkage void __sched schedule(void) 3334static noinline void __schedule_bug(struct task_struct *prev)
3420{ 3335{
3421 struct task_struct *prev, *next; 3336 printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d\n",
3422 struct prio_array *array; 3337 prev->comm, preempt_count(), prev->pid);
3423 struct list_head *queue; 3338 debug_show_held_locks(prev);
3424 unsigned long long now; 3339 if (irqs_disabled())
3425 unsigned long run_time; 3340 print_irqtrace_events(prev);
3426 int cpu, idx; 3341 dump_stack();
3427 long *switch_count; 3342}
3428 struct rq *rq;
3429 3343
3344/*
3345 * Various schedule()-time debugging checks and statistics:
3346 */
3347static inline void schedule_debug(struct task_struct *prev)
3348{
3430 /* 3349 /*
3431 * Test if we are atomic. Since do_exit() needs to call into 3350 * Test if we are atomic. Since do_exit() needs to call into
3432 * schedule() atomically, we ignore that path for now. 3351 * schedule() atomically, we ignore that path for now.
3433 * Otherwise, whine if we are scheduling when we should not be. 3352 * Otherwise, whine if we are scheduling when we should not be.
3434 */ 3353 */
3435 if (unlikely(in_atomic() && !current->exit_state)) { 3354 if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state))
3436 printk(KERN_ERR "BUG: scheduling while atomic: " 3355 __schedule_bug(prev);
3437 "%s/0x%08x/%d\n", 3356
3438 current->comm, preempt_count(), current->pid);
3439 debug_show_held_locks(current);
3440 if (irqs_disabled())
3441 print_irqtrace_events(current);
3442 dump_stack();
3443 }
3444 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 3357 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3445 3358
3446need_resched: 3359 schedstat_inc(this_rq(), sched_cnt);
3447 preempt_disable(); 3360}
3448 prev = current; 3361
3449 release_kernel_lock(prev); 3362/*
3450need_resched_nonpreemptible: 3363 * Pick up the highest-prio task:
3451 rq = this_rq(); 3364 */
3365static inline struct task_struct *
3366pick_next_task(struct rq *rq, struct task_struct *prev, u64 now)
3367{
3368 struct sched_class *class;
3369 struct task_struct *p;
3452 3370
3453 /* 3371 /*
3454 * The idle thread is not allowed to schedule! 3372 * Optimization: we know that if all tasks are in
3455 * Remove this check after it has been exercised a bit. 3373 * the fair class we can call that function directly:
3456 */ 3374 */
3457 if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) { 3375 if (likely(rq->nr_running == rq->cfs.nr_running)) {
3458 printk(KERN_ERR "bad: scheduling from the idle thread!\n"); 3376 p = fair_sched_class.pick_next_task(rq, now);
3459 dump_stack(); 3377 if (likely(p))
3378 return p;
3460 } 3379 }
3461 3380
3462 schedstat_inc(rq, sched_cnt); 3381 class = sched_class_highest;
3463 now = sched_clock(); 3382 for ( ; ; ) {
3464 if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) { 3383 p = class->pick_next_task(rq, now);
3465 run_time = now - prev->timestamp; 3384 if (p)
3466 if (unlikely((long long)(now - prev->timestamp) < 0)) 3385 return p;
3467 run_time = 0; 3386 /*
3468 } else 3387 * Will never be NULL as the idle class always
3469 run_time = NS_MAX_SLEEP_AVG; 3388 * returns a non-NULL p:
3389 */
3390 class = class->next;
3391 }
3392}
3470 3393
3471 /* 3394/*
3472 * Tasks charged proportionately less run_time at high sleep_avg to 3395 * schedule() is the main scheduler function.
3473 * delay them losing their interactive status 3396 */
3474 */ 3397asmlinkage void __sched schedule(void)
3475 run_time /= (CURRENT_BONUS(prev) ? : 1); 3398{
3399 struct task_struct *prev, *next;
3400 long *switch_count;
3401 struct rq *rq;
3402 u64 now;
3403 int cpu;
3404
3405need_resched:
3406 preempt_disable();
3407 cpu = smp_processor_id();
3408 rq = cpu_rq(cpu);
3409 rcu_qsctr_inc(cpu);
3410 prev = rq->curr;
3411 switch_count = &prev->nivcsw;
3412
3413 release_kernel_lock(prev);
3414need_resched_nonpreemptible:
3415
3416 schedule_debug(prev);
3476 3417
3477 spin_lock_irq(&rq->lock); 3418 spin_lock_irq(&rq->lock);
3419 clear_tsk_need_resched(prev);
3478 3420
3479 switch_count = &prev->nivcsw;
3480 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 3421 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3481 switch_count = &prev->nvcsw;
3482 if (unlikely((prev->state & TASK_INTERRUPTIBLE) && 3422 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
3483 unlikely(signal_pending(prev)))) 3423 unlikely(signal_pending(prev)))) {
3484 prev->state = TASK_RUNNING; 3424 prev->state = TASK_RUNNING;
3485 else { 3425 } else {
3486 if (prev->state == TASK_UNINTERRUPTIBLE) 3426 deactivate_task(rq, prev, 1);
3487 rq->nr_uninterruptible++;
3488 deactivate_task(prev, rq);
3489 } 3427 }
3428 switch_count = &prev->nvcsw;
3490 } 3429 }
3491 3430
3492 cpu = smp_processor_id(); 3431 if (unlikely(!rq->nr_running))
3493 if (unlikely(!rq->nr_running)) {
3494 idle_balance(cpu, rq); 3432 idle_balance(cpu, rq);
3495 if (!rq->nr_running) {
3496 next = rq->idle;
3497 rq->expired_timestamp = 0;
3498 goto switch_tasks;
3499 }
3500 }
3501 3433
3502 array = rq->active; 3434 now = __rq_clock(rq);
3503 if (unlikely(!array->nr_active)) { 3435 prev->sched_class->put_prev_task(rq, prev, now);
3504 /* 3436 next = pick_next_task(rq, prev, now);
3505 * Switch the active and expired arrays.
3506 */
3507 schedstat_inc(rq, sched_switch);
3508 rq->active = rq->expired;
3509 rq->expired = array;
3510 array = rq->active;
3511 rq->expired_timestamp = 0;
3512 rq->best_expired_prio = MAX_PRIO;
3513 }
3514
3515 idx = sched_find_first_bit(array->bitmap);
3516 queue = array->queue + idx;
3517 next = list_entry(queue->next, struct task_struct, run_list);
3518
3519switch_tasks:
3520 if (next == rq->idle)
3521 schedstat_inc(rq, sched_goidle);
3522 prefetch(next);
3523 prefetch_stack(next);
3524 clear_tsk_need_resched(prev);
3525 rcu_qsctr_inc(task_cpu(prev));
3526
3527 prev->timestamp = prev->last_ran = now;
3528 3437
3529 sched_info_switch(prev, next); 3438 sched_info_switch(prev, next);
3439
3530 if (likely(prev != next)) { 3440 if (likely(prev != next)) {
3531 next->timestamp = next->last_ran = now;
3532 rq->nr_switches++; 3441 rq->nr_switches++;
3533 rq->curr = next; 3442 rq->curr = next;
3534 ++*switch_count; 3443 ++*switch_count;
3535 3444
3536 prepare_task_switch(rq, next); 3445 context_switch(rq, prev, next); /* unlocks the rq */
3537 prev = context_switch(rq, prev, next);
3538 barrier();
3539 /*
3540 * this_rq must be evaluated again because prev may have moved
3541 * CPUs since it called schedule(), thus the 'rq' on its stack
3542 * frame will be invalid.
3543 */
3544 finish_task_switch(this_rq(), prev);
3545 } else 3446 } else
3546 spin_unlock_irq(&rq->lock); 3447 spin_unlock_irq(&rq->lock);
3547 3448
3548 prev = current; 3449 if (unlikely(reacquire_kernel_lock(current) < 0)) {
3549 if (unlikely(reacquire_kernel_lock(prev) < 0)) 3450 cpu = smp_processor_id();
3451 rq = cpu_rq(cpu);
3550 goto need_resched_nonpreemptible; 3452 goto need_resched_nonpreemptible;
3453 }
3551 preempt_enable_no_resched(); 3454 preempt_enable_no_resched();
3552 if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) 3455 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3553 goto need_resched; 3456 goto need_resched;
@@ -3959,29 +3862,30 @@ EXPORT_SYMBOL(sleep_on_timeout);
3959 */ 3862 */
3960void rt_mutex_setprio(struct task_struct *p, int prio) 3863void rt_mutex_setprio(struct task_struct *p, int prio)
3961{ 3864{
3962 struct prio_array *array;
3963 unsigned long flags; 3865 unsigned long flags;
3866 int oldprio, on_rq;
3964 struct rq *rq; 3867 struct rq *rq;
3965 int oldprio; 3868 u64 now;
3966 3869
3967 BUG_ON(prio < 0 || prio > MAX_PRIO); 3870 BUG_ON(prio < 0 || prio > MAX_PRIO);
3968 3871
3969 rq = task_rq_lock(p, &flags); 3872 rq = task_rq_lock(p, &flags);
3873 now = rq_clock(rq);
3970 3874
3971 oldprio = p->prio; 3875 oldprio = p->prio;
3972 array = p->array; 3876 on_rq = p->se.on_rq;
3973 if (array) 3877 if (on_rq)
3974 dequeue_task(p, array); 3878 dequeue_task(rq, p, 0, now);
3879
3880 if (rt_prio(prio))
3881 p->sched_class = &rt_sched_class;
3882 else
3883 p->sched_class = &fair_sched_class;
3884
3975 p->prio = prio; 3885 p->prio = prio;
3976 3886
3977 if (array) { 3887 if (on_rq) {
3978 /* 3888 enqueue_task(rq, p, 0, now);
3979 * If changing to an RT priority then queue it
3980 * in the active array!
3981 */
3982 if (rt_task(p))
3983 array = rq->active;
3984 enqueue_task(p, array);
3985 /* 3889 /*
3986 * Reschedule if we are currently running on this runqueue and 3890 * Reschedule if we are currently running on this runqueue and
3987 * our priority decreased, or if we are not currently running on 3891 * our priority decreased, or if we are not currently running on
@@ -3990,8 +3894,9 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
3990 if (task_running(rq, p)) { 3894 if (task_running(rq, p)) {
3991 if (p->prio > oldprio) 3895 if (p->prio > oldprio)
3992 resched_task(rq->curr); 3896 resched_task(rq->curr);
3993 } else if (TASK_PREEMPTS_CURR(p, rq)) 3897 } else {
3994 resched_task(rq->curr); 3898 check_preempt_curr(rq, p);
3899 }
3995 } 3900 }
3996 task_rq_unlock(rq, &flags); 3901 task_rq_unlock(rq, &flags);
3997} 3902}
@@ -4000,10 +3905,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4000 3905
4001void set_user_nice(struct task_struct *p, long nice) 3906void set_user_nice(struct task_struct *p, long nice)
4002{ 3907{
4003 struct prio_array *array; 3908 int old_prio, delta, on_rq;
4004 int old_prio, delta;
4005 unsigned long flags; 3909 unsigned long flags;
4006 struct rq *rq; 3910 struct rq *rq;
3911 u64 now;
4007 3912
4008 if (TASK_NICE(p) == nice || nice < -20 || nice > 19) 3913 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
4009 return; 3914 return;
@@ -4012,20 +3917,21 @@ void set_user_nice(struct task_struct *p, long nice)
4012 * the task might be in the middle of scheduling on another CPU. 3917 * the task might be in the middle of scheduling on another CPU.
4013 */ 3918 */
4014 rq = task_rq_lock(p, &flags); 3919 rq = task_rq_lock(p, &flags);
3920 now = rq_clock(rq);
4015 /* 3921 /*
4016 * The RT priorities are set via sched_setscheduler(), but we still 3922 * The RT priorities are set via sched_setscheduler(), but we still
4017 * allow the 'normal' nice value to be set - but as expected 3923 * allow the 'normal' nice value to be set - but as expected
4018 * it wont have any effect on scheduling until the task is 3924 * it wont have any effect on scheduling until the task is
4019 * not SCHED_NORMAL/SCHED_BATCH: 3925 * SCHED_FIFO/SCHED_RR:
4020 */ 3926 */
4021 if (task_has_rt_policy(p)) { 3927 if (task_has_rt_policy(p)) {
4022 p->static_prio = NICE_TO_PRIO(nice); 3928 p->static_prio = NICE_TO_PRIO(nice);
4023 goto out_unlock; 3929 goto out_unlock;
4024 } 3930 }
4025 array = p->array; 3931 on_rq = p->se.on_rq;
4026 if (array) { 3932 if (on_rq) {
4027 dequeue_task(p, array); 3933 dequeue_task(rq, p, 0, now);
4028 dec_raw_weighted_load(rq, p); 3934 dec_load(rq, p, now);
4029 } 3935 }
4030 3936
4031 p->static_prio = NICE_TO_PRIO(nice); 3937 p->static_prio = NICE_TO_PRIO(nice);
@@ -4034,9 +3940,9 @@ void set_user_nice(struct task_struct *p, long nice)
4034 p->prio = effective_prio(p); 3940 p->prio = effective_prio(p);
4035 delta = p->prio - old_prio; 3941 delta = p->prio - old_prio;
4036 3942
4037 if (array) { 3943 if (on_rq) {
4038 enqueue_task(p, array); 3944 enqueue_task(rq, p, 0, now);
4039 inc_raw_weighted_load(rq, p); 3945 inc_load(rq, p, now);
4040 /* 3946 /*
4041 * If the task increased its priority or is running and 3947 * If the task increased its priority or is running and
4042 * lowered its priority, then reschedule its CPU: 3948 * lowered its priority, then reschedule its CPU:
@@ -4156,11 +4062,24 @@ static inline struct task_struct *find_process_by_pid(pid_t pid)
4156} 4062}
4157 4063
4158/* Actually do priority change: must hold rq lock. */ 4064/* Actually do priority change: must hold rq lock. */
4159static void __setscheduler(struct task_struct *p, int policy, int prio) 4065static void
4066__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
4160{ 4067{
4161 BUG_ON(p->array); 4068 BUG_ON(p->se.on_rq);
4162 4069
4163 p->policy = policy; 4070 p->policy = policy;
4071 switch (p->policy) {
4072 case SCHED_NORMAL:
4073 case SCHED_BATCH:
4074 case SCHED_IDLE:
4075 p->sched_class = &fair_sched_class;
4076 break;
4077 case SCHED_FIFO:
4078 case SCHED_RR:
4079 p->sched_class = &rt_sched_class;
4080 break;
4081 }
4082
4164 p->rt_priority = prio; 4083 p->rt_priority = prio;
4165 p->normal_prio = normal_prio(p); 4084 p->normal_prio = normal_prio(p);
4166 /* we are holding p->pi_lock already */ 4085 /* we are holding p->pi_lock already */
@@ -4179,8 +4098,7 @@ static void __setscheduler(struct task_struct *p, int policy, int prio)
4179int sched_setscheduler(struct task_struct *p, int policy, 4098int sched_setscheduler(struct task_struct *p, int policy,
4180 struct sched_param *param) 4099 struct sched_param *param)
4181{ 4100{
4182 int retval, oldprio, oldpolicy = -1; 4101 int retval, oldprio, oldpolicy = -1, on_rq;
4183 struct prio_array *array;
4184 unsigned long flags; 4102 unsigned long flags;
4185 struct rq *rq; 4103 struct rq *rq;
4186 4104
@@ -4191,12 +4109,13 @@ recheck:
4191 if (policy < 0) 4109 if (policy < 0)
4192 policy = oldpolicy = p->policy; 4110 policy = oldpolicy = p->policy;
4193 else if (policy != SCHED_FIFO && policy != SCHED_RR && 4111 else if (policy != SCHED_FIFO && policy != SCHED_RR &&
4194 policy != SCHED_NORMAL && policy != SCHED_BATCH) 4112 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
4113 policy != SCHED_IDLE)
4195 return -EINVAL; 4114 return -EINVAL;
4196 /* 4115 /*
4197 * Valid priorities for SCHED_FIFO and SCHED_RR are 4116 * Valid priorities for SCHED_FIFO and SCHED_RR are
4198 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and 4117 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
4199 * SCHED_BATCH is 0. 4118 * SCHED_BATCH and SCHED_IDLE is 0.
4200 */ 4119 */
4201 if (param->sched_priority < 0 || 4120 if (param->sched_priority < 0 ||
4202 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || 4121 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
@@ -4211,7 +4130,6 @@ recheck:
4211 if (!capable(CAP_SYS_NICE)) { 4130 if (!capable(CAP_SYS_NICE)) {
4212 if (rt_policy(policy)) { 4131 if (rt_policy(policy)) {
4213 unsigned long rlim_rtprio; 4132 unsigned long rlim_rtprio;
4214 unsigned long flags;
4215 4133
4216 if (!lock_task_sighand(p, &flags)) 4134 if (!lock_task_sighand(p, &flags))
4217 return -ESRCH; 4135 return -ESRCH;
@@ -4227,6 +4145,12 @@ recheck:
4227 param->sched_priority > rlim_rtprio) 4145 param->sched_priority > rlim_rtprio)
4228 return -EPERM; 4146 return -EPERM;
4229 } 4147 }
4148 /*
4149 * Like positive nice levels, dont allow tasks to
4150 * move out of SCHED_IDLE either:
4151 */
4152 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
4153 return -EPERM;
4230 4154
4231 /* can't change other user's priorities */ 4155 /* can't change other user's priorities */
4232 if ((current->euid != p->euid) && 4156 if ((current->euid != p->euid) &&
@@ -4254,13 +4178,13 @@ recheck:
4254 spin_unlock_irqrestore(&p->pi_lock, flags); 4178 spin_unlock_irqrestore(&p->pi_lock, flags);
4255 goto recheck; 4179 goto recheck;
4256 } 4180 }
4257 array = p->array; 4181 on_rq = p->se.on_rq;
4258 if (array) 4182 if (on_rq)
4259 deactivate_task(p, rq); 4183 deactivate_task(rq, p, 0);
4260 oldprio = p->prio; 4184 oldprio = p->prio;
4261 __setscheduler(p, policy, param->sched_priority); 4185 __setscheduler(rq, p, policy, param->sched_priority);
4262 if (array) { 4186 if (on_rq) {
4263 __activate_task(p, rq); 4187 activate_task(rq, p, 0);
4264 /* 4188 /*
4265 * Reschedule if we are currently running on this runqueue and 4189 * Reschedule if we are currently running on this runqueue and
4266 * our priority decreased, or if we are not currently running on 4190 * our priority decreased, or if we are not currently running on
@@ -4269,8 +4193,9 @@ recheck:
4269 if (task_running(rq, p)) { 4193 if (task_running(rq, p)) {
4270 if (p->prio > oldprio) 4194 if (p->prio > oldprio)
4271 resched_task(rq->curr); 4195 resched_task(rq->curr);
4272 } else if (TASK_PREEMPTS_CURR(p, rq)) 4196 } else {
4273 resched_task(rq->curr); 4197 check_preempt_curr(rq, p);
4198 }
4274 } 4199 }
4275 __task_rq_unlock(rq); 4200 __task_rq_unlock(rq);
4276 spin_unlock_irqrestore(&p->pi_lock, flags); 4201 spin_unlock_irqrestore(&p->pi_lock, flags);
@@ -4542,41 +4467,18 @@ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
4542/** 4467/**
4543 * sys_sched_yield - yield the current processor to other threads. 4468 * sys_sched_yield - yield the current processor to other threads.
4544 * 4469 *
4545 * This function yields the current CPU by moving the calling thread 4470 * This function yields the current CPU to other tasks. If there are no
4546 * to the expired array. If there are no other threads running on this 4471 * other threads running on this CPU then this function will return.
4547 * CPU then this function will return.
4548 */ 4472 */
4549asmlinkage long sys_sched_yield(void) 4473asmlinkage long sys_sched_yield(void)
4550{ 4474{
4551 struct rq *rq = this_rq_lock(); 4475 struct rq *rq = this_rq_lock();
4552 struct prio_array *array = current->array, *target = rq->expired;
4553 4476
4554 schedstat_inc(rq, yld_cnt); 4477 schedstat_inc(rq, yld_cnt);
4555 /* 4478 if (unlikely(rq->nr_running == 1))
4556 * We implement yielding by moving the task into the expired
4557 * queue.
4558 *
4559 * (special rule: RT tasks will just roundrobin in the active
4560 * array.)
4561 */
4562 if (rt_task(current))
4563 target = rq->active;
4564
4565 if (array->nr_active == 1) {
4566 schedstat_inc(rq, yld_act_empty); 4479 schedstat_inc(rq, yld_act_empty);
4567 if (!rq->expired->nr_active) 4480 else
4568 schedstat_inc(rq, yld_both_empty); 4481 current->sched_class->yield_task(rq, current);
4569 } else if (!rq->expired->nr_active)
4570 schedstat_inc(rq, yld_exp_empty);
4571
4572 if (array != target) {
4573 dequeue_task(current, array);
4574 enqueue_task(current, target);
4575 } else
4576 /*
4577 * requeue_task is cheaper so perform that if possible.
4578 */
4579 requeue_task(current, array);
4580 4482
4581 /* 4483 /*
4582 * Since we are going to call schedule() anyway, there's 4484 * Since we are going to call schedule() anyway, there's
@@ -4727,6 +4629,7 @@ asmlinkage long sys_sched_get_priority_max(int policy)
4727 break; 4629 break;
4728 case SCHED_NORMAL: 4630 case SCHED_NORMAL:
4729 case SCHED_BATCH: 4631 case SCHED_BATCH:
4632 case SCHED_IDLE:
4730 ret = 0; 4633 ret = 0;
4731 break; 4634 break;
4732 } 4635 }
@@ -4751,6 +4654,7 @@ asmlinkage long sys_sched_get_priority_min(int policy)
4751 break; 4654 break;
4752 case SCHED_NORMAL: 4655 case SCHED_NORMAL:
4753 case SCHED_BATCH: 4656 case SCHED_BATCH:
4657 case SCHED_IDLE:
4754 ret = 0; 4658 ret = 0;
4755 } 4659 }
4756 return ret; 4660 return ret;
@@ -4785,7 +4689,7 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
4785 goto out_unlock; 4689 goto out_unlock;
4786 4690
4787 jiffies_to_timespec(p->policy == SCHED_FIFO ? 4691 jiffies_to_timespec(p->policy == SCHED_FIFO ?
4788 0 : task_timeslice(p), &t); 4692 0 : static_prio_timeslice(p->static_prio), &t);
4789 read_unlock(&tasklist_lock); 4693 read_unlock(&tasklist_lock);
4790 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; 4694 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
4791out_nounlock: 4695out_nounlock:
@@ -4860,6 +4764,9 @@ void show_state_filter(unsigned long state_filter)
4860 4764
4861 touch_all_softlockup_watchdogs(); 4765 touch_all_softlockup_watchdogs();
4862 4766
4767#ifdef CONFIG_SCHED_DEBUG
4768 sysrq_sched_debug_show();
4769#endif
4863 read_unlock(&tasklist_lock); 4770 read_unlock(&tasklist_lock);
4864 /* 4771 /*
4865 * Only show locks if all tasks are dumped: 4772 * Only show locks if all tasks are dumped:
@@ -4870,7 +4777,7 @@ void show_state_filter(unsigned long state_filter)
4870 4777
4871void __cpuinit init_idle_bootup_task(struct task_struct *idle) 4778void __cpuinit init_idle_bootup_task(struct task_struct *idle)
4872{ 4779{
4873 /* nothing yet */ 4780 idle->sched_class = &idle_sched_class;
4874} 4781}
4875 4782
4876/** 4783/**
@@ -4886,12 +4793,12 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
4886 struct rq *rq = cpu_rq(cpu); 4793 struct rq *rq = cpu_rq(cpu);
4887 unsigned long flags; 4794 unsigned long flags;
4888 4795
4889 idle->timestamp = sched_clock(); 4796 __sched_fork(idle);
4890 idle->array = NULL; 4797 idle->se.exec_start = sched_clock();
4798
4891 idle->prio = idle->normal_prio = MAX_PRIO; 4799 idle->prio = idle->normal_prio = MAX_PRIO;
4892 idle->state = TASK_RUNNING;
4893 idle->cpus_allowed = cpumask_of_cpu(cpu); 4800 idle->cpus_allowed = cpumask_of_cpu(cpu);
4894 set_task_cpu(idle, cpu); 4801 __set_task_cpu(idle, cpu);
4895 4802
4896 spin_lock_irqsave(&rq->lock, flags); 4803 spin_lock_irqsave(&rq->lock, flags);
4897 rq->curr = rq->idle = idle; 4804 rq->curr = rq->idle = idle;
@@ -4906,6 +4813,10 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
4906#else 4813#else
4907 task_thread_info(idle)->preempt_count = 0; 4814 task_thread_info(idle)->preempt_count = 0;
4908#endif 4815#endif
4816 /*
4817 * The idle tasks have their own, simple scheduling class:
4818 */
4819 idle->sched_class = &idle_sched_class;
4909} 4820}
4910 4821
4911/* 4822/*
@@ -4917,6 +4828,28 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
4917 */ 4828 */
4918cpumask_t nohz_cpu_mask = CPU_MASK_NONE; 4829cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
4919 4830
4831/*
4832 * Increase the granularity value when there are more CPUs,
4833 * because with more CPUs the 'effective latency' as visible
4834 * to users decreases. But the relationship is not linear,
4835 * so pick a second-best guess by going with the log2 of the
4836 * number of CPUs.
4837 *
4838 * This idea comes from the SD scheduler of Con Kolivas:
4839 */
4840static inline void sched_init_granularity(void)
4841{
4842 unsigned int factor = 1 + ilog2(num_online_cpus());
4843 const unsigned long gran_limit = 10000000;
4844
4845 sysctl_sched_granularity *= factor;
4846 if (sysctl_sched_granularity > gran_limit)
4847 sysctl_sched_granularity = gran_limit;
4848
4849 sysctl_sched_runtime_limit = sysctl_sched_granularity * 4;
4850 sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2;
4851}
4852
4920#ifdef CONFIG_SMP 4853#ifdef CONFIG_SMP
4921/* 4854/*
4922 * This is how migration works: 4855 * This is how migration works:
@@ -4990,7 +4923,7 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed);
4990static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) 4923static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4991{ 4924{
4992 struct rq *rq_dest, *rq_src; 4925 struct rq *rq_dest, *rq_src;
4993 int ret = 0; 4926 int ret = 0, on_rq;
4994 4927
4995 if (unlikely(cpu_is_offline(dest_cpu))) 4928 if (unlikely(cpu_is_offline(dest_cpu)))
4996 return ret; 4929 return ret;
@@ -5006,20 +4939,13 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5006 if (!cpu_isset(dest_cpu, p->cpus_allowed)) 4939 if (!cpu_isset(dest_cpu, p->cpus_allowed))
5007 goto out; 4940 goto out;
5008 4941
4942 on_rq = p->se.on_rq;
4943 if (on_rq)
4944 deactivate_task(rq_src, p, 0);
5009 set_task_cpu(p, dest_cpu); 4945 set_task_cpu(p, dest_cpu);
5010 if (p->array) { 4946 if (on_rq) {
5011 /* 4947 activate_task(rq_dest, p, 0);
5012 * Sync timestamp with rq_dest's before activating. 4948 check_preempt_curr(rq_dest, p);
5013 * The same thing could be achieved by doing this step
5014 * afterwards, and pretending it was a local activate.
5015 * This way is cleaner and logically correct.
5016 */
5017 p->timestamp = p->timestamp - rq_src->most_recent_timestamp
5018 + rq_dest->most_recent_timestamp;
5019 deactivate_task(p, rq_src);
5020 __activate_task(p, rq_dest);
5021 if (TASK_PREEMPTS_CURR(p, rq_dest))
5022 resched_task(rq_dest->curr);
5023 } 4949 }
5024 ret = 1; 4950 ret = 1;
5025out: 4951out:
@@ -5171,7 +5097,8 @@ static void migrate_live_tasks(int src_cpu)
5171 write_unlock_irq(&tasklist_lock); 5097 write_unlock_irq(&tasklist_lock);
5172} 5098}
5173 5099
5174/* Schedules idle task to be the next runnable task on current CPU. 5100/*
5101 * Schedules idle task to be the next runnable task on current CPU.
5175 * It does so by boosting its priority to highest possible and adding it to 5102 * It does so by boosting its priority to highest possible and adding it to
5176 * the _front_ of the runqueue. Used by CPU offline code. 5103 * the _front_ of the runqueue. Used by CPU offline code.
5177 */ 5104 */
@@ -5191,10 +5118,10 @@ void sched_idle_next(void)
5191 */ 5118 */
5192 spin_lock_irqsave(&rq->lock, flags); 5119 spin_lock_irqsave(&rq->lock, flags);
5193 5120
5194 __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); 5121 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5195 5122
5196 /* Add idle task to the _front_ of its priority queue: */ 5123 /* Add idle task to the _front_ of its priority queue: */
5197 __activate_idle_task(p, rq); 5124 activate_idle_task(p, rq);
5198 5125
5199 spin_unlock_irqrestore(&rq->lock, flags); 5126 spin_unlock_irqrestore(&rq->lock, flags);
5200} 5127}
@@ -5244,16 +5171,15 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
5244static void migrate_dead_tasks(unsigned int dead_cpu) 5171static void migrate_dead_tasks(unsigned int dead_cpu)
5245{ 5172{
5246 struct rq *rq = cpu_rq(dead_cpu); 5173 struct rq *rq = cpu_rq(dead_cpu);
5247 unsigned int arr, i; 5174 struct task_struct *next;
5248
5249 for (arr = 0; arr < 2; arr++) {
5250 for (i = 0; i < MAX_PRIO; i++) {
5251 struct list_head *list = &rq->arrays[arr].queue[i];
5252 5175
5253 while (!list_empty(list)) 5176 for ( ; ; ) {
5254 migrate_dead(dead_cpu, list_entry(list->next, 5177 if (!rq->nr_running)
5255 struct task_struct, run_list)); 5178 break;
5256 } 5179 next = pick_next_task(rq, rq->curr, rq_clock(rq));
5180 if (!next)
5181 break;
5182 migrate_dead(dead_cpu, next);
5257 } 5183 }
5258} 5184}
5259#endif /* CONFIG_HOTPLUG_CPU */ 5185#endif /* CONFIG_HOTPLUG_CPU */
@@ -5277,14 +5203,14 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5277 5203
5278 case CPU_UP_PREPARE: 5204 case CPU_UP_PREPARE:
5279 case CPU_UP_PREPARE_FROZEN: 5205 case CPU_UP_PREPARE_FROZEN:
5280 p = kthread_create(migration_thread, hcpu, "migration/%d",cpu); 5206 p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
5281 if (IS_ERR(p)) 5207 if (IS_ERR(p))
5282 return NOTIFY_BAD; 5208 return NOTIFY_BAD;
5283 p->flags |= PF_NOFREEZE; 5209 p->flags |= PF_NOFREEZE;
5284 kthread_bind(p, cpu); 5210 kthread_bind(p, cpu);
5285 /* Must be high prio: stop_machine expects to yield to it. */ 5211 /* Must be high prio: stop_machine expects to yield to it. */
5286 rq = task_rq_lock(p, &flags); 5212 rq = task_rq_lock(p, &flags);
5287 __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); 5213 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5288 task_rq_unlock(rq, &flags); 5214 task_rq_unlock(rq, &flags);
5289 cpu_rq(cpu)->migration_thread = p; 5215 cpu_rq(cpu)->migration_thread = p;
5290 break; 5216 break;
@@ -5315,9 +5241,10 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5315 rq->migration_thread = NULL; 5241 rq->migration_thread = NULL;
5316 /* Idle task back to normal (off runqueue, low prio) */ 5242 /* Idle task back to normal (off runqueue, low prio) */
5317 rq = task_rq_lock(rq->idle, &flags); 5243 rq = task_rq_lock(rq->idle, &flags);
5318 deactivate_task(rq->idle, rq); 5244 deactivate_task(rq, rq->idle, 0);
5319 rq->idle->static_prio = MAX_PRIO; 5245 rq->idle->static_prio = MAX_PRIO;
5320 __setscheduler(rq->idle, SCHED_NORMAL, 0); 5246 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
5247 rq->idle->sched_class = &idle_sched_class;
5321 migrate_dead_tasks(cpu); 5248 migrate_dead_tasks(cpu);
5322 task_rq_unlock(rq, &flags); 5249 task_rq_unlock(rq, &flags);
5323 migrate_nr_uninterruptible(rq); 5250 migrate_nr_uninterruptible(rq);
@@ -5926,7 +5853,6 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
5926static int build_sched_domains(const cpumask_t *cpu_map) 5853static int build_sched_domains(const cpumask_t *cpu_map)
5927{ 5854{
5928 int i; 5855 int i;
5929 struct sched_domain *sd;
5930#ifdef CONFIG_NUMA 5856#ifdef CONFIG_NUMA
5931 struct sched_group **sched_group_nodes = NULL; 5857 struct sched_group **sched_group_nodes = NULL;
5932 int sd_allnodes = 0; 5858 int sd_allnodes = 0;
@@ -5934,7 +5860,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
5934 /* 5860 /*
5935 * Allocate the per-node list of sched groups 5861 * Allocate the per-node list of sched groups
5936 */ 5862 */
5937 sched_group_nodes = kzalloc(sizeof(struct sched_group*)*MAX_NUMNODES, 5863 sched_group_nodes = kzalloc(sizeof(struct sched_group *)*MAX_NUMNODES,
5938 GFP_KERNEL); 5864 GFP_KERNEL);
5939 if (!sched_group_nodes) { 5865 if (!sched_group_nodes) {
5940 printk(KERN_WARNING "Can not alloc sched group node list\n"); 5866 printk(KERN_WARNING "Can not alloc sched group node list\n");
@@ -5953,8 +5879,8 @@ static int build_sched_domains(const cpumask_t *cpu_map)
5953 cpus_and(nodemask, nodemask, *cpu_map); 5879 cpus_and(nodemask, nodemask, *cpu_map);
5954 5880
5955#ifdef CONFIG_NUMA 5881#ifdef CONFIG_NUMA
5956 if (cpus_weight(*cpu_map) 5882 if (cpus_weight(*cpu_map) >
5957 > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { 5883 SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
5958 sd = &per_cpu(allnodes_domains, i); 5884 sd = &per_cpu(allnodes_domains, i);
5959 *sd = SD_ALLNODES_INIT; 5885 *sd = SD_ALLNODES_INIT;
5960 sd->span = *cpu_map; 5886 sd->span = *cpu_map;
@@ -6013,7 +5939,8 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6013 if (i != first_cpu(this_sibling_map)) 5939 if (i != first_cpu(this_sibling_map))
6014 continue; 5940 continue;
6015 5941
6016 init_sched_build_groups(this_sibling_map, cpu_map, &cpu_to_cpu_group); 5942 init_sched_build_groups(this_sibling_map, cpu_map,
5943 &cpu_to_cpu_group);
6017 } 5944 }
6018#endif 5945#endif
6019 5946
@@ -6024,11 +5951,11 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6024 cpus_and(this_core_map, this_core_map, *cpu_map); 5951 cpus_and(this_core_map, this_core_map, *cpu_map);
6025 if (i != first_cpu(this_core_map)) 5952 if (i != first_cpu(this_core_map))
6026 continue; 5953 continue;
6027 init_sched_build_groups(this_core_map, cpu_map, &cpu_to_core_group); 5954 init_sched_build_groups(this_core_map, cpu_map,
5955 &cpu_to_core_group);
6028 } 5956 }
6029#endif 5957#endif
6030 5958
6031
6032 /* Set up physical groups */ 5959 /* Set up physical groups */
6033 for (i = 0; i < MAX_NUMNODES; i++) { 5960 for (i = 0; i < MAX_NUMNODES; i++) {
6034 cpumask_t nodemask = node_to_cpumask(i); 5961 cpumask_t nodemask = node_to_cpumask(i);
@@ -6043,7 +5970,8 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6043#ifdef CONFIG_NUMA 5970#ifdef CONFIG_NUMA
6044 /* Set up node groups */ 5971 /* Set up node groups */
6045 if (sd_allnodes) 5972 if (sd_allnodes)
6046 init_sched_build_groups(*cpu_map, cpu_map, &cpu_to_allnodes_group); 5973 init_sched_build_groups(*cpu_map, cpu_map,
5974 &cpu_to_allnodes_group);
6047 5975
6048 for (i = 0; i < MAX_NUMNODES; i++) { 5976 for (i = 0; i < MAX_NUMNODES; i++) {
6049 /* Set up node groups */ 5977 /* Set up node groups */
@@ -6115,19 +6043,22 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6115 /* Calculate CPU power for physical packages and nodes */ 6043 /* Calculate CPU power for physical packages and nodes */
6116#ifdef CONFIG_SCHED_SMT 6044#ifdef CONFIG_SCHED_SMT
6117 for_each_cpu_mask(i, *cpu_map) { 6045 for_each_cpu_mask(i, *cpu_map) {
6118 sd = &per_cpu(cpu_domains, i); 6046 struct sched_domain *sd = &per_cpu(cpu_domains, i);
6047
6119 init_sched_groups_power(i, sd); 6048 init_sched_groups_power(i, sd);
6120 } 6049 }
6121#endif 6050#endif
6122#ifdef CONFIG_SCHED_MC 6051#ifdef CONFIG_SCHED_MC
6123 for_each_cpu_mask(i, *cpu_map) { 6052 for_each_cpu_mask(i, *cpu_map) {
6124 sd = &per_cpu(core_domains, i); 6053 struct sched_domain *sd = &per_cpu(core_domains, i);
6054
6125 init_sched_groups_power(i, sd); 6055 init_sched_groups_power(i, sd);
6126 } 6056 }
6127#endif 6057#endif
6128 6058
6129 for_each_cpu_mask(i, *cpu_map) { 6059 for_each_cpu_mask(i, *cpu_map) {
6130 sd = &per_cpu(phys_domains, i); 6060 struct sched_domain *sd = &per_cpu(phys_domains, i);
6061
6131 init_sched_groups_power(i, sd); 6062 init_sched_groups_power(i, sd);
6132 } 6063 }
6133 6064
@@ -6361,10 +6292,12 @@ void __init sched_init_smp(void)
6361 /* Move init over to a non-isolated CPU */ 6292 /* Move init over to a non-isolated CPU */
6362 if (set_cpus_allowed(current, non_isolated_cpus) < 0) 6293 if (set_cpus_allowed(current, non_isolated_cpus) < 0)
6363 BUG(); 6294 BUG();
6295 sched_init_granularity();
6364} 6296}
6365#else 6297#else
6366void __init sched_init_smp(void) 6298void __init sched_init_smp(void)
6367{ 6299{
6300 sched_init_granularity();
6368} 6301}
6369#endif /* CONFIG_SMP */ 6302#endif /* CONFIG_SMP */
6370 6303
@@ -6378,28 +6311,51 @@ int in_sched_functions(unsigned long addr)
6378 && addr < (unsigned long)__sched_text_end); 6311 && addr < (unsigned long)__sched_text_end);
6379} 6312}
6380 6313
6314static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
6315{
6316 cfs_rq->tasks_timeline = RB_ROOT;
6317 cfs_rq->fair_clock = 1;
6318#ifdef CONFIG_FAIR_GROUP_SCHED
6319 cfs_rq->rq = rq;
6320#endif
6321}
6322
6381void __init sched_init(void) 6323void __init sched_init(void)
6382{ 6324{
6383 int i, j, k; 6325 u64 now = sched_clock();
6384 int highest_cpu = 0; 6326 int highest_cpu = 0;
6327 int i, j;
6328
6329 /*
6330 * Link up the scheduling class hierarchy:
6331 */
6332 rt_sched_class.next = &fair_sched_class;
6333 fair_sched_class.next = &idle_sched_class;
6334 idle_sched_class.next = NULL;
6385 6335
6386 for_each_possible_cpu(i) { 6336 for_each_possible_cpu(i) {
6387 struct prio_array *array; 6337 struct rt_prio_array *array;
6388 struct rq *rq; 6338 struct rq *rq;
6389 6339
6390 rq = cpu_rq(i); 6340 rq = cpu_rq(i);
6391 spin_lock_init(&rq->lock); 6341 spin_lock_init(&rq->lock);
6392 lockdep_set_class(&rq->lock, &rq->rq_lock_key); 6342 lockdep_set_class(&rq->lock, &rq->rq_lock_key);
6393 rq->nr_running = 0; 6343 rq->nr_running = 0;
6394 rq->active = rq->arrays; 6344 rq->clock = 1;
6395 rq->expired = rq->arrays + 1; 6345 init_cfs_rq(&rq->cfs, rq);
6396 rq->best_expired_prio = MAX_PRIO; 6346#ifdef CONFIG_FAIR_GROUP_SCHED
6347 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
6348 list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
6349#endif
6350 rq->ls.load_update_last = now;
6351 rq->ls.load_update_start = now;
6397 6352
6353 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6354 rq->cpu_load[j] = 0;
6398#ifdef CONFIG_SMP 6355#ifdef CONFIG_SMP
6399 rq->sd = NULL; 6356 rq->sd = NULL;
6400 for (j = 1; j < 3; j++)
6401 rq->cpu_load[j] = 0;
6402 rq->active_balance = 0; 6357 rq->active_balance = 0;
6358 rq->next_balance = jiffies;
6403 rq->push_cpu = 0; 6359 rq->push_cpu = 0;
6404 rq->cpu = i; 6360 rq->cpu = i;
6405 rq->migration_thread = NULL; 6361 rq->migration_thread = NULL;
@@ -6407,16 +6363,14 @@ void __init sched_init(void)
6407#endif 6363#endif
6408 atomic_set(&rq->nr_iowait, 0); 6364 atomic_set(&rq->nr_iowait, 0);
6409 6365
6410 for (j = 0; j < 2; j++) { 6366 array = &rq->rt.active;
6411 array = rq->arrays + j; 6367 for (j = 0; j < MAX_RT_PRIO; j++) {
6412 for (k = 0; k < MAX_PRIO; k++) { 6368 INIT_LIST_HEAD(array->queue + j);
6413 INIT_LIST_HEAD(array->queue + k); 6369 __clear_bit(j, array->bitmap);
6414 __clear_bit(k, array->bitmap);
6415 }
6416 // delimiter for bitsearch
6417 __set_bit(MAX_PRIO, array->bitmap);
6418 } 6370 }
6419 highest_cpu = i; 6371 highest_cpu = i;
6372 /* delimiter for bitsearch: */
6373 __set_bit(MAX_RT_PRIO, array->bitmap);
6420 } 6374 }
6421 6375
6422 set_load_weight(&init_task); 6376 set_load_weight(&init_task);
@@ -6443,6 +6397,10 @@ void __init sched_init(void)
6443 * when this runqueue becomes "idle". 6397 * when this runqueue becomes "idle".
6444 */ 6398 */
6445 init_idle(current, smp_processor_id()); 6399 init_idle(current, smp_processor_id());
6400 /*
6401 * During early bootup we pretend to be a normal task:
6402 */
6403 current->sched_class = &fair_sched_class;
6446} 6404}
6447 6405
6448#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 6406#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
@@ -6473,29 +6431,55 @@ EXPORT_SYMBOL(__might_sleep);
6473#ifdef CONFIG_MAGIC_SYSRQ 6431#ifdef CONFIG_MAGIC_SYSRQ
6474void normalize_rt_tasks(void) 6432void normalize_rt_tasks(void)
6475{ 6433{
6476 struct prio_array *array;
6477 struct task_struct *g, *p; 6434 struct task_struct *g, *p;
6478 unsigned long flags; 6435 unsigned long flags;
6479 struct rq *rq; 6436 struct rq *rq;
6437 int on_rq;
6480 6438
6481 read_lock_irq(&tasklist_lock); 6439 read_lock_irq(&tasklist_lock);
6482
6483 do_each_thread(g, p) { 6440 do_each_thread(g, p) {
6484 if (!rt_task(p)) 6441 p->se.fair_key = 0;
6442 p->se.wait_runtime = 0;
6443 p->se.wait_start_fair = 0;
6444 p->se.wait_start = 0;
6445 p->se.exec_start = 0;
6446 p->se.sleep_start = 0;
6447 p->se.sleep_start_fair = 0;
6448 p->se.block_start = 0;
6449 task_rq(p)->cfs.fair_clock = 0;
6450 task_rq(p)->clock = 0;
6451
6452 if (!rt_task(p)) {
6453 /*
6454 * Renice negative nice level userspace
6455 * tasks back to 0:
6456 */
6457 if (TASK_NICE(p) < 0 && p->mm)
6458 set_user_nice(p, 0);
6485 continue; 6459 continue;
6460 }
6486 6461
6487 spin_lock_irqsave(&p->pi_lock, flags); 6462 spin_lock_irqsave(&p->pi_lock, flags);
6488 rq = __task_rq_lock(p); 6463 rq = __task_rq_lock(p);
6464#ifdef CONFIG_SMP
6465 /*
6466 * Do not touch the migration thread:
6467 */
6468 if (p == rq->migration_thread)
6469 goto out_unlock;
6470#endif
6489 6471
6490 array = p->array; 6472 on_rq = p->se.on_rq;
6491 if (array) 6473 if (on_rq)
6492 deactivate_task(p, task_rq(p)); 6474 deactivate_task(task_rq(p), p, 0);
6493 __setscheduler(p, SCHED_NORMAL, 0); 6475 __setscheduler(rq, p, SCHED_NORMAL, 0);
6494 if (array) { 6476 if (on_rq) {
6495 __activate_task(p, task_rq(p)); 6477 activate_task(task_rq(p), p, 0);
6496 resched_task(rq->curr); 6478 resched_task(rq->curr);
6497 } 6479 }
6498 6480#ifdef CONFIG_SMP
6481 out_unlock:
6482#endif
6499 __task_rq_unlock(rq); 6483 __task_rq_unlock(rq);
6500 spin_unlock_irqrestore(&p->pi_lock, flags); 6484 spin_unlock_irqrestore(&p->pi_lock, flags);
6501 } while_each_thread(g, p); 6485 } while_each_thread(g, p);