aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/sched.h8
-rw-r--r--kernel/sched.c284
2 files changed, 225 insertions, 67 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 122a25c1b997..74a1e39e0d3d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -123,6 +123,7 @@ extern unsigned long nr_running(void);
123extern unsigned long nr_uninterruptible(void); 123extern unsigned long nr_uninterruptible(void);
124extern unsigned long nr_active(void); 124extern unsigned long nr_active(void);
125extern unsigned long nr_iowait(void); 125extern unsigned long nr_iowait(void);
126extern unsigned long weighted_cpuload(const int cpu);
126 127
127 128
128/* 129/*
@@ -558,9 +559,9 @@ enum idle_type
558/* 559/*
559 * sched-domains (multiprocessor balancing) declarations: 560 * sched-domains (multiprocessor balancing) declarations:
560 */ 561 */
561#ifdef CONFIG_SMP
562#define SCHED_LOAD_SCALE 128UL /* increase resolution of load */ 562#define SCHED_LOAD_SCALE 128UL /* increase resolution of load */
563 563
564#ifdef CONFIG_SMP
564#define SD_LOAD_BALANCE 1 /* Do load balancing on this domain. */ 565#define SD_LOAD_BALANCE 1 /* Do load balancing on this domain. */
565#define SD_BALANCE_NEWIDLE 2 /* Balance when about to become idle */ 566#define SD_BALANCE_NEWIDLE 2 /* Balance when about to become idle */
566#define SD_BALANCE_EXEC 4 /* Balance on exec */ 567#define SD_BALANCE_EXEC 4 /* Balance on exec */
@@ -713,9 +714,12 @@ struct task_struct {
713 714
714 int lock_depth; /* BKL lock depth */ 715 int lock_depth; /* BKL lock depth */
715 716
716#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 717#ifdef CONFIG_SMP
718#ifdef __ARCH_WANT_UNLOCKED_CTXSW
717 int oncpu; 719 int oncpu;
718#endif 720#endif
721#endif
722 int load_weight; /* for niceness load balancing purposes */
719 int prio, static_prio; 723 int prio, static_prio;
720 struct list_head run_list; 724 struct list_head run_list;
721 prio_array_t *array; 725 prio_array_t *array;
diff --git a/kernel/sched.c b/kernel/sched.c
index 678335a8b390..1847a4456a2d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -168,15 +168,21 @@
168 */ 168 */
169 169
170#define SCALE_PRIO(x, prio) \ 170#define SCALE_PRIO(x, prio) \
171 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE) 171 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
172 172
173static unsigned int task_timeslice(task_t *p) 173static unsigned int static_prio_timeslice(int static_prio)
174{ 174{
175 if (p->static_prio < NICE_TO_PRIO(0)) 175 if (static_prio < NICE_TO_PRIO(0))
176 return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio); 176 return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
177 else 177 else
178 return SCALE_PRIO(DEF_TIMESLICE, p->static_prio); 178 return SCALE_PRIO(DEF_TIMESLICE, static_prio);
179} 179}
180
181static inline unsigned int task_timeslice(task_t *p)
182{
183 return static_prio_timeslice(p->static_prio);
184}
185
180#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \ 186#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \
181 < (long long) (sd)->cache_hot_time) 187 < (long long) (sd)->cache_hot_time)
182 188
@@ -207,6 +213,7 @@ struct runqueue {
207 * remote CPUs use both these fields when doing load calculation. 213 * remote CPUs use both these fields when doing load calculation.
208 */ 214 */
209 unsigned long nr_running; 215 unsigned long nr_running;
216 unsigned long raw_weighted_load;
210#ifdef CONFIG_SMP 217#ifdef CONFIG_SMP
211 unsigned long cpu_load[3]; 218 unsigned long cpu_load[3];
212#endif 219#endif
@@ -662,6 +669,68 @@ static int effective_prio(task_t *p)
662} 669}
663 670
664/* 671/*
672 * To aid in avoiding the subversion of "niceness" due to uneven distribution
673 * of tasks with abnormal "nice" values across CPUs the contribution that
674 * each task makes to its run queue's load is weighted according to its
675 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
676 * scaled version of the new time slice allocation that they receive on time
677 * slice expiry etc.
678 */
679
680/*
681 * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE
682 * If static_prio_timeslice() is ever changed to break this assumption then
683 * this code will need modification
684 */
685#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
686#define LOAD_WEIGHT(lp) \
687 (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
688#define PRIO_TO_LOAD_WEIGHT(prio) \
689 LOAD_WEIGHT(static_prio_timeslice(prio))
690#define RTPRIO_TO_LOAD_WEIGHT(rp) \
691 (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp))
692
693static void set_load_weight(task_t *p)
694{
695 if (rt_task(p)) {
696#ifdef CONFIG_SMP
697 if (p == task_rq(p)->migration_thread)
698 /*
699 * The migration thread does the actual balancing.
700 * Giving its load any weight will skew balancing
701 * adversely.
702 */
703 p->load_weight = 0;
704 else
705#endif
706 p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority);
707 } else
708 p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
709}
710
711static inline void inc_raw_weighted_load(runqueue_t *rq, const task_t *p)
712{
713 rq->raw_weighted_load += p->load_weight;
714}
715
716static inline void dec_raw_weighted_load(runqueue_t *rq, const task_t *p)
717{
718 rq->raw_weighted_load -= p->load_weight;
719}
720
721static inline void inc_nr_running(task_t *p, runqueue_t *rq)
722{
723 rq->nr_running++;
724 inc_raw_weighted_load(rq, p);
725}
726
727static inline void dec_nr_running(task_t *p, runqueue_t *rq)
728{
729 rq->nr_running--;
730 dec_raw_weighted_load(rq, p);
731}
732
733/*
665 * __activate_task - move a task to the runqueue. 734 * __activate_task - move a task to the runqueue.
666 */ 735 */
667static void __activate_task(task_t *p, runqueue_t *rq) 736static void __activate_task(task_t *p, runqueue_t *rq)
@@ -671,7 +740,7 @@ static void __activate_task(task_t *p, runqueue_t *rq)
671 if (batch_task(p)) 740 if (batch_task(p))
672 target = rq->expired; 741 target = rq->expired;
673 enqueue_task(p, target); 742 enqueue_task(p, target);
674 rq->nr_running++; 743 inc_nr_running(p, rq);
675} 744}
676 745
677/* 746/*
@@ -680,7 +749,7 @@ static void __activate_task(task_t *p, runqueue_t *rq)
680static inline void __activate_idle_task(task_t *p, runqueue_t *rq) 749static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
681{ 750{
682 enqueue_task_head(p, rq->active); 751 enqueue_task_head(p, rq->active);
683 rq->nr_running++; 752 inc_nr_running(p, rq);
684} 753}
685 754
686static int recalc_task_prio(task_t *p, unsigned long long now) 755static int recalc_task_prio(task_t *p, unsigned long long now)
@@ -804,7 +873,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
804 */ 873 */
805static void deactivate_task(struct task_struct *p, runqueue_t *rq) 874static void deactivate_task(struct task_struct *p, runqueue_t *rq)
806{ 875{
807 rq->nr_running--; 876 dec_nr_running(p, rq);
808 dequeue_task(p, p->array); 877 dequeue_task(p, p->array);
809 p->array = NULL; 878 p->array = NULL;
810} 879}
@@ -859,6 +928,12 @@ inline int task_curr(const task_t *p)
859 return cpu_curr(task_cpu(p)) == p; 928 return cpu_curr(task_cpu(p)) == p;
860} 929}
861 930
931/* Used instead of source_load when we know the type == 0 */
932unsigned long weighted_cpuload(const int cpu)
933{
934 return cpu_rq(cpu)->raw_weighted_load;
935}
936
862#ifdef CONFIG_SMP 937#ifdef CONFIG_SMP
863typedef struct { 938typedef struct {
864 struct list_head list; 939 struct list_head list;
@@ -948,7 +1023,8 @@ void kick_process(task_t *p)
948} 1023}
949 1024
950/* 1025/*
951 * Return a low guess at the load of a migration-source cpu. 1026 * Return a low guess at the load of a migration-source cpu weighted
1027 * according to the scheduling class and "nice" value.
952 * 1028 *
953 * We want to under-estimate the load of migration sources, to 1029 * We want to under-estimate the load of migration sources, to
954 * balance conservatively. 1030 * balance conservatively.
@@ -956,24 +1032,36 @@ void kick_process(task_t *p)
956static inline unsigned long source_load(int cpu, int type) 1032static inline unsigned long source_load(int cpu, int type)
957{ 1033{
958 runqueue_t *rq = cpu_rq(cpu); 1034 runqueue_t *rq = cpu_rq(cpu);
959 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; 1035
960 if (type == 0) 1036 if (type == 0)
961 return load_now; 1037 return rq->raw_weighted_load;
962 1038
963 return min(rq->cpu_load[type-1], load_now); 1039 return min(rq->cpu_load[type-1], rq->raw_weighted_load);
964} 1040}
965 1041
966/* 1042/*
967 * Return a high guess at the load of a migration-target cpu 1043 * Return a high guess at the load of a migration-target cpu weighted
1044 * according to the scheduling class and "nice" value.
968 */ 1045 */
969static inline unsigned long target_load(int cpu, int type) 1046static inline unsigned long target_load(int cpu, int type)
970{ 1047{
971 runqueue_t *rq = cpu_rq(cpu); 1048 runqueue_t *rq = cpu_rq(cpu);
972 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; 1049
973 if (type == 0) 1050 if (type == 0)
974 return load_now; 1051 return rq->raw_weighted_load;
975 1052
976 return max(rq->cpu_load[type-1], load_now); 1053 return max(rq->cpu_load[type-1], rq->raw_weighted_load);
1054}
1055
1056/*
1057 * Return the average load per task on the cpu's run queue
1058 */
1059static inline unsigned long cpu_avg_load_per_task(int cpu)
1060{
1061 runqueue_t *rq = cpu_rq(cpu);
1062 unsigned long n = rq->nr_running;
1063
1064 return n ? rq->raw_weighted_load / n : SCHED_LOAD_SCALE;
977} 1065}
978 1066
979/* 1067/*
@@ -1046,7 +1134,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1046 cpus_and(tmp, group->cpumask, p->cpus_allowed); 1134 cpus_and(tmp, group->cpumask, p->cpus_allowed);
1047 1135
1048 for_each_cpu_mask(i, tmp) { 1136 for_each_cpu_mask(i, tmp) {
1049 load = source_load(i, 0); 1137 load = weighted_cpuload(i);
1050 1138
1051 if (load < min_load || (load == min_load && i == this_cpu)) { 1139 if (load < min_load || (load == min_load && i == this_cpu)) {
1052 min_load = load; 1140 min_load = load;
@@ -1226,17 +1314,19 @@ static int try_to_wake_up(task_t *p, unsigned int state, int sync)
1226 1314
1227 if (this_sd->flags & SD_WAKE_AFFINE) { 1315 if (this_sd->flags & SD_WAKE_AFFINE) {
1228 unsigned long tl = this_load; 1316 unsigned long tl = this_load;
1317 unsigned long tl_per_task = cpu_avg_load_per_task(this_cpu);
1318
1229 /* 1319 /*
1230 * If sync wakeup then subtract the (maximum possible) 1320 * If sync wakeup then subtract the (maximum possible)
1231 * effect of the currently running task from the load 1321 * effect of the currently running task from the load
1232 * of the current CPU: 1322 * of the current CPU:
1233 */ 1323 */
1234 if (sync) 1324 if (sync)
1235 tl -= SCHED_LOAD_SCALE; 1325 tl -= current->load_weight;
1236 1326
1237 if ((tl <= load && 1327 if ((tl <= load &&
1238 tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) || 1328 tl + target_load(cpu, idx) <= tl_per_task) ||
1239 100*(tl + SCHED_LOAD_SCALE) <= imbalance*load) { 1329 100*(tl + p->load_weight) <= imbalance*load) {
1240 /* 1330 /*
1241 * This domain has SD_WAKE_AFFINE and 1331 * This domain has SD_WAKE_AFFINE and
1242 * p is cache cold in this domain, and 1332 * p is cache cold in this domain, and
@@ -1435,7 +1525,7 @@ void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags)
1435 list_add_tail(&p->run_list, &current->run_list); 1525 list_add_tail(&p->run_list, &current->run_list);
1436 p->array = current->array; 1526 p->array = current->array;
1437 p->array->nr_active++; 1527 p->array->nr_active++;
1438 rq->nr_running++; 1528 inc_nr_running(p, rq);
1439 } 1529 }
1440 set_need_resched(); 1530 set_need_resched();
1441 } else 1531 } else
@@ -1802,9 +1892,9 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
1802 runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) 1892 runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
1803{ 1893{
1804 dequeue_task(p, src_array); 1894 dequeue_task(p, src_array);
1805 src_rq->nr_running--; 1895 dec_nr_running(p, src_rq);
1806 set_task_cpu(p, this_cpu); 1896 set_task_cpu(p, this_cpu);
1807 this_rq->nr_running++; 1897 inc_nr_running(p, this_rq);
1808 enqueue_task(p, this_array); 1898 enqueue_task(p, this_array);
1809 p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) 1899 p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
1810 + this_rq->timestamp_last_tick; 1900 + this_rq->timestamp_last_tick;
@@ -1852,24 +1942,27 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
1852} 1942}
1853 1943
1854/* 1944/*
1855 * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq, 1945 * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
1856 * as part of a balancing operation within "domain". Returns the number of 1946 * load from busiest to this_rq, as part of a balancing operation within
1857 * tasks moved. 1947 * "domain". Returns the number of tasks moved.
1858 * 1948 *
1859 * Called with both runqueues locked. 1949 * Called with both runqueues locked.
1860 */ 1950 */
1861static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, 1951static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
1862 unsigned long max_nr_move, struct sched_domain *sd, 1952 unsigned long max_nr_move, unsigned long max_load_move,
1863 enum idle_type idle, int *all_pinned) 1953 struct sched_domain *sd, enum idle_type idle,
1954 int *all_pinned)
1864{ 1955{
1865 prio_array_t *array, *dst_array; 1956 prio_array_t *array, *dst_array;
1866 struct list_head *head, *curr; 1957 struct list_head *head, *curr;
1867 int idx, pulled = 0, pinned = 0; 1958 int idx, pulled = 0, pinned = 0;
1959 long rem_load_move;
1868 task_t *tmp; 1960 task_t *tmp;
1869 1961
1870 if (max_nr_move == 0) 1962 if (max_nr_move == 0 || max_load_move == 0)
1871 goto out; 1963 goto out;
1872 1964
1965 rem_load_move = max_load_move;
1873 pinned = 1; 1966 pinned = 1;
1874 1967
1875 /* 1968 /*
@@ -1910,7 +2003,8 @@ skip_queue:
1910 2003
1911 curr = curr->prev; 2004 curr = curr->prev;
1912 2005
1913 if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { 2006 if (tmp->load_weight > rem_load_move ||
2007 !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
1914 if (curr != head) 2008 if (curr != head)
1915 goto skip_queue; 2009 goto skip_queue;
1916 idx++; 2010 idx++;
@@ -1924,9 +2018,13 @@ skip_queue:
1924 2018
1925 pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); 2019 pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
1926 pulled++; 2020 pulled++;
2021 rem_load_move -= tmp->load_weight;
1927 2022
1928 /* We only want to steal up to the prescribed number of tasks. */ 2023 /*
1929 if (pulled < max_nr_move) { 2024 * We only want to steal up to the prescribed number of tasks
2025 * and the prescribed amount of weighted load.
2026 */
2027 if (pulled < max_nr_move && rem_load_move > 0) {
1930 if (curr != head) 2028 if (curr != head)
1931 goto skip_queue; 2029 goto skip_queue;
1932 idx++; 2030 idx++;
@@ -1947,7 +2045,7 @@ out:
1947 2045
1948/* 2046/*
1949 * find_busiest_group finds and returns the busiest CPU group within the 2047 * find_busiest_group finds and returns the busiest CPU group within the
1950 * domain. It calculates and returns the number of tasks which should be 2048 * domain. It calculates and returns the amount of weighted load which should be
1951 * moved to restore balance via the imbalance parameter. 2049 * moved to restore balance via the imbalance parameter.
1952 */ 2050 */
1953static struct sched_group * 2051static struct sched_group *
@@ -1957,9 +2055,13 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
1957 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; 2055 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
1958 unsigned long max_load, avg_load, total_load, this_load, total_pwr; 2056 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
1959 unsigned long max_pull; 2057 unsigned long max_pull;
2058 unsigned long busiest_load_per_task, busiest_nr_running;
2059 unsigned long this_load_per_task, this_nr_running;
1960 int load_idx; 2060 int load_idx;
1961 2061
1962 max_load = this_load = total_load = total_pwr = 0; 2062 max_load = this_load = total_load = total_pwr = 0;
2063 busiest_load_per_task = busiest_nr_running = 0;
2064 this_load_per_task = this_nr_running = 0;
1963 if (idle == NOT_IDLE) 2065 if (idle == NOT_IDLE)
1964 load_idx = sd->busy_idx; 2066 load_idx = sd->busy_idx;
1965 else if (idle == NEWLY_IDLE) 2067 else if (idle == NEWLY_IDLE)
@@ -1971,13 +2073,16 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
1971 unsigned long load; 2073 unsigned long load;
1972 int local_group; 2074 int local_group;
1973 int i; 2075 int i;
2076 unsigned long sum_nr_running, sum_weighted_load;
1974 2077
1975 local_group = cpu_isset(this_cpu, group->cpumask); 2078 local_group = cpu_isset(this_cpu, group->cpumask);
1976 2079
1977 /* Tally up the load of all CPUs in the group */ 2080 /* Tally up the load of all CPUs in the group */
1978 avg_load = 0; 2081 sum_weighted_load = sum_nr_running = avg_load = 0;
1979 2082
1980 for_each_cpu_mask(i, group->cpumask) { 2083 for_each_cpu_mask(i, group->cpumask) {
2084 runqueue_t *rq = cpu_rq(i);
2085
1981 if (*sd_idle && !idle_cpu(i)) 2086 if (*sd_idle && !idle_cpu(i))
1982 *sd_idle = 0; 2087 *sd_idle = 0;
1983 2088
@@ -1988,6 +2093,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
1988 load = source_load(i, load_idx); 2093 load = source_load(i, load_idx);
1989 2094
1990 avg_load += load; 2095 avg_load += load;
2096 sum_nr_running += rq->nr_running;
2097 sum_weighted_load += rq->raw_weighted_load;
1991 } 2098 }
1992 2099
1993 total_load += avg_load; 2100 total_load += avg_load;
@@ -1999,14 +2106,19 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
1999 if (local_group) { 2106 if (local_group) {
2000 this_load = avg_load; 2107 this_load = avg_load;
2001 this = group; 2108 this = group;
2002 } else if (avg_load > max_load) { 2109 this_nr_running = sum_nr_running;
2110 this_load_per_task = sum_weighted_load;
2111 } else if (avg_load > max_load &&
2112 sum_nr_running > group->cpu_power / SCHED_LOAD_SCALE) {
2003 max_load = avg_load; 2113 max_load = avg_load;
2004 busiest = group; 2114 busiest = group;
2115 busiest_nr_running = sum_nr_running;
2116 busiest_load_per_task = sum_weighted_load;
2005 } 2117 }
2006 group = group->next; 2118 group = group->next;
2007 } while (group != sd->groups); 2119 } while (group != sd->groups);
2008 2120
2009 if (!busiest || this_load >= max_load || max_load <= SCHED_LOAD_SCALE) 2121 if (!busiest || this_load >= max_load || busiest_nr_running == 0)
2010 goto out_balanced; 2122 goto out_balanced;
2011 2123
2012 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; 2124 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
@@ -2015,6 +2127,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2015 100*max_load <= sd->imbalance_pct*this_load) 2127 100*max_load <= sd->imbalance_pct*this_load)
2016 goto out_balanced; 2128 goto out_balanced;
2017 2129
2130 busiest_load_per_task /= busiest_nr_running;
2018 /* 2131 /*
2019 * We're trying to get all the cpus to the average_load, so we don't 2132 * We're trying to get all the cpus to the average_load, so we don't
2020 * want to push ourselves above the average load, nor do we wish to 2133 * want to push ourselves above the average load, nor do we wish to
@@ -2026,21 +2139,50 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2026 * by pulling tasks to us. Be careful of negative numbers as they'll 2139 * by pulling tasks to us. Be careful of negative numbers as they'll
2027 * appear as very large values with unsigned longs. 2140 * appear as very large values with unsigned longs.
2028 */ 2141 */
2142 if (max_load <= busiest_load_per_task)
2143 goto out_balanced;
2144
2145 /*
2146 * In the presence of smp nice balancing, certain scenarios can have
2147 * max load less than avg load(as we skip the groups at or below
2148 * its cpu_power, while calculating max_load..)
2149 */
2150 if (max_load < avg_load) {
2151 *imbalance = 0;
2152 goto small_imbalance;
2153 }
2029 2154
2030 /* Don't want to pull so many tasks that a group would go idle */ 2155 /* Don't want to pull so many tasks that a group would go idle */
2031 max_pull = min(max_load - avg_load, max_load - SCHED_LOAD_SCALE); 2156 max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
2032 2157
2033 /* How much load to actually move to equalise the imbalance */ 2158 /* How much load to actually move to equalise the imbalance */
2034 *imbalance = min(max_pull * busiest->cpu_power, 2159 *imbalance = min(max_pull * busiest->cpu_power,
2035 (avg_load - this_load) * this->cpu_power) 2160 (avg_load - this_load) * this->cpu_power)
2036 / SCHED_LOAD_SCALE; 2161 / SCHED_LOAD_SCALE;
2037 2162
2038 if (*imbalance < SCHED_LOAD_SCALE) { 2163 /*
2039 unsigned long pwr_now = 0, pwr_move = 0; 2164 * if *imbalance is less than the average load per runnable task
2165 * there is no gaurantee that any tasks will be moved so we'll have
2166 * a think about bumping its value to force at least one task to be
2167 * moved
2168 */
2169 if (*imbalance < busiest_load_per_task) {
2170 unsigned long pwr_now, pwr_move;
2040 unsigned long tmp; 2171 unsigned long tmp;
2172 unsigned int imbn;
2173
2174small_imbalance:
2175 pwr_move = pwr_now = 0;
2176 imbn = 2;
2177 if (this_nr_running) {
2178 this_load_per_task /= this_nr_running;
2179 if (busiest_load_per_task > this_load_per_task)
2180 imbn = 1;
2181 } else
2182 this_load_per_task = SCHED_LOAD_SCALE;
2041 2183
2042 if (max_load - this_load >= SCHED_LOAD_SCALE*2) { 2184 if (max_load - this_load >= busiest_load_per_task * imbn) {
2043 *imbalance = 1; 2185 *imbalance = busiest_load_per_task;
2044 return busiest; 2186 return busiest;
2045 } 2187 }
2046 2188
@@ -2050,35 +2192,34 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2050 * moving them. 2192 * moving them.
2051 */ 2193 */
2052 2194
2053 pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load); 2195 pwr_now += busiest->cpu_power *
2054 pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load); 2196 min(busiest_load_per_task, max_load);
2197 pwr_now += this->cpu_power *
2198 min(this_load_per_task, this_load);
2055 pwr_now /= SCHED_LOAD_SCALE; 2199 pwr_now /= SCHED_LOAD_SCALE;
2056 2200
2057 /* Amount of load we'd subtract */ 2201 /* Amount of load we'd subtract */
2058 tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power; 2202 tmp = busiest_load_per_task*SCHED_LOAD_SCALE/busiest->cpu_power;
2059 if (max_load > tmp) 2203 if (max_load > tmp)
2060 pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE, 2204 pwr_move += busiest->cpu_power *
2061 max_load - tmp); 2205 min(busiest_load_per_task, max_load - tmp);
2062 2206
2063 /* Amount of load we'd add */ 2207 /* Amount of load we'd add */
2064 if (max_load*busiest->cpu_power < 2208 if (max_load*busiest->cpu_power <
2065 SCHED_LOAD_SCALE*SCHED_LOAD_SCALE) 2209 busiest_load_per_task*SCHED_LOAD_SCALE)
2066 tmp = max_load*busiest->cpu_power/this->cpu_power; 2210 tmp = max_load*busiest->cpu_power/this->cpu_power;
2067 else 2211 else
2068 tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power; 2212 tmp = busiest_load_per_task*SCHED_LOAD_SCALE/this->cpu_power;
2069 pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp); 2213 pwr_move += this->cpu_power*min(this_load_per_task, this_load + tmp);
2070 pwr_move /= SCHED_LOAD_SCALE; 2214 pwr_move /= SCHED_LOAD_SCALE;
2071 2215
2072 /* Move if we gain throughput */ 2216 /* Move if we gain throughput */
2073 if (pwr_move <= pwr_now) 2217 if (pwr_move <= pwr_now)
2074 goto out_balanced; 2218 goto out_balanced;
2075 2219
2076 *imbalance = 1; 2220 *imbalance = busiest_load_per_task;
2077 return busiest;
2078 } 2221 }
2079 2222
2080 /* Get rid of the scaling factor, rounding down as we divide */
2081 *imbalance = *imbalance / SCHED_LOAD_SCALE;
2082 return busiest; 2223 return busiest;
2083 2224
2084out_balanced: 2225out_balanced:
@@ -2091,18 +2232,21 @@ out_balanced:
2091 * find_busiest_queue - find the busiest runqueue among the cpus in group. 2232 * find_busiest_queue - find the busiest runqueue among the cpus in group.
2092 */ 2233 */
2093static runqueue_t *find_busiest_queue(struct sched_group *group, 2234static runqueue_t *find_busiest_queue(struct sched_group *group,
2094 enum idle_type idle) 2235 enum idle_type idle, unsigned long imbalance)
2095{ 2236{
2096 unsigned long load, max_load = 0; 2237 unsigned long max_load = 0;
2097 runqueue_t *busiest = NULL; 2238 runqueue_t *busiest = NULL, *rqi;
2098 int i; 2239 int i;
2099 2240
2100 for_each_cpu_mask(i, group->cpumask) { 2241 for_each_cpu_mask(i, group->cpumask) {
2101 load = source_load(i, 0); 2242 rqi = cpu_rq(i);
2243
2244 if (rqi->nr_running == 1 && rqi->raw_weighted_load > imbalance)
2245 continue;
2102 2246
2103 if (load > max_load) { 2247 if (rqi->raw_weighted_load > max_load) {
2104 max_load = load; 2248 max_load = rqi->raw_weighted_load;
2105 busiest = cpu_rq(i); 2249 busiest = rqi;
2106 } 2250 }
2107 } 2251 }
2108 2252
@@ -2115,6 +2259,7 @@ static runqueue_t *find_busiest_queue(struct sched_group *group,
2115 */ 2259 */
2116#define MAX_PINNED_INTERVAL 512 2260#define MAX_PINNED_INTERVAL 512
2117 2261
2262#define minus_1_or_zero(n) ((n) > 0 ? (n) - 1 : 0)
2118/* 2263/*
2119 * Check this_cpu to ensure it is balanced within domain. Attempt to move 2264 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2120 * tasks if there is an imbalance. 2265 * tasks if there is an imbalance.
@@ -2142,7 +2287,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
2142 goto out_balanced; 2287 goto out_balanced;
2143 } 2288 }
2144 2289
2145 busiest = find_busiest_queue(group, idle); 2290 busiest = find_busiest_queue(group, idle, imbalance);
2146 if (!busiest) { 2291 if (!busiest) {
2147 schedstat_inc(sd, lb_nobusyq[idle]); 2292 schedstat_inc(sd, lb_nobusyq[idle]);
2148 goto out_balanced; 2293 goto out_balanced;
@@ -2162,6 +2307,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
2162 */ 2307 */
2163 double_rq_lock(this_rq, busiest); 2308 double_rq_lock(this_rq, busiest);
2164 nr_moved = move_tasks(this_rq, this_cpu, busiest, 2309 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2310 minus_1_or_zero(busiest->nr_running),
2165 imbalance, sd, idle, &all_pinned); 2311 imbalance, sd, idle, &all_pinned);
2166 double_rq_unlock(this_rq, busiest); 2312 double_rq_unlock(this_rq, busiest);
2167 2313
@@ -2265,7 +2411,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
2265 goto out_balanced; 2411 goto out_balanced;
2266 } 2412 }
2267 2413
2268 busiest = find_busiest_queue(group, NEWLY_IDLE); 2414 busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance);
2269 if (!busiest) { 2415 if (!busiest) {
2270 schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); 2416 schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
2271 goto out_balanced; 2417 goto out_balanced;
@@ -2280,6 +2426,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
2280 /* Attempt to move tasks */ 2426 /* Attempt to move tasks */
2281 double_lock_balance(this_rq, busiest); 2427 double_lock_balance(this_rq, busiest);
2282 nr_moved = move_tasks(this_rq, this_cpu, busiest, 2428 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2429 minus_1_or_zero(busiest->nr_running),
2283 imbalance, sd, NEWLY_IDLE, NULL); 2430 imbalance, sd, NEWLY_IDLE, NULL);
2284 spin_unlock(&busiest->lock); 2431 spin_unlock(&busiest->lock);
2285 } 2432 }
@@ -2361,7 +2508,8 @@ static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
2361 2508
2362 schedstat_inc(sd, alb_cnt); 2509 schedstat_inc(sd, alb_cnt);
2363 2510
2364 if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL)) 2511 if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
2512 RTPRIO_TO_LOAD_WEIGHT(100), sd, SCHED_IDLE, NULL))
2365 schedstat_inc(sd, alb_pushed); 2513 schedstat_inc(sd, alb_pushed);
2366 else 2514 else
2367 schedstat_inc(sd, alb_failed); 2515 schedstat_inc(sd, alb_failed);
@@ -2389,7 +2537,7 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
2389 struct sched_domain *sd; 2537 struct sched_domain *sd;
2390 int i; 2538 int i;
2391 2539
2392 this_load = this_rq->nr_running * SCHED_LOAD_SCALE; 2540 this_load = this_rq->raw_weighted_load;
2393 /* Update our load */ 2541 /* Update our load */
2394 for (i = 0; i < 3; i++) { 2542 for (i = 0; i < 3; i++) {
2395 unsigned long new_load = this_load; 2543 unsigned long new_load = this_load;
@@ -3441,17 +3589,21 @@ void set_user_nice(task_t *p, long nice)
3441 goto out_unlock; 3589 goto out_unlock;
3442 } 3590 }
3443 array = p->array; 3591 array = p->array;
3444 if (array) 3592 if (array) {
3445 dequeue_task(p, array); 3593 dequeue_task(p, array);
3594 dec_raw_weighted_load(rq, p);
3595 }
3446 3596
3447 old_prio = p->prio; 3597 old_prio = p->prio;
3448 new_prio = NICE_TO_PRIO(nice); 3598 new_prio = NICE_TO_PRIO(nice);
3449 delta = new_prio - old_prio; 3599 delta = new_prio - old_prio;
3450 p->static_prio = NICE_TO_PRIO(nice); 3600 p->static_prio = NICE_TO_PRIO(nice);
3601 set_load_weight(p);
3451 p->prio += delta; 3602 p->prio += delta;
3452 3603
3453 if (array) { 3604 if (array) {
3454 enqueue_task(p, array); 3605 enqueue_task(p, array);
3606 inc_raw_weighted_load(rq, p);
3455 /* 3607 /*
3456 * If the task increased its priority or is running and 3608 * If the task increased its priority or is running and
3457 * lowered its priority, then reschedule its CPU: 3609 * lowered its priority, then reschedule its CPU:
@@ -3587,6 +3739,7 @@ static void __setscheduler(struct task_struct *p, int policy, int prio)
3587 if (policy == SCHED_BATCH) 3739 if (policy == SCHED_BATCH)
3588 p->sleep_avg = 0; 3740 p->sleep_avg = 0;
3589 } 3741 }
3742 set_load_weight(p);
3590} 3743}
3591 3744
3592/** 3745/**
@@ -6106,6 +6259,7 @@ void __init sched_init(void)
6106 } 6259 }
6107 } 6260 }
6108 6261
6262 set_load_weight(&init_task);
6109 /* 6263 /*
6110 * The boot idle thread does lazy MMU switching as well: 6264 * The boot idle thread does lazy MMU switching as well:
6111 */ 6265 */