aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched_fair.c
diff options
context:
space:
mode:
authorPeter Zijlstra <a.p.zijlstra@chello.nl>2008-02-25 11:34:02 -0500
committerIngo Molnar <mingo@elte.hu>2008-03-04 11:54:06 -0500
commit62fb185130e4d420f71a30ff59d8b16b74ef5d2b (patch)
tree474c0824a5bf90950b0a430a11a52b358c9e1f31 /kernel/sched_fair.c
parent976dde010e513a9c7c3117a32b7b015f84b37430 (diff)
sched: revert load_balance_monitor() changes
The following commits cause a number of regressions: commit 58e2d4ca581167c2a079f4ee02be2f0bc52e8729 Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com> Date: Fri Jan 25 21:08:00 2008 +0100 sched: group scheduling, change how cpu load is calculated commit 6b2d7700266b9402e12824e11e0099ae6a4a6a79 Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com> Date: Fri Jan 25 21:08:00 2008 +0100 sched: group scheduler, fix fairness of cpu bandwidth allocation for task groups Namely: - very frequent wakeups on SMP, reported by PowerTop users. - cacheline trashing on (large) SMP - some latencies larger than 500ms While there is a mergeable patch to fix the latter, the former issues are not fixable in a manner suitable for .25 (we're at -rc3 now). Hence we revert them and try again in v2.6.26. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> CC: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com> Tested-by: Alexey Zaytsev <alexey.zaytsev@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel/sched_fair.c')
-rw-r--r--kernel/sched_fair.c115
1 files changed, 35 insertions, 80 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index c8e6492c5925..3df4d46994ca 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -727,8 +727,6 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
727 return se->parent; 727 return se->parent;
728} 728}
729 729
730#define GROUP_IMBALANCE_PCT 20
731
732#else /* CONFIG_FAIR_GROUP_SCHED */ 730#else /* CONFIG_FAIR_GROUP_SCHED */
733 731
734#define for_each_sched_entity(se) \ 732#define for_each_sched_entity(se) \
@@ -819,26 +817,15 @@ hrtick_start_fair(struct rq *rq, struct task_struct *p)
819static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) 817static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
820{ 818{
821 struct cfs_rq *cfs_rq; 819 struct cfs_rq *cfs_rq;
822 struct sched_entity *se = &p->se, 820 struct sched_entity *se = &p->se;
823 *topse = NULL; /* Highest schedulable entity */
824 int incload = 1;
825 821
826 for_each_sched_entity(se) { 822 for_each_sched_entity(se) {
827 topse = se; 823 if (se->on_rq)
828 if (se->on_rq) {
829 incload = 0;
830 break; 824 break;
831 }
832 cfs_rq = cfs_rq_of(se); 825 cfs_rq = cfs_rq_of(se);
833 enqueue_entity(cfs_rq, se, wakeup); 826 enqueue_entity(cfs_rq, se, wakeup);
834 wakeup = 1; 827 wakeup = 1;
835 } 828 }
836 /* Increment cpu load if we just enqueued the first task of a group on
837 * 'rq->cpu'. 'topse' represents the group to which task 'p' belongs
838 * at the highest grouping level.
839 */
840 if (incload)
841 inc_cpu_load(rq, topse->load.weight);
842 829
843 hrtick_start_fair(rq, rq->curr); 830 hrtick_start_fair(rq, rq->curr);
844} 831}
@@ -851,28 +838,16 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
851static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) 838static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
852{ 839{
853 struct cfs_rq *cfs_rq; 840 struct cfs_rq *cfs_rq;
854 struct sched_entity *se = &p->se, 841 struct sched_entity *se = &p->se;
855 *topse = NULL; /* Highest schedulable entity */
856 int decload = 1;
857 842
858 for_each_sched_entity(se) { 843 for_each_sched_entity(se) {
859 topse = se;
860 cfs_rq = cfs_rq_of(se); 844 cfs_rq = cfs_rq_of(se);
861 dequeue_entity(cfs_rq, se, sleep); 845 dequeue_entity(cfs_rq, se, sleep);
862 /* Don't dequeue parent if it has other entities besides us */ 846 /* Don't dequeue parent if it has other entities besides us */
863 if (cfs_rq->load.weight) { 847 if (cfs_rq->load.weight)
864 if (parent_entity(se))
865 decload = 0;
866 break; 848 break;
867 }
868 sleep = 1; 849 sleep = 1;
869 } 850 }
870 /* Decrement cpu load if we just dequeued the last task of a group on
871 * 'rq->cpu'. 'topse' represents the group to which task 'p' belongs
872 * at the highest grouping level.
873 */
874 if (decload)
875 dec_cpu_load(rq, topse->load.weight);
876 851
877 hrtick_start_fair(rq, rq->curr); 852 hrtick_start_fair(rq, rq->curr);
878} 853}
@@ -1186,6 +1161,25 @@ static struct task_struct *load_balance_next_fair(void *arg)
1186 return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); 1161 return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr);
1187} 1162}
1188 1163
1164#ifdef CONFIG_FAIR_GROUP_SCHED
1165static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
1166{
1167 struct sched_entity *curr;
1168 struct task_struct *p;
1169
1170 if (!cfs_rq->nr_running || !first_fair(cfs_rq))
1171 return MAX_PRIO;
1172
1173 curr = cfs_rq->curr;
1174 if (!curr)
1175 curr = __pick_next_entity(cfs_rq);
1176
1177 p = task_of(curr);
1178
1179 return p->prio;
1180}
1181#endif
1182
1189static unsigned long 1183static unsigned long
1190load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 1184load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1191 unsigned long max_load_move, 1185 unsigned long max_load_move,
@@ -1195,45 +1189,28 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1195 struct cfs_rq *busy_cfs_rq; 1189 struct cfs_rq *busy_cfs_rq;
1196 long rem_load_move = max_load_move; 1190 long rem_load_move = max_load_move;
1197 struct rq_iterator cfs_rq_iterator; 1191 struct rq_iterator cfs_rq_iterator;
1198 unsigned long load_moved;
1199 1192
1200 cfs_rq_iterator.start = load_balance_start_fair; 1193 cfs_rq_iterator.start = load_balance_start_fair;
1201 cfs_rq_iterator.next = load_balance_next_fair; 1194 cfs_rq_iterator.next = load_balance_next_fair;
1202 1195
1203 for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { 1196 for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
1204#ifdef CONFIG_FAIR_GROUP_SCHED 1197#ifdef CONFIG_FAIR_GROUP_SCHED
1205 struct cfs_rq *this_cfs_rq = busy_cfs_rq->tg->cfs_rq[this_cpu]; 1198 struct cfs_rq *this_cfs_rq;
1206 unsigned long maxload, task_load, group_weight; 1199 long imbalance;
1207 unsigned long thisload, per_task_load; 1200 unsigned long maxload;
1208 struct sched_entity *se = busy_cfs_rq->tg->se[busiest->cpu];
1209
1210 task_load = busy_cfs_rq->load.weight;
1211 group_weight = se->load.weight;
1212 1201
1213 /* 1202 this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
1214 * 'group_weight' is contributed by tasks of total weight
1215 * 'task_load'. To move 'rem_load_move' worth of weight only,
1216 * we need to move a maximum task load of:
1217 *
1218 * maxload = (remload / group_weight) * task_load;
1219 */
1220 maxload = (rem_load_move * task_load) / group_weight;
1221 1203
1222 if (!maxload || !task_load) 1204 imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight;
1205 /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
1206 if (imbalance <= 0)
1223 continue; 1207 continue;
1224 1208
1225 per_task_load = task_load / busy_cfs_rq->nr_running; 1209 /* Don't pull more than imbalance/2 */
1226 /* 1210 imbalance /= 2;
1227 * balance_tasks will try to forcibly move atleast one task if 1211 maxload = min(rem_load_move, imbalance);
1228 * possible (because of SCHED_LOAD_SCALE_FUZZ). Avoid that if
1229 * maxload is less than GROUP_IMBALANCE_FUZZ% the per_task_load.
1230 */
1231 if (100 * maxload < GROUP_IMBALANCE_PCT * per_task_load)
1232 continue;
1233 1212
1234 /* Disable priority-based load balance */ 1213 *this_best_prio = cfs_rq_best_prio(this_cfs_rq);
1235 *this_best_prio = 0;
1236 thisload = this_cfs_rq->load.weight;
1237#else 1214#else
1238# define maxload rem_load_move 1215# define maxload rem_load_move
1239#endif 1216#endif
@@ -1242,33 +1219,11 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1242 * load_balance_[start|next]_fair iterators 1219 * load_balance_[start|next]_fair iterators
1243 */ 1220 */
1244 cfs_rq_iterator.arg = busy_cfs_rq; 1221 cfs_rq_iterator.arg = busy_cfs_rq;
1245 load_moved = balance_tasks(this_rq, this_cpu, busiest, 1222 rem_load_move -= balance_tasks(this_rq, this_cpu, busiest,
1246 maxload, sd, idle, all_pinned, 1223 maxload, sd, idle, all_pinned,
1247 this_best_prio, 1224 this_best_prio,
1248 &cfs_rq_iterator); 1225 &cfs_rq_iterator);
1249 1226
1250#ifdef CONFIG_FAIR_GROUP_SCHED
1251 /*
1252 * load_moved holds the task load that was moved. The
1253 * effective (group) weight moved would be:
1254 * load_moved_eff = load_moved/task_load * group_weight;
1255 */
1256 load_moved = (group_weight * load_moved) / task_load;
1257
1258 /* Adjust shares on both cpus to reflect load_moved */
1259 group_weight -= load_moved;
1260 set_se_shares(se, group_weight);
1261
1262 se = busy_cfs_rq->tg->se[this_cpu];
1263 if (!thisload)
1264 group_weight = load_moved;
1265 else
1266 group_weight = se->load.weight + load_moved;
1267 set_se_shares(se, group_weight);
1268#endif
1269
1270 rem_load_move -= load_moved;
1271
1272 if (rem_load_move <= 0) 1227 if (rem_load_move <= 0)
1273 break; 1228 break;
1274 } 1229 }