aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2008-10-28 12:46:20 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-10-28 12:46:20 -0400
commit8ca6215502462f564d7bcae2d8dcc825aa95d743 (patch)
tree1534f8ad77640ab6f6d9471679b6e4c2d11e739c
parentf8245e91a5121acc435e509aa56cd04d445a74c7 (diff)
parent4078e359c4688541a0093fde0dff35dc7190c4f5 (diff)
Merge branch 'sched-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'sched-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: sched: fix documentation reference for sched_min_granularity_ns sched: virtual time buddy preemption sched: re-instate vruntime based wakeup preemption sched: weaken sync hint sched: more accurate min_vruntime accounting sched: fix a find_busiest_group buglet sched: add CONFIG_SMP consistency
-rw-r--r--Documentation/scheduler/sched-design-CFS.txt2
-rw-r--r--include/linux/sched.h12
-rw-r--r--kernel/sched.c3
-rw-r--r--kernel/sched_fair.c169
-rw-r--r--kernel/sched_idletask.c5
-rw-r--r--kernel/sched_rt.c5
6 files changed, 139 insertions, 57 deletions
diff --git a/Documentation/scheduler/sched-design-CFS.txt b/Documentation/scheduler/sched-design-CFS.txt
index 9d8eb553884c..eb471c7a905e 100644
--- a/Documentation/scheduler/sched-design-CFS.txt
+++ b/Documentation/scheduler/sched-design-CFS.txt
@@ -92,7 +92,7 @@ other HZ detail. Thus the CFS scheduler has no notion of "timeslices" in the
92way the previous scheduler had, and has no heuristics whatsoever. There is 92way the previous scheduler had, and has no heuristics whatsoever. There is
93only one central tunable (you have to switch on CONFIG_SCHED_DEBUG): 93only one central tunable (you have to switch on CONFIG_SCHED_DEBUG):
94 94
95 /proc/sys/kernel/sched_granularity_ns 95 /proc/sys/kernel/sched_min_granularity_ns
96 96
97which can be used to tune the scheduler from "desktop" (i.e., low latencies) to 97which can be used to tune the scheduler from "desktop" (i.e., low latencies) to
98"server" (i.e., good batching) workloads. It defaults to a setting suitable 98"server" (i.e., good batching) workloads. It defaults to a setting suitable
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8478f334d732..b483f39a7112 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -936,7 +936,6 @@ struct sched_class {
936 void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup); 936 void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup);
937 void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep); 937 void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep);
938 void (*yield_task) (struct rq *rq); 938 void (*yield_task) (struct rq *rq);
939 int (*select_task_rq)(struct task_struct *p, int sync);
940 939
941 void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int sync); 940 void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int sync);
942 941
@@ -944,6 +943,8 @@ struct sched_class {
944 void (*put_prev_task) (struct rq *rq, struct task_struct *p); 943 void (*put_prev_task) (struct rq *rq, struct task_struct *p);
945 944
946#ifdef CONFIG_SMP 945#ifdef CONFIG_SMP
946 int (*select_task_rq)(struct task_struct *p, int sync);
947
947 unsigned long (*load_balance) (struct rq *this_rq, int this_cpu, 948 unsigned long (*load_balance) (struct rq *this_rq, int this_cpu,
948 struct rq *busiest, unsigned long max_load_move, 949 struct rq *busiest, unsigned long max_load_move,
949 struct sched_domain *sd, enum cpu_idle_type idle, 950 struct sched_domain *sd, enum cpu_idle_type idle,
@@ -955,16 +956,17 @@ struct sched_class {
955 void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); 956 void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
956 void (*post_schedule) (struct rq *this_rq); 957 void (*post_schedule) (struct rq *this_rq);
957 void (*task_wake_up) (struct rq *this_rq, struct task_struct *task); 958 void (*task_wake_up) (struct rq *this_rq, struct task_struct *task);
958#endif
959 959
960 void (*set_curr_task) (struct rq *rq);
961 void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
962 void (*task_new) (struct rq *rq, struct task_struct *p);
963 void (*set_cpus_allowed)(struct task_struct *p, 960 void (*set_cpus_allowed)(struct task_struct *p,
964 const cpumask_t *newmask); 961 const cpumask_t *newmask);
965 962
966 void (*rq_online)(struct rq *rq); 963 void (*rq_online)(struct rq *rq);
967 void (*rq_offline)(struct rq *rq); 964 void (*rq_offline)(struct rq *rq);
965#endif
966
967 void (*set_curr_task) (struct rq *rq);
968 void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
969 void (*task_new) (struct rq *rq, struct task_struct *p);
968 970
969 void (*switched_from) (struct rq *this_rq, struct task_struct *task, 971 void (*switched_from) (struct rq *this_rq, struct task_struct *task,
970 int running); 972 int running);
diff --git a/kernel/sched.c b/kernel/sched.c
index 6625c3c4b10d..e8819bc6f462 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -386,7 +386,6 @@ struct cfs_rq {
386 386
387 u64 exec_clock; 387 u64 exec_clock;
388 u64 min_vruntime; 388 u64 min_vruntime;
389 u64 pair_start;
390 389
391 struct rb_root tasks_timeline; 390 struct rb_root tasks_timeline;
392 struct rb_node *rb_leftmost; 391 struct rb_node *rb_leftmost;
@@ -3344,7 +3343,7 @@ small_imbalance:
3344 } else 3343 } else
3345 this_load_per_task = cpu_avg_load_per_task(this_cpu); 3344 this_load_per_task = cpu_avg_load_per_task(this_cpu);
3346 3345
3347 if (max_load - this_load + 2*busiest_load_per_task >= 3346 if (max_load - this_load + busiest_load_per_task >=
3348 busiest_load_per_task * imbn) { 3347 busiest_load_per_task * imbn) {
3349 *imbalance = busiest_load_per_task; 3348 *imbalance = busiest_load_per_task;
3350 return busiest; 3349 return busiest;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 9573c33688b8..ce514afd78ff 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -143,6 +143,49 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
143 return se->parent; 143 return se->parent;
144} 144}
145 145
146/* return depth at which a sched entity is present in the hierarchy */
147static inline int depth_se(struct sched_entity *se)
148{
149 int depth = 0;
150
151 for_each_sched_entity(se)
152 depth++;
153
154 return depth;
155}
156
157static void
158find_matching_se(struct sched_entity **se, struct sched_entity **pse)
159{
160 int se_depth, pse_depth;
161
162 /*
163 * preemption test can be made between sibling entities who are in the
164 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
165 * both tasks until we find their ancestors who are siblings of common
166 * parent.
167 */
168
169 /* First walk up until both entities are at same depth */
170 se_depth = depth_se(*se);
171 pse_depth = depth_se(*pse);
172
173 while (se_depth > pse_depth) {
174 se_depth--;
175 *se = parent_entity(*se);
176 }
177
178 while (pse_depth > se_depth) {
179 pse_depth--;
180 *pse = parent_entity(*pse);
181 }
182
183 while (!is_same_group(*se, *pse)) {
184 *se = parent_entity(*se);
185 *pse = parent_entity(*pse);
186 }
187}
188
146#else /* CONFIG_FAIR_GROUP_SCHED */ 189#else /* CONFIG_FAIR_GROUP_SCHED */
147 190
148static inline struct rq *rq_of(struct cfs_rq *cfs_rq) 191static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
@@ -193,6 +236,11 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
193 return NULL; 236 return NULL;
194} 237}
195 238
239static inline void
240find_matching_se(struct sched_entity **se, struct sched_entity **pse)
241{
242}
243
196#endif /* CONFIG_FAIR_GROUP_SCHED */ 244#endif /* CONFIG_FAIR_GROUP_SCHED */
197 245
198 246
@@ -223,6 +271,27 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
223 return se->vruntime - cfs_rq->min_vruntime; 271 return se->vruntime - cfs_rq->min_vruntime;
224} 272}
225 273
274static void update_min_vruntime(struct cfs_rq *cfs_rq)
275{
276 u64 vruntime = cfs_rq->min_vruntime;
277
278 if (cfs_rq->curr)
279 vruntime = cfs_rq->curr->vruntime;
280
281 if (cfs_rq->rb_leftmost) {
282 struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
283 struct sched_entity,
284 run_node);
285
286 if (vruntime == cfs_rq->min_vruntime)
287 vruntime = se->vruntime;
288 else
289 vruntime = min_vruntime(vruntime, se->vruntime);
290 }
291
292 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
293}
294
226/* 295/*
227 * Enqueue an entity into the rb-tree: 296 * Enqueue an entity into the rb-tree:
228 */ 297 */
@@ -256,15 +325,8 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
256 * Maintain a cache of leftmost tree entries (it is frequently 325 * Maintain a cache of leftmost tree entries (it is frequently
257 * used): 326 * used):
258 */ 327 */
259 if (leftmost) { 328 if (leftmost)
260 cfs_rq->rb_leftmost = &se->run_node; 329 cfs_rq->rb_leftmost = &se->run_node;
261 /*
262 * maintain cfs_rq->min_vruntime to be a monotonic increasing
263 * value tracking the leftmost vruntime in the tree.
264 */
265 cfs_rq->min_vruntime =
266 max_vruntime(cfs_rq->min_vruntime, se->vruntime);
267 }
268 330
269 rb_link_node(&se->run_node, parent, link); 331 rb_link_node(&se->run_node, parent, link);
270 rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); 332 rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
@@ -274,18 +336,9 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
274{ 336{
275 if (cfs_rq->rb_leftmost == &se->run_node) { 337 if (cfs_rq->rb_leftmost == &se->run_node) {
276 struct rb_node *next_node; 338 struct rb_node *next_node;
277 struct sched_entity *next;
278 339
279 next_node = rb_next(&se->run_node); 340 next_node = rb_next(&se->run_node);
280 cfs_rq->rb_leftmost = next_node; 341 cfs_rq->rb_leftmost = next_node;
281
282 if (next_node) {
283 next = rb_entry(next_node,
284 struct sched_entity, run_node);
285 cfs_rq->min_vruntime =
286 max_vruntime(cfs_rq->min_vruntime,
287 next->vruntime);
288 }
289 } 342 }
290 343
291 if (cfs_rq->next == se) 344 if (cfs_rq->next == se)
@@ -424,6 +477,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
424 schedstat_add(cfs_rq, exec_clock, delta_exec); 477 schedstat_add(cfs_rq, exec_clock, delta_exec);
425 delta_exec_weighted = calc_delta_fair(delta_exec, curr); 478 delta_exec_weighted = calc_delta_fair(delta_exec, curr);
426 curr->vruntime += delta_exec_weighted; 479 curr->vruntime += delta_exec_weighted;
480 update_min_vruntime(cfs_rq);
427} 481}
428 482
429static void update_curr(struct cfs_rq *cfs_rq) 483static void update_curr(struct cfs_rq *cfs_rq)
@@ -613,13 +667,7 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
613static void 667static void
614place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) 668place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
615{ 669{
616 u64 vruntime; 670 u64 vruntime = cfs_rq->min_vruntime;
617
618 if (first_fair(cfs_rq)) {
619 vruntime = min_vruntime(cfs_rq->min_vruntime,
620 __pick_next_entity(cfs_rq)->vruntime);
621 } else
622 vruntime = cfs_rq->min_vruntime;
623 671
624 /* 672 /*
625 * The 'current' period is already promised to the current tasks, 673 * The 'current' period is already promised to the current tasks,
@@ -696,6 +744,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
696 if (se != cfs_rq->curr) 744 if (se != cfs_rq->curr)
697 __dequeue_entity(cfs_rq, se); 745 __dequeue_entity(cfs_rq, se);
698 account_entity_dequeue(cfs_rq, se); 746 account_entity_dequeue(cfs_rq, se);
747 update_min_vruntime(cfs_rq);
699} 748}
700 749
701/* 750/*
@@ -742,16 +791,14 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
742 se->prev_sum_exec_runtime = se->sum_exec_runtime; 791 se->prev_sum_exec_runtime = se->sum_exec_runtime;
743} 792}
744 793
794static int
795wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
796
745static struct sched_entity * 797static struct sched_entity *
746pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se) 798pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se)
747{ 799{
748 struct rq *rq = rq_of(cfs_rq); 800 if (!cfs_rq->next || wakeup_preempt_entity(cfs_rq->next, se) == 1)
749 u64 pair_slice = rq->clock - cfs_rq->pair_start;
750
751 if (!cfs_rq->next || pair_slice > sysctl_sched_min_granularity) {
752 cfs_rq->pair_start = rq->clock;
753 return se; 801 return se;
754 }
755 802
756 return cfs_rq->next; 803 return cfs_rq->next;
757} 804}
@@ -1122,10 +1169,9 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1122 if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) 1169 if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
1123 return 0; 1170 return 0;
1124 1171
1125 if (!sync && sched_feat(SYNC_WAKEUPS) && 1172 if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost ||
1126 curr->se.avg_overlap < sysctl_sched_migration_cost && 1173 p->se.avg_overlap > sysctl_sched_migration_cost))
1127 p->se.avg_overlap < sysctl_sched_migration_cost) 1174 sync = 0;
1128 sync = 1;
1129 1175
1130 /* 1176 /*
1131 * If sync wakeup then subtract the (maximum possible) 1177 * If sync wakeup then subtract the (maximum possible)
@@ -1244,13 +1290,42 @@ static unsigned long wakeup_gran(struct sched_entity *se)
1244 * More easily preempt - nice tasks, while not making it harder for 1290 * More easily preempt - nice tasks, while not making it harder for
1245 * + nice tasks. 1291 * + nice tasks.
1246 */ 1292 */
1247 if (sched_feat(ASYM_GRAN)) 1293 if (!sched_feat(ASYM_GRAN) || se->load.weight > NICE_0_LOAD)
1248 gran = calc_delta_mine(gran, NICE_0_LOAD, &se->load); 1294 gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se);
1249 1295
1250 return gran; 1296 return gran;
1251} 1297}
1252 1298
1253/* 1299/*
1300 * Should 'se' preempt 'curr'.
1301 *
1302 * |s1
1303 * |s2
1304 * |s3
1305 * g
1306 * |<--->|c
1307 *
1308 * w(c, s1) = -1
1309 * w(c, s2) = 0
1310 * w(c, s3) = 1
1311 *
1312 */
1313static int
1314wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
1315{
1316 s64 gran, vdiff = curr->vruntime - se->vruntime;
1317
1318 if (vdiff <= 0)
1319 return -1;
1320
1321 gran = wakeup_gran(curr);
1322 if (vdiff > gran)
1323 return 1;
1324
1325 return 0;
1326}
1327
1328/*
1254 * Preempt the current task with a newly woken task if needed: 1329 * Preempt the current task with a newly woken task if needed:
1255 */ 1330 */
1256static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) 1331static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
@@ -1258,7 +1333,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
1258 struct task_struct *curr = rq->curr; 1333 struct task_struct *curr = rq->curr;
1259 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 1334 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1260 struct sched_entity *se = &curr->se, *pse = &p->se; 1335 struct sched_entity *se = &curr->se, *pse = &p->se;
1261 s64 delta_exec;
1262 1336
1263 if (unlikely(rt_prio(p->prio))) { 1337 if (unlikely(rt_prio(p->prio))) {
1264 update_rq_clock(rq); 1338 update_rq_clock(rq);
@@ -1296,9 +1370,19 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
1296 return; 1370 return;
1297 } 1371 }
1298 1372
1299 delta_exec = se->sum_exec_runtime - se->prev_sum_exec_runtime; 1373 find_matching_se(&se, &pse);
1300 if (delta_exec > wakeup_gran(pse)) 1374
1301 resched_task(curr); 1375 while (se) {
1376 BUG_ON(!pse);
1377
1378 if (wakeup_preempt_entity(se, pse) == 1) {
1379 resched_task(curr);
1380 break;
1381 }
1382
1383 se = parent_entity(se);
1384 pse = parent_entity(pse);
1385 }
1302} 1386}
1303 1387
1304static struct task_struct *pick_next_task_fair(struct rq *rq) 1388static struct task_struct *pick_next_task_fair(struct rq *rq)
@@ -1594,9 +1678,6 @@ static const struct sched_class fair_sched_class = {
1594 .enqueue_task = enqueue_task_fair, 1678 .enqueue_task = enqueue_task_fair,
1595 .dequeue_task = dequeue_task_fair, 1679 .dequeue_task = dequeue_task_fair,
1596 .yield_task = yield_task_fair, 1680 .yield_task = yield_task_fair,
1597#ifdef CONFIG_SMP
1598 .select_task_rq = select_task_rq_fair,
1599#endif /* CONFIG_SMP */
1600 1681
1601 .check_preempt_curr = check_preempt_wakeup, 1682 .check_preempt_curr = check_preempt_wakeup,
1602 1683
@@ -1604,6 +1685,8 @@ static const struct sched_class fair_sched_class = {
1604 .put_prev_task = put_prev_task_fair, 1685 .put_prev_task = put_prev_task_fair,
1605 1686
1606#ifdef CONFIG_SMP 1687#ifdef CONFIG_SMP
1688 .select_task_rq = select_task_rq_fair,
1689
1607 .load_balance = load_balance_fair, 1690 .load_balance = load_balance_fair,
1608 .move_one_task = move_one_task_fair, 1691 .move_one_task = move_one_task_fair,
1609#endif 1692#endif
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index dec4ccabe2f5..8a21a2e28c13 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -105,9 +105,6 @@ static const struct sched_class idle_sched_class = {
105 105
106 /* dequeue is not valid, we print a debug message there: */ 106 /* dequeue is not valid, we print a debug message there: */
107 .dequeue_task = dequeue_task_idle, 107 .dequeue_task = dequeue_task_idle,
108#ifdef CONFIG_SMP
109 .select_task_rq = select_task_rq_idle,
110#endif /* CONFIG_SMP */
111 108
112 .check_preempt_curr = check_preempt_curr_idle, 109 .check_preempt_curr = check_preempt_curr_idle,
113 110
@@ -115,6 +112,8 @@ static const struct sched_class idle_sched_class = {
115 .put_prev_task = put_prev_task_idle, 112 .put_prev_task = put_prev_task_idle,
116 113
117#ifdef CONFIG_SMP 114#ifdef CONFIG_SMP
115 .select_task_rq = select_task_rq_idle,
116
118 .load_balance = load_balance_idle, 117 .load_balance = load_balance_idle,
119 .move_one_task = move_one_task_idle, 118 .move_one_task = move_one_task_idle,
120#endif 119#endif
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index b446dc87494f..d9ba9d5f99d6 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1504,9 +1504,6 @@ static const struct sched_class rt_sched_class = {
1504 .enqueue_task = enqueue_task_rt, 1504 .enqueue_task = enqueue_task_rt,
1505 .dequeue_task = dequeue_task_rt, 1505 .dequeue_task = dequeue_task_rt,
1506 .yield_task = yield_task_rt, 1506 .yield_task = yield_task_rt,
1507#ifdef CONFIG_SMP
1508 .select_task_rq = select_task_rq_rt,
1509#endif /* CONFIG_SMP */
1510 1507
1511 .check_preempt_curr = check_preempt_curr_rt, 1508 .check_preempt_curr = check_preempt_curr_rt,
1512 1509
@@ -1514,6 +1511,8 @@ static const struct sched_class rt_sched_class = {
1514 .put_prev_task = put_prev_task_rt, 1511 .put_prev_task = put_prev_task_rt,
1515 1512
1516#ifdef CONFIG_SMP 1513#ifdef CONFIG_SMP
1514 .select_task_rq = select_task_rq_rt,
1515
1517 .load_balance = load_balance_rt, 1516 .load_balance = load_balance_rt,
1518 .move_one_task = move_one_task_rt, 1517 .move_one_task = move_one_task_rt,
1519 .set_cpus_allowed = set_cpus_allowed_rt, 1518 .set_cpus_allowed = set_cpus_allowed_rt,