aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRik van Riel <riel@redhat.com>2011-02-01 09:51:03 -0500
committerIngo Molnar <mingo@elte.hu>2011-02-03 08:20:33 -0500
commitac53db596cc08ecb8040cfb6f71ae40c6f2041c4 (patch)
treec263b558772213530532026af6fa9e34a8f88375
parent2c13c919d9e9a3db9896143a501f83dcbbe1ced4 (diff)
sched: Use a buddy to implement yield_task_fair()
Use the buddy mechanism to implement yield_task_fair. This allows us to skip onto the next highest priority se at every level in the CFS tree, unless doing so would introduce gross unfairness in CPU time distribution. We order the buddy selection in pick_next_entity to check yield first, then last, then next. We need next to be able to override yield, because it is possible for the "next" and "yield" task to be different processen in the same sub-tree of the CFS tree. When they are, we need to go into that sub-tree regardless of the "yield" hint, and pick the correct entity once we get to the right level. Signed-off-by: Rik van Riel <riel@redhat.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> LKML-Reference: <20110201095103.3a79e92a@annuminas.surriel.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r--include/linux/sched.h2
-rw-r--r--kernel/sched.c2
-rw-r--r--kernel/sched_debug.c2
-rw-r--r--kernel/sched_fair.c148
-rw-r--r--kernel/sysctl.c7
5 files changed, 90 insertions, 71 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0542774914d4..4e9fad271c30 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1942,8 +1942,6 @@ int sched_rt_handler(struct ctl_table *table, int write,
1942 void __user *buffer, size_t *lenp, 1942 void __user *buffer, size_t *lenp,
1943 loff_t *ppos); 1943 loff_t *ppos);
1944 1944
1945extern unsigned int sysctl_sched_compat_yield;
1946
1947#ifdef CONFIG_SCHED_AUTOGROUP 1945#ifdef CONFIG_SCHED_AUTOGROUP
1948extern unsigned int sysctl_sched_autogroup_enabled; 1946extern unsigned int sysctl_sched_autogroup_enabled;
1949 1947
diff --git a/kernel/sched.c b/kernel/sched.c
index 477e1bcc63f9..ae5e1a19b9d6 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -324,7 +324,7 @@ struct cfs_rq {
324 * 'curr' points to currently running entity on this cfs_rq. 324 * 'curr' points to currently running entity on this cfs_rq.
325 * It is set to NULL otherwise (i.e when none are currently running). 325 * It is set to NULL otherwise (i.e when none are currently running).
326 */ 326 */
327 struct sched_entity *curr, *next, *last; 327 struct sched_entity *curr, *next, *last, *skip;
328 328
329 unsigned int nr_spread_over; 329 unsigned int nr_spread_over;
330 330
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index eb6cb8edd075..7bacd83a4158 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -179,7 +179,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
179 179
180 raw_spin_lock_irqsave(&rq->lock, flags); 180 raw_spin_lock_irqsave(&rq->lock, flags);
181 if (cfs_rq->rb_leftmost) 181 if (cfs_rq->rb_leftmost)
182 MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime; 182 MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime;
183 last = __pick_last_entity(cfs_rq); 183 last = __pick_last_entity(cfs_rq);
184 if (last) 184 if (last)
185 max_vruntime = last->vruntime; 185 max_vruntime = last->vruntime;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index a785e08202cf..c0fbeb992833 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -69,14 +69,6 @@ static unsigned int sched_nr_latency = 8;
69unsigned int sysctl_sched_child_runs_first __read_mostly; 69unsigned int sysctl_sched_child_runs_first __read_mostly;
70 70
71/* 71/*
72 * sys_sched_yield() compat mode
73 *
74 * This option switches the agressive yield implementation of the
75 * old scheduler back on.
76 */
77unsigned int __read_mostly sysctl_sched_compat_yield;
78
79/*
80 * SCHED_OTHER wake-up granularity. 72 * SCHED_OTHER wake-up granularity.
81 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) 73 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
82 * 74 *
@@ -419,7 +411,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
419 rb_erase(&se->run_node, &cfs_rq->tasks_timeline); 411 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
420} 412}
421 413
422static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) 414static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
423{ 415{
424 struct rb_node *left = cfs_rq->rb_leftmost; 416 struct rb_node *left = cfs_rq->rb_leftmost;
425 417
@@ -429,6 +421,17 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
429 return rb_entry(left, struct sched_entity, run_node); 421 return rb_entry(left, struct sched_entity, run_node);
430} 422}
431 423
424static struct sched_entity *__pick_next_entity(struct sched_entity *se)
425{
426 struct rb_node *next = rb_next(&se->run_node);
427
428 if (!next)
429 return NULL;
430
431 return rb_entry(next, struct sched_entity, run_node);
432}
433
434#ifdef CONFIG_SCHED_DEBUG
432static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) 435static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
433{ 436{
434 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); 437 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
@@ -443,7 +446,6 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
443 * Scheduling class statistics methods: 446 * Scheduling class statistics methods:
444 */ 447 */
445 448
446#ifdef CONFIG_SCHED_DEBUG
447int sched_proc_update_handler(struct ctl_table *table, int write, 449int sched_proc_update_handler(struct ctl_table *table, int write,
448 void __user *buffer, size_t *lenp, 450 void __user *buffer, size_t *lenp,
449 loff_t *ppos) 451 loff_t *ppos)
@@ -1017,6 +1019,17 @@ static void __clear_buddies_next(struct sched_entity *se)
1017 } 1019 }
1018} 1020}
1019 1021
1022static void __clear_buddies_skip(struct sched_entity *se)
1023{
1024 for_each_sched_entity(se) {
1025 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1026 if (cfs_rq->skip == se)
1027 cfs_rq->skip = NULL;
1028 else
1029 break;
1030 }
1031}
1032
1020static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) 1033static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
1021{ 1034{
1022 if (cfs_rq->last == se) 1035 if (cfs_rq->last == se)
@@ -1024,6 +1037,9 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
1024 1037
1025 if (cfs_rq->next == se) 1038 if (cfs_rq->next == se)
1026 __clear_buddies_next(se); 1039 __clear_buddies_next(se);
1040
1041 if (cfs_rq->skip == se)
1042 __clear_buddies_skip(se);
1027} 1043}
1028 1044
1029static void 1045static void
@@ -1099,7 +1115,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
1099 return; 1115 return;
1100 1116
1101 if (cfs_rq->nr_running > 1) { 1117 if (cfs_rq->nr_running > 1) {
1102 struct sched_entity *se = __pick_next_entity(cfs_rq); 1118 struct sched_entity *se = __pick_first_entity(cfs_rq);
1103 s64 delta = curr->vruntime - se->vruntime; 1119 s64 delta = curr->vruntime - se->vruntime;
1104 1120
1105 if (delta < 0) 1121 if (delta < 0)
@@ -1143,13 +1159,27 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
1143static int 1159static int
1144wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); 1160wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
1145 1161
1162/*
1163 * Pick the next process, keeping these things in mind, in this order:
1164 * 1) keep things fair between processes/task groups
1165 * 2) pick the "next" process, since someone really wants that to run
1166 * 3) pick the "last" process, for cache locality
1167 * 4) do not run the "skip" process, if something else is available
1168 */
1146static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) 1169static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
1147{ 1170{
1148 struct sched_entity *se = __pick_next_entity(cfs_rq); 1171 struct sched_entity *se = __pick_first_entity(cfs_rq);
1149 struct sched_entity *left = se; 1172 struct sched_entity *left = se;
1150 1173
1151 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) 1174 /*
1152 se = cfs_rq->next; 1175 * Avoid running the skip buddy, if running something else can
1176 * be done without getting too unfair.
1177 */
1178 if (cfs_rq->skip == se) {
1179 struct sched_entity *second = __pick_next_entity(se);
1180 if (second && wakeup_preempt_entity(second, left) < 1)
1181 se = second;
1182 }
1153 1183
1154 /* 1184 /*
1155 * Prefer last buddy, try to return the CPU to a preempted task. 1185 * Prefer last buddy, try to return the CPU to a preempted task.
@@ -1157,6 +1187,12 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
1157 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) 1187 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
1158 se = cfs_rq->last; 1188 se = cfs_rq->last;
1159 1189
1190 /*
1191 * Someone really wants this to run. If it's not unfair, run it.
1192 */
1193 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
1194 se = cfs_rq->next;
1195
1160 clear_buddies(cfs_rq, se); 1196 clear_buddies(cfs_rq, se);
1161 1197
1162 return se; 1198 return se;
@@ -1333,52 +1369,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1333 hrtick_update(rq); 1369 hrtick_update(rq);
1334} 1370}
1335 1371
1336/*
1337 * sched_yield() support is very simple - we dequeue and enqueue.
1338 *
1339 * If compat_yield is turned on then we requeue to the end of the tree.
1340 */
1341static void yield_task_fair(struct rq *rq)
1342{
1343 struct task_struct *curr = rq->curr;
1344 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1345 struct sched_entity *rightmost, *se = &curr->se;
1346
1347 /*
1348 * Are we the only task in the tree?
1349 */
1350 if (unlikely(rq->nr_running == 1))
1351 return;
1352
1353 clear_buddies(cfs_rq, se);
1354
1355 if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) {
1356 update_rq_clock(rq);
1357 /*
1358 * Update run-time statistics of the 'current'.
1359 */
1360 update_curr(cfs_rq);
1361
1362 return;
1363 }
1364 /*
1365 * Find the rightmost entry in the rbtree:
1366 */
1367 rightmost = __pick_last_entity(cfs_rq);
1368 /*
1369 * Already in the rightmost position?
1370 */
1371 if (unlikely(!rightmost || entity_before(rightmost, se)))
1372 return;
1373
1374 /*
1375 * Minimally necessary key value to be last in the tree:
1376 * Upon rescheduling, sched_class::put_prev_task() will place
1377 * 'current' within the tree based on its new key value.
1378 */
1379 se->vruntime = rightmost->vruntime + 1;
1380}
1381
1382#ifdef CONFIG_SMP 1372#ifdef CONFIG_SMP
1383 1373
1384static void task_waking_fair(struct rq *rq, struct task_struct *p) 1374static void task_waking_fair(struct rq *rq, struct task_struct *p)
@@ -1849,6 +1839,14 @@ static void set_next_buddy(struct sched_entity *se)
1849 } 1839 }
1850} 1840}
1851 1841
1842static void set_skip_buddy(struct sched_entity *se)
1843{
1844 if (likely(task_of(se)->policy != SCHED_IDLE)) {
1845 for_each_sched_entity(se)
1846 cfs_rq_of(se)->skip = se;
1847 }
1848}
1849
1852/* 1850/*
1853 * Preempt the current task with a newly woken task if needed: 1851 * Preempt the current task with a newly woken task if needed:
1854 */ 1852 */
@@ -1947,6 +1945,36 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
1947 } 1945 }
1948} 1946}
1949 1947
1948/*
1949 * sched_yield() is very simple
1950 *
1951 * The magic of dealing with the ->skip buddy is in pick_next_entity.
1952 */
1953static void yield_task_fair(struct rq *rq)
1954{
1955 struct task_struct *curr = rq->curr;
1956 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1957 struct sched_entity *se = &curr->se;
1958
1959 /*
1960 * Are we the only task in the tree?
1961 */
1962 if (unlikely(rq->nr_running == 1))
1963 return;
1964
1965 clear_buddies(cfs_rq, se);
1966
1967 if (curr->policy != SCHED_BATCH) {
1968 update_rq_clock(rq);
1969 /*
1970 * Update run-time statistics of the 'current'.
1971 */
1972 update_curr(cfs_rq);
1973 }
1974
1975 set_skip_buddy(se);
1976}
1977
1950#ifdef CONFIG_SMP 1978#ifdef CONFIG_SMP
1951/************************************************** 1979/**************************************************
1952 * Fair scheduling class load-balancing methods: 1980 * Fair scheduling class load-balancing methods:
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index bc86bb32e126..cbfda7e2416e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -360,13 +360,6 @@ static struct ctl_table kern_table[] = {
360 .mode = 0644, 360 .mode = 0644,
361 .proc_handler = sched_rt_handler, 361 .proc_handler = sched_rt_handler,
362 }, 362 },
363 {
364 .procname = "sched_compat_yield",
365 .data = &sysctl_sched_compat_yield,
366 .maxlen = sizeof(unsigned int),
367 .mode = 0644,
368 .proc_handler = proc_dointvec,
369 },
370#ifdef CONFIG_SCHED_AUTOGROUP 363#ifdef CONFIG_SCHED_AUTOGROUP
371 { 364 {
372 .procname = "sched_autogroup_enabled", 365 .procname = "sched_autogroup_enabled",