aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGregory Haskins <ghaskins@novell.com>2008-12-29 09:39:53 -0500
committerGregory Haskins <ghaskins@novell.com>2008-12-29 09:39:53 -0500
commit917b627d4d981dc614519d7b34ea31a976b14e12 (patch)
treeedb1744bd3f943ee79ee4f6c995c48a28421504c
parent4075134e40804821f90866d7de56802e4dcecb1e (diff)
sched: create "pushable_tasks" list to limit pushing to one attempt
The RT scheduler employs a "push/pull" design to actively balance tasks within the system (on a per disjoint cpuset basis). When a task is awoken, it is immediately determined if there are any lower priority cpus which should be preempted. This is opposed to the way normal SCHED_OTHER tasks behave, which will wait for a periodic rebalancing operation to occur before spreading out load. When a particular RQ has more than 1 active RT task, it is said to be in an "overloaded" state. Once this occurs, the system enters the active balancing mode, where it will try to push the task away, or persuade a different cpu to pull it over. The system will stay in this state until the system falls back below the <= 1 queued RT task per RQ. However, the current implementation suffers from a limitation in the push logic. Once overloaded, all tasks (other than current) on the RQ are analyzed on every push operation, even if it was previously unpushable (due to affinity, etc). Whats more, the operation stops at the first task that is unpushable and will not look at items lower in the queue. This causes two problems: 1) We can have the same tasks analyzed over and over again during each push, which extends out the fast path in the scheduler for no gain. Consider a RQ that has dozens of tasks that are bound to a core. Each one of those tasks will be encountered and skipped for each push operation while they are queued. 2) There may be lower-priority tasks under the unpushable task that could have been successfully pushed, but will never be considered until either the unpushable task is cleared, or a pull operation succeeds. The net result is a potential latency source for mid priority tasks. This patch aims to rectify these two conditions by introducing a new priority sorted list: "pushable_tasks". A task is added to the list each time a task is activated or preempted. It is removed from the list any time it is deactivated, made current, or fails to push. This works because a task only needs to be attempted to push once. After an initial failure to push, the other cpus will eventually try to pull the task when the conditions are proper. This also solves the problem that we don't completely analyze all tasks due to encountering an unpushable tasks. Now every task will have a push attempted (when appropriate). This reduces latency both by shorting the critical section of the rq->lock for certain workloads, and by making sure the algorithm considers all eligible tasks in the system. [ rostedt: added a couple more BUG_ONs ] Signed-off-by: Gregory Haskins <ghaskins@novell.com> Acked-by: Steven Rostedt <srostedt@redhat.com>
-rw-r--r--include/linux/init_task.h1
-rw-r--r--include/linux/sched.h1
-rw-r--r--kernel/sched.c4
-rw-r--r--kernel/sched_rt.c119
4 files changed, 107 insertions, 18 deletions
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 23fd8909b9e5..6851225f44a7 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -140,6 +140,7 @@ extern struct group_info init_groups;
140 .nr_cpus_allowed = NR_CPUS, \ 140 .nr_cpus_allowed = NR_CPUS, \
141 }, \ 141 }, \
142 .tasks = LIST_HEAD_INIT(tsk.tasks), \ 142 .tasks = LIST_HEAD_INIT(tsk.tasks), \
143 .pushable_tasks = PLIST_NODE_INIT(tsk.pushable_tasks, MAX_PRIO), \
143 .ptraced = LIST_HEAD_INIT(tsk.ptraced), \ 144 .ptraced = LIST_HEAD_INIT(tsk.ptraced), \
144 .ptrace_entry = LIST_HEAD_INIT(tsk.ptrace_entry), \ 145 .ptrace_entry = LIST_HEAD_INIT(tsk.ptrace_entry), \
145 .real_parent = &tsk, \ 146 .real_parent = &tsk, \
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 836a86c32a65..440cabb2d432 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1179,6 +1179,7 @@ struct task_struct {
1179#endif 1179#endif
1180 1180
1181 struct list_head tasks; 1181 struct list_head tasks;
1182 struct plist_node pushable_tasks;
1182 1183
1183 struct mm_struct *mm, *active_mm; 1184 struct mm_struct *mm, *active_mm;
1184 1185
diff --git a/kernel/sched.c b/kernel/sched.c
index 3acbad8991a2..24ab80c28765 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -471,6 +471,7 @@ struct rt_rq {
471#ifdef CONFIG_SMP 471#ifdef CONFIG_SMP
472 unsigned long rt_nr_migratory; 472 unsigned long rt_nr_migratory;
473 int overloaded; 473 int overloaded;
474 struct plist_head pushable_tasks;
474#endif 475#endif
475 int rt_throttled; 476 int rt_throttled;
476 u64 rt_time; 477 u64 rt_time;
@@ -2481,6 +2482,8 @@ void sched_fork(struct task_struct *p, int clone_flags)
2481 /* Want to start with kernel preemption disabled. */ 2482 /* Want to start with kernel preemption disabled. */
2482 task_thread_info(p)->preempt_count = 1; 2483 task_thread_info(p)->preempt_count = 1;
2483#endif 2484#endif
2485 plist_node_init(&p->pushable_tasks, MAX_PRIO);
2486
2484 put_cpu(); 2487 put_cpu();
2485} 2488}
2486 2489
@@ -8237,6 +8240,7 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
8237#ifdef CONFIG_SMP 8240#ifdef CONFIG_SMP
8238 rt_rq->rt_nr_migratory = 0; 8241 rt_rq->rt_nr_migratory = 0;
8239 rt_rq->overloaded = 0; 8242 rt_rq->overloaded = 0;
8243 plist_head_init(&rq->rt.pushable_tasks, &rq->lock);
8240#endif 8244#endif
8241 8245
8242 rt_rq->rt_time = 0; 8246 rt_rq->rt_time = 0;
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index b0b6ea4ed674..fe9da6084c87 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -49,6 +49,24 @@ static void update_rt_migration(struct rq *rq)
49 rq->rt.overloaded = 0; 49 rq->rt.overloaded = 0;
50 } 50 }
51} 51}
52
53static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
54{
55 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
56 plist_node_init(&p->pushable_tasks, p->prio);
57 plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
58}
59
60static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
61{
62 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
63}
64
65#else
66
67#define enqueue_pushable_task(rq, p) do { } while (0)
68#define dequeue_pushable_task(rq, p) do { } while (0)
69
52#endif /* CONFIG_SMP */ 70#endif /* CONFIG_SMP */
53 71
54static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) 72static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
@@ -751,6 +769,9 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
751 769
752 enqueue_rt_entity(rt_se); 770 enqueue_rt_entity(rt_se);
753 771
772 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
773 enqueue_pushable_task(rq, p);
774
754 inc_cpu_load(rq, p->se.load.weight); 775 inc_cpu_load(rq, p->se.load.weight);
755} 776}
756 777
@@ -761,6 +782,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
761 update_curr_rt(rq); 782 update_curr_rt(rq);
762 dequeue_rt_entity(rt_se); 783 dequeue_rt_entity(rt_se);
763 784
785 dequeue_pushable_task(rq, p);
786
764 dec_cpu_load(rq, p->se.load.weight); 787 dec_cpu_load(rq, p->se.load.weight);
765} 788}
766 789
@@ -911,7 +934,7 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
911 return next; 934 return next;
912} 935}
913 936
914static struct task_struct *pick_next_task_rt(struct rq *rq) 937static struct task_struct *_pick_next_task_rt(struct rq *rq)
915{ 938{
916 struct sched_rt_entity *rt_se; 939 struct sched_rt_entity *rt_se;
917 struct task_struct *p; 940 struct task_struct *p;
@@ -933,6 +956,18 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
933 956
934 p = rt_task_of(rt_se); 957 p = rt_task_of(rt_se);
935 p->se.exec_start = rq->clock; 958 p->se.exec_start = rq->clock;
959
960 return p;
961}
962
963static struct task_struct *pick_next_task_rt(struct rq *rq)
964{
965 struct task_struct *p = _pick_next_task_rt(rq);
966
967 /* The running task is never eligible for pushing */
968 if (p)
969 dequeue_pushable_task(rq, p);
970
936 return p; 971 return p;
937} 972}
938 973
@@ -940,6 +975,13 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
940{ 975{
941 update_curr_rt(rq); 976 update_curr_rt(rq);
942 p->se.exec_start = 0; 977 p->se.exec_start = 0;
978
979 /*
980 * The previous task needs to be made eligible for pushing
981 * if it is still active
982 */
983 if (p->se.on_rq && p->rt.nr_cpus_allowed > 1)
984 enqueue_pushable_task(rq, p);
943} 985}
944 986
945#ifdef CONFIG_SMP 987#ifdef CONFIG_SMP
@@ -1116,6 +1158,31 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1116 return lowest_rq; 1158 return lowest_rq;
1117} 1159}
1118 1160
1161static inline int has_pushable_tasks(struct rq *rq)
1162{
1163 return !plist_head_empty(&rq->rt.pushable_tasks);
1164}
1165
1166static struct task_struct *pick_next_pushable_task(struct rq *rq)
1167{
1168 struct task_struct *p;
1169
1170 if (!has_pushable_tasks(rq))
1171 return NULL;
1172
1173 p = plist_first_entry(&rq->rt.pushable_tasks,
1174 struct task_struct, pushable_tasks);
1175
1176 BUG_ON(rq->cpu != task_cpu(p));
1177 BUG_ON(task_current(rq, p));
1178 BUG_ON(p->rt.nr_cpus_allowed <= 1);
1179
1180 BUG_ON(!p->se.on_rq);
1181 BUG_ON(!rt_task(p));
1182
1183 return p;
1184}
1185
1119/* 1186/*
1120 * If the current CPU has more than one RT task, see if the non 1187 * If the current CPU has more than one RT task, see if the non
1121 * running task can migrate over to a CPU that is running a task 1188 * running task can migrate over to a CPU that is running a task
@@ -1125,13 +1192,12 @@ static int push_rt_task(struct rq *rq)
1125{ 1192{
1126 struct task_struct *next_task; 1193 struct task_struct *next_task;
1127 struct rq *lowest_rq; 1194 struct rq *lowest_rq;
1128 int ret = 0;
1129 int paranoid = RT_MAX_TRIES; 1195 int paranoid = RT_MAX_TRIES;
1130 1196
1131 if (!rq->rt.overloaded) 1197 if (!rq->rt.overloaded)
1132 return 0; 1198 return 0;
1133 1199
1134 next_task = pick_next_highest_task_rt(rq, -1); 1200 next_task = pick_next_pushable_task(rq);
1135 if (!next_task) 1201 if (!next_task)
1136 return 0; 1202 return 0;
1137 1203
@@ -1163,12 +1229,19 @@ static int push_rt_task(struct rq *rq)
1163 * so it is possible that next_task has changed. 1229 * so it is possible that next_task has changed.
1164 * If it has, then try again. 1230 * If it has, then try again.
1165 */ 1231 */
1166 task = pick_next_highest_task_rt(rq, -1); 1232 task = pick_next_pushable_task(rq);
1167 if (unlikely(task != next_task) && task && paranoid--) { 1233 if (unlikely(task != next_task) && task && paranoid--) {
1168 put_task_struct(next_task); 1234 put_task_struct(next_task);
1169 next_task = task; 1235 next_task = task;
1170 goto retry; 1236 goto retry;
1171 } 1237 }
1238
1239 /*
1240 * Once we have failed to push this task, we will not
1241 * try again, since the other cpus will pull from us
1242 * when they are ready
1243 */
1244 dequeue_pushable_task(rq, next_task);
1172 goto out; 1245 goto out;
1173 } 1246 }
1174 1247
@@ -1180,23 +1253,12 @@ static int push_rt_task(struct rq *rq)
1180 1253
1181 double_unlock_balance(rq, lowest_rq); 1254 double_unlock_balance(rq, lowest_rq);
1182 1255
1183 ret = 1;
1184out: 1256out:
1185 put_task_struct(next_task); 1257 put_task_struct(next_task);
1186 1258
1187 return ret; 1259 return 1;
1188} 1260}
1189 1261
1190/*
1191 * TODO: Currently we just use the second highest prio task on
1192 * the queue, and stop when it can't migrate (or there's
1193 * no more RT tasks). There may be a case where a lower
1194 * priority RT task has a different affinity than the
1195 * higher RT task. In this case the lower RT task could
1196 * possibly be able to migrate where as the higher priority
1197 * RT task could not. We currently ignore this issue.
1198 * Enhancements are welcome!
1199 */
1200static void push_rt_tasks(struct rq *rq) 1262static void push_rt_tasks(struct rq *rq)
1201{ 1263{
1202 /* push_rt_task will return true if it moved an RT */ 1264 /* push_rt_task will return true if it moved an RT */
@@ -1295,7 +1357,7 @@ static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
1295 */ 1357 */
1296static int needs_post_schedule_rt(struct rq *rq) 1358static int needs_post_schedule_rt(struct rq *rq)
1297{ 1359{
1298 return rq->rt.overloaded ? 1 : 0; 1360 return has_pushable_tasks(rq);
1299} 1361}
1300 1362
1301static void post_schedule_rt(struct rq *rq) 1363static void post_schedule_rt(struct rq *rq)
@@ -1317,7 +1379,7 @@ static void task_wake_up_rt(struct rq *rq, struct task_struct *p)
1317{ 1379{
1318 if (!task_running(rq, p) && 1380 if (!task_running(rq, p) &&
1319 !test_tsk_need_resched(rq->curr) && 1381 !test_tsk_need_resched(rq->curr) &&
1320 rq->rt.overloaded && 1382 has_pushable_tasks(rq) &&
1321 p->rt.nr_cpus_allowed > 1) 1383 p->rt.nr_cpus_allowed > 1)
1322 push_rt_tasks(rq); 1384 push_rt_tasks(rq);
1323} 1385}
@@ -1354,6 +1416,24 @@ static void set_cpus_allowed_rt(struct task_struct *p,
1354 if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) { 1416 if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) {
1355 struct rq *rq = task_rq(p); 1417 struct rq *rq = task_rq(p);
1356 1418
1419 if (!task_current(rq, p)) {
1420 /*
1421 * Make sure we dequeue this task from the pushable list
1422 * before going further. It will either remain off of
1423 * the list because we are no longer pushable, or it
1424 * will be requeued.
1425 */
1426 if (p->rt.nr_cpus_allowed > 1)
1427 dequeue_pushable_task(rq, p);
1428
1429 /*
1430 * Requeue if our weight is changing and still > 1
1431 */
1432 if (weight > 1)
1433 enqueue_pushable_task(rq, p);
1434
1435 }
1436
1357 if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) { 1437 if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) {
1358 rq->rt.rt_nr_migratory++; 1438 rq->rt.rt_nr_migratory++;
1359 } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) { 1439 } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) {
@@ -1538,6 +1618,9 @@ static void set_curr_task_rt(struct rq *rq)
1538 struct task_struct *p = rq->curr; 1618 struct task_struct *p = rq->curr;
1539 1619
1540 p->se.exec_start = rq->clock; 1620 p->se.exec_start = rq->clock;
1621
1622 /* The running task is never eligible for pushing */
1623 dequeue_pushable_task(rq, p);
1541} 1624}
1542 1625
1543static const struct sched_class rt_sched_class = { 1626static const struct sched_class rt_sched_class = {