aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorBjoern Brandenburg <bbb@mpi-sws.org>2015-08-09 07:18:48 -0400
committerBjoern Brandenburg <bbb@mpi-sws.org>2017-05-26 17:12:28 -0400
commit3baa55c19ffb567aa48568fa69dd17ad6f70d31d (patch)
tree7e79fd398705929f2db40ba239895cc60762f61f /kernel
parentcbe61859a233702ed8e6723b3b133d1f2ae1ae2c (diff)
Add LITMUS^RT core implementation
This patch adds the core of LITMUS^RT: - library functionality (heaps, rt_domain, prioritization, etc.) - budget enforcement logic - job management - system call backends - virtual devices (control page, etc.) - scheduler plugin API (and dummy plugin) This code compiles, but is not yet integrated with the rest of Linux. Squashed changes: LITMUS^RT Core: add get_current_budget() system call Allow userspace to figure out the used-up and remaining budget of a task. Adds deadline field to control page and updates it when setting up jobs for release. Adds control page deadline offset ftdev: respect O_NONBLOCK flag in ftdev_read() Don't block if userspace wants to go on doing something else. Export job release time and job sequence number in ctrl page Add alternate complete_job() default implementation Let jobs sleep like regular Linux tasks by suspending and waking them with a one-shot timer. Plugins can opt into using this implementation instead of the classic complete_job() implementation (or custom implementations). Fix RCU locking in sys_get_rt_task_param() sys_get_rt_task_param() is rarely used and apparently attracted some bitrot. Free before setting NULL to prevent memory leak Add hrtimer_start_on() support This patch replaces the previous implementation of hrtimer_start_on() by now using smp_call_function_single_async() to arm hrtimers on remote CPUs. Expose LITMUS^RT system calls via control page ioctl() Rationale: make LITMUS^RT ops available in a way that does not create merge conflicts each time we rebase LITMUS^RT on top of a new kernel version. This also helps with portability to different architectures, as we no longer need to patch each architecture's syscall table. Pick non-zero syscall ID start range To avoid interfering with Linux's magic reserved IOCTL numbers Don't preempt before time check in sleep_until_next_release() Avoid preempting jobs that are about to go to sleep soon anyway. LITMUS^RT proc: fix wrong memset() TRACE(): add TRACE_WARN_ON() helper Useful to replace BUG_ON() and WARN_ON() with a non-fatal TRACE()-based equivalent. Add void* plugin_state pointer to task_struct LITMUS^RT: split task admission into two functions Plugin interface: add fork_task() callback LITMUS^RT: Enable plugins to permit RT tasks to fork one-shot complete_job(): set completed flag This could race with a SIGSTOP or some other forced suspension, but we'll let plugins handle this, should they actually care. FP: add list-based ready queue LITMUS^RT core: add should_wait_for_stack() callback Allow plugins to give up when waiting for a stack to become available. LITMUS^RT core: add next_became_invalid() callback LITMUS^RT core: add post-migration validation callback LITMUS^RT core: be more careful when pull-migrating tasks Close more race windows and give plugins a chance to validate tasks after they have been migrated. Add KConfig options for timer latency warnings Add reservation creation API to plugin interface & syscalls LITMUS^RT syscall: expose sys_reservation_create() via ioctl() Add reservation configuration types to rt_param.h Add basic generic reservation-based scheduling infrastructure Switch to aligned quanta by default. For first-time users, aligned quanta is likely what's expected. LITMUS^RT core: keep track of time of last suspension This information is needed to insert ST_COMPLETION records for sporadic tasks. add fields for clock_nanosleep() support Need to communicate the intended wake-up time to the plugin wake-up handler. LITMUS^RT core: add generic handler for sporadic job arrivals In particular, check if a job arrival is triggered from a clock_nanosleep() call. add litmus->task_change_params() callback to plugin interface Will be used by adaptive C-EDF. Call litmus->task_change_params() from sys_set_rt_task_param() Move trace point definition to litmus/litmus.c If !CONFIG_SCHED_TASK_TRACE, but CONFIG_SCHED_LITMUS_TRACEPOINT, then we still need to define the tracepoint structures. This patch should be integrated with the earlier sched_task_trace.c patches during one of the next major rebasing efforts. LITMUS^RT scheduling class: mark enqueued task as present Remove unistd_*.h rebase fix: update to new hrtimer API The new API is actually nicer and cleaner. rebase fix: call lockdep_unpin_lock(&rq->lock, cookie) The LITMUS^RT scheduling class should also do the LOCKDEP dance. LITMUS^RT core: break out non-preemptive flag defs Not every file including litmus.h needs to know this. LITMUS^RT core: don't include debug_trace.h in litmus.h Including debug_trace.h introduces the TRACE() macro, which causes symbol clashes in some (rather obscure) drivers. LITMUS^RT core: add litmus_preemption_in_progress flags Used to communicate that a preemption is in progress. Set by the scheduler; read by the plugins. LITMUS^RT core: revise is_current_running() macro
Diffstat (limited to 'kernel')
-rw-r--r--kernel/sched/litmus.c385
1 files changed, 385 insertions, 0 deletions
diff --git a/kernel/sched/litmus.c b/kernel/sched/litmus.c
new file mode 100644
index 000000000000..80e6928d9c55
--- /dev/null
+++ b/kernel/sched/litmus.c
@@ -0,0 +1,385 @@
1/* This file is included from kernel/sched.c */
2
3#include "sched.h"
4
5#include <litmus/trace.h>
6#include <litmus/sched_trace.h>
7
8#include <litmus/litmus.h>
9#include <litmus/budget.h>
10#include <litmus/sched_plugin.h>
11#include <litmus/preempt.h>
12#include <litmus/np.h>
13
14static void update_time_litmus(struct rq *rq, struct task_struct *p)
15{
16 u64 delta = rq->clock - p->se.exec_start;
17 if (unlikely((s64)delta < 0))
18 delta = 0;
19 /* per job counter */
20 p->rt_param.job_params.exec_time += delta;
21 /* task counter */
22 p->se.sum_exec_runtime += delta;
23 if (delta) {
24 TRACE_TASK(p, "charged %llu exec time (total:%llu, rem:%llu)\n",
25 delta, p->rt_param.job_params.exec_time, budget_remaining(p));
26 }
27 /* sched_clock() */
28 p->se.exec_start = rq->clock;
29 cpuacct_charge(p, delta);
30}
31
32static void double_rq_lock(struct rq *rq1, struct rq *rq2);
33static void double_rq_unlock(struct rq *rq1, struct rq *rq2);
34
35static struct task_struct *
36litmus_schedule(struct rq *rq, struct task_struct *prev)
37{
38 struct task_struct *next;
39
40#ifdef CONFIG_SMP
41 struct rq* other_rq;
42 long was_running;
43 int from_where;
44 lt_t _maybe_deadlock = 0;
45#endif
46
47 /* let the plugin schedule */
48 next = litmus->schedule(prev);
49
50 sched_state_plugin_check();
51
52#ifdef CONFIG_SMP
53 /* check if a global plugin pulled a task from a different RQ */
54 if (next && task_rq(next) != rq) {
55 /* we need to migrate the task */
56 other_rq = task_rq(next);
57 from_where = other_rq->cpu;
58 TRACE_TASK(next, "migrate from %d\n", from_where);
59
60 /* while we drop the lock, the prev task could change its
61 * state
62 */
63 BUG_ON(prev != current);
64 was_running = is_current_running();
65
66 /* Don't race with a concurrent switch. This could deadlock in
67 * the case of cross or circular migrations. It's the job of
68 * the plugin to make sure that doesn't happen.
69 */
70 TRACE_TASK(next, "stack_in_use=%d\n",
71 next->rt_param.stack_in_use);
72 if (next->rt_param.stack_in_use != NO_CPU) {
73 TRACE_TASK(next, "waiting to deschedule\n");
74 _maybe_deadlock = litmus_clock();
75 }
76
77 raw_spin_unlock(&rq->lock);
78
79 while (next->rt_param.stack_in_use != NO_CPU) {
80 cpu_relax();
81 mb();
82 if (next->rt_param.stack_in_use == NO_CPU)
83 TRACE_TASK(next,"descheduled. Proceeding.\n");
84
85 if (!litmus->should_wait_for_stack(next)) {
86 /* plugin aborted the wait */
87 TRACE_TASK(next,
88 "plugin gave up waiting for stack\n");
89 next = NULL;
90 /* Make sure plugin is given a chance to
91 * reconsider. */
92 litmus_reschedule_local();
93 /* give up */
94 raw_spin_lock(&rq->lock);
95 goto out;
96 }
97
98 if (from_where != task_rq(next)->cpu) {
99 /* The plugin should not give us something
100 * that other cores are trying to pull, too */
101 TRACE_TASK(next, "next invalid: task keeps "
102 "shifting around!? "
103 "(%d->%d)\n",
104 from_where,
105 task_rq(next)->cpu);
106
107 /* bail out */
108 raw_spin_lock(&rq->lock);
109 litmus->next_became_invalid(next);
110 litmus_reschedule_local();
111 next = NULL;
112 goto out;
113 }
114
115 if (lt_before(_maybe_deadlock + 1000000000L,
116 litmus_clock())) {
117 /* We've been spinning for 1s.
118 * Something can't be right!
119 * Let's abandon the task and bail out; at least
120 * we will have debug info instead of a hard
121 * deadlock.
122 */
123#ifdef CONFIG_BUG_ON_MIGRATION_DEADLOCK
124 BUG();
125#else
126 TRACE_TASK(next,"stack too long in use. "
127 "Deadlock?\n");
128 next = NULL;
129
130 /* bail out */
131 raw_spin_lock(&rq->lock);
132 goto out;
133#endif
134 }
135 }
136#ifdef __ARCH_WANT_UNLOCKED_CTXSW
137 if (next->on_cpu)
138 TRACE_TASK(next, "waiting for !oncpu");
139 while (next->on_cpu) {
140 cpu_relax();
141 mb();
142 }
143#endif
144 double_rq_lock(rq, other_rq);
145 if (other_rq == task_rq(next) &&
146 next->rt_param.stack_in_use == NO_CPU) {
147 /* ok, we can grab it */
148 set_task_cpu(next, rq->cpu);
149 /* release the other CPU's runqueue, but keep ours */
150 raw_spin_unlock(&other_rq->lock);
151 } else {
152 /* Either it moved or the stack was claimed; both is
153 * bad and forces us to abort the migration. */
154 TRACE_TASK(next, "next invalid: no longer available\n");
155 raw_spin_unlock(&other_rq->lock);
156 litmus->next_became_invalid(next);
157 next = NULL;
158 goto out;
159 }
160
161 if (!litmus->post_migration_validate(next)) {
162 TRACE_TASK(next, "plugin deems task now invalid\n");
163 litmus_reschedule_local();
164 next = NULL;
165 }
166 }
167#endif
168
169 /* check if the task became invalid while we dropped the lock */
170 if (next && (!is_realtime(next) || !tsk_rt(next)->present)) {
171 TRACE_TASK(next,
172 "BAD: next (no longer?) valid\n");
173 litmus->next_became_invalid(next);
174 litmus_reschedule_local();
175 next = NULL;
176 }
177
178 if (next) {
179#ifdef CONFIG_SMP
180 next->rt_param.stack_in_use = rq->cpu;
181#else
182 next->rt_param.stack_in_use = 0;
183#endif
184 update_rq_clock(rq);
185 next->se.exec_start = rq->clock;
186 }
187
188out:
189 update_enforcement_timer(next);
190 return next;
191}
192
193static void enqueue_task_litmus(struct rq *rq, struct task_struct *p,
194 int flags)
195{
196 tsk_rt(p)->present = 1;
197 if (flags & ENQUEUE_WAKEUP) {
198 sched_trace_task_resume(p);
199 /* LITMUS^RT plugins need to update the state
200 * _before_ making it available in global structures.
201 * Linux gets away with being lazy about the task state
202 * update. We can't do that, hence we update the task
203 * state already here.
204 *
205 * WARNING: this needs to be re-evaluated when porting
206 * to newer kernel versions.
207 */
208 p->state = TASK_RUNNING;
209 litmus->task_wake_up(p);
210
211 rq->litmus.nr_running++;
212 } else {
213 TRACE_TASK(p, "ignoring an enqueue, not a wake up.\n");
214 p->se.exec_start = rq->clock;
215 }
216}
217
218static void dequeue_task_litmus(struct rq *rq, struct task_struct *p,
219 int flags)
220{
221 if (flags & DEQUEUE_SLEEP) {
222#ifdef CONFIG_SCHED_TASK_TRACE
223 tsk_rt(p)->job_params.last_suspension = litmus_clock();
224#endif
225 litmus->task_block(p);
226 tsk_rt(p)->present = 0;
227 sched_trace_task_block(p);
228
229 rq->litmus.nr_running--;
230 } else
231 TRACE_TASK(p, "ignoring a dequeue, not going to sleep.\n");
232}
233
234static void yield_task_litmus(struct rq *rq)
235{
236 TS_SYSCALL_IN_START;
237 TS_SYSCALL_IN_END;
238
239 BUG_ON(rq->curr != current);
240 /* sched_yield() is called to trigger delayed preemptions.
241 * Thus, mark the current task as needing to be rescheduled.
242 * This will cause the scheduler plugin to be invoked, which can
243 * then determine if a preemption is still required.
244 */
245 clear_exit_np(current);
246 litmus_reschedule_local();
247
248 TS_SYSCALL_OUT_START;
249}
250
251/* Plugins are responsible for this.
252 */
253static void check_preempt_curr_litmus(struct rq *rq, struct task_struct *p, int flags)
254{
255}
256
257static void put_prev_task_litmus(struct rq *rq, struct task_struct *p)
258{
259}
260
261/* pick_next_task_litmus() - litmus_schedule() function
262 *
263 * return the next task to be scheduled
264 */
265static struct task_struct *pick_next_task_litmus(struct rq *rq,
266 struct task_struct *prev, struct pin_cookie cookie)
267{
268 struct task_struct *next;
269
270 if (is_realtime(prev))
271 update_time_litmus(rq, prev);
272
273 lockdep_unpin_lock(&rq->lock, cookie);
274 TS_PLUGIN_SCHED_START;
275 next = litmus_schedule(rq, prev);
276 TS_PLUGIN_SCHED_END;
277 lockdep_repin_lock(&rq->lock, cookie);
278
279 /* This is a bit backwards: the other classes call put_prev_task()
280 * _after_ they've determined that the class has some queued tasks.
281 * We can't determine this easily because each plugin manages its own
282 * ready queues, and because in the case of globally shared queues,
283 * we really don't know whether we'll have something ready even if
284 * we test here. So we do it in reverse: first ask the plugin to
285 * provide a task, and if we find one, call put_prev_task() on the
286 * previously scheduled task.
287 */
288 if (next)
289 put_prev_task(rq, prev);
290
291 return next;
292}
293
294static void task_tick_litmus(struct rq *rq, struct task_struct *p, int queued)
295{
296 if (is_realtime(p) && !queued) {
297 update_time_litmus(rq, p);
298 /* budget check for QUANTUM_ENFORCEMENT tasks */
299 if (budget_enforced(p) && budget_exhausted(p)) {
300 litmus_reschedule_local();
301 }
302 }
303}
304
305static void switched_to_litmus(struct rq *rq, struct task_struct *p)
306{
307}
308
309static void prio_changed_litmus(struct rq *rq, struct task_struct *p,
310 int oldprio)
311{
312}
313
314unsigned int get_rr_interval_litmus(struct rq *rq, struct task_struct *p)
315{
316 /* return infinity */
317 return 0;
318}
319
320/* This is called when a task became a real-time task, either due to a SCHED_*
321 * class transition or due to PI mutex inheritance. We don't handle Linux PI
322 * mutex inheritance yet (and probably never will). Use LITMUS provided
323 * synchronization primitives instead.
324 */
325static void set_curr_task_litmus(struct rq *rq)
326{
327 rq->curr->se.exec_start = rq->clock;
328}
329
330
331#ifdef CONFIG_SMP
332/* execve tries to rebalance task in this scheduling domain.
333 * We don't care about the scheduling domain; can gets called from
334 * exec, fork, wakeup.
335 */
336static int
337select_task_rq_litmus(struct task_struct *p, int cpu, int sd_flag, int flags)
338{
339 /* preemption is already disabled.
340 * We don't want to change cpu here
341 */
342 return task_cpu(p);
343}
344#endif
345
346static void update_curr_litmus(struct rq *rq)
347{
348 struct task_struct *p = rq->curr;
349
350 if (!is_realtime(p))
351 return;
352
353 update_time_litmus(rq, p);
354}
355
356const struct sched_class litmus_sched_class = {
357 /* From 34f971f6 the stop/migrate worker threads have a class on
358 * their own, which is the highest prio class. We don't support
359 * cpu-hotplug or cpu throttling. Allows Litmus to use up to 1.0
360 * CPU capacity.
361 */
362 .next = &dl_sched_class,
363 .enqueue_task = enqueue_task_litmus,
364 .dequeue_task = dequeue_task_litmus,
365 .yield_task = yield_task_litmus,
366
367 .check_preempt_curr = check_preempt_curr_litmus,
368
369 .pick_next_task = pick_next_task_litmus,
370 .put_prev_task = put_prev_task_litmus,
371
372#ifdef CONFIG_SMP
373 .select_task_rq = select_task_rq_litmus,
374#endif
375
376 .set_curr_task = set_curr_task_litmus,
377 .task_tick = task_tick_litmus,
378
379 .get_rr_interval = get_rr_interval_litmus,
380
381 .prio_changed = prio_changed_litmus,
382 .switched_to = switched_to_litmus,
383
384 .update_curr = update_curr_litmus,
385};