aboutsummaryrefslogtreecommitdiffstats
path: root/litmus
diff options
context:
space:
mode:
authorBjoern Brandenburg <bbb@mpi-sws.org>2015-08-09 07:18:56 -0400
committerBjoern Brandenburg <bbb@mpi-sws.org>2017-05-26 17:12:42 -0400
commit55b4b8689a88d6cb457ecfaabbccc09d5f7c121a (patch)
treee4f28b33dd7a0cb2f18bd0a6dd4aeb2af3504e65 /litmus
parentd3976b0260a5241d7f5461d1a12b51b53c833a91 (diff)
Add PD^2 scheduler plugin
PFAIR: fix wrong memset() PFAIR: don't arm timer for tardy tasks If the system is overloaded, tasks may be tardy. In that case, do not arm timers. Rather, add tardy tasks back into the ready queue right away. PFAIR: use sched_trace_last_suspension_as_completion() PFAIR: use inferred_sporadic_job_release_at() rebase fix: use new hrtimer API in PFAIR plugin
Diffstat (limited to 'litmus')
-rw-r--r--litmus/Kconfig13
-rw-r--r--litmus/Makefile1
-rw-r--r--litmus/sched_pfair.c1230
3 files changed, 1244 insertions, 0 deletions
diff --git a/litmus/Kconfig b/litmus/Kconfig
index 5916db3973ca..9fc402e79cfd 100644
--- a/litmus/Kconfig
+++ b/litmus/Kconfig
@@ -12,6 +12,19 @@ config PLUGIN_CEDF
12 On smaller platforms (e.g., ARM PB11MPCore), using C-EDF 12 On smaller platforms (e.g., ARM PB11MPCore), using C-EDF
13 makes little sense since there aren't any shared caches. 13 makes little sense since there aren't any shared caches.
14 14
15config PLUGIN_PFAIR
16 bool "PFAIR"
17 default y
18 help
19 Include the PFAIR plugin (i.e., the PD^2 scheduler) in the kernel.
20 The PFAIR plugin requires high resolution timers (for staggered
21 quanta) and also requires HZ_PERIODIC (i.e., periodic timer ticks
22 even if a processor is idle, as quanta could be missed otherwise).
23 Further, the PFAIR plugin uses the system tick and thus requires
24 HZ=1000 to achive reasonable granularity.
25
26 If unsure, say Yes.
27
15config RELEASE_MASTER 28config RELEASE_MASTER
16 bool "Release-master Support" 29 bool "Release-master Support"
17 depends on SMP 30 depends on SMP
diff --git a/litmus/Makefile b/litmus/Makefile
index 86b865d26563..c969ce59db67 100644
--- a/litmus/Makefile
+++ b/litmus/Makefile
@@ -24,6 +24,7 @@ obj-y = sched_plugin.o litmus.o \
24 sched_pfp.o 24 sched_pfp.o
25 25
26obj-$(CONFIG_PLUGIN_CEDF) += sched_cedf.o 26obj-$(CONFIG_PLUGIN_CEDF) += sched_cedf.o
27obj-$(CONFIG_PLUGIN_PFAIR) += sched_pfair.o
27 28
28obj-$(CONFIG_FEATHER_TRACE) += ft_event.o ftdev.o 29obj-$(CONFIG_FEATHER_TRACE) += ft_event.o ftdev.o
29obj-$(CONFIG_SCHED_TASK_TRACE) += sched_task_trace.o 30obj-$(CONFIG_SCHED_TASK_TRACE) += sched_task_trace.o
diff --git a/litmus/sched_pfair.c b/litmus/sched_pfair.c
new file mode 100644
index 000000000000..f75cc8c7e441
--- /dev/null
+++ b/litmus/sched_pfair.c
@@ -0,0 +1,1230 @@
1/*
2 * kernel/sched_pfair.c
3 *
4 * Implementation of the PD^2 pfair scheduling algorithm. This
5 * implementation realizes "early releasing," i.e., it is work-conserving.
6 *
7 */
8
9#include <asm/div64.h>
10#include <linux/delay.h>
11#include <linux/module.h>
12#include <linux/spinlock.h>
13#include <linux/percpu.h>
14#include <linux/sched.h>
15#include <linux/list.h>
16#include <linux/slab.h>
17
18#include <litmus/litmus.h>
19#include <litmus/jobs.h>
20#include <litmus/preempt.h>
21#include <litmus/rt_domain.h>
22#include <litmus/sched_plugin.h>
23#include <litmus/sched_trace.h>
24#include <litmus/trace.h>
25
26#include <litmus/bheap.h>
27
28/* to configure the cluster size */
29#include <litmus/litmus_proc.h>
30
31#include <litmus/clustered.h>
32
33static enum cache_level pfair_cluster_level = GLOBAL_CLUSTER;
34
35struct subtask {
36 /* measured in quanta relative to job release */
37 quanta_t release;
38 quanta_t deadline;
39 quanta_t overlap; /* called "b bit" by PD^2 */
40 quanta_t group_deadline;
41};
42
43struct pfair_param {
44 quanta_t quanta; /* number of subtasks */
45 quanta_t cur; /* index of current subtask */
46
47 quanta_t release; /* in quanta */
48 quanta_t period; /* in quanta */
49
50 quanta_t last_quantum; /* when scheduled last */
51 int last_cpu; /* where scheduled last */
52
53 unsigned int needs_requeue:1;
54
55 struct pfair_cluster* cluster; /* where this task is scheduled */
56
57 struct subtask subtasks[0]; /* allocate together with pfair_param */
58};
59
60#define tsk_pfair(tsk) ((tsk)->rt_param.pfair)
61
62struct pfair_state {
63 struct cluster_cpu topology;
64
65 struct hrtimer quantum_timer;
66
67 volatile quanta_t cur_tick; /* updated by the CPU that is advancing
68 * the time */
69 volatile quanta_t local_tick; /* What tick is the local CPU currently
70 * executing? Updated only by the local
71 * CPU. In QEMU, this may lag behind the
72 * current tick. In a real system, with
73 * proper timers and aligned quanta,
74 * that should only be the case for a
75 * very short time after the time
76 * advanced. With staggered quanta, it
77 * will lag for the duration of the
78 * offset.
79 */
80
81 struct task_struct* linked; /* the task that should be executing */
82 struct task_struct* local; /* the local copy of linked */
83 struct task_struct* scheduled; /* what is actually scheduled */
84
85 struct list_head out_of_budget; /* list of tasks that exhausted their allocation */
86
87 lt_t offset; /* stagger offset */
88 unsigned int missed_updates;
89 unsigned int missed_quanta;
90};
91
92struct pfair_cluster {
93 struct scheduling_cluster topology;
94
95 /* The "global" time in this cluster. */
96 quanta_t pfair_time; /* the "official" PFAIR clock */
97
98 /* The ready queue for this cluster. */
99 rt_domain_t pfair;
100
101 /* The set of jobs that should have their release enacted at the next
102 * quantum boundary.
103 */
104 struct bheap release_queue;
105 raw_spinlock_t release_lock;
106};
107
108static inline struct pfair_cluster* cpu_cluster(struct pfair_state* state)
109{
110 return container_of(state->topology.cluster, struct pfair_cluster, topology);
111}
112
113static inline int cpu_id(struct pfair_state* state)
114{
115 return state->topology.id;
116}
117
118static inline struct pfair_state* from_cluster_list(struct list_head* pos)
119{
120 return list_entry(pos, struct pfair_state, topology.cluster_list);
121}
122
123static inline struct pfair_cluster* from_domain(rt_domain_t* rt)
124{
125 return container_of(rt, struct pfair_cluster, pfair);
126}
127
128static inline raw_spinlock_t* cluster_lock(struct pfair_cluster* cluster)
129{
130 /* The ready_lock is used to serialize all scheduling events. */
131 return &cluster->pfair.ready_lock;
132}
133
134static inline raw_spinlock_t* cpu_lock(struct pfair_state* state)
135{
136 return cluster_lock(cpu_cluster(state));
137}
138
139DEFINE_PER_CPU(struct pfair_state, pfair_state);
140struct pfair_state* *pstate; /* short cut */
141
142static struct pfair_cluster* pfair_clusters;
143static int num_pfair_clusters;
144
145/* Enable for lots of trace info.
146 * #define PFAIR_DEBUG
147 */
148
149#ifdef PFAIR_DEBUG
150#define PTRACE_TASK(t, f, args...) TRACE_TASK(t, f, ## args)
151#define PTRACE(f, args...) TRACE(f, ## args)
152#else
153#define PTRACE_TASK(t, f, args...)
154#define PTRACE(f, args...)
155#endif
156
157/* gcc will inline all of these accessor functions... */
158static struct subtask* cur_subtask(struct task_struct* t)
159{
160 return tsk_pfair(t)->subtasks + tsk_pfair(t)->cur;
161}
162
163static quanta_t cur_deadline(struct task_struct* t)
164{
165 return cur_subtask(t)->deadline + tsk_pfair(t)->release;
166}
167
168static quanta_t cur_release(struct task_struct* t)
169{
170 /* This is early releasing: only the release of the first subtask
171 * counts. */
172 return tsk_pfair(t)->release;
173}
174
175static quanta_t cur_overlap(struct task_struct* t)
176{
177 return cur_subtask(t)->overlap;
178}
179
180static quanta_t cur_group_deadline(struct task_struct* t)
181{
182 quanta_t gdl = cur_subtask(t)->group_deadline;
183 if (gdl)
184 return gdl + tsk_pfair(t)->release;
185 else
186 return gdl;
187}
188
189
190static int pfair_higher_prio(struct task_struct* first,
191 struct task_struct* second)
192{
193 return /* first task must exist */
194 first && (
195 /* Does the second task exist and is it a real-time task? If
196 * not, the first task (which is a RT task) has higher
197 * priority.
198 */
199 !second || !is_realtime(second) ||
200
201 /* Is the (subtask) deadline of the first task earlier?
202 * Then it has higher priority.
203 */
204 time_before(cur_deadline(first), cur_deadline(second)) ||
205
206 /* Do we have a deadline tie?
207 * Then break by B-bit.
208 */
209 (cur_deadline(first) == cur_deadline(second) &&
210 (cur_overlap(first) > cur_overlap(second) ||
211
212 /* Do we have a B-bit tie?
213 * Then break by group deadline.
214 */
215 (cur_overlap(first) == cur_overlap(second) &&
216 (time_after(cur_group_deadline(first),
217 cur_group_deadline(second)) ||
218
219 /* Do we have a group deadline tie?
220 * Then break by PID, which are unique.
221 */
222 (cur_group_deadline(first) ==
223 cur_group_deadline(second) &&
224 first->pid < second->pid))))));
225}
226
227int pfair_ready_order(struct bheap_node* a, struct bheap_node* b)
228{
229 return pfair_higher_prio(bheap2task(a), bheap2task(b));
230}
231
232static void pfair_release_jobs(rt_domain_t* rt, struct bheap* tasks)
233{
234 struct pfair_cluster* cluster = from_domain(rt);
235 unsigned long flags;
236
237 raw_spin_lock_irqsave(&cluster->release_lock, flags);
238
239 bheap_union(pfair_ready_order, &cluster->release_queue, tasks);
240
241 raw_spin_unlock_irqrestore(&cluster->release_lock, flags);
242}
243
244static void prepare_release(struct task_struct* t, quanta_t at)
245{
246 tsk_pfair(t)->release = at;
247 tsk_pfair(t)->cur = 0;
248}
249
250/* pull released tasks from the release queue */
251static void poll_releases(struct pfair_cluster* cluster)
252{
253 raw_spin_lock(&cluster->release_lock);
254 __merge_ready(&cluster->pfair, &cluster->release_queue);
255 raw_spin_unlock(&cluster->release_lock);
256}
257
258static void check_preempt(struct task_struct* t)
259{
260 int cpu = NO_CPU;
261 if (tsk_rt(t)->linked_on != tsk_rt(t)->scheduled_on &&
262 is_present(t)) {
263 /* the task can be scheduled and
264 * is not scheduled where it ought to be scheduled
265 */
266 cpu = tsk_rt(t)->linked_on != NO_CPU ?
267 tsk_rt(t)->linked_on :
268 tsk_rt(t)->scheduled_on;
269 PTRACE_TASK(t, "linked_on:%d, scheduled_on:%d\n",
270 tsk_rt(t)->linked_on, tsk_rt(t)->scheduled_on);
271 /* preempt */
272 litmus_reschedule(cpu);
273 }
274}
275
276/* caller must hold pfair.ready_lock */
277static void drop_all_references(struct task_struct *t)
278{
279 int cpu;
280 struct pfair_state* s;
281 struct pfair_cluster* cluster;
282 if (bheap_node_in_heap(tsk_rt(t)->heap_node)) {
283 /* It must be in the ready queue; drop references isn't called
284 * when the job is in a release queue. */
285 cluster = tsk_pfair(t)->cluster;
286 bheap_delete(pfair_ready_order, &cluster->pfair.ready_queue,
287 tsk_rt(t)->heap_node);
288 }
289 for (cpu = 0; cpu < num_online_cpus(); cpu++) {
290 s = &per_cpu(pfair_state, cpu);
291 if (s->linked == t)
292 s->linked = NULL;
293 if (s->local == t)
294 s->local = NULL;
295 if (s->scheduled == t)
296 s->scheduled = NULL;
297 }
298 /* make sure we don't have a stale linked_on field */
299 tsk_rt(t)->linked_on = NO_CPU;
300
301 /* make sure we're not queued for re-releasing */
302 if (in_list(&tsk_rt(t)->list))
303 {
304 TRACE_TASK(t, "removing from out_of_budget queue\n");
305 list_del(&tsk_rt(t)->list);
306 }
307}
308
309static void pfair_prepare_next_period(struct task_struct* t)
310{
311 struct pfair_param* p = tsk_pfair(t);
312
313 prepare_for_next_period(t);
314 tsk_rt(t)->completed = 0;
315 p->release = time2quanta(get_release(t), CEIL);
316}
317
318/* returns 1 if the task needs to go the release queue */
319static int advance_subtask(quanta_t time, struct task_struct* t, int cpu)
320{
321 struct pfair_param* p = tsk_pfair(t);
322 int to_relq;
323 p->cur = (p->cur + 1) % p->quanta;
324 if (!p->cur) {
325 if (is_present(t)) {
326 /* The job overran; we start a new budget allocation. */
327 TRACE_TASK(t, "overran budget, preparing next period\n");
328 sched_trace_task_completion(t, 1);
329 pfair_prepare_next_period(t);
330 } else {
331 /* remove task from system until it wakes */
332 drop_all_references(t);
333 p->needs_requeue = 1;
334 TRACE_TASK(t, "on %d advanced to subtask %lu (not present)\n",
335 cpu, p->cur);
336 return 0;
337 }
338 }
339 to_relq = time_after(cur_release(t), time);
340 TRACE_TASK(t, "on %d advanced to subtask %lu -> to_relq=%d "
341 "(cur_release:%lu time:%lu present:%d on_cpu=%d)\n",
342 cpu, p->cur, to_relq, cur_release(t), time,
343 tsk_rt(t)->present, tsk_rt(t)->scheduled_on);
344 return to_relq;
345}
346
347static void advance_subtasks(struct pfair_cluster *cluster, quanta_t time)
348{
349 struct task_struct* l;
350 struct pfair_param* p;
351 struct list_head* pos;
352 struct pfair_state* cpu;
353
354 list_for_each(pos, &cluster->topology.cpus) {
355 cpu = from_cluster_list(pos);
356 l = cpu->linked;
357 cpu->missed_updates += cpu->linked != cpu->local;
358 if (l) {
359 p = tsk_pfair(l);
360 p->last_quantum = time;
361 p->last_cpu = cpu_id(cpu);
362 if (advance_subtask(time, l, cpu_id(cpu))) {
363 cpu->linked = NULL;
364 tsk_rt(l)->linked_on = NO_CPU;
365 PTRACE_TASK(l, "should go to release queue. "
366 "scheduled_on=%d present=%d\n",
367 tsk_rt(l)->scheduled_on,
368 tsk_rt(l)->present);
369 list_add(&tsk_rt(l)->list, &cpu->out_of_budget);
370 }
371 }
372 }
373}
374
375static int target_cpu(quanta_t time, struct task_struct* t, int default_cpu)
376{
377 int cpu;
378 if (tsk_rt(t)->scheduled_on != NO_CPU) {
379 /* always observe scheduled_on linkage */
380 default_cpu = tsk_rt(t)->scheduled_on;
381 } else if (tsk_pfair(t)->last_quantum == time - 1) {
382 /* back2back quanta */
383 /* Only observe last_quantum if no scheduled_on is in the way.
384 * This should only kick in if a CPU missed quanta, and that
385 * *should* only happen in QEMU.
386 */
387 cpu = tsk_pfair(t)->last_cpu;
388 if (!pstate[cpu]->linked ||
389 tsk_rt(pstate[cpu]->linked)->scheduled_on != cpu) {
390 default_cpu = cpu;
391 }
392 }
393 return default_cpu;
394}
395
396/* returns one if linking was redirected */
397static int pfair_link(quanta_t time, int cpu,
398 struct task_struct* t)
399{
400 int target = target_cpu(time, t, cpu);
401 struct task_struct* prev = pstate[cpu]->linked;
402 struct task_struct* other;
403 struct pfair_cluster* cluster = cpu_cluster(pstate[cpu]);
404
405 if (target != cpu) {
406 BUG_ON(pstate[target]->topology.cluster != pstate[cpu]->topology.cluster);
407 other = pstate[target]->linked;
408 pstate[target]->linked = t;
409 tsk_rt(t)->linked_on = target;
410 if (!other)
411 /* linked ok, but reschedule this CPU */
412 return 1;
413 if (target < cpu) {
414 /* link other to cpu instead */
415 tsk_rt(other)->linked_on = cpu;
416 pstate[cpu]->linked = other;
417 if (prev) {
418 /* prev got pushed back into the ready queue */
419 tsk_rt(prev)->linked_on = NO_CPU;
420 __add_ready(&cluster->pfair, prev);
421 }
422 /* we are done with this cpu */
423 return 0;
424 } else {
425 /* re-add other, it's original CPU was not considered yet */
426 tsk_rt(other)->linked_on = NO_CPU;
427 __add_ready(&cluster->pfair, other);
428 /* reschedule this CPU */
429 return 1;
430 }
431 } else {
432 pstate[cpu]->linked = t;
433 tsk_rt(t)->linked_on = cpu;
434 if (prev) {
435 /* prev got pushed back into the ready queue */
436 tsk_rt(prev)->linked_on = NO_CPU;
437 __add_ready(&cluster->pfair, prev);
438 }
439 /* we are done with this CPU */
440 return 0;
441 }
442}
443
444static void schedule_subtasks(struct pfair_cluster *cluster, quanta_t time)
445{
446 int retry;
447 struct list_head *pos;
448 struct pfair_state *cpu_state;
449
450 list_for_each(pos, &cluster->topology.cpus) {
451 cpu_state = from_cluster_list(pos);
452 retry = 1;
453#ifdef CONFIG_RELEASE_MASTER
454 /* skip release master */
455 if (cluster->pfair.release_master == cpu_id(cpu_state))
456 continue;
457#endif
458 while (retry) {
459 if (pfair_higher_prio(__peek_ready(&cluster->pfair),
460 cpu_state->linked))
461 retry = pfair_link(time, cpu_id(cpu_state),
462 __take_ready(&cluster->pfair));
463 else
464 retry = 0;
465 }
466 }
467}
468
469static void schedule_next_quantum(struct pfair_cluster *cluster, quanta_t time)
470{
471 struct pfair_state *cpu;
472 struct list_head* pos;
473
474 /* called with interrupts disabled */
475 PTRACE("--- Q %lu at %llu PRE-SPIN\n",
476 time, litmus_clock());
477 raw_spin_lock(cluster_lock(cluster));
478 PTRACE("<<< Q %lu at %llu\n",
479 time, litmus_clock());
480
481 sched_trace_quantum_boundary();
482
483 advance_subtasks(cluster, time);
484 poll_releases(cluster);
485 schedule_subtasks(cluster, time);
486
487 list_for_each(pos, &cluster->topology.cpus) {
488 cpu = from_cluster_list(pos);
489 if (cpu->linked)
490 PTRACE_TASK(cpu->linked,
491 " linked on %d.\n", cpu_id(cpu));
492 else
493 PTRACE("(null) linked on %d.\n", cpu_id(cpu));
494 }
495 /* We are done. Advance time. */
496 mb();
497 list_for_each(pos, &cluster->topology.cpus) {
498 cpu = from_cluster_list(pos);
499 if (cpu->local_tick != cpu->cur_tick) {
500 TRACE("BAD Quantum not acked on %d "
501 "(l:%lu c:%lu p:%lu)\n",
502 cpu_id(cpu),
503 cpu->local_tick,
504 cpu->cur_tick,
505 cluster->pfair_time);
506 cpu->missed_quanta++;
507 }
508 cpu->cur_tick = time;
509 }
510 PTRACE(">>> Q %lu at %llu\n",
511 time, litmus_clock());
512 raw_spin_unlock(cluster_lock(cluster));
513}
514
515static noinline void wait_for_quantum(quanta_t q, struct pfair_state* state)
516{
517 quanta_t loc;
518
519 goto first; /* skip mb() on first iteration */
520 do {
521 cpu_relax();
522 mb();
523 first: loc = state->cur_tick;
524 /* FIXME: what if loc > cur? */
525 } while (time_before(loc, q));
526 PTRACE("observed cur_tick:%lu >= q:%lu\n",
527 loc, q);
528}
529
530static quanta_t current_quantum(struct pfair_state* state)
531{
532 lt_t t = litmus_clock() - state->offset;
533 return time2quanta(t, FLOOR);
534}
535
536static void catchup_quanta(quanta_t from, quanta_t target,
537 struct pfair_state* state)
538{
539 quanta_t cur = from, time;
540 TRACE("+++< BAD catching up quanta from %lu to %lu\n",
541 from, target);
542 while (time_before(cur, target)) {
543 wait_for_quantum(cur, state);
544 cur++;
545 time = cmpxchg(&cpu_cluster(state)->pfair_time,
546 cur - 1, /* expected */
547 cur /* next */
548 );
549 if (time == cur - 1)
550 schedule_next_quantum(cpu_cluster(state), cur);
551 }
552 TRACE("+++> catching up done\n");
553}
554
555/* pfair_tick - this function is called for every local timer
556 * interrupt.
557 */
558static void pfair_tick(struct task_struct* t)
559{
560 struct pfair_state* state = this_cpu_ptr(&pfair_state);
561 quanta_t time, cur;
562 int retry = 10;
563
564 do {
565 cur = current_quantum(state);
566 PTRACE("q %lu at %llu\n", cur, litmus_clock());
567
568 /* Attempt to advance time. First CPU to get here
569 * will prepare the next quantum.
570 */
571 time = cpu_cluster(state)->pfair_time;
572 if (time == cur - 1)
573 {
574 /* looks good, see if we can advance the time */
575 time = cmpxchg(&cpu_cluster(state)->pfair_time,
576 cur - 1, /* expected */
577 cur /* next */
578 );
579 }
580
581 if (time == cur - 1) {
582 /* exchange succeeded */
583 wait_for_quantum(cur - 1, state);
584 schedule_next_quantum(cpu_cluster(state), cur);
585 retry = 0;
586 } else if (time_before(time, cur - 1)) {
587 /* the whole system missed a tick !? */
588 catchup_quanta(time, cur, state);
589 retry--;
590 } else if (time_after(time, cur)) {
591 /* our timer lagging behind!? */
592 TRACE("BAD pfair_time:%lu > cur:%lu\n", time, cur);
593 retry--;
594 } else {
595 /* Some other CPU already started scheduling
596 * this quantum. Let it do its job and then update.
597 */
598 retry = 0;
599 }
600 } while (retry);
601
602 /* Spin locally until time advances. */
603 wait_for_quantum(cur, state);
604
605 /* copy assignment */
606 /* FIXME: what if we race with a future update? Corrupted state? */
607 state->local = state->linked;
608 /* signal that we are done */
609 mb();
610 state->local_tick = state->cur_tick;
611
612 if (state->local != current
613 && (is_realtime(current) || is_present(state->local)))
614 litmus_reschedule_local();
615}
616
617static void process_out_of_budget_tasks(
618 struct pfair_state* state,
619 struct task_struct* prev,
620 unsigned int blocks)
621{
622 struct task_struct *t;
623
624 while (!list_empty(&state->out_of_budget))
625 {
626
627 t = list_first_entry(&state->out_of_budget,
628 struct task_struct, rt_param.list);
629 TRACE_TASK(t, "found on out_of_budget queue is_prev=%d\n", t == prev);
630 list_del(&tsk_rt(t)->list);
631 if (t != prev || !blocks)
632 {
633 if (time_after(cur_release(t), state->local_tick)) {
634 TRACE_TASK(t, "adding to release queue (budget exhausted)\n");
635 add_release(&cpu_cluster(state)->pfair, t);
636 } else {
637 TRACE_TASK(t, "adding to ready queue (budget exhausted)\n");
638 sched_trace_task_release(t);
639 __add_ready(&cpu_cluster(state)->pfair, t);
640 }
641 } else {
642 TRACE_TASK(t, "not added to release queue (blocks=%d)\n", blocks);
643 tsk_pfair(t)->needs_requeue = 1;
644 }
645 if (unlikely(state->local == t)) {
646 TRACE_TASK(t, "still linked as ->local, cleaning up\n");
647 state->local = NULL;
648 }
649 }
650}
651
652/* Custom scheduling tick: called on each quantum boundary. */
653static enum hrtimer_restart on_quantum_boundary(struct hrtimer *timer)
654{
655 TS_QUANTUM_BOUNDARY_START;
656
657 pfair_tick(current);
658 hrtimer_add_expires_ns(timer, LITMUS_QUANTUM_LENGTH_NS);
659
660 TS_QUANTUM_BOUNDARY_END;
661 return HRTIMER_RESTART;
662}
663
664static int safe_to_schedule(struct task_struct* t, int cpu)
665{
666 int where = tsk_rt(t)->scheduled_on;
667 if (where != NO_CPU && where != cpu) {
668 TRACE_TASK(t, "BAD: can't be scheduled on %d, "
669 "scheduled already on %d.\n", cpu, where);
670 return 0;
671 } else
672 return is_present(t) && !is_completed(t);
673}
674
675static struct task_struct* pfair_schedule(struct task_struct * prev)
676{
677 struct pfair_state* state = this_cpu_ptr(&pfair_state);
678 struct pfair_cluster* cluster = cpu_cluster(state);
679 int blocks, completion, out_of_time;
680 struct task_struct* next = NULL;
681
682#ifdef CONFIG_RELEASE_MASTER
683 /* Bail out early if we are the release master.
684 * The release master never schedules any real-time tasks.
685 */
686 if (unlikely(cluster->pfair.release_master == cpu_id(state))) {
687 goto out;
688 }
689#endif
690
691 raw_spin_lock(cpu_lock(state));
692
693 blocks = is_realtime(prev) && !is_current_running();
694 completion = is_realtime(prev) && is_completed(prev);
695 out_of_time = is_realtime(prev) && time_after(cur_release(prev),
696 state->local_tick);
697
698 if (is_realtime(prev))
699 PTRACE_TASK(prev, "blocks:%d completion:%d out_of_time:%d\n",
700 blocks, completion, out_of_time);
701
702 if (completion && !out_of_time) {
703 sched_trace_task_completion(prev, 0);
704 pfair_prepare_next_period(prev);
705 prepare_release(prev, cur_release(prev));
706 drop_all_references(prev);
707 list_add(&tsk_rt(prev)->list, &state->out_of_budget);
708 }
709
710 process_out_of_budget_tasks(state, prev, blocks);
711
712 if (state->local && safe_to_schedule(state->local, cpu_id(state)))
713 next = state->local;
714
715 if (prev != next) {
716 tsk_rt(prev)->scheduled_on = NO_CPU;
717 if (next)
718 tsk_rt(next)->scheduled_on = cpu_id(state);
719 }
720 sched_state_task_picked();
721 raw_spin_unlock(cpu_lock(state));
722
723 if (next)
724 TRACE_TASK(next, "scheduled rel=%lu at %lu (%llu)\n",
725 tsk_pfair(next)->release, cpu_cluster(state)->pfair_time, litmus_clock());
726 else if (is_realtime(prev))
727 TRACE("Becomes idle at %lu (%llu)\n", cpu_cluster(state)->pfair_time, litmus_clock());
728
729#ifdef CONFIG_RELEASE_MASTER
730out:
731#endif
732
733 if (unlikely(!hrtimer_active(&state->quantum_timer))) {
734 TRACE("activating quantum timer start=%llu\n",
735 hrtimer_get_expires(&state->quantum_timer));
736 hrtimer_start(&state->quantum_timer,
737 hrtimer_get_expires(&state->quantum_timer),
738 HRTIMER_MODE_ABS);
739 }
740
741 return next;
742}
743
744static void pfair_task_new(struct task_struct * t, int on_rq, int is_scheduled)
745{
746 unsigned long flags;
747 struct pfair_cluster* cluster;
748
749 TRACE("pfair: task new %d state:%d\n", t->pid, t->state);
750
751 cluster = tsk_pfair(t)->cluster;
752
753 raw_spin_lock_irqsave(cluster_lock(cluster), flags);
754
755 prepare_release(t, cluster->pfair_time + 1);
756 release_at(t, quanta2time(cur_release(t)));
757
758 t->rt_param.scheduled_on = NO_CPU;
759 t->rt_param.linked_on = NO_CPU;
760
761 if (is_scheduled) {
762#ifdef CONFIG_RELEASE_MASTER
763 if (task_cpu(t) != cluster->pfair.release_master)
764#endif
765 t->rt_param.scheduled_on = task_cpu(t);
766 }
767
768 if (on_rq || is_scheduled) {
769 tsk_rt(t)->present = 1;
770 __add_ready(&cluster->pfair, t);
771 } else {
772 tsk_rt(t)->present = 0;
773 tsk_pfair(t)->needs_requeue = 1;
774 }
775
776 check_preempt(t);
777
778 raw_spin_unlock_irqrestore(cluster_lock(cluster), flags);
779}
780
781static void pfair_task_wake_up(struct task_struct *t)
782{
783 unsigned long flags;
784 lt_t now;
785 struct pfair_cluster* cluster;
786 struct pfair_state* state;
787 int sporadic_release = 0;
788
789 cluster = tsk_pfair(t)->cluster;
790
791 TRACE_TASK(t, "wakes at %llu, release=%lu, pfair_time:%lu\n",
792 litmus_clock(), cur_release(t), cluster->pfair_time);
793
794 raw_spin_lock_irqsave(cluster_lock(cluster), flags);
795
796 state = this_cpu_ptr(&pfair_state);
797
798 /* If a task blocks and wakes before its next job release,
799 * then it may resume if it is currently linked somewhere
800 * (as if it never blocked at all). Otherwise, we have a
801 * new sporadic job release.
802 */
803 now = litmus_clock();
804 if (is_tardy(t, now)) {
805 TRACE_TASK(t, "sporadic release!\n");
806 sporadic_release = 1;
807 inferred_sporadic_job_release_at(t, now);
808 prepare_release(t, time2quanta(now, CEIL));
809 }
810
811 /* only add to ready queue if the task isn't still linked somewhere */
812 if (tsk_pfair(t)->needs_requeue) {
813 tsk_pfair(t)->needs_requeue = 0;
814 TRACE_TASK(t, "requeueing required (released:%d)\n",
815 !time_after(cur_release(t), state->local_tick));
816 tsk_rt(t)->completed = 0;
817 if (time_after(cur_release(t), state->local_tick)
818 && !sporadic_release)
819 add_release(&cluster->pfair, t);
820 else
821 __add_ready(&cluster->pfair, t);
822 }
823
824 check_preempt(t);
825
826 raw_spin_unlock_irqrestore(cluster_lock(cluster), flags);
827 TRACE_TASK(t, "wake up done at %llu\n", litmus_clock());
828}
829
830static void pfair_task_block(struct task_struct *t)
831{
832 BUG_ON(!is_realtime(t));
833 TRACE_TASK(t, "blocks at %llu, state:%d\n",
834 litmus_clock(), t->state);
835}
836
837static void pfair_task_exit(struct task_struct * t)
838{
839 unsigned long flags;
840 struct pfair_cluster *cluster;
841
842 BUG_ON(!is_realtime(t));
843
844 cluster = tsk_pfair(t)->cluster;
845
846 /* Remote task from release or ready queue, and ensure
847 * that it is not the scheduled task for ANY CPU. We
848 * do this blanket check because occassionally when
849 * tasks exit while blocked, the task_cpu of the task
850 * might not be the same as the CPU that the PFAIR scheduler
851 * has chosen for it.
852 */
853 raw_spin_lock_irqsave(cluster_lock(cluster), flags);
854
855 TRACE_TASK(t, "RIP, state:%d\n", t->state);
856 drop_all_references(t);
857
858 raw_spin_unlock_irqrestore(cluster_lock(cluster), flags);
859
860 kfree(t->rt_param.pfair);
861 t->rt_param.pfair = NULL;
862}
863
864static void init_subtask(struct subtask* sub, unsigned long i,
865 lt_t quanta, lt_t period)
866{
867 /* since i is zero-based, the formulas are shifted by one */
868 lt_t tmp;
869
870 /* release */
871 tmp = period * i;
872 do_div(tmp, quanta); /* floor */
873 sub->release = (quanta_t) tmp;
874
875 /* deadline */
876 tmp = period * (i + 1);
877 if (do_div(tmp, quanta)) /* ceil */
878 tmp++;
879 sub->deadline = (quanta_t) tmp;
880
881 /* next release */
882 tmp = period * (i + 1);
883 do_div(tmp, quanta); /* floor */
884 sub->overlap = sub->deadline - (quanta_t) tmp;
885
886 /* Group deadline.
887 * Based on the formula given in Uma's thesis.
888 */
889 if (2 * quanta >= period) {
890 /* heavy */
891 tmp = (sub->deadline - (i + 1)) * period;
892 if (period > quanta &&
893 do_div(tmp, (period - quanta))) /* ceil */
894 tmp++;
895 sub->group_deadline = (quanta_t) tmp;
896 } else
897 sub->group_deadline = 0;
898}
899
900static void dump_subtasks(struct task_struct* t)
901{
902 unsigned long i;
903 for (i = 0; i < t->rt_param.pfair->quanta; i++)
904 TRACE_TASK(t, "SUBTASK %lu: rel=%lu dl=%lu bbit:%lu gdl:%lu\n",
905 i + 1,
906 t->rt_param.pfair->subtasks[i].release,
907 t->rt_param.pfair->subtasks[i].deadline,
908 t->rt_param.pfair->subtasks[i].overlap,
909 t->rt_param.pfair->subtasks[i].group_deadline);
910}
911
912static long pfair_admit_task(struct task_struct* t)
913{
914 lt_t quanta;
915 lt_t period;
916 s64 quantum_length = LITMUS_QUANTUM_LENGTH_NS;
917 struct pfair_param* param;
918 unsigned long i;
919
920 /* first check that the task is in the right cluster */
921 if (cpu_cluster(pstate[tsk_rt(t)->task_params.cpu]) !=
922 cpu_cluster(pstate[task_cpu(t)]))
923 return -EINVAL;
924
925 if (get_rt_period(t) != get_rt_relative_deadline(t)) {
926 printk(KERN_INFO "%s: Admission rejected. "
927 "Only implicit deadlines are currently supported.\n",
928 litmus->plugin_name);
929 return -EINVAL;
930 }
931
932 /* Pfair is a tick-based scheduler, so the unit of time
933 * is one quantum. Calculate quantum-based parameters for everything.
934 * (Ceiling of exec cost, floor of period.)
935 */
936
937 quanta = get_exec_cost(t);
938 period = get_rt_period(t);
939
940 quanta = time2quanta(get_exec_cost(t), CEIL);
941
942 if (do_div(period, quantum_length))
943 printk(KERN_WARNING
944 "The period of %s/%d is not a multiple of %llu.\n",
945 t->comm, t->pid, (unsigned long long) quantum_length);
946
947 if (quanta == period) {
948 PTRACE_TASK(t, "Admitting weight 1.0 task. (%llu, %llu).\n", quanta, period);
949 }
950
951 param = kzalloc(sizeof(*param) +
952 quanta * sizeof(struct subtask), GFP_ATOMIC);
953
954 if (!param)
955 return -ENOMEM;
956
957 param->quanta = quanta;
958 param->period = period;
959
960 param->cluster = cpu_cluster(pstate[tsk_rt(t)->task_params.cpu]);
961
962 for (i = 0; i < quanta; i++)
963 init_subtask(param->subtasks + i, i, quanta, period);
964
965 if (t->rt_param.pfair)
966 /* get rid of stale allocation */
967 kfree(t->rt_param.pfair);
968
969 t->rt_param.pfair = param;
970
971 /* spew out some debug info */
972 dump_subtasks(t);
973
974 /* Disable generic budget enforcement (if enabled).
975 * The plugin provides its own (non-optional) enforcement
976 * of allocations at quantum granularity. */
977 tsk_rt(t)->task_params.budget_policy = NO_ENFORCEMENT;
978
979 return 0;
980}
981
982static void pfair_init_cluster(struct pfair_cluster* cluster)
983{
984 rt_domain_init(&cluster->pfair, pfair_ready_order, NULL, pfair_release_jobs);
985 bheap_init(&cluster->release_queue);
986 raw_spin_lock_init(&cluster->release_lock);
987 INIT_LIST_HEAD(&cluster->topology.cpus);
988}
989
990static void cleanup_clusters(void)
991{
992 int i;
993
994 if (num_pfair_clusters)
995 kfree(pfair_clusters);
996 pfair_clusters = NULL;
997 num_pfair_clusters = 0;
998
999 /* avoid stale pointers */
1000 for (i = 0; i < num_online_cpus(); i++) {
1001 pstate[i]->topology.cluster = NULL;
1002 printk("P%d missed %u updates and %u quanta.\n", cpu_id(pstate[i]),
1003 pstate[i]->missed_updates, pstate[i]->missed_quanta);
1004 }
1005}
1006
1007static struct domain_proc_info pfair_domain_proc_info;
1008static long pfair_get_domain_proc_info(struct domain_proc_info **ret)
1009{
1010 *ret = &pfair_domain_proc_info;
1011 return 0;
1012}
1013
1014static void pfair_setup_domain_proc(void)
1015{
1016 int i, cpu, domain;
1017#ifdef CONFIG_RELEASE_MASTER
1018 int release_master = atomic_read(&release_master_cpu);
1019 /* skip over the domain with the release master if cluster size is 1 */
1020 int cluster_size = num_online_cpus() / num_pfair_clusters;
1021 int skip_domain = (1 == cluster_size && release_master != NO_CPU) ?
1022 release_master : NO_CPU;
1023#else
1024 int release_master = NO_CPU;
1025 int skip_domain = NO_CPU;
1026#endif
1027 int num_rt_cpus = num_online_cpus() - (release_master != NO_CPU);
1028 int num_rt_domains = num_pfair_clusters - (skip_domain != NO_CPU);
1029 struct cd_mapping *map;
1030
1031 memset(&pfair_domain_proc_info, 0, sizeof(pfair_domain_proc_info));
1032 init_domain_proc_info(&pfair_domain_proc_info, num_rt_cpus, num_pfair_clusters);
1033 pfair_domain_proc_info.num_cpus = num_rt_cpus;
1034 pfair_domain_proc_info.num_domains = num_rt_domains;
1035
1036 for (cpu = 0, i = 0; cpu < num_online_cpus(); ++cpu) {
1037 if (cpu == release_master)
1038 continue;
1039 map = &pfair_domain_proc_info.cpu_to_domains[i];
1040 /* pointer math to figure out the domain index */
1041 domain = cpu_cluster(&per_cpu(pfair_state, cpu)) - pfair_clusters;
1042 map->id = cpu;
1043 cpumask_set_cpu(domain, map->mask);
1044 ++i;
1045 }
1046
1047 for (domain = 0, i = 0; domain < num_pfair_clusters; ++domain) {
1048 struct pfair_cluster *cluster;
1049 struct list_head *pos;
1050
1051 if (domain == skip_domain)
1052 continue;
1053
1054 cluster = &pfair_clusters[domain];
1055 map = &pfair_domain_proc_info.domain_to_cpus[i];
1056 map->id = i;
1057
1058 list_for_each(pos, &cluster->topology.cpus) {
1059 cpu = cpu_id(from_cluster_list(pos));
1060 if (cpu != release_master)
1061 cpumask_set_cpu(cpu, map->mask);
1062 }
1063 ++i;
1064 }
1065}
1066
1067static long pfair_activate_plugin(void)
1068{
1069 int err, i;
1070 struct pfair_state* state;
1071 struct pfair_cluster* cluster;
1072 quanta_t now, start;
1073 int cluster_size;
1074 struct cluster_cpu* cpus[NR_CPUS];
1075 struct scheduling_cluster* clust[NR_CPUS];
1076 lt_t quantum_timer_start;
1077
1078 cluster_size = get_cluster_size(pfair_cluster_level);
1079
1080 if (cluster_size <= 0 || num_online_cpus() % cluster_size != 0)
1081 return -EINVAL;
1082
1083 num_pfair_clusters = num_online_cpus() / cluster_size;
1084
1085 pfair_clusters = kzalloc(num_pfair_clusters * sizeof(struct pfair_cluster), GFP_ATOMIC);
1086 if (!pfair_clusters) {
1087 num_pfair_clusters = 0;
1088 printk(KERN_ERR "Could not allocate Pfair clusters!\n");
1089 return -ENOMEM;
1090 }
1091
1092 state = this_cpu_ptr(&pfair_state);
1093 now = current_quantum(state);
1094 start = now + 50;
1095 quantum_timer_start = quanta2time(start);
1096 TRACE("Activating PFAIR at %llu (q=%lu), first tick at %llu (q=%lu)\n",
1097 litmus_clock(),
1098 now,
1099 quantum_timer_start,
1100 time2quanta(quantum_timer_start, CEIL));
1101
1102 for (i = 0; i < num_pfair_clusters; i++) {
1103 cluster = &pfair_clusters[i];
1104 pfair_init_cluster(cluster);
1105 cluster->pfair_time = start;
1106 clust[i] = &cluster->topology;
1107#ifdef CONFIG_RELEASE_MASTER
1108 cluster->pfair.release_master = atomic_read(&release_master_cpu);
1109#endif
1110 }
1111
1112 for_each_online_cpu(i) {
1113 state = &per_cpu(pfair_state, i);
1114 state->cur_tick = start;
1115 state->local_tick = start;
1116 state->missed_quanta = 0;
1117 state->missed_updates = 0;
1118 state->offset = cpu_stagger_offset(i);
1119 hrtimer_set_expires(&state->quantum_timer,
1120 ns_to_ktime(quantum_timer_start + state->offset));
1121 cpus[i] = &state->topology;
1122 TRACE("cpus[%d] set; offset=%llu; %d\n", i, state->offset, num_online_cpus());
1123 INIT_LIST_HEAD(&state->out_of_budget);
1124 /* force rescheduling to start quantum timer */
1125 litmus_reschedule(i);
1126
1127 WARN_ONCE(!hrtimer_is_hres_active(&state->quantum_timer),
1128 KERN_ERR "WARNING: no high resolution timers available!?\n");
1129 }
1130
1131 err = assign_cpus_to_clusters(pfair_cluster_level, clust, num_pfair_clusters,
1132 cpus, num_online_cpus());
1133
1134 if (err < 0)
1135 cleanup_clusters();
1136 else
1137 pfair_setup_domain_proc();
1138
1139 return err;
1140}
1141
1142static long pfair_deactivate_plugin(void)
1143{
1144 int cpu;
1145 struct pfair_state* state;
1146
1147 for_each_online_cpu(cpu) {
1148 state = &per_cpu(pfair_state, cpu);
1149 TRACE("stopping quantum timer on CPU%d\n", cpu);
1150 hrtimer_cancel(&state->quantum_timer);
1151 }
1152 cleanup_clusters();
1153 destroy_domain_proc_info(&pfair_domain_proc_info);
1154 return 0;
1155}
1156
1157/* Plugin object */
1158static struct sched_plugin pfair_plugin __cacheline_aligned_in_smp = {
1159 .plugin_name = "PFAIR",
1160 .task_new = pfair_task_new,
1161 .task_exit = pfair_task_exit,
1162 .schedule = pfair_schedule,
1163 .task_wake_up = pfair_task_wake_up,
1164 .task_block = pfair_task_block,
1165 .admit_task = pfair_admit_task,
1166 .complete_job = complete_job,
1167 .activate_plugin = pfair_activate_plugin,
1168 .deactivate_plugin = pfair_deactivate_plugin,
1169 .get_domain_proc_info = pfair_get_domain_proc_info,
1170};
1171
1172
1173static struct proc_dir_entry *cluster_file = NULL, *pfair_dir = NULL;
1174
1175static int __init init_pfair(void)
1176{
1177 int cpu, err, fs;
1178 struct pfair_state *state;
1179
1180 /*
1181 * initialize short_cut for per-cpu pfair state;
1182 * there may be a problem here if someone removes a cpu
1183 * while we are doing this initialization... and if cpus
1184 * are added / removed later... but we don't support CPU hotplug atm anyway.
1185 */
1186 pstate = kmalloc(sizeof(struct pfair_state*) * num_online_cpus(), GFP_KERNEL);
1187
1188 /* initialize CPU state */
1189 for (cpu = 0; cpu < num_online_cpus(); cpu++) {
1190 state = &per_cpu(pfair_state, cpu);
1191 hrtimer_init(&state->quantum_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
1192 state->quantum_timer.function = on_quantum_boundary;
1193 state->topology.id = cpu;
1194 state->cur_tick = 0;
1195 state->local_tick = 0;
1196 state->linked = NULL;
1197 state->local = NULL;
1198 state->scheduled = NULL;
1199 state->missed_quanta = 0;
1200 state->offset = cpu_stagger_offset(cpu);
1201 pstate[cpu] = state;
1202 }
1203
1204 pfair_clusters = NULL;
1205 num_pfair_clusters = 0;
1206
1207 err = register_sched_plugin(&pfair_plugin);
1208 if (!err) {
1209 fs = make_plugin_proc_dir(&pfair_plugin, &pfair_dir);
1210 if (!fs)
1211 cluster_file = create_cluster_file(pfair_dir, &pfair_cluster_level);
1212 else
1213 printk(KERN_ERR "Could not allocate PFAIR procfs dir.\n");
1214 }
1215
1216 return err;
1217}
1218
1219static void __exit clean_pfair(void)
1220{
1221 kfree(pstate);
1222
1223 if (cluster_file)
1224 remove_proc_entry("cluster", pfair_dir);
1225 if (pfair_dir)
1226 remove_plugin_proc_dir(&pfair_plugin);
1227}
1228
1229module_init(init_pfair);
1230module_exit(clean_pfair);