aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBjoern Brandenburg <bbb@mpi-sws.org>2013-02-12 13:21:11 -0500
committerNamhoon Kim <namhoonk@cs.unc.edu>2014-10-21 10:08:43 -0400
commit3e37b4b502634d6598bbc45d89fef854a2a13ae6 (patch)
treec23cac1f0cad20753a7e89586a4a3a15d93faaaf
parentdcd52da5373b0afb556b0d4fb006568dc44f2ba0 (diff)
Add PD^2 scheduler plugin
-rw-r--r--litmus/Kconfig13
-rw-r--r--litmus/Makefile1
-rw-r--r--litmus/sched_pfair.c1165
3 files changed, 1179 insertions, 0 deletions
diff --git a/litmus/Kconfig b/litmus/Kconfig
index 38d9e433b345..babb43deffb5 100644
--- a/litmus/Kconfig
+++ b/litmus/Kconfig
@@ -12,6 +12,19 @@ config PLUGIN_CEDF
12 On smaller platforms (e.g., ARM PB11MPCore), using C-EDF 12 On smaller platforms (e.g., ARM PB11MPCore), using C-EDF
13 makes little sense since there aren't any shared caches. 13 makes little sense since there aren't any shared caches.
14 14
15config PLUGIN_PFAIR
16 bool "PFAIR"
17 default y
18 help
19 Include the PFAIR plugin (i.e., the PD^2 scheduler) in the kernel.
20 The PFAIR plugin requires high resolution timers (for staggered
21 quanta) and also requires HZ_PERIODIC (i.e., periodic timer ticks
22 even if a processor is idle, as quanta could be missed otherwise).
23 Further, the PFAIR plugin uses the system tick and thus requires
24 HZ=1000 to achive reasonable granularity.
25
26 If unsure, say Yes.
27
15config RELEASE_MASTER 28config RELEASE_MASTER
16 bool "Release-master Support" 29 bool "Release-master Support"
17 depends on ARCH_HAS_SEND_PULL_TIMERS && SMP 30 depends on ARCH_HAS_SEND_PULL_TIMERS && SMP
diff --git a/litmus/Makefile b/litmus/Makefile
index 8110a5ae1589..84b173a57b7d 100644
--- a/litmus/Makefile
+++ b/litmus/Makefile
@@ -24,6 +24,7 @@ obj-y = sched_plugin.o litmus.o \
24 sched_pfp.o 24 sched_pfp.o
25 25
26obj-$(CONFIG_PLUGIN_CEDF) += sched_cedf.o 26obj-$(CONFIG_PLUGIN_CEDF) += sched_cedf.o
27obj-$(CONFIG_PLUGIN_PFAIR) += sched_pfair.o
27 28
28obj-$(CONFIG_SCHED_CPU_AFFINITY) += affinity.o 29obj-$(CONFIG_SCHED_CPU_AFFINITY) += affinity.o
29 30
diff --git a/litmus/sched_pfair.c b/litmus/sched_pfair.c
new file mode 100644
index 000000000000..91f1e08cdd5c
--- /dev/null
+++ b/litmus/sched_pfair.c
@@ -0,0 +1,1165 @@
1/*
2 * kernel/sched_pfair.c
3 *
4 * Implementation of the PD^2 pfair scheduling algorithm. This
5 * implementation realizes "early releasing," i.e., it is work-conserving.
6 *
7 */
8
9#include <asm/div64.h>
10#include <linux/delay.h>
11#include <linux/module.h>
12#include <linux/spinlock.h>
13#include <linux/percpu.h>
14#include <linux/sched.h>
15#include <linux/list.h>
16#include <linux/slab.h>
17
18#include <litmus/litmus.h>
19#include <litmus/jobs.h>
20#include <litmus/preempt.h>
21#include <litmus/rt_domain.h>
22#include <litmus/sched_plugin.h>
23#include <litmus/sched_trace.h>
24#include <litmus/trace.h>
25
26#include <litmus/bheap.h>
27
28/* to configure the cluster size */
29#include <litmus/litmus_proc.h>
30
31#include <litmus/clustered.h>
32
33static enum cache_level pfair_cluster_level = GLOBAL_CLUSTER;
34
35struct subtask {
36 /* measured in quanta relative to job release */
37 quanta_t release;
38 quanta_t deadline;
39 quanta_t overlap; /* called "b bit" by PD^2 */
40 quanta_t group_deadline;
41};
42
43struct pfair_param {
44 quanta_t quanta; /* number of subtasks */
45 quanta_t cur; /* index of current subtask */
46
47 quanta_t release; /* in quanta */
48 quanta_t period; /* in quanta */
49
50 quanta_t last_quantum; /* when scheduled last */
51 int last_cpu; /* where scheduled last */
52
53 struct pfair_cluster* cluster; /* where this task is scheduled */
54
55 struct subtask subtasks[0]; /* allocate together with pfair_param */
56};
57
58#define tsk_pfair(tsk) ((tsk)->rt_param.pfair)
59
60struct pfair_state {
61 struct cluster_cpu topology;
62
63 struct hrtimer quantum_timer;
64
65 volatile quanta_t cur_tick; /* updated by the CPU that is advancing
66 * the time */
67 volatile quanta_t local_tick; /* What tick is the local CPU currently
68 * executing? Updated only by the local
69 * CPU. In QEMU, this may lag behind the
70 * current tick. In a real system, with
71 * proper timers and aligned quanta,
72 * that should only be the case for a
73 * very short time after the time
74 * advanced. With staggered quanta, it
75 * will lag for the duration of the
76 * offset.
77 */
78
79 struct task_struct* linked; /* the task that should be executing */
80 struct task_struct* local; /* the local copy of linked */
81 struct task_struct* scheduled; /* what is actually scheduled */
82
83 lt_t offset; /* stagger offset */
84 unsigned int missed_updates;
85 unsigned int missed_quanta;
86};
87
88struct pfair_cluster {
89 struct scheduling_cluster topology;
90
91 /* The "global" time in this cluster. */
92 quanta_t pfair_time; /* the "official" PFAIR clock */
93
94 /* The ready queue for this cluster. */
95 rt_domain_t pfair;
96
97 /* The set of jobs that should have their release enacted at the next
98 * quantum boundary.
99 */
100 struct bheap release_queue;
101 raw_spinlock_t release_lock;
102};
103
104#define FLAGS_NEED_REQUEUE 0x1
105
106static inline struct pfair_cluster* cpu_cluster(struct pfair_state* state)
107{
108 return container_of(state->topology.cluster, struct pfair_cluster, topology);
109}
110
111static inline int cpu_id(struct pfair_state* state)
112{
113 return state->topology.id;
114}
115
116static inline struct pfair_state* from_cluster_list(struct list_head* pos)
117{
118 return list_entry(pos, struct pfair_state, topology.cluster_list);
119}
120
121static inline struct pfair_cluster* from_domain(rt_domain_t* rt)
122{
123 return container_of(rt, struct pfair_cluster, pfair);
124}
125
126static inline raw_spinlock_t* cluster_lock(struct pfair_cluster* cluster)
127{
128 /* The ready_lock is used to serialize all scheduling events. */
129 return &cluster->pfair.ready_lock;
130}
131
132static inline raw_spinlock_t* cpu_lock(struct pfair_state* state)
133{
134 return cluster_lock(cpu_cluster(state));
135}
136
137DEFINE_PER_CPU(struct pfair_state, pfair_state);
138struct pfair_state* *pstate; /* short cut */
139
140static struct pfair_cluster* pfair_clusters;
141static int num_pfair_clusters;
142
143/* Enable for lots of trace info.
144 * #define PFAIR_DEBUG
145 */
146
147#ifdef PFAIR_DEBUG
148#define PTRACE_TASK(t, f, args...) TRACE_TASK(t, f, ## args)
149#define PTRACE(f, args...) TRACE(f, ## args)
150#else
151#define PTRACE_TASK(t, f, args...)
152#define PTRACE(f, args...)
153#endif
154
155/* gcc will inline all of these accessor functions... */
156static struct subtask* cur_subtask(struct task_struct* t)
157{
158 return tsk_pfair(t)->subtasks + tsk_pfair(t)->cur;
159}
160
161static quanta_t cur_deadline(struct task_struct* t)
162{
163 return cur_subtask(t)->deadline + tsk_pfair(t)->release;
164}
165
166static quanta_t cur_release(struct task_struct* t)
167{
168 /* This is early releasing: only the release of the first subtask
169 * counts. */
170 return tsk_pfair(t)->release;
171}
172
173static quanta_t cur_overlap(struct task_struct* t)
174{
175 return cur_subtask(t)->overlap;
176}
177
178static quanta_t cur_group_deadline(struct task_struct* t)
179{
180 quanta_t gdl = cur_subtask(t)->group_deadline;
181 if (gdl)
182 return gdl + tsk_pfair(t)->release;
183 else
184 return gdl;
185}
186
187
188static int pfair_higher_prio(struct task_struct* first,
189 struct task_struct* second)
190{
191 return /* first task must exist */
192 first && (
193 /* Does the second task exist and is it a real-time task? If
194 * not, the first task (which is a RT task) has higher
195 * priority.
196 */
197 !second || !is_realtime(second) ||
198
199 /* Is the (subtask) deadline of the first task earlier?
200 * Then it has higher priority.
201 */
202 time_before(cur_deadline(first), cur_deadline(second)) ||
203
204 /* Do we have a deadline tie?
205 * Then break by B-bit.
206 */
207 (cur_deadline(first) == cur_deadline(second) &&
208 (cur_overlap(first) > cur_overlap(second) ||
209
210 /* Do we have a B-bit tie?
211 * Then break by group deadline.
212 */
213 (cur_overlap(first) == cur_overlap(second) &&
214 (time_after(cur_group_deadline(first),
215 cur_group_deadline(second)) ||
216
217 /* Do we have a group deadline tie?
218 * Then break by PID, which are unique.
219 */
220 (cur_group_deadline(first) ==
221 cur_group_deadline(second) &&
222 first->pid < second->pid))))));
223}
224
225int pfair_ready_order(struct bheap_node* a, struct bheap_node* b)
226{
227 return pfair_higher_prio(bheap2task(a), bheap2task(b));
228}
229
230static void pfair_release_jobs(rt_domain_t* rt, struct bheap* tasks)
231{
232 struct pfair_cluster* cluster = from_domain(rt);
233 unsigned long flags;
234
235 raw_spin_lock_irqsave(&cluster->release_lock, flags);
236
237 bheap_union(pfair_ready_order, &cluster->release_queue, tasks);
238
239 raw_spin_unlock_irqrestore(&cluster->release_lock, flags);
240}
241
242static void prepare_release(struct task_struct* t, quanta_t at)
243{
244 tsk_pfair(t)->release = at;
245 tsk_pfair(t)->cur = 0;
246}
247
248/* pull released tasks from the release queue */
249static void poll_releases(struct pfair_cluster* cluster)
250{
251 raw_spin_lock(&cluster->release_lock);
252 __merge_ready(&cluster->pfair, &cluster->release_queue);
253 raw_spin_unlock(&cluster->release_lock);
254}
255
256static void check_preempt(struct task_struct* t)
257{
258 int cpu = NO_CPU;
259 if (tsk_rt(t)->linked_on != tsk_rt(t)->scheduled_on &&
260 is_present(t)) {
261 /* the task can be scheduled and
262 * is not scheduled where it ought to be scheduled
263 */
264 cpu = tsk_rt(t)->linked_on != NO_CPU ?
265 tsk_rt(t)->linked_on :
266 tsk_rt(t)->scheduled_on;
267 PTRACE_TASK(t, "linked_on:%d, scheduled_on:%d\n",
268 tsk_rt(t)->linked_on, tsk_rt(t)->scheduled_on);
269 /* preempt */
270 litmus_reschedule(cpu);
271 }
272}
273
274/* caller must hold pfair.ready_lock */
275static void drop_all_references(struct task_struct *t)
276{
277 int cpu;
278 struct pfair_state* s;
279 struct pfair_cluster* cluster;
280 if (bheap_node_in_heap(tsk_rt(t)->heap_node)) {
281 /* It must be in the ready queue; drop references isn't called
282 * when the job is in a release queue. */
283 cluster = tsk_pfair(t)->cluster;
284 bheap_delete(pfair_ready_order, &cluster->pfair.ready_queue,
285 tsk_rt(t)->heap_node);
286 }
287 for (cpu = 0; cpu < num_online_cpus(); cpu++) {
288 s = &per_cpu(pfair_state, cpu);
289 if (s->linked == t)
290 s->linked = NULL;
291 if (s->local == t)
292 s->local = NULL;
293 if (s->scheduled == t)
294 s->scheduled = NULL;
295 }
296 /* make sure we don't have a stale linked_on field */
297 tsk_rt(t)->linked_on = NO_CPU;
298}
299
300static void pfair_prepare_next_period(struct task_struct* t)
301{
302 struct pfair_param* p = tsk_pfair(t);
303
304 prepare_for_next_period(t);
305 tsk_rt(t)->completed = 0;
306 p->release = time2quanta(get_release(t), CEIL);
307}
308
309/* returns 1 if the task needs to go the release queue */
310static int advance_subtask(quanta_t time, struct task_struct* t, int cpu)
311{
312 struct pfair_param* p = tsk_pfair(t);
313 int to_relq;
314 p->cur = (p->cur + 1) % p->quanta;
315 if (!p->cur) {
316 if (is_present(t)) {
317 /* The job overran; we start a new budget allocation. */
318 pfair_prepare_next_period(t);
319 } else {
320 /* remove task from system until it wakes */
321 drop_all_references(t);
322 tsk_rt(t)->flags |= FLAGS_NEED_REQUEUE;
323 TRACE_TASK(t, "on %d advanced to subtask %lu (not present)\n",
324 cpu, p->cur);
325 return 0;
326 }
327 }
328 to_relq = time_after(cur_release(t), time);
329 TRACE_TASK(t, "on %d advanced to subtask %lu -> to_relq=%d (cur_release:%lu time:%lu)\n",
330 cpu, p->cur, to_relq, cur_release(t), time);
331 return to_relq;
332}
333
334static void advance_subtasks(struct pfair_cluster *cluster, quanta_t time)
335{
336 struct task_struct* l;
337 struct pfair_param* p;
338 struct list_head* pos;
339 struct pfair_state* cpu;
340
341 list_for_each(pos, &cluster->topology.cpus) {
342 cpu = from_cluster_list(pos);
343 l = cpu->linked;
344 cpu->missed_updates += cpu->linked != cpu->local;
345 if (l) {
346 p = tsk_pfair(l);
347 p->last_quantum = time;
348 p->last_cpu = cpu_id(cpu);
349 if (advance_subtask(time, l, cpu_id(cpu))) {
350 //cpu->linked = NULL;
351 PTRACE_TASK(l, "should go to release queue. "
352 "scheduled_on=%d present=%d\n",
353 tsk_rt(l)->scheduled_on,
354 tsk_rt(l)->present);
355 }
356 }
357 }
358}
359
360static int target_cpu(quanta_t time, struct task_struct* t, int default_cpu)
361{
362 int cpu;
363 if (tsk_rt(t)->scheduled_on != NO_CPU) {
364 /* always observe scheduled_on linkage */
365 default_cpu = tsk_rt(t)->scheduled_on;
366 } else if (tsk_pfair(t)->last_quantum == time - 1) {
367 /* back2back quanta */
368 /* Only observe last_quantum if no scheduled_on is in the way.
369 * This should only kick in if a CPU missed quanta, and that
370 * *should* only happen in QEMU.
371 */
372 cpu = tsk_pfair(t)->last_cpu;
373 if (!pstate[cpu]->linked ||
374 tsk_rt(pstate[cpu]->linked)->scheduled_on != cpu) {
375 default_cpu = cpu;
376 }
377 }
378 return default_cpu;
379}
380
381/* returns one if linking was redirected */
382static int pfair_link(quanta_t time, int cpu,
383 struct task_struct* t)
384{
385 int target = target_cpu(time, t, cpu);
386 struct task_struct* prev = pstate[cpu]->linked;
387 struct task_struct* other;
388 struct pfair_cluster* cluster = cpu_cluster(pstate[cpu]);
389
390 if (target != cpu) {
391 BUG_ON(pstate[target]->topology.cluster != pstate[cpu]->topology.cluster);
392 other = pstate[target]->linked;
393 pstate[target]->linked = t;
394 tsk_rt(t)->linked_on = target;
395 if (!other)
396 /* linked ok, but reschedule this CPU */
397 return 1;
398 if (target < cpu) {
399 /* link other to cpu instead */
400 tsk_rt(other)->linked_on = cpu;
401 pstate[cpu]->linked = other;
402 if (prev) {
403 /* prev got pushed back into the ready queue */
404 tsk_rt(prev)->linked_on = NO_CPU;
405 __add_ready(&cluster->pfair, prev);
406 }
407 /* we are done with this cpu */
408 return 0;
409 } else {
410 /* re-add other, it's original CPU was not considered yet */
411 tsk_rt(other)->linked_on = NO_CPU;
412 __add_ready(&cluster->pfair, other);
413 /* reschedule this CPU */
414 return 1;
415 }
416 } else {
417 pstate[cpu]->linked = t;
418 tsk_rt(t)->linked_on = cpu;
419 if (prev) {
420 /* prev got pushed back into the ready queue */
421 tsk_rt(prev)->linked_on = NO_CPU;
422 __add_ready(&cluster->pfair, prev);
423 }
424 /* we are done with this CPU */
425 return 0;
426 }
427}
428
429static void schedule_subtasks(struct pfair_cluster *cluster, quanta_t time)
430{
431 int retry;
432 struct list_head *pos;
433 struct pfair_state *cpu_state;
434
435 list_for_each(pos, &cluster->topology.cpus) {
436 cpu_state = from_cluster_list(pos);
437 retry = 1;
438#ifdef CONFIG_RELEASE_MASTER
439 /* skip release master */
440 if (cluster->pfair.release_master == cpu_id(cpu_state))
441 continue;
442#endif
443 while (retry) {
444 if (pfair_higher_prio(__peek_ready(&cluster->pfair),
445 cpu_state->linked))
446 retry = pfair_link(time, cpu_id(cpu_state),
447 __take_ready(&cluster->pfair));
448 else
449 retry = 0;
450 }
451 }
452}
453
454static void schedule_next_quantum(struct pfair_cluster *cluster, quanta_t time)
455{
456 struct pfair_state *cpu;
457 struct list_head* pos;
458
459 /* called with interrupts disabled */
460 PTRACE("--- Q %lu at %llu PRE-SPIN\n",
461 time, litmus_clock());
462 raw_spin_lock(cluster_lock(cluster));
463 PTRACE("<<< Q %lu at %llu\n",
464 time, litmus_clock());
465
466 sched_trace_quantum_boundary();
467
468 advance_subtasks(cluster, time);
469 poll_releases(cluster);
470 schedule_subtasks(cluster, time);
471
472 list_for_each(pos, &cluster->topology.cpus) {
473 cpu = from_cluster_list(pos);
474 if (cpu->linked)
475 PTRACE_TASK(cpu->linked,
476 " linked on %d.\n", cpu_id(cpu));
477 else
478 PTRACE("(null) linked on %d.\n", cpu_id(cpu));
479 }
480 /* We are done. Advance time. */
481 mb();
482 list_for_each(pos, &cluster->topology.cpus) {
483 cpu = from_cluster_list(pos);
484 if (cpu->local_tick != cpu->cur_tick) {
485 TRACE("BAD Quantum not acked on %d "
486 "(l:%lu c:%lu p:%lu)\n",
487 cpu_id(cpu),
488 cpu->local_tick,
489 cpu->cur_tick,
490 cluster->pfair_time);
491 cpu->missed_quanta++;
492 }
493 cpu->cur_tick = time;
494 }
495 PTRACE(">>> Q %lu at %llu\n",
496 time, litmus_clock());
497 raw_spin_unlock(cluster_lock(cluster));
498}
499
500static noinline void wait_for_quantum(quanta_t q, struct pfair_state* state)
501{
502 quanta_t loc;
503
504 goto first; /* skip mb() on first iteration */
505 do {
506 cpu_relax();
507 mb();
508 first: loc = state->cur_tick;
509 /* FIXME: what if loc > cur? */
510 } while (time_before(loc, q));
511 PTRACE("observed cur_tick:%lu >= q:%lu\n",
512 loc, q);
513}
514
515static quanta_t current_quantum(struct pfair_state* state)
516{
517 lt_t t = litmus_clock() - state->offset;
518 return time2quanta(t, FLOOR);
519}
520
521static void catchup_quanta(quanta_t from, quanta_t target,
522 struct pfair_state* state)
523{
524 quanta_t cur = from, time;
525 TRACE("+++< BAD catching up quanta from %lu to %lu\n",
526 from, target);
527 while (time_before(cur, target)) {
528 wait_for_quantum(cur, state);
529 cur++;
530 time = cmpxchg(&cpu_cluster(state)->pfair_time,
531 cur - 1, /* expected */
532 cur /* next */
533 );
534 if (time == cur - 1)
535 schedule_next_quantum(cpu_cluster(state), cur);
536 }
537 TRACE("+++> catching up done\n");
538}
539
540/* pfair_tick - this function is called for every local timer
541 * interrupt.
542 */
543static void pfair_tick(struct task_struct* t)
544{
545 struct pfair_state* state = &__get_cpu_var(pfair_state);
546 quanta_t time, cur;
547 int retry = 10;
548
549 do {
550 cur = current_quantum(state);
551 PTRACE("q %lu at %llu\n", cur, litmus_clock());
552
553 /* Attempt to advance time. First CPU to get here
554 * will prepare the next quantum.
555 */
556 time = cmpxchg(&cpu_cluster(state)->pfair_time,
557 cur - 1, /* expected */
558 cur /* next */
559 );
560 if (time == cur - 1) {
561 /* exchange succeeded */
562 wait_for_quantum(cur - 1, state);
563 schedule_next_quantum(cpu_cluster(state), cur);
564 retry = 0;
565 } else if (time_before(time, cur - 1)) {
566 /* the whole system missed a tick !? */
567 catchup_quanta(time, cur, state);
568 retry--;
569 } else if (time_after(time, cur)) {
570 /* our timer lagging behind!? */
571 TRACE("BAD pfair_time:%lu > cur:%lu\n", time, cur);
572 retry--;
573 } else {
574 /* Some other CPU already started scheduling
575 * this quantum. Let it do its job and then update.
576 */
577 retry = 0;
578 }
579 } while (retry);
580
581 /* Spin locally until time advances. */
582 wait_for_quantum(cur, state);
583
584 /* copy assignment */
585 /* FIXME: what if we race with a future update? Corrupted state? */
586 state->local = state->linked;
587 /* signal that we are done */
588 mb();
589 state->local_tick = state->cur_tick;
590
591 if (state->local != current
592 && (is_realtime(current) || is_present(state->local)))
593 litmus_reschedule_local();
594}
595
596/* Custom scheduling tick: called on each quantum boundary. */
597static enum hrtimer_restart on_quantum_boundary(struct hrtimer *timer)
598{
599 TS_QUANTUM_BOUNDARY_START;
600
601 pfair_tick(current);
602 hrtimer_add_expires_ns(timer, LITMUS_QUANTUM_LENGTH_NS);
603
604 TS_QUANTUM_BOUNDARY_END;
605 return HRTIMER_RESTART;
606}
607
608static int safe_to_schedule(struct task_struct* t, int cpu)
609{
610 int where = tsk_rt(t)->scheduled_on;
611 if (where != NO_CPU && where != cpu) {
612 TRACE_TASK(t, "BAD: can't be scheduled on %d, "
613 "scheduled already on %d.\n", cpu, where);
614 return 0;
615 } else
616 return is_present(t) && !is_completed(t);
617}
618
619static struct task_struct* pfair_schedule(struct task_struct * prev)
620{
621 struct pfair_state* state = &__get_cpu_var(pfair_state);
622 struct pfair_cluster* cluster = cpu_cluster(state);
623 int blocks, completion, out_of_time;
624 struct task_struct* next = NULL;
625
626#ifdef CONFIG_RELEASE_MASTER
627 /* Bail out early if we are the release master.
628 * The release master never schedules any real-time tasks.
629 */
630 if (unlikely(cluster->pfair.release_master == cpu_id(state))) {
631 sched_state_task_picked();
632 return NULL;
633 }
634#endif
635
636 raw_spin_lock(cpu_lock(state));
637
638 blocks = is_realtime(prev) && !is_running(prev);
639 completion = is_realtime(prev) && is_completed(prev);
640 out_of_time = is_realtime(prev) && time_after(cur_release(prev),
641 state->local_tick);
642
643 if (is_realtime(prev))
644 PTRACE_TASK(prev, "blocks:%d completion:%d out_of_time:%d\n",
645 blocks, completion, out_of_time);
646
647 if (completion) {
648 sched_trace_task_completion(prev, 0);
649 pfair_prepare_next_period(prev);
650 prepare_release(prev, cur_release(prev));
651 }
652
653 if (!blocks && (completion || out_of_time)) {
654 drop_all_references(prev);
655 sched_trace_task_release(prev);
656 add_release(&cluster->pfair, prev);
657 }
658
659 if (state->local && safe_to_schedule(state->local, cpu_id(state)))
660 next = state->local;
661
662 if (prev != next) {
663 tsk_rt(prev)->scheduled_on = NO_CPU;
664 if (next)
665 tsk_rt(next)->scheduled_on = cpu_id(state);
666 }
667 sched_state_task_picked();
668 raw_spin_unlock(cpu_lock(state));
669
670 if (next)
671 TRACE_TASK(next, "scheduled rel=%lu at %lu (%llu)\n",
672 tsk_pfair(next)->release, cpu_cluster(state)->pfair_time, litmus_clock());
673 else if (is_realtime(prev))
674 TRACE("Becomes idle at %lu (%llu)\n", cpu_cluster(state)->pfair_time, litmus_clock());
675
676 if (unlikely(!hrtimer_active(&state->quantum_timer))) {
677 TRACE("activating quantum timer start=%llu\n",
678 hrtimer_get_expires(&state->quantum_timer));
679 hrtimer_start(&state->quantum_timer,
680 hrtimer_get_expires(&state->quantum_timer),
681 HRTIMER_MODE_ABS_PINNED);
682 }
683
684 return next;
685}
686
687static void pfair_task_new(struct task_struct * t, int on_rq, int is_scheduled)
688{
689 unsigned long flags;
690 struct pfair_cluster* cluster;
691
692 TRACE("pfair: task new %d state:%d\n", t->pid, t->state);
693
694 cluster = tsk_pfair(t)->cluster;
695
696 raw_spin_lock_irqsave(cluster_lock(cluster), flags);
697
698 prepare_release(t, cluster->pfair_time + 1);
699
700 t->rt_param.scheduled_on = NO_CPU;
701 t->rt_param.linked_on = NO_CPU;
702
703 if (is_scheduled) {
704#ifdef CONFIG_RELEASE_MASTER
705 if (task_cpu(t) != cluster->pfair.release_master)
706#endif
707 t->rt_param.scheduled_on = task_cpu(t);
708 }
709
710 if (is_running(t)) {
711 tsk_rt(t)->present = 1;
712 __add_ready(&cluster->pfair, t);
713 } else {
714 tsk_rt(t)->present = 0;
715 tsk_rt(t)->flags |= FLAGS_NEED_REQUEUE;
716 }
717
718 check_preempt(t);
719
720 raw_spin_unlock_irqrestore(cluster_lock(cluster), flags);
721}
722
723static void pfair_task_wake_up(struct task_struct *t)
724{
725 unsigned long flags;
726 lt_t now;
727 struct pfair_cluster* cluster;
728
729 cluster = tsk_pfair(t)->cluster;
730
731 TRACE_TASK(t, "wakes at %llu, release=%lu, pfair_time:%lu\n",
732 litmus_clock(), cur_release(t), cluster->pfair_time);
733
734 raw_spin_lock_irqsave(cluster_lock(cluster), flags);
735
736 /* If a task blocks and wakes before its next job release,
737 * then it may resume if it is currently linked somewhere
738 * (as if it never blocked at all). Otherwise, we have a
739 * new sporadic job release.
740 */
741 now = litmus_clock();
742 if (is_tardy(t, now)) {
743 TRACE_TASK(t, "sporadic release!\n");
744 release_at(t, now);
745 prepare_release(t, time2quanta(now, CEIL));
746 sched_trace_task_release(t);
747 }
748
749 /* only add to ready queue if the task isn't still linked somewhere */
750 if (tsk_rt(t)->flags & FLAGS_NEED_REQUEUE) {
751 tsk_rt(t)->flags &= ~FLAGS_NEED_REQUEUE;
752 TRACE_TASK(t, "requeueing required\n");
753 tsk_rt(t)->completed = 0;
754 __add_ready(&cluster->pfair, t);
755 }
756
757 check_preempt(t);
758
759 raw_spin_unlock_irqrestore(cluster_lock(cluster), flags);
760 TRACE_TASK(t, "wake up done at %llu\n", litmus_clock());
761}
762
763static void pfair_task_block(struct task_struct *t)
764{
765 BUG_ON(!is_realtime(t));
766 TRACE_TASK(t, "blocks at %llu, state:%d\n",
767 litmus_clock(), t->state);
768}
769
770static void pfair_task_exit(struct task_struct * t)
771{
772 unsigned long flags;
773 struct pfair_cluster *cluster;
774
775 BUG_ON(!is_realtime(t));
776
777 cluster = tsk_pfair(t)->cluster;
778
779 /* Remote task from release or ready queue, and ensure
780 * that it is not the scheduled task for ANY CPU. We
781 * do this blanket check because occassionally when
782 * tasks exit while blocked, the task_cpu of the task
783 * might not be the same as the CPU that the PFAIR scheduler
784 * has chosen for it.
785 */
786 raw_spin_lock_irqsave(cluster_lock(cluster), flags);
787
788 TRACE_TASK(t, "RIP, state:%d\n", t->state);
789 drop_all_references(t);
790
791 raw_spin_unlock_irqrestore(cluster_lock(cluster), flags);
792
793 kfree(t->rt_param.pfair);
794 t->rt_param.pfair = NULL;
795}
796
797static void init_subtask(struct subtask* sub, unsigned long i,
798 lt_t quanta, lt_t period)
799{
800 /* since i is zero-based, the formulas are shifted by one */
801 lt_t tmp;
802
803 /* release */
804 tmp = period * i;
805 do_div(tmp, quanta); /* floor */
806 sub->release = (quanta_t) tmp;
807
808 /* deadline */
809 tmp = period * (i + 1);
810 if (do_div(tmp, quanta)) /* ceil */
811 tmp++;
812 sub->deadline = (quanta_t) tmp;
813
814 /* next release */
815 tmp = period * (i + 1);
816 do_div(tmp, quanta); /* floor */
817 sub->overlap = sub->deadline - (quanta_t) tmp;
818
819 /* Group deadline.
820 * Based on the formula given in Uma's thesis.
821 */
822 if (2 * quanta >= period) {
823 /* heavy */
824 tmp = (sub->deadline - (i + 1)) * period;
825 if (period > quanta &&
826 do_div(tmp, (period - quanta))) /* ceil */
827 tmp++;
828 sub->group_deadline = (quanta_t) tmp;
829 } else
830 sub->group_deadline = 0;
831}
832
833static void dump_subtasks(struct task_struct* t)
834{
835 unsigned long i;
836 for (i = 0; i < t->rt_param.pfair->quanta; i++)
837 TRACE_TASK(t, "SUBTASK %lu: rel=%lu dl=%lu bbit:%lu gdl:%lu\n",
838 i + 1,
839 t->rt_param.pfair->subtasks[i].release,
840 t->rt_param.pfair->subtasks[i].deadline,
841 t->rt_param.pfair->subtasks[i].overlap,
842 t->rt_param.pfair->subtasks[i].group_deadline);
843}
844
845static long pfair_admit_task(struct task_struct* t)
846{
847 lt_t quanta;
848 lt_t period;
849 s64 quantum_length = LITMUS_QUANTUM_LENGTH_NS;
850 struct pfair_param* param;
851 unsigned long i;
852
853 /* first check that the task is in the right cluster */
854 if (cpu_cluster(pstate[tsk_rt(t)->task_params.cpu]) !=
855 cpu_cluster(pstate[task_cpu(t)]))
856 return -EINVAL;
857
858 if (get_rt_period(t) != get_rt_relative_deadline(t)) {
859 printk(KERN_INFO "%s: Admission rejected. "
860 "Only implicit deadlines are currently supported.\n",
861 litmus->plugin_name);
862 return -EINVAL;
863 }
864
865 /* Pfair is a tick-based method, so the time
866 * of interest is jiffies. Calculate tick-based
867 * times for everything.
868 * (Ceiling of exec cost, floor of period.)
869 */
870
871 quanta = get_exec_cost(t);
872 period = get_rt_period(t);
873
874 quanta = time2quanta(get_exec_cost(t), CEIL);
875
876 if (do_div(period, quantum_length))
877 printk(KERN_WARNING
878 "The period of %s/%d is not a multiple of %llu.\n",
879 t->comm, t->pid, (unsigned long long) quantum_length);
880
881 if (quanta == period) {
882 /* special case: task has weight 1.0 */
883 printk(KERN_INFO
884 "Admitting weight 1.0 task. (%s/%d, %llu, %llu).\n",
885 t->comm, t->pid, quanta, period);
886 quanta = 1;
887 period = 1;
888 }
889
890 param = kmalloc(sizeof(*param) +
891 quanta * sizeof(struct subtask), GFP_ATOMIC);
892
893 if (!param)
894 return -ENOMEM;
895
896 param->quanta = quanta;
897 param->cur = 0;
898 param->release = 0;
899 param->period = period;
900
901 param->cluster = cpu_cluster(pstate[tsk_rt(t)->task_params.cpu]);
902
903 for (i = 0; i < quanta; i++)
904 init_subtask(param->subtasks + i, i, quanta, period);
905
906 if (t->rt_param.pfair)
907 /* get rid of stale allocation */
908 kfree(t->rt_param.pfair);
909
910 t->rt_param.pfair = param;
911
912 /* spew out some debug info */
913 dump_subtasks(t);
914
915 return 0;
916}
917
918static void pfair_init_cluster(struct pfair_cluster* cluster)
919{
920 rt_domain_init(&cluster->pfair, pfair_ready_order, NULL, pfair_release_jobs);
921 bheap_init(&cluster->release_queue);
922 raw_spin_lock_init(&cluster->release_lock);
923 INIT_LIST_HEAD(&cluster->topology.cpus);
924}
925
926static void cleanup_clusters(void)
927{
928 int i;
929
930 if (num_pfair_clusters)
931 kfree(pfair_clusters);
932 pfair_clusters = NULL;
933 num_pfair_clusters = 0;
934
935 /* avoid stale pointers */
936 for (i = 0; i < num_online_cpus(); i++) {
937 pstate[i]->topology.cluster = NULL;
938 printk("P%d missed %u updates and %u quanta.\n", cpu_id(pstate[i]),
939 pstate[i]->missed_updates, pstate[i]->missed_quanta);
940 }
941}
942
943static struct domain_proc_info pfair_domain_proc_info;
944static long pfair_get_domain_proc_info(struct domain_proc_info **ret)
945{
946 *ret = &pfair_domain_proc_info;
947 return 0;
948}
949
950static void pfair_setup_domain_proc(void)
951{
952 int i, cpu, domain;
953#ifdef CONFIG_RELEASE_MASTER
954 int release_master = atomic_read(&release_master_cpu);
955 /* skip over the domain with the release master if cluster size is 1 */
956 int cluster_size = num_online_cpus() / num_pfair_clusters;
957 int skip_domain = (1 == cluster_size && release_master != NO_CPU) ?
958 release_master : NO_CPU;
959#else
960 int release_master = NO_CPU;
961 int skip_domain = NO_CPU;
962#endif
963 int num_rt_cpus = num_online_cpus() - (release_master != NO_CPU);
964 int num_rt_domains = num_pfair_clusters - (skip_domain != NO_CPU);
965 struct cd_mapping *map;
966
967 memset(&pfair_domain_proc_info, sizeof(pfair_domain_proc_info), 0);
968 init_domain_proc_info(&pfair_domain_proc_info, num_rt_cpus, num_pfair_clusters);
969 pfair_domain_proc_info.num_cpus = num_rt_cpus;
970 pfair_domain_proc_info.num_domains = num_rt_domains;
971
972 for (cpu = 0, i = 0; cpu < num_online_cpus(); ++cpu) {
973 if (cpu == release_master)
974 continue;
975 map = &pfair_domain_proc_info.cpu_to_domains[i];
976 /* pointer math to figure out the domain index */
977 domain = cpu_cluster(&per_cpu(pfair_state, cpu)) - pfair_clusters;
978 map->id = cpu;
979 cpumask_set_cpu(domain, map->mask);
980 ++i;
981 }
982
983 for (domain = 0, i = 0; domain < num_pfair_clusters; ++domain) {
984 struct pfair_cluster *cluster;
985 struct list_head *pos;
986
987 if (domain == skip_domain)
988 continue;
989
990 cluster = &pfair_clusters[domain];
991 map = &pfair_domain_proc_info.domain_to_cpus[i];
992 map->id = i;
993
994 list_for_each(pos, &cluster->topology.cpus) {
995 cpu = cpu_id(from_cluster_list(pos));
996 if (cpu != release_master)
997 cpumask_set_cpu(cpu, map->mask);
998 }
999 ++i;
1000 }
1001}
1002
1003static long pfair_activate_plugin(void)
1004{
1005 int err, i;
1006 struct pfair_state* state;
1007 struct pfair_cluster* cluster;
1008 quanta_t now, start;
1009 int cluster_size;
1010 struct cluster_cpu* cpus[NR_CPUS];
1011 struct scheduling_cluster* clust[NR_CPUS];
1012 lt_t quantum_timer_start;
1013
1014 cluster_size = get_cluster_size(pfair_cluster_level);
1015
1016 if (cluster_size <= 0 || num_online_cpus() % cluster_size != 0)
1017 return -EINVAL;
1018
1019 num_pfair_clusters = num_online_cpus() / cluster_size;
1020
1021 pfair_clusters = kzalloc(num_pfair_clusters * sizeof(struct pfair_cluster), GFP_ATOMIC);
1022 if (!pfair_clusters) {
1023 num_pfair_clusters = 0;
1024 printk(KERN_ERR "Could not allocate Pfair clusters!\n");
1025 return -ENOMEM;
1026 }
1027
1028 state = &__get_cpu_var(pfair_state);
1029 now = current_quantum(state);
1030 start = now + 50;
1031 quantum_timer_start = quanta2time(start);
1032 TRACE("Activating PFAIR at %llu (q=%lu), first tick at %llu (q=%lu)\n",
1033 litmus_clock(),
1034 now,
1035 quantum_timer_start,
1036 time2quanta(quantum_timer_start, CEIL));
1037
1038 for (i = 0; i < num_pfair_clusters; i++) {
1039 cluster = &pfair_clusters[i];
1040 pfair_init_cluster(cluster);
1041 cluster->pfair_time = start;
1042 clust[i] = &cluster->topology;
1043#ifdef CONFIG_RELEASE_MASTER
1044 cluster->pfair.release_master = atomic_read(&release_master_cpu);
1045#endif
1046 }
1047
1048 for_each_online_cpu(i) {
1049 state = &per_cpu(pfair_state, i);
1050 state->cur_tick = start;
1051 state->local_tick = start;
1052 state->missed_quanta = 0;
1053 state->missed_updates = 0;
1054 state->offset = cpu_stagger_offset(i);
1055 hrtimer_set_expires(&state->quantum_timer,
1056 ns_to_ktime(quantum_timer_start + state->offset));
1057 printk(KERN_ERR "cpus[%d] set; offset=%llu; %d\n", i, state->offset, num_online_cpus());
1058 cpus[i] = &state->topology;
1059 /* force rescheduling to start quantum timer */
1060 litmus_reschedule(i);
1061
1062 WARN_ONCE(!hrtimer_is_hres_active(&state->quantum_timer),
1063 KERN_ERR "WARNING: no high resolution timers available!?\n");
1064 }
1065
1066 err = assign_cpus_to_clusters(pfair_cluster_level, clust, num_pfair_clusters,
1067 cpus, num_online_cpus());
1068
1069 if (err < 0)
1070 cleanup_clusters();
1071 else
1072 pfair_setup_domain_proc();
1073
1074 return err;
1075}
1076
1077static long pfair_deactivate_plugin(void)
1078{
1079 int cpu;
1080 struct pfair_state* state;
1081
1082 for_each_online_cpu(cpu) {
1083 state = &per_cpu(pfair_state, cpu);
1084 TRACE("stopping quantum timer on CPU%d\n", cpu);
1085 hrtimer_cancel(&state->quantum_timer);
1086 }
1087 cleanup_clusters();
1088 destroy_domain_proc_info(&pfair_domain_proc_info);
1089 return 0;
1090}
1091
1092/* Plugin object */
1093static struct sched_plugin pfair_plugin __cacheline_aligned_in_smp = {
1094 .plugin_name = "PFAIR",
1095 .task_new = pfair_task_new,
1096 .task_exit = pfair_task_exit,
1097 .schedule = pfair_schedule,
1098 .task_wake_up = pfair_task_wake_up,
1099 .task_block = pfair_task_block,
1100 .admit_task = pfair_admit_task,
1101 .complete_job = complete_job,
1102 .activate_plugin = pfair_activate_plugin,
1103 .deactivate_plugin = pfair_deactivate_plugin,
1104 .get_domain_proc_info = pfair_get_domain_proc_info,
1105};
1106
1107
1108static struct proc_dir_entry *cluster_file = NULL, *pfair_dir = NULL;
1109
1110static int __init init_pfair(void)
1111{
1112 int cpu, err, fs;
1113 struct pfair_state *state;
1114
1115 /*
1116 * initialize short_cut for per-cpu pfair state;
1117 * there may be a problem here if someone removes a cpu
1118 * while we are doing this initialization... and if cpus
1119 * are added / removed later... but we don't support CPU hotplug atm anyway.
1120 */
1121 pstate = kmalloc(sizeof(struct pfair_state*) * num_online_cpus(), GFP_KERNEL);
1122
1123 /* initialize CPU state */
1124 for (cpu = 0; cpu < num_online_cpus(); cpu++) {
1125 state = &per_cpu(pfair_state, cpu);
1126 hrtimer_init(&state->quantum_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
1127 state->quantum_timer.function = on_quantum_boundary;
1128 state->topology.id = cpu;
1129 state->cur_tick = 0;
1130 state->local_tick = 0;
1131 state->linked = NULL;
1132 state->local = NULL;
1133 state->scheduled = NULL;
1134 state->missed_quanta = 0;
1135 state->offset = cpu_stagger_offset(cpu);
1136 pstate[cpu] = state;
1137 }
1138
1139 pfair_clusters = NULL;
1140 num_pfair_clusters = 0;
1141
1142 err = register_sched_plugin(&pfair_plugin);
1143 if (!err) {
1144 fs = make_plugin_proc_dir(&pfair_plugin, &pfair_dir);
1145 if (!fs)
1146 cluster_file = create_cluster_file(pfair_dir, &pfair_cluster_level);
1147 else
1148 printk(KERN_ERR "Could not allocate PFAIR procfs dir.\n");
1149 }
1150
1151 return err;
1152}
1153
1154static void __exit clean_pfair(void)
1155{
1156 kfree(pstate);
1157
1158 if (cluster_file)
1159 remove_proc_entry("cluster", pfair_dir);
1160 if (pfair_dir)
1161 remove_plugin_proc_dir(&pfair_plugin);
1162}
1163
1164module_init(init_pfair);
1165module_exit(clean_pfair);