aboutsummaryrefslogtreecommitdiffstats
path: root/litmus/sched_pfair.c
diff options
context:
space:
mode:
Diffstat (limited to 'litmus/sched_pfair.c')
-rw-r--r--litmus/sched_pfair.c1074
1 files changed, 1074 insertions, 0 deletions
diff --git a/litmus/sched_pfair.c b/litmus/sched_pfair.c
new file mode 100644
index 00000000000..72c06a492ef
--- /dev/null
+++ b/litmus/sched_pfair.c
@@ -0,0 +1,1074 @@
1/*
2 * kernel/sched_pfair.c
3 *
4 * Implementation of the PD^2 pfair scheduling algorithm. This
5 * implementation realizes "early releasing," i.e., it is work-conserving.
6 *
7 */
8
9#include <asm/div64.h>
10#include <linux/delay.h>
11#include <linux/module.h>
12#include <linux/spinlock.h>
13#include <linux/percpu.h>
14#include <linux/sched.h>
15#include <linux/list.h>
16#include <linux/slab.h>
17
18#include <litmus/litmus.h>
19#include <litmus/jobs.h>
20#include <litmus/preempt.h>
21#include <litmus/rt_domain.h>
22#include <litmus/sched_plugin.h>
23#include <litmus/sched_trace.h>
24
25#include <litmus/bheap.h>
26
27/* to configure the cluster size */
28#include <litmus/litmus_proc.h>
29
30#include <litmus/clustered.h>
31
32static enum cache_level pfair_cluster_level = GLOBAL_CLUSTER;
33
34struct subtask {
35 /* measured in quanta relative to job release */
36 quanta_t release;
37 quanta_t deadline;
38 quanta_t overlap; /* called "b bit" by PD^2 */
39 quanta_t group_deadline;
40};
41
42struct pfair_param {
43 quanta_t quanta; /* number of subtasks */
44 quanta_t cur; /* index of current subtask */
45
46 quanta_t release; /* in quanta */
47 quanta_t period; /* in quanta */
48
49 quanta_t last_quantum; /* when scheduled last */
50 int last_cpu; /* where scheduled last */
51
52 struct pfair_cluster* cluster; /* where this task is scheduled */
53
54 struct subtask subtasks[0]; /* allocate together with pfair_param */
55};
56
57#define tsk_pfair(tsk) ((tsk)->rt_param.pfair)
58
59struct pfair_state {
60 struct cluster_cpu topology;
61
62 volatile quanta_t cur_tick; /* updated by the CPU that is advancing
63 * the time */
64 volatile quanta_t local_tick; /* What tick is the local CPU currently
65 * executing? Updated only by the local
66 * CPU. In QEMU, this may lag behind the
67 * current tick. In a real system, with
68 * proper timers and aligned quanta,
69 * that should only be the case for a
70 * very short time after the time
71 * advanced. With staggered quanta, it
72 * will lag for the duration of the
73 * offset.
74 */
75
76 struct task_struct* linked; /* the task that should be executing */
77 struct task_struct* local; /* the local copy of linked */
78 struct task_struct* scheduled; /* what is actually scheduled */
79
80 lt_t offset; /* stagger offset */
81 unsigned int missed_updates;
82 unsigned int missed_quanta;
83};
84
85struct pfair_cluster {
86 struct scheduling_cluster topology;
87
88 /* The "global" time in this cluster. */
89 quanta_t pfair_time; /* the "official" PFAIR clock */
90
91 /* The ready queue for this cluster. */
92 rt_domain_t pfair;
93
94 /* The set of jobs that should have their release enacted at the next
95 * quantum boundary.
96 */
97 struct bheap release_queue;
98 raw_spinlock_t release_lock;
99};
100
101#define RT_F_REQUEUE 0x2
102
103static inline struct pfair_cluster* cpu_cluster(struct pfair_state* state)
104{
105 return container_of(state->topology.cluster, struct pfair_cluster, topology);
106}
107
108static inline int cpu_id(struct pfair_state* state)
109{
110 return state->topology.id;
111}
112
113static inline struct pfair_state* from_cluster_list(struct list_head* pos)
114{
115 return list_entry(pos, struct pfair_state, topology.cluster_list);
116}
117
118static inline struct pfair_cluster* from_domain(rt_domain_t* rt)
119{
120 return container_of(rt, struct pfair_cluster, pfair);
121}
122
123static inline raw_spinlock_t* cluster_lock(struct pfair_cluster* cluster)
124{
125 /* The ready_lock is used to serialize all scheduling events. */
126 return &cluster->pfair.ready_lock;
127}
128
129static inline raw_spinlock_t* cpu_lock(struct pfair_state* state)
130{
131 return cluster_lock(cpu_cluster(state));
132}
133
134DEFINE_PER_CPU(struct pfair_state, pfair_state);
135struct pfair_state* *pstate; /* short cut */
136
137static struct pfair_cluster* pfair_clusters;
138static int num_pfair_clusters;
139
140/* Enable for lots of trace info.
141 * #define PFAIR_DEBUG
142 */
143
144#ifdef PFAIR_DEBUG
145#define PTRACE_TASK(t, f, args...) TRACE_TASK(t, f, ## args)
146#define PTRACE(f, args...) TRACE(f, ## args)
147#else
148#define PTRACE_TASK(t, f, args...)
149#define PTRACE(f, args...)
150#endif
151
152/* gcc will inline all of these accessor functions... */
153static struct subtask* cur_subtask(struct task_struct* t)
154{
155 return tsk_pfair(t)->subtasks + tsk_pfair(t)->cur;
156}
157
158static quanta_t cur_deadline(struct task_struct* t)
159{
160 return cur_subtask(t)->deadline + tsk_pfair(t)->release;
161}
162
163static quanta_t cur_release(struct task_struct* t)
164{
165 /* This is early releasing: only the release of the first subtask
166 * counts. */
167 return tsk_pfair(t)->release;
168}
169
170static quanta_t cur_overlap(struct task_struct* t)
171{
172 return cur_subtask(t)->overlap;
173}
174
175static quanta_t cur_group_deadline(struct task_struct* t)
176{
177 quanta_t gdl = cur_subtask(t)->group_deadline;
178 if (gdl)
179 return gdl + tsk_pfair(t)->release;
180 else
181 return gdl;
182}
183
184
185static int pfair_higher_prio(struct task_struct* first,
186 struct task_struct* second)
187{
188 return /* first task must exist */
189 first && (
190 /* Does the second task exist and is it a real-time task? If
191 * not, the first task (which is a RT task) has higher
192 * priority.
193 */
194 !second || !is_realtime(second) ||
195
196 /* Is the (subtask) deadline of the first task earlier?
197 * Then it has higher priority.
198 */
199 time_before(cur_deadline(first), cur_deadline(second)) ||
200
201 /* Do we have a deadline tie?
202 * Then break by B-bit.
203 */
204 (cur_deadline(first) == cur_deadline(second) &&
205 (cur_overlap(first) > cur_overlap(second) ||
206
207 /* Do we have a B-bit tie?
208 * Then break by group deadline.
209 */
210 (cur_overlap(first) == cur_overlap(second) &&
211 (time_after(cur_group_deadline(first),
212 cur_group_deadline(second)) ||
213
214 /* Do we have a group deadline tie?
215 * Then break by PID, which are unique.
216 */
217 (cur_group_deadline(first) ==
218 cur_group_deadline(second) &&
219 first->pid < second->pid))))));
220}
221
222int pfair_ready_order(struct bheap_node* a, struct bheap_node* b)
223{
224 return pfair_higher_prio(bheap2task(a), bheap2task(b));
225}
226
227static void pfair_release_jobs(rt_domain_t* rt, struct bheap* tasks)
228{
229 struct pfair_cluster* cluster = from_domain(rt);
230 unsigned long flags;
231
232 raw_spin_lock_irqsave(&cluster->release_lock, flags);
233
234 bheap_union(pfair_ready_order, &cluster->release_queue, tasks);
235
236 raw_spin_unlock_irqrestore(&cluster->release_lock, flags);
237}
238
239static void prepare_release(struct task_struct* t, quanta_t at)
240{
241 tsk_pfair(t)->release = at;
242 tsk_pfair(t)->cur = 0;
243}
244
245/* pull released tasks from the release queue */
246static void poll_releases(struct pfair_cluster* cluster)
247{
248 raw_spin_lock(&cluster->release_lock);
249 __merge_ready(&cluster->pfair, &cluster->release_queue);
250 raw_spin_unlock(&cluster->release_lock);
251}
252
253static void check_preempt(struct task_struct* t)
254{
255 int cpu = NO_CPU;
256 if (tsk_rt(t)->linked_on != tsk_rt(t)->scheduled_on &&
257 tsk_rt(t)->present) {
258 /* the task can be scheduled and
259 * is not scheduled where it ought to be scheduled
260 */
261 cpu = tsk_rt(t)->linked_on != NO_CPU ?
262 tsk_rt(t)->linked_on :
263 tsk_rt(t)->scheduled_on;
264 PTRACE_TASK(t, "linked_on:%d, scheduled_on:%d\n",
265 tsk_rt(t)->linked_on, tsk_rt(t)->scheduled_on);
266 /* preempt */
267 litmus_reschedule(cpu);
268 }
269}
270
271/* caller must hold pfair.ready_lock */
272static void drop_all_references(struct task_struct *t)
273{
274 int cpu;
275 struct pfair_state* s;
276 struct pfair_cluster* cluster;
277 if (bheap_node_in_heap(tsk_rt(t)->heap_node)) {
278 /* It must be in the ready queue; drop references isn't called
279 * when the job is in a release queue. */
280 cluster = tsk_pfair(t)->cluster;
281 bheap_delete(pfair_ready_order, &cluster->pfair.ready_queue,
282 tsk_rt(t)->heap_node);
283 }
284 for (cpu = 0; cpu < num_online_cpus(); cpu++) {
285 s = &per_cpu(pfair_state, cpu);
286 if (s->linked == t)
287 s->linked = NULL;
288 if (s->local == t)
289 s->local = NULL;
290 if (s->scheduled == t)
291 s->scheduled = NULL;
292 }
293 /* make sure we don't have a stale linked_on field */
294 tsk_rt(t)->linked_on = NO_CPU;
295}
296
297static void pfair_prepare_next_period(struct task_struct* t)
298{
299 struct pfair_param* p = tsk_pfair(t);
300
301 prepare_for_next_period(t);
302 get_rt_flags(t) = RT_F_RUNNING;
303 p->release += p->period;
304}
305
306/* returns 1 if the task needs to go the release queue */
307static int advance_subtask(quanta_t time, struct task_struct* t, int cpu)
308{
309 struct pfair_param* p = tsk_pfair(t);
310 int to_relq;
311 p->cur = (p->cur + 1) % p->quanta;
312 if (!p->cur) {
313 if (tsk_rt(t)->present) {
314 /* The job overran; we start a new budget allocation. */
315 pfair_prepare_next_period(t);
316 } else {
317 /* remove task from system until it wakes */
318 drop_all_references(t);
319 tsk_rt(t)->flags = RT_F_REQUEUE;
320 TRACE_TASK(t, "on %d advanced to subtask %lu (not present)\n",
321 cpu, p->cur);
322 return 0;
323 }
324 }
325 to_relq = time_after(cur_release(t), time);
326 TRACE_TASK(t, "on %d advanced to subtask %lu -> to_relq=%d (cur_release:%lu time:%lu)\n",
327 cpu, p->cur, to_relq, cur_release(t), time);
328 return to_relq;
329}
330
331static void advance_subtasks(struct pfair_cluster *cluster, quanta_t time)
332{
333 struct task_struct* l;
334 struct pfair_param* p;
335 struct list_head* pos;
336 struct pfair_state* cpu;
337
338 list_for_each(pos, &cluster->topology.cpus) {
339 cpu = from_cluster_list(pos);
340 l = cpu->linked;
341 cpu->missed_updates += cpu->linked != cpu->local;
342 if (l) {
343 p = tsk_pfair(l);
344 p->last_quantum = time;
345 p->last_cpu = cpu_id(cpu);
346 if (advance_subtask(time, l, cpu_id(cpu))) {
347 //cpu->linked = NULL;
348 PTRACE_TASK(l, "should go to release queue. "
349 "scheduled_on=%d present=%d\n",
350 tsk_rt(l)->scheduled_on,
351 tsk_rt(l)->present);
352 }
353 }
354 }
355}
356
357static int target_cpu(quanta_t time, struct task_struct* t, int default_cpu)
358{
359 int cpu;
360 if (tsk_rt(t)->scheduled_on != NO_CPU) {
361 /* always observe scheduled_on linkage */
362 default_cpu = tsk_rt(t)->scheduled_on;
363 } else if (tsk_pfair(t)->last_quantum == time - 1) {
364 /* back2back quanta */
365 /* Only observe last_quantum if no scheduled_on is in the way.
366 * This should only kick in if a CPU missed quanta, and that
367 * *should* only happen in QEMU.
368 */
369 cpu = tsk_pfair(t)->last_cpu;
370 if (!pstate[cpu]->linked ||
371 tsk_rt(pstate[cpu]->linked)->scheduled_on != cpu) {
372 default_cpu = cpu;
373 }
374 }
375 return default_cpu;
376}
377
378/* returns one if linking was redirected */
379static int pfair_link(quanta_t time, int cpu,
380 struct task_struct* t)
381{
382 int target = target_cpu(time, t, cpu);
383 struct task_struct* prev = pstate[cpu]->linked;
384 struct task_struct* other;
385 struct pfair_cluster* cluster = cpu_cluster(pstate[cpu]);
386
387 if (target != cpu) {
388 BUG_ON(pstate[target]->topology.cluster != pstate[cpu]->topology.cluster);
389 other = pstate[target]->linked;
390 pstate[target]->linked = t;
391 tsk_rt(t)->linked_on = target;
392 if (!other)
393 /* linked ok, but reschedule this CPU */
394 return 1;
395 if (target < cpu) {
396 /* link other to cpu instead */
397 tsk_rt(other)->linked_on = cpu;
398 pstate[cpu]->linked = other;
399 if (prev) {
400 /* prev got pushed back into the ready queue */
401 tsk_rt(prev)->linked_on = NO_CPU;
402 __add_ready(&cluster->pfair, prev);
403 }
404 /* we are done with this cpu */
405 return 0;
406 } else {
407 /* re-add other, it's original CPU was not considered yet */
408 tsk_rt(other)->linked_on = NO_CPU;
409 __add_ready(&cluster->pfair, other);
410 /* reschedule this CPU */
411 return 1;
412 }
413 } else {
414 pstate[cpu]->linked = t;
415 tsk_rt(t)->linked_on = cpu;
416 if (prev) {
417 /* prev got pushed back into the ready queue */
418 tsk_rt(prev)->linked_on = NO_CPU;
419 __add_ready(&cluster->pfair, prev);
420 }
421 /* we are done with this CPU */
422 return 0;
423 }
424}
425
426static void schedule_subtasks(struct pfair_cluster *cluster, quanta_t time)
427{
428 int retry;
429 struct list_head *pos;
430 struct pfair_state *cpu_state;
431
432 list_for_each(pos, &cluster->topology.cpus) {
433 cpu_state = from_cluster_list(pos);
434 retry = 1;
435#ifdef CONFIG_RELEASE_MASTER
436 /* skip release master */
437 if (cluster->pfair.release_master == cpu_id(cpu_state))
438 continue;
439#endif
440 while (retry) {
441 if (pfair_higher_prio(__peek_ready(&cluster->pfair),
442 cpu_state->linked))
443 retry = pfair_link(time, cpu_id(cpu_state),
444 __take_ready(&cluster->pfair));
445 else
446 retry = 0;
447 }
448 }
449}
450
451static void schedule_next_quantum(struct pfair_cluster *cluster, quanta_t time)
452{
453 struct pfair_state *cpu;
454 struct list_head* pos;
455
456 /* called with interrupts disabled */
457 PTRACE("--- Q %lu at %llu PRE-SPIN\n",
458 time, litmus_clock());
459 raw_spin_lock(cluster_lock(cluster));
460 PTRACE("<<< Q %lu at %llu\n",
461 time, litmus_clock());
462
463 sched_trace_quantum_boundary();
464
465 advance_subtasks(cluster, time);
466 poll_releases(cluster);
467 schedule_subtasks(cluster, time);
468
469 list_for_each(pos, &cluster->topology.cpus) {
470 cpu = from_cluster_list(pos);
471 if (cpu->linked)
472 PTRACE_TASK(cpu->linked,
473 " linked on %d.\n", cpu_id(cpu));
474 else
475 PTRACE("(null) linked on %d.\n", cpu_id(cpu));
476 }
477 /* We are done. Advance time. */
478 mb();
479 list_for_each(pos, &cluster->topology.cpus) {
480 cpu = from_cluster_list(pos);
481 if (cpu->local_tick != cpu->cur_tick) {
482 TRACE("BAD Quantum not acked on %d "
483 "(l:%lu c:%lu p:%lu)\n",
484 cpu_id(cpu),
485 cpu->local_tick,
486 cpu->cur_tick,
487 cluster->pfair_time);
488 cpu->missed_quanta++;
489 }
490 cpu->cur_tick = time;
491 }
492 PTRACE(">>> Q %lu at %llu\n",
493 time, litmus_clock());
494 raw_spin_unlock(cluster_lock(cluster));
495}
496
497static noinline void wait_for_quantum(quanta_t q, struct pfair_state* state)
498{
499 quanta_t loc;
500
501 goto first; /* skip mb() on first iteration */
502 do {
503 cpu_relax();
504 mb();
505 first: loc = state->cur_tick;
506 /* FIXME: what if loc > cur? */
507 } while (time_before(loc, q));
508 PTRACE("observed cur_tick:%lu >= q:%lu\n",
509 loc, q);
510}
511
512static quanta_t current_quantum(struct pfair_state* state)
513{
514 lt_t t = litmus_clock() - state->offset;
515 return time2quanta(t, FLOOR);
516}
517
518static void catchup_quanta(quanta_t from, quanta_t target,
519 struct pfair_state* state)
520{
521 quanta_t cur = from, time;
522 TRACE("+++< BAD catching up quanta from %lu to %lu\n",
523 from, target);
524 while (time_before(cur, target)) {
525 wait_for_quantum(cur, state);
526 cur++;
527 time = cmpxchg(&cpu_cluster(state)->pfair_time,
528 cur - 1, /* expected */
529 cur /* next */
530 );
531 if (time == cur - 1)
532 schedule_next_quantum(cpu_cluster(state), cur);
533 }
534 TRACE("+++> catching up done\n");
535}
536
537/* pfair_tick - this function is called for every local timer
538 * interrupt.
539 */
540static void pfair_tick(struct task_struct* t)
541{
542 struct pfair_state* state = &__get_cpu_var(pfair_state);
543 quanta_t time, cur;
544 int retry = 10;
545
546 do {
547 cur = current_quantum(state);
548 PTRACE("q %lu at %llu\n", cur, litmus_clock());
549
550 /* Attempt to advance time. First CPU to get here
551 * will prepare the next quantum.
552 */
553 time = cmpxchg(&cpu_cluster(state)->pfair_time,
554 cur - 1, /* expected */
555 cur /* next */
556 );
557 if (time == cur - 1) {
558 /* exchange succeeded */
559 wait_for_quantum(cur - 1, state);
560 schedule_next_quantum(cpu_cluster(state), cur);
561 retry = 0;
562 } else if (time_before(time, cur - 1)) {
563 /* the whole system missed a tick !? */
564 catchup_quanta(time, cur, state);
565 retry--;
566 } else if (time_after(time, cur)) {
567 /* our timer lagging behind!? */
568 TRACE("BAD pfair_time:%lu > cur:%lu\n", time, cur);
569 retry--;
570 } else {
571 /* Some other CPU already started scheduling
572 * this quantum. Let it do its job and then update.
573 */
574 retry = 0;
575 }
576 } while (retry);
577
578 /* Spin locally until time advances. */
579 wait_for_quantum(cur, state);
580
581 /* copy assignment */
582 /* FIXME: what if we race with a future update? Corrupted state? */
583 state->local = state->linked;
584 /* signal that we are done */
585 mb();
586 state->local_tick = state->cur_tick;
587
588 if (state->local != current
589 && (is_realtime(current) || is_present(state->local)))
590 litmus_reschedule_local();
591}
592
593static int safe_to_schedule(struct task_struct* t, int cpu)
594{
595 int where = tsk_rt(t)->scheduled_on;
596 if (where != NO_CPU && where != cpu) {
597 TRACE_TASK(t, "BAD: can't be scheduled on %d, "
598 "scheduled already on %d.\n", cpu, where);
599 return 0;
600 } else
601 return tsk_rt(t)->present && get_rt_flags(t) == RT_F_RUNNING;
602}
603
604static struct task_struct* pfair_schedule(struct task_struct * prev)
605{
606 struct pfair_state* state = &__get_cpu_var(pfair_state);
607 struct pfair_cluster* cluster = cpu_cluster(state);
608 int blocks, completion, out_of_time;
609 struct task_struct* next = NULL;
610
611#ifdef CONFIG_RELEASE_MASTER
612 /* Bail out early if we are the release master.
613 * The release master never schedules any real-time tasks.
614 */
615 if (unlikely(cluster->pfair.release_master == cpu_id(state))) {
616 sched_state_task_picked();
617 return NULL;
618 }
619#endif
620
621 raw_spin_lock(cpu_lock(state));
622
623 blocks = is_realtime(prev) && !is_running(prev);
624 completion = is_realtime(prev) && get_rt_flags(prev) == RT_F_SLEEP;
625 out_of_time = is_realtime(prev) && time_after(cur_release(prev),
626 state->local_tick);
627
628 if (is_realtime(prev))
629 PTRACE_TASK(prev, "blocks:%d completion:%d out_of_time:%d\n",
630 blocks, completion, out_of_time);
631
632 if (completion) {
633 sched_trace_task_completion(prev, 0);
634 pfair_prepare_next_period(prev);
635 prepare_release(prev, cur_release(prev));
636 }
637
638 if (!blocks && (completion || out_of_time)) {
639 drop_all_references(prev);
640 sched_trace_task_release(prev);
641 add_release(&cluster->pfair, prev);
642 }
643
644 if (state->local && safe_to_schedule(state->local, cpu_id(state)))
645 next = state->local;
646
647 if (prev != next) {
648 tsk_rt(prev)->scheduled_on = NO_CPU;
649 if (next)
650 tsk_rt(next)->scheduled_on = cpu_id(state);
651 }
652 sched_state_task_picked();
653 raw_spin_unlock(cpu_lock(state));
654
655 if (next)
656 TRACE_TASK(next, "scheduled rel=%lu at %lu (%llu)\n",
657 tsk_pfair(next)->release, cpu_cluster(state)->pfair_time, litmus_clock());
658 else if (is_realtime(prev))
659 TRACE("Becomes idle at %lu (%llu)\n", cpu_cluster(state)->pfair_time, litmus_clock());
660
661 return next;
662}
663
664static void pfair_task_new(struct task_struct * t, int on_rq, int running)
665{
666 unsigned long flags;
667 struct pfair_cluster* cluster;
668
669 TRACE("pfair: task new %d state:%d\n", t->pid, t->state);
670
671 cluster = tsk_pfair(t)->cluster;
672
673 raw_spin_lock_irqsave(cluster_lock(cluster), flags);
674
675 prepare_release(t, cluster->pfair_time + 1);
676
677 t->rt_param.scheduled_on = NO_CPU;
678
679 if (running) {
680#ifdef CONFIG_RELEASE_MASTER
681 if (task_cpu(t) != cluster->pfair.release_master)
682#endif
683 t->rt_param.scheduled_on = task_cpu(t);
684 __add_ready(&cluster->pfair, t);
685 }
686
687 check_preempt(t);
688
689 raw_spin_unlock_irqrestore(cluster_lock(cluster), flags);
690}
691
692static void pfair_task_wake_up(struct task_struct *t)
693{
694 unsigned long flags;
695 lt_t now;
696 int requeue = 0;
697 struct pfair_cluster* cluster;
698
699 cluster = tsk_pfair(t)->cluster;
700
701 TRACE_TASK(t, "wakes at %llu, release=%lu, pfair_time:%lu\n",
702 litmus_clock(), cur_release(t), cluster->pfair_time);
703
704 raw_spin_lock_irqsave(cluster_lock(cluster), flags);
705
706 /* If a task blocks and wakes before its next job release,
707 * then it may resume if it is currently linked somewhere
708 * (as if it never blocked at all). Otherwise, we have a
709 * new sporadic job release.
710 */
711 requeue = tsk_rt(t)->flags == RT_F_REQUEUE;
712 now = litmus_clock();
713 if (lt_before(get_deadline(t), now)) {
714 TRACE_TASK(t, "sporadic release!\n");
715 release_at(t, now);
716 prepare_release(t, time2quanta(now, CEIL));
717 sched_trace_task_release(t);
718 }
719
720 /* only add to ready queue if the task isn't still linked somewhere */
721 if (requeue) {
722 TRACE_TASK(t, "requeueing required\n");
723 tsk_rt(t)->flags = RT_F_RUNNING;
724 __add_ready(&cluster->pfair, t);
725 }
726
727 check_preempt(t);
728
729 raw_spin_unlock_irqrestore(cluster_lock(cluster), flags);
730 TRACE_TASK(t, "wake up done at %llu\n", litmus_clock());
731}
732
733static void pfair_task_block(struct task_struct *t)
734{
735 BUG_ON(!is_realtime(t));
736 TRACE_TASK(t, "blocks at %llu, state:%d\n",
737 litmus_clock(), t->state);
738}
739
740static void pfair_task_exit(struct task_struct * t)
741{
742 unsigned long flags;
743 struct pfair_cluster *cluster;
744
745 BUG_ON(!is_realtime(t));
746
747 cluster = tsk_pfair(t)->cluster;
748
749 /* Remote task from release or ready queue, and ensure
750 * that it is not the scheduled task for ANY CPU. We
751 * do this blanket check because occassionally when
752 * tasks exit while blocked, the task_cpu of the task
753 * might not be the same as the CPU that the PFAIR scheduler
754 * has chosen for it.
755 */
756 raw_spin_lock_irqsave(cluster_lock(cluster), flags);
757
758 TRACE_TASK(t, "RIP, state:%d\n", t->state);
759 drop_all_references(t);
760
761 raw_spin_unlock_irqrestore(cluster_lock(cluster), flags);
762
763 kfree(t->rt_param.pfair);
764 t->rt_param.pfair = NULL;
765}
766
767
768static void pfair_release_at(struct task_struct* task, lt_t start)
769{
770 unsigned long flags;
771 quanta_t release;
772
773 struct pfair_cluster *cluster;
774
775 cluster = tsk_pfair(task)->cluster;
776
777 BUG_ON(!is_realtime(task));
778
779 raw_spin_lock_irqsave(cluster_lock(cluster), flags);
780 release_at(task, start);
781 release = time2quanta(start, CEIL);
782
783 TRACE_TASK(task, "sys release at %lu\n", release);
784
785 drop_all_references(task);
786 prepare_release(task, release);
787 add_release(&cluster->pfair, task);
788
789 raw_spin_unlock_irqrestore(cluster_lock(cluster), flags);
790}
791
792static void init_subtask(struct subtask* sub, unsigned long i,
793 lt_t quanta, lt_t period)
794{
795 /* since i is zero-based, the formulas are shifted by one */
796 lt_t tmp;
797
798 /* release */
799 tmp = period * i;
800 do_div(tmp, quanta); /* floor */
801 sub->release = (quanta_t) tmp;
802
803 /* deadline */
804 tmp = period * (i + 1);
805 if (do_div(tmp, quanta)) /* ceil */
806 tmp++;
807 sub->deadline = (quanta_t) tmp;
808
809 /* next release */
810 tmp = period * (i + 1);
811 do_div(tmp, quanta); /* floor */
812 sub->overlap = sub->deadline - (quanta_t) tmp;
813
814 /* Group deadline.
815 * Based on the formula given in Uma's thesis.
816 */
817 if (2 * quanta >= period) {
818 /* heavy */
819 tmp = (sub->deadline - (i + 1)) * period;
820 if (period > quanta &&
821 do_div(tmp, (period - quanta))) /* ceil */
822 tmp++;
823 sub->group_deadline = (quanta_t) tmp;
824 } else
825 sub->group_deadline = 0;
826}
827
828static void dump_subtasks(struct task_struct* t)
829{
830 unsigned long i;
831 for (i = 0; i < t->rt_param.pfair->quanta; i++)
832 TRACE_TASK(t, "SUBTASK %lu: rel=%lu dl=%lu bbit:%lu gdl:%lu\n",
833 i + 1,
834 t->rt_param.pfair->subtasks[i].release,
835 t->rt_param.pfair->subtasks[i].deadline,
836 t->rt_param.pfair->subtasks[i].overlap,
837 t->rt_param.pfair->subtasks[i].group_deadline);
838}
839
840static long pfair_admit_task(struct task_struct* t)
841{
842 lt_t quanta;
843 lt_t period;
844 s64 quantum_length = ktime_to_ns(tick_period);
845 struct pfair_param* param;
846 unsigned long i;
847
848 /* first check that the task is in the right cluster */
849 if (cpu_cluster(pstate[tsk_rt(t)->task_params.cpu]) !=
850 cpu_cluster(pstate[task_cpu(t)]))
851 return -EINVAL;
852
853 if (get_rt_period(t) != get_rt_relative_deadline(t)) {
854 printk(KERN_INFO "%s: Admission rejected. "
855 "Only implicit deadlines are currently supported.\n",
856 litmus->plugin_name);
857 return -EINVAL;
858 }
859
860 /* Pfair is a tick-based method, so the time
861 * of interest is jiffies. Calculate tick-based
862 * times for everything.
863 * (Ceiling of exec cost, floor of period.)
864 */
865
866 quanta = get_exec_cost(t);
867 period = get_rt_period(t);
868
869 quanta = time2quanta(get_exec_cost(t), CEIL);
870
871 if (do_div(period, quantum_length))
872 printk(KERN_WARNING
873 "The period of %s/%d is not a multiple of %llu.\n",
874 t->comm, t->pid, (unsigned long long) quantum_length);
875
876 if (quanta == period) {
877 /* special case: task has weight 1.0 */
878 printk(KERN_INFO
879 "Admitting weight 1.0 task. (%s/%d, %llu, %llu).\n",
880 t->comm, t->pid, quanta, period);
881 quanta = 1;
882 period = 1;
883 }
884
885 param = kmalloc(sizeof(*param) +
886 quanta * sizeof(struct subtask), GFP_ATOMIC);
887
888 if (!param)
889 return -ENOMEM;
890
891 param->quanta = quanta;
892 param->cur = 0;
893 param->release = 0;
894 param->period = period;
895
896 param->cluster = cpu_cluster(pstate[tsk_rt(t)->task_params.cpu]);
897
898 for (i = 0; i < quanta; i++)
899 init_subtask(param->subtasks + i, i, quanta, period);
900
901 if (t->rt_param.pfair)
902 /* get rid of stale allocation */
903 kfree(t->rt_param.pfair);
904
905 t->rt_param.pfair = param;
906
907 /* spew out some debug info */
908 dump_subtasks(t);
909
910 return 0;
911}
912
913static void pfair_init_cluster(struct pfair_cluster* cluster)
914{
915 rt_domain_init(&cluster->pfair, pfair_ready_order, NULL, pfair_release_jobs);
916 bheap_init(&cluster->release_queue);
917 raw_spin_lock_init(&cluster->release_lock);
918 INIT_LIST_HEAD(&cluster->topology.cpus);
919}
920
921static void cleanup_clusters(void)
922{
923 int i;
924
925 if (num_pfair_clusters)
926 kfree(pfair_clusters);
927 pfair_clusters = NULL;
928 num_pfair_clusters = 0;
929
930 /* avoid stale pointers */
931 for (i = 0; i < num_online_cpus(); i++) {
932 pstate[i]->topology.cluster = NULL;
933 printk("P%d missed %u updates and %u quanta.\n", cpu_id(pstate[i]),
934 pstate[i]->missed_updates, pstate[i]->missed_quanta);
935 }
936}
937
938static long pfair_activate_plugin(void)
939{
940 int err, i;
941 struct pfair_state* state;
942 struct pfair_cluster* cluster ;
943 quanta_t now;
944 int cluster_size;
945 struct cluster_cpu* cpus[NR_CPUS];
946 struct scheduling_cluster* clust[NR_CPUS];
947
948 cluster_size = get_cluster_size(pfair_cluster_level);
949
950 if (cluster_size <= 0 || num_online_cpus() % cluster_size != 0)
951 return -EINVAL;
952
953 num_pfair_clusters = num_online_cpus() / cluster_size;
954
955 pfair_clusters = kzalloc(num_pfair_clusters * sizeof(struct pfair_cluster), GFP_ATOMIC);
956 if (!pfair_clusters) {
957 num_pfair_clusters = 0;
958 printk(KERN_ERR "Could not allocate Pfair clusters!\n");
959 return -ENOMEM;
960 }
961
962 state = &__get_cpu_var(pfair_state);
963 now = current_quantum(state);
964 TRACE("Activating PFAIR at q=%lu\n", now);
965
966 for (i = 0; i < num_pfair_clusters; i++) {
967 cluster = &pfair_clusters[i];
968 pfair_init_cluster(cluster);
969 cluster->pfair_time = now;
970 clust[i] = &cluster->topology;
971#ifdef CONFIG_RELEASE_MASTER
972 cluster->pfair.release_master = atomic_read(&release_master_cpu);
973#endif
974 }
975
976 for (i = 0; i < num_online_cpus(); i++) {
977 state = &per_cpu(pfair_state, i);
978 state->cur_tick = now;
979 state->local_tick = now;
980 state->missed_quanta = 0;
981 state->missed_updates = 0;
982 state->offset = cpu_stagger_offset(i);
983 printk(KERN_ERR "cpus[%d] set; %d\n", i, num_online_cpus());
984 cpus[i] = &state->topology;
985 }
986
987 err = assign_cpus_to_clusters(pfair_cluster_level, clust, num_pfair_clusters,
988 cpus, num_online_cpus());
989
990 if (err < 0)
991 cleanup_clusters();
992
993 return err;
994}
995
996static long pfair_deactivate_plugin(void)
997{
998 cleanup_clusters();
999 return 0;
1000}
1001
1002/* Plugin object */
1003static struct sched_plugin pfair_plugin __cacheline_aligned_in_smp = {
1004 .plugin_name = "PFAIR",
1005 .tick = pfair_tick,
1006 .task_new = pfair_task_new,
1007 .task_exit = pfair_task_exit,
1008 .schedule = pfair_schedule,
1009 .task_wake_up = pfair_task_wake_up,
1010 .task_block = pfair_task_block,
1011 .admit_task = pfair_admit_task,
1012 .release_at = pfair_release_at,
1013 .complete_job = complete_job,
1014 .activate_plugin = pfair_activate_plugin,
1015 .deactivate_plugin = pfair_deactivate_plugin,
1016};
1017
1018
1019static struct proc_dir_entry *cluster_file = NULL, *pfair_dir = NULL;
1020
1021static int __init init_pfair(void)
1022{
1023 int cpu, err, fs;
1024 struct pfair_state *state;
1025
1026 /*
1027 * initialize short_cut for per-cpu pfair state;
1028 * there may be a problem here if someone removes a cpu
1029 * while we are doing this initialization... and if cpus
1030 * are added / removed later... but we don't support CPU hotplug atm anyway.
1031 */
1032 pstate = kmalloc(sizeof(struct pfair_state*) * num_online_cpus(), GFP_KERNEL);
1033
1034 /* initialize CPU state */
1035 for (cpu = 0; cpu < num_online_cpus(); cpu++) {
1036 state = &per_cpu(pfair_state, cpu);
1037 state->topology.id = cpu;
1038 state->cur_tick = 0;
1039 state->local_tick = 0;
1040 state->linked = NULL;
1041 state->local = NULL;
1042 state->scheduled = NULL;
1043 state->missed_quanta = 0;
1044 state->offset = cpu_stagger_offset(cpu);
1045 pstate[cpu] = state;
1046 }
1047
1048 pfair_clusters = NULL;
1049 num_pfair_clusters = 0;
1050
1051 err = register_sched_plugin(&pfair_plugin);
1052 if (!err) {
1053 fs = make_plugin_proc_dir(&pfair_plugin, &pfair_dir);
1054 if (!fs)
1055 cluster_file = create_cluster_file(pfair_dir, &pfair_cluster_level);
1056 else
1057 printk(KERN_ERR "Could not allocate PFAIR procfs dir.\n");
1058 }
1059
1060 return err;
1061}
1062
1063static void __exit clean_pfair(void)
1064{
1065 kfree(pstate);
1066
1067 if (cluster_file)
1068 remove_proc_entry("cluster", pfair_dir);
1069 if (pfair_dir)
1070 remove_plugin_proc_dir(&pfair_plugin);
1071}
1072
1073module_init(init_pfair);
1074module_exit(clean_pfair);