aboutsummaryrefslogtreecommitdiffstats
path: root/litmus/sched_pfair.c
diff options
context:
space:
mode:
Diffstat (limited to 'litmus/sched_pfair.c')
-rw-r--r--litmus/sched_pfair.c1056
1 files changed, 1056 insertions, 0 deletions
diff --git a/litmus/sched_pfair.c b/litmus/sched_pfair.c
new file mode 100644
index 000000000000..c95bde87b5d7
--- /dev/null
+++ b/litmus/sched_pfair.c
@@ -0,0 +1,1056 @@
1/*
2 * kernel/sched_pfair.c
3 *
4 * Implementation of the PD^2 pfair scheduling algorithm. This
5 * implementation realizes "early releasing," i.e., it is work-conserving.
6 *
7 */
8
9#include <asm/div64.h>
10#include <linux/delay.h>
11#include <linux/module.h>
12#include <linux/spinlock.h>
13#include <linux/percpu.h>
14#include <linux/sched.h>
15#include <linux/list.h>
16#include <linux/slab.h>
17
18#include <litmus/litmus.h>
19#include <litmus/jobs.h>
20#include <litmus/preempt.h>
21#include <litmus/rt_domain.h>
22#include <litmus/sched_plugin.h>
23#include <litmus/sched_trace.h>
24
25#include <litmus/bheap.h>
26
27/* to configure the cluster size */
28#include <litmus/litmus_proc.h>
29
30#include <litmus/clustered.h>
31
32static enum cache_level pfair_cluster_level = GLOBAL_CLUSTER;
33
34struct subtask {
35 /* measured in quanta relative to job release */
36 quanta_t release;
37 quanta_t deadline;
38 quanta_t overlap; /* called "b bit" by PD^2 */
39 quanta_t group_deadline;
40};
41
42struct pfair_param {
43 quanta_t quanta; /* number of subtasks */
44 quanta_t cur; /* index of current subtask */
45
46 quanta_t release; /* in quanta */
47 quanta_t period; /* in quanta */
48
49 quanta_t last_quantum; /* when scheduled last */
50 int last_cpu; /* where scheduled last */
51
52 struct pfair_cluster* cluster; /* where this task is scheduled */
53
54 struct subtask subtasks[0]; /* allocate together with pfair_param */
55};
56
57#define tsk_pfair(tsk) ((tsk)->rt_param.pfair)
58
59struct pfair_state {
60 struct cluster_cpu topology;
61
62 volatile quanta_t cur_tick; /* updated by the CPU that is advancing
63 * the time */
64 volatile quanta_t local_tick; /* What tick is the local CPU currently
65 * executing? Updated only by the local
66 * CPU. In QEMU, this may lag behind the
67 * current tick. In a real system, with
68 * proper timers and aligned quanta,
69 * that should only be the case for a
70 * very short time after the time
71 * advanced. With staggered quanta, it
72 * will lag for the duration of the
73 * offset.
74 */
75
76 struct task_struct* linked; /* the task that should be executing */
77 struct task_struct* local; /* the local copy of linked */
78 struct task_struct* scheduled; /* what is actually scheduled */
79
80 lt_t offset; /* stagger offset */
81 unsigned int missed_updates;
82 unsigned int missed_quanta;
83};
84
85struct pfair_cluster {
86 struct scheduling_cluster topology;
87
88 /* The "global" time in this cluster. */
89 quanta_t pfair_time; /* the "official" PFAIR clock */
90
91 /* The ready queue for this cluster. */
92 rt_domain_t pfair;
93
94 /* The set of jobs that should have their release enacted at the next
95 * quantum boundary.
96 */
97 struct bheap release_queue;
98 raw_spinlock_t release_lock;
99};
100
101static inline struct pfair_cluster* cpu_cluster(struct pfair_state* state)
102{
103 return container_of(state->topology.cluster, struct pfair_cluster, topology);
104}
105
106static inline int cpu_id(struct pfair_state* state)
107{
108 return state->topology.id;
109}
110
111static inline struct pfair_state* from_cluster_list(struct list_head* pos)
112{
113 return list_entry(pos, struct pfair_state, topology.cluster_list);
114}
115
116static inline struct pfair_cluster* from_domain(rt_domain_t* rt)
117{
118 return container_of(rt, struct pfair_cluster, pfair);
119}
120
121static inline raw_spinlock_t* cluster_lock(struct pfair_cluster* cluster)
122{
123 /* The ready_lock is used to serialize all scheduling events. */
124 return &cluster->pfair.ready_lock;
125}
126
127static inline raw_spinlock_t* cpu_lock(struct pfair_state* state)
128{
129 return cluster_lock(cpu_cluster(state));
130}
131
132DEFINE_PER_CPU(struct pfair_state, pfair_state);
133struct pfair_state* *pstate; /* short cut */
134
135static struct pfair_cluster* pfair_clusters;
136static int num_pfair_clusters;
137
138/* Enable for lots of trace info.
139 * #define PFAIR_DEBUG
140 */
141
142#ifdef PFAIR_DEBUG
143#define PTRACE_TASK(t, f, args...) TRACE_TASK(t, f, ## args)
144#define PTRACE(f, args...) TRACE(f, ## args)
145#else
146#define PTRACE_TASK(t, f, args...)
147#define PTRACE(f, args...)
148#endif
149
150/* gcc will inline all of these accessor functions... */
151static struct subtask* cur_subtask(struct task_struct* t)
152{
153 return tsk_pfair(t)->subtasks + tsk_pfair(t)->cur;
154}
155
156static quanta_t cur_deadline(struct task_struct* t)
157{
158 return cur_subtask(t)->deadline + tsk_pfair(t)->release;
159}
160
161static quanta_t cur_release(struct task_struct* t)
162{
163 /* This is early releasing: only the release of the first subtask
164 * counts. */
165 return tsk_pfair(t)->release;
166}
167
168static quanta_t cur_overlap(struct task_struct* t)
169{
170 return cur_subtask(t)->overlap;
171}
172
173static quanta_t cur_group_deadline(struct task_struct* t)
174{
175 quanta_t gdl = cur_subtask(t)->group_deadline;
176 if (gdl)
177 return gdl + tsk_pfair(t)->release;
178 else
179 return gdl;
180}
181
182
183static int pfair_higher_prio(struct task_struct* first,
184 struct task_struct* second)
185{
186 return /* first task must exist */
187 first && (
188 /* Does the second task exist and is it a real-time task? If
189 * not, the first task (which is a RT task) has higher
190 * priority.
191 */
192 !second || !is_realtime(second) ||
193
194 /* Is the (subtask) deadline of the first task earlier?
195 * Then it has higher priority.
196 */
197 time_before(cur_deadline(first), cur_deadline(second)) ||
198
199 /* Do we have a deadline tie?
200 * Then break by B-bit.
201 */
202 (cur_deadline(first) == cur_deadline(second) &&
203 (cur_overlap(first) > cur_overlap(second) ||
204
205 /* Do we have a B-bit tie?
206 * Then break by group deadline.
207 */
208 (cur_overlap(first) == cur_overlap(second) &&
209 (time_after(cur_group_deadline(first),
210 cur_group_deadline(second)) ||
211
212 /* Do we have a group deadline tie?
213 * Then break by PID, which are unique.
214 */
215 (cur_group_deadline(first) ==
216 cur_group_deadline(second) &&
217 first->pid < second->pid))))));
218}
219
220int pfair_ready_order(struct bheap_node* a, struct bheap_node* b)
221{
222 return pfair_higher_prio(bheap2task(a), bheap2task(b));
223}
224
225static void pfair_release_jobs(rt_domain_t* rt, struct bheap* tasks)
226{
227 struct pfair_cluster* cluster = from_domain(rt);
228 unsigned long flags;
229
230 raw_spin_lock_irqsave(&cluster->release_lock, flags);
231
232 bheap_union(pfair_ready_order, &cluster->release_queue, tasks);
233
234 raw_spin_unlock_irqrestore(&cluster->release_lock, flags);
235}
236
237static void prepare_release(struct task_struct* t, quanta_t at)
238{
239 tsk_pfair(t)->release = at;
240 tsk_pfair(t)->cur = 0;
241}
242
243/* pull released tasks from the release queue */
244static void poll_releases(struct pfair_cluster* cluster)
245{
246 raw_spin_lock(&cluster->release_lock);
247 __merge_ready(&cluster->pfair, &cluster->release_queue);
248 raw_spin_unlock(&cluster->release_lock);
249}
250
251static void check_preempt(struct task_struct* t)
252{
253 int cpu = NO_CPU;
254 if (tsk_rt(t)->linked_on != tsk_rt(t)->scheduled_on &&
255 tsk_rt(t)->present) {
256 /* the task can be scheduled and
257 * is not scheduled where it ought to be scheduled
258 */
259 cpu = tsk_rt(t)->linked_on != NO_CPU ?
260 tsk_rt(t)->linked_on :
261 tsk_rt(t)->scheduled_on;
262 PTRACE_TASK(t, "linked_on:%d, scheduled_on:%d\n",
263 tsk_rt(t)->linked_on, tsk_rt(t)->scheduled_on);
264 /* preempt */
265 litmus_reschedule(cpu);
266 }
267}
268
269/* caller must hold pfair.ready_lock */
270static void drop_all_references(struct task_struct *t)
271{
272 int cpu;
273 struct pfair_state* s;
274 struct pfair_cluster* cluster;
275 if (bheap_node_in_heap(tsk_rt(t)->heap_node)) {
276 /* It must be in the ready queue; drop references isn't called
277 * when the job is in a release queue. */
278 cluster = tsk_pfair(t)->cluster;
279 bheap_delete(pfair_ready_order, &cluster->pfair.ready_queue,
280 tsk_rt(t)->heap_node);
281 }
282 for (cpu = 0; cpu < num_online_cpus(); cpu++) {
283 s = &per_cpu(pfair_state, cpu);
284 if (s->linked == t)
285 s->linked = NULL;
286 if (s->local == t)
287 s->local = NULL;
288 if (s->scheduled == t)
289 s->scheduled = NULL;
290 }
291}
292
293static void pfair_prepare_next_period(struct task_struct* t)
294{
295 struct pfair_param* p = tsk_pfair(t);
296
297 prepare_for_next_period(t);
298 get_rt_flags(t) = RT_F_RUNNING;
299 p->release += p->period;
300}
301
302/* returns 1 if the task needs to go the release queue */
303static int advance_subtask(quanta_t time, struct task_struct* t, int cpu)
304{
305 struct pfair_param* p = tsk_pfair(t);
306 int to_relq;
307 p->cur = (p->cur + 1) % p->quanta;
308 if (!p->cur) {
309 if (tsk_rt(t)->present) {
310 /* The job overran; we start a new budget allocation. */
311 pfair_prepare_next_period(t);
312 } else {
313 /* remove task from system until it wakes */
314 drop_all_references(t);
315 TRACE_TASK(t, "on %d advanced to subtask %lu (not present)\n",
316 cpu, p->cur);
317 return 0;
318 }
319 }
320 to_relq = time_after(cur_release(t), time);
321 TRACE_TASK(t, "on %d advanced to subtask %lu -> to_relq=%d (cur_release:%lu time:%lu)\n",
322 cpu, p->cur, to_relq, cur_release(t), time);
323 return to_relq;
324}
325
326static void advance_subtasks(struct pfair_cluster *cluster, quanta_t time)
327{
328 struct task_struct* l;
329 struct pfair_param* p;
330 struct list_head* pos;
331 struct pfair_state* cpu;
332
333 list_for_each(pos, &cluster->topology.cpus) {
334 cpu = from_cluster_list(pos);
335 l = cpu->linked;
336 cpu->missed_updates += cpu->linked != cpu->local;
337 if (l) {
338 p = tsk_pfair(l);
339 p->last_quantum = time;
340 p->last_cpu = cpu_id(cpu);
341 if (advance_subtask(time, l, cpu_id(cpu))) {
342 //cpu->linked = NULL;
343 PTRACE_TASK(l, "should go to release queue. "
344 "scheduled_on=%d present=%d\n",
345 tsk_rt(l)->scheduled_on,
346 tsk_rt(l)->present);
347 }
348 }
349 }
350}
351
352static int target_cpu(quanta_t time, struct task_struct* t, int default_cpu)
353{
354 int cpu;
355 if (tsk_rt(t)->scheduled_on != NO_CPU) {
356 /* always observe scheduled_on linkage */
357 default_cpu = tsk_rt(t)->scheduled_on;
358 } else if (tsk_pfair(t)->last_quantum == time - 1) {
359 /* back2back quanta */
360 /* Only observe last_quantum if no scheduled_on is in the way.
361 * This should only kick in if a CPU missed quanta, and that
362 * *should* only happen in QEMU.
363 */
364 cpu = tsk_pfair(t)->last_cpu;
365 if (!pstate[cpu]->linked ||
366 tsk_rt(pstate[cpu]->linked)->scheduled_on != cpu) {
367 default_cpu = cpu;
368 }
369 }
370 return default_cpu;
371}
372
373/* returns one if linking was redirected */
374static int pfair_link(quanta_t time, int cpu,
375 struct task_struct* t)
376{
377 int target = target_cpu(time, t, cpu);
378 struct task_struct* prev = pstate[cpu]->linked;
379 struct task_struct* other;
380 struct pfair_cluster* cluster = cpu_cluster(pstate[cpu]);
381
382 if (target != cpu) {
383 BUG_ON(pstate[target]->topology.cluster != pstate[cpu]->topology.cluster);
384 other = pstate[target]->linked;
385 pstate[target]->linked = t;
386 tsk_rt(t)->linked_on = target;
387 if (!other)
388 /* linked ok, but reschedule this CPU */
389 return 1;
390 if (target < cpu) {
391 /* link other to cpu instead */
392 tsk_rt(other)->linked_on = cpu;
393 pstate[cpu]->linked = other;
394 if (prev) {
395 /* prev got pushed back into the ready queue */
396 tsk_rt(prev)->linked_on = NO_CPU;
397 __add_ready(&cluster->pfair, prev);
398 }
399 /* we are done with this cpu */
400 return 0;
401 } else {
402 /* re-add other, it's original CPU was not considered yet */
403 tsk_rt(other)->linked_on = NO_CPU;
404 __add_ready(&cluster->pfair, other);
405 /* reschedule this CPU */
406 return 1;
407 }
408 } else {
409 pstate[cpu]->linked = t;
410 tsk_rt(t)->linked_on = cpu;
411 if (prev) {
412 /* prev got pushed back into the ready queue */
413 tsk_rt(prev)->linked_on = NO_CPU;
414 __add_ready(&cluster->pfair, prev);
415 }
416 /* we are done with this CPU */
417 return 0;
418 }
419}
420
421static void schedule_subtasks(struct pfair_cluster *cluster, quanta_t time)
422{
423 int retry;
424 struct list_head *pos;
425 struct pfair_state *cpu_state;
426
427 list_for_each(pos, &cluster->topology.cpus) {
428 cpu_state = from_cluster_list(pos);
429 retry = 1;
430#ifdef CONFIG_RELEASE_MASTER
431 /* skip release master */
432 if (cluster->pfair.release_master == cpu_id(cpu_state))
433 continue;
434#endif
435 while (retry) {
436 if (pfair_higher_prio(__peek_ready(&cluster->pfair),
437 cpu_state->linked))
438 retry = pfair_link(time, cpu_id(cpu_state),
439 __take_ready(&cluster->pfair));
440 else
441 retry = 0;
442 }
443 }
444}
445
446static void schedule_next_quantum(struct pfair_cluster *cluster, quanta_t time)
447{
448 struct pfair_state *cpu;
449 struct list_head* pos;
450
451 /* called with interrupts disabled */
452 PTRACE("--- Q %lu at %llu PRE-SPIN\n",
453 time, litmus_clock());
454 raw_spin_lock(cluster_lock(cluster));
455 PTRACE("<<< Q %lu at %llu\n",
456 time, litmus_clock());
457
458 sched_trace_quantum_boundary();
459
460 advance_subtasks(cluster, time);
461 poll_releases(cluster);
462 schedule_subtasks(cluster, time);
463
464 list_for_each(pos, &cluster->topology.cpus) {
465 cpu = from_cluster_list(pos);
466 if (cpu->linked)
467 PTRACE_TASK(cpu->linked,
468 " linked on %d.\n", cpu_id(cpu));
469 else
470 PTRACE("(null) linked on %d.\n", cpu_id(cpu));
471 }
472 /* We are done. Advance time. */
473 mb();
474 list_for_each(pos, &cluster->topology.cpus) {
475 cpu = from_cluster_list(pos);
476 if (cpu->local_tick != cpu->cur_tick) {
477 TRACE("BAD Quantum not acked on %d "
478 "(l:%lu c:%lu p:%lu)\n",
479 cpu_id(cpu),
480 cpu->local_tick,
481 cpu->cur_tick,
482 cluster->pfair_time);
483 cpu->missed_quanta++;
484 }
485 cpu->cur_tick = time;
486 }
487 PTRACE(">>> Q %lu at %llu\n",
488 time, litmus_clock());
489 raw_spin_unlock(cluster_lock(cluster));
490}
491
492static noinline void wait_for_quantum(quanta_t q, struct pfair_state* state)
493{
494 quanta_t loc;
495
496 goto first; /* skip mb() on first iteration */
497 do {
498 cpu_relax();
499 mb();
500 first: loc = state->cur_tick;
501 /* FIXME: what if loc > cur? */
502 } while (time_before(loc, q));
503 PTRACE("observed cur_tick:%lu >= q:%lu\n",
504 loc, q);
505}
506
507static quanta_t current_quantum(struct pfair_state* state)
508{
509 lt_t t = litmus_clock() - state->offset;
510 return time2quanta(t, FLOOR);
511}
512
513static void catchup_quanta(quanta_t from, quanta_t target,
514 struct pfair_state* state)
515{
516 quanta_t cur = from, time;
517 TRACE("+++< BAD catching up quanta from %lu to %lu\n",
518 from, target);
519 while (time_before(cur, target)) {
520 wait_for_quantum(cur, state);
521 cur++;
522 time = cmpxchg(&cpu_cluster(state)->pfair_time,
523 cur - 1, /* expected */
524 cur /* next */
525 );
526 if (time == cur - 1)
527 schedule_next_quantum(cpu_cluster(state), cur);
528 }
529 TRACE("+++> catching up done\n");
530}
531
532/* pfair_tick - this function is called for every local timer
533 * interrupt.
534 */
535static void pfair_tick(struct task_struct* t)
536{
537 struct pfair_state* state = &__get_cpu_var(pfair_state);
538 quanta_t time, cur;
539 int retry = 10;
540
541 do {
542 cur = current_quantum(state);
543 PTRACE("q %lu at %llu\n", cur, litmus_clock());
544
545 /* Attempt to advance time. First CPU to get here
546 * will prepare the next quantum.
547 */
548 time = cmpxchg(&cpu_cluster(state)->pfair_time,
549 cur - 1, /* expected */
550 cur /* next */
551 );
552 if (time == cur - 1) {
553 /* exchange succeeded */
554 wait_for_quantum(cur - 1, state);
555 schedule_next_quantum(cpu_cluster(state), cur);
556 retry = 0;
557 } else if (time_before(time, cur - 1)) {
558 /* the whole system missed a tick !? */
559 catchup_quanta(time, cur, state);
560 retry--;
561 } else if (time_after(time, cur)) {
562 /* our timer lagging behind!? */
563 TRACE("BAD pfair_time:%lu > cur:%lu\n", time, cur);
564 retry--;
565 } else {
566 /* Some other CPU already started scheduling
567 * this quantum. Let it do its job and then update.
568 */
569 retry = 0;
570 }
571 } while (retry);
572
573 /* Spin locally until time advances. */
574 wait_for_quantum(cur, state);
575
576 /* copy assignment */
577 /* FIXME: what if we race with a future update? Corrupted state? */
578 state->local = state->linked;
579 /* signal that we are done */
580 mb();
581 state->local_tick = state->cur_tick;
582
583 if (state->local != current
584 && (is_realtime(current) || is_present(state->local)))
585 litmus_reschedule_local();
586}
587
588static int safe_to_schedule(struct task_struct* t, int cpu)
589{
590 int where = tsk_rt(t)->scheduled_on;
591 if (where != NO_CPU && where != cpu) {
592 TRACE_TASK(t, "BAD: can't be scheduled on %d, "
593 "scheduled already on %d.\n", cpu, where);
594 return 0;
595 } else
596 return tsk_rt(t)->present && get_rt_flags(t) == RT_F_RUNNING;
597}
598
599static struct task_struct* pfair_schedule(struct task_struct * prev)
600{
601 struct pfair_state* state = &__get_cpu_var(pfair_state);
602 struct pfair_cluster* cluster = cpu_cluster(state);
603 int blocks, completion, out_of_time;
604 struct task_struct* next = NULL;
605
606#ifdef CONFIG_RELEASE_MASTER
607 /* Bail out early if we are the release master.
608 * The release master never schedules any real-time tasks.
609 */
610 if (unlikely(cluster->pfair.release_master == cpu_id(state))) {
611 sched_state_task_picked();
612 return NULL;
613 }
614#endif
615
616 raw_spin_lock(cpu_lock(state));
617
618 blocks = is_realtime(prev) && !is_running(prev);
619 completion = is_realtime(prev) && get_rt_flags(prev) == RT_F_SLEEP;
620 out_of_time = is_realtime(prev) && time_after(cur_release(prev),
621 state->local_tick);
622
623 if (is_realtime(prev))
624 PTRACE_TASK(prev, "blocks:%d completion:%d out_of_time:%d\n",
625 blocks, completion, out_of_time);
626
627 if (completion) {
628 sched_trace_task_completion(prev, 0);
629 pfair_prepare_next_period(prev);
630 prepare_release(prev, cur_release(prev));
631 }
632
633 if (!blocks && (completion || out_of_time)) {
634 drop_all_references(prev);
635 sched_trace_task_release(prev);
636 add_release(&cluster->pfair, prev);
637 }
638
639 if (state->local && safe_to_schedule(state->local, cpu_id(state)))
640 next = state->local;
641
642 if (prev != next) {
643 tsk_rt(prev)->scheduled_on = NO_CPU;
644 if (next)
645 tsk_rt(next)->scheduled_on = cpu_id(state);
646 }
647 sched_state_task_picked();
648 raw_spin_unlock(cpu_lock(state));
649
650 if (next)
651 TRACE_TASK(next, "scheduled rel=%lu at %lu (%llu)\n",
652 tsk_pfair(next)->release, cpu_cluster(state)->pfair_time, litmus_clock());
653 else if (is_realtime(prev))
654 TRACE("Becomes idle at %lu (%llu)\n", cpu_cluster(state)->pfair_time, litmus_clock());
655
656 return next;
657}
658
659static void pfair_task_new(struct task_struct * t, int on_rq, int running)
660{
661 unsigned long flags;
662 struct pfair_cluster* cluster;
663
664 TRACE("pfair: task new %d state:%d\n", t->pid, t->state);
665
666 cluster = tsk_pfair(t)->cluster;
667
668 raw_spin_lock_irqsave(cluster_lock(cluster), flags);
669
670 prepare_release(t, cluster->pfair_time + 1);
671
672 t->rt_param.scheduled_on = NO_CPU;
673
674 if (running) {
675#ifdef CONFIG_RELEASE_MASTER
676 if (task_cpu(t) != cluster->pfair.release_master)
677#endif
678 t->rt_param.scheduled_on = task_cpu(t);
679 __add_ready(&cluster->pfair, t);
680 }
681
682 check_preempt(t);
683
684 raw_spin_unlock_irqrestore(cluster_lock(cluster), flags);
685}
686
687static void pfair_task_wake_up(struct task_struct *t)
688{
689 unsigned long flags;
690 lt_t now;
691 struct pfair_cluster* cluster;
692
693 cluster = tsk_pfair(t)->cluster;
694
695 TRACE_TASK(t, "wakes at %llu, release=%lu, pfair_time:%lu\n",
696 litmus_clock(), cur_release(t), cluster->pfair_time);
697
698 raw_spin_lock_irqsave(cluster_lock(cluster), flags);
699
700 /* If a task blocks and wakes before its next job release,
701 * then it may resume if it is currently linked somewhere
702 * (as if it never blocked at all). Otherwise, we have a
703 * new sporadic job release.
704 */
705 now = litmus_clock();
706 if (lt_before(get_deadline(t), now)) {
707 release_at(t, now);
708 prepare_release(t, time2quanta(now, CEIL));
709 sched_trace_task_release(t);
710 }
711
712 /* only add to ready queue if the task isn't still linked somewhere */
713 if (tsk_rt(t)->linked_on == NO_CPU)
714 __add_ready(&cluster->pfair, t);
715
716 check_preempt(t);
717
718 raw_spin_unlock_irqrestore(cluster_lock(cluster), flags);
719 TRACE_TASK(t, "wake up done at %llu\n", litmus_clock());
720}
721
722static void pfair_task_block(struct task_struct *t)
723{
724 BUG_ON(!is_realtime(t));
725 TRACE_TASK(t, "blocks at %llu, state:%d\n",
726 litmus_clock(), t->state);
727}
728
729static void pfair_task_exit(struct task_struct * t)
730{
731 unsigned long flags;
732 struct pfair_cluster *cluster;
733
734 BUG_ON(!is_realtime(t));
735
736 cluster = tsk_pfair(t)->cluster;
737
738 /* Remote task from release or ready queue, and ensure
739 * that it is not the scheduled task for ANY CPU. We
740 * do this blanket check because occassionally when
741 * tasks exit while blocked, the task_cpu of the task
742 * might not be the same as the CPU that the PFAIR scheduler
743 * has chosen for it.
744 */
745 raw_spin_lock_irqsave(cluster_lock(cluster), flags);
746
747 TRACE_TASK(t, "RIP, state:%d\n", t->state);
748 drop_all_references(t);
749
750 raw_spin_unlock_irqrestore(cluster_lock(cluster), flags);
751
752 kfree(t->rt_param.pfair);
753 t->rt_param.pfair = NULL;
754}
755
756
757static void pfair_release_at(struct task_struct* task, lt_t start)
758{
759 unsigned long flags;
760 quanta_t release;
761
762 struct pfair_cluster *cluster;
763
764 cluster = tsk_pfair(task)->cluster;
765
766 BUG_ON(!is_realtime(task));
767
768 raw_spin_lock_irqsave(cluster_lock(cluster), flags);
769 release_at(task, start);
770 release = time2quanta(start, CEIL);
771
772 TRACE_TASK(task, "sys release at %lu\n", release);
773
774 drop_all_references(task);
775 prepare_release(task, release);
776 add_release(&cluster->pfair, task);
777
778 raw_spin_unlock_irqrestore(cluster_lock(cluster), flags);
779}
780
781static void init_subtask(struct subtask* sub, unsigned long i,
782 lt_t quanta, lt_t period)
783{
784 /* since i is zero-based, the formulas are shifted by one */
785 lt_t tmp;
786
787 /* release */
788 tmp = period * i;
789 do_div(tmp, quanta); /* floor */
790 sub->release = (quanta_t) tmp;
791
792 /* deadline */
793 tmp = period * (i + 1);
794 if (do_div(tmp, quanta)) /* ceil */
795 tmp++;
796 sub->deadline = (quanta_t) tmp;
797
798 /* next release */
799 tmp = period * (i + 1);
800 do_div(tmp, quanta); /* floor */
801 sub->overlap = sub->deadline - (quanta_t) tmp;
802
803 /* Group deadline.
804 * Based on the formula given in Uma's thesis.
805 */
806 if (2 * quanta >= period) {
807 /* heavy */
808 tmp = (sub->deadline - (i + 1)) * period;
809 if (period > quanta &&
810 do_div(tmp, (period - quanta))) /* ceil */
811 tmp++;
812 sub->group_deadline = (quanta_t) tmp;
813 } else
814 sub->group_deadline = 0;
815}
816
817static void dump_subtasks(struct task_struct* t)
818{
819 unsigned long i;
820 for (i = 0; i < t->rt_param.pfair->quanta; i++)
821 TRACE_TASK(t, "SUBTASK %lu: rel=%lu dl=%lu bbit:%lu gdl:%lu\n",
822 i + 1,
823 t->rt_param.pfair->subtasks[i].release,
824 t->rt_param.pfair->subtasks[i].deadline,
825 t->rt_param.pfair->subtasks[i].overlap,
826 t->rt_param.pfair->subtasks[i].group_deadline);
827}
828
829static long pfair_admit_task(struct task_struct* t)
830{
831 lt_t quanta;
832 lt_t period;
833 s64 quantum_length = ktime_to_ns(tick_period);
834 struct pfair_param* param;
835 unsigned long i;
836
837 /* first check that the task is in the right cluster */
838 if (cpu_cluster(pstate[tsk_rt(t)->task_params.cpu]) !=
839 cpu_cluster(pstate[task_cpu(t)]))
840 return -EINVAL;
841
842 /* Pfair is a tick-based method, so the time
843 * of interest is jiffies. Calculate tick-based
844 * times for everything.
845 * (Ceiling of exec cost, floor of period.)
846 */
847
848 quanta = get_exec_cost(t);
849 period = get_rt_period(t);
850
851 quanta = time2quanta(get_exec_cost(t), CEIL);
852
853 if (do_div(period, quantum_length))
854 printk(KERN_WARNING
855 "The period of %s/%d is not a multiple of %llu.\n",
856 t->comm, t->pid, (unsigned long long) quantum_length);
857
858 if (quanta == period) {
859 /* special case: task has weight 1.0 */
860 printk(KERN_INFO
861 "Admitting weight 1.0 task. (%s/%d, %llu, %llu).\n",
862 t->comm, t->pid, quanta, period);
863 quanta = 1;
864 period = 1;
865 }
866
867 param = kmalloc(sizeof(*param) +
868 quanta * sizeof(struct subtask), GFP_ATOMIC);
869
870 if (!param)
871 return -ENOMEM;
872
873 param->quanta = quanta;
874 param->cur = 0;
875 param->release = 0;
876 param->period = period;
877
878 param->cluster = cpu_cluster(pstate[tsk_rt(t)->task_params.cpu]);
879
880 for (i = 0; i < quanta; i++)
881 init_subtask(param->subtasks + i, i, quanta, period);
882
883 if (t->rt_param.pfair)
884 /* get rid of stale allocation */
885 kfree(t->rt_param.pfair);
886
887 t->rt_param.pfair = param;
888
889 /* spew out some debug info */
890 dump_subtasks(t);
891
892 return 0;
893}
894
895static void pfair_init_cluster(struct pfair_cluster* cluster)
896{
897 rt_domain_init(&cluster->pfair, pfair_ready_order, NULL, pfair_release_jobs);
898 bheap_init(&cluster->release_queue);
899 raw_spin_lock_init(&cluster->release_lock);
900 INIT_LIST_HEAD(&cluster->topology.cpus);
901}
902
903static void cleanup_clusters(void)
904{
905 int i;
906
907 if (num_pfair_clusters)
908 kfree(pfair_clusters);
909 pfair_clusters = NULL;
910 num_pfair_clusters = 0;
911
912 /* avoid stale pointers */
913 for (i = 0; i < num_online_cpus(); i++) {
914 pstate[i]->topology.cluster = NULL;
915 printk("P%d missed %u updates and %u quanta.\n", cpu_id(pstate[i]),
916 pstate[i]->missed_updates, pstate[i]->missed_quanta);
917 }
918}
919
920static long pfair_activate_plugin(void)
921{
922 int err, i;
923 struct pfair_state* state;
924 struct pfair_cluster* cluster ;
925 quanta_t now;
926 int cluster_size;
927 struct cluster_cpu* cpus[NR_CPUS];
928 struct scheduling_cluster* clust[NR_CPUS];
929
930 cluster_size = get_cluster_size(pfair_cluster_level);
931
932 if (cluster_size <= 0 || num_online_cpus() % cluster_size != 0)
933 return -EINVAL;
934
935 num_pfair_clusters = num_online_cpus() / cluster_size;
936
937 pfair_clusters = kzalloc(num_pfair_clusters * sizeof(struct pfair_cluster), GFP_ATOMIC);
938 if (!pfair_clusters) {
939 num_pfair_clusters = 0;
940 printk(KERN_ERR "Could not allocate Pfair clusters!\n");
941 return -ENOMEM;
942 }
943
944 state = &__get_cpu_var(pfair_state);
945 now = current_quantum(state);
946 TRACE("Activating PFAIR at q=%lu\n", now);
947
948 for (i = 0; i < num_pfair_clusters; i++) {
949 cluster = &pfair_clusters[i];
950 pfair_init_cluster(cluster);
951 cluster->pfair_time = now;
952 clust[i] = &cluster->topology;
953#ifdef CONFIG_RELEASE_MASTER
954 cluster->pfair.release_master = atomic_read(&release_master_cpu);
955#endif
956 }
957
958 for (i = 0; i < num_online_cpus(); i++) {
959 state = &per_cpu(pfair_state, i);
960 state->cur_tick = now;
961 state->local_tick = now;
962 state->missed_quanta = 0;
963 state->missed_updates = 0;
964 state->offset = cpu_stagger_offset(i);
965 printk(KERN_ERR "cpus[%d] set; %d\n", i, num_online_cpus());
966 cpus[i] = &state->topology;
967 }
968
969 err = assign_cpus_to_clusters(pfair_cluster_level, clust, num_pfair_clusters,
970 cpus, num_online_cpus());
971
972 if (err < 0)
973 cleanup_clusters();
974
975 return err;
976}
977
978static long pfair_deactivate_plugin(void)
979{
980 cleanup_clusters();
981 return 0;
982}
983
984/* Plugin object */
985static struct sched_plugin pfair_plugin __cacheline_aligned_in_smp = {
986 .plugin_name = "PFAIR",
987 .tick = pfair_tick,
988 .task_new = pfair_task_new,
989 .task_exit = pfair_task_exit,
990 .schedule = pfair_schedule,
991 .task_wake_up = pfair_task_wake_up,
992 .task_block = pfair_task_block,
993 .admit_task = pfair_admit_task,
994 .release_at = pfair_release_at,
995 .complete_job = complete_job,
996 .activate_plugin = pfair_activate_plugin,
997 .deactivate_plugin = pfair_deactivate_plugin,
998};
999
1000
1001static struct proc_dir_entry *cluster_file = NULL, *pfair_dir = NULL;
1002
1003static int __init init_pfair(void)
1004{
1005 int cpu, err, fs;
1006 struct pfair_state *state;
1007
1008 /*
1009 * initialize short_cut for per-cpu pfair state;
1010 * there may be a problem here if someone removes a cpu
1011 * while we are doing this initialization... and if cpus
1012 * are added / removed later... but we don't support CPU hotplug atm anyway.
1013 */
1014 pstate = kmalloc(sizeof(struct pfair_state*) * num_online_cpus(), GFP_KERNEL);
1015
1016 /* initialize CPU state */
1017 for (cpu = 0; cpu < num_online_cpus(); cpu++) {
1018 state = &per_cpu(pfair_state, cpu);
1019 state->topology.id = cpu;
1020 state->cur_tick = 0;
1021 state->local_tick = 0;
1022 state->linked = NULL;
1023 state->local = NULL;
1024 state->scheduled = NULL;
1025 state->missed_quanta = 0;
1026 state->offset = cpu_stagger_offset(cpu);
1027 pstate[cpu] = state;
1028 }
1029
1030 pfair_clusters = NULL;
1031 num_pfair_clusters = 0;
1032
1033 err = register_sched_plugin(&pfair_plugin);
1034 if (!err) {
1035 fs = make_plugin_proc_dir(&pfair_plugin, &pfair_dir);
1036 if (!fs)
1037 cluster_file = create_cluster_file(pfair_dir, &pfair_cluster_level);
1038 else
1039 printk(KERN_ERR "Could not allocate PFAIR procfs dir.\n");
1040 }
1041
1042 return err;
1043}
1044
1045static void __exit clean_pfair(void)
1046{
1047 kfree(pstate);
1048
1049 if (cluster_file)
1050 remove_proc_entry("cluster", pfair_dir);
1051 if (pfair_dir)
1052 remove_plugin_proc_dir(&pfair_plugin);
1053}
1054
1055module_init(init_pfair);
1056module_exit(clean_pfair);