aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorFelipe Cerqueira <felipec@mpi-sws.org>2013-02-12 13:21:11 -0500
committerBjoern Brandenburg <bbb@mpi-sws.org>2013-08-07 03:47:07 -0400
commit3bd7e43d778163e9e1b696fdb5030b7717aba236 (patch)
tree007ea0c3b54bcd90f036b6a37f8ebfa82f9e66e6
parenta6e8c66b436815b7a7abdfb38808fb94cc70006b (diff)
Add PD^2 scheduler plugin
-rw-r--r--litmus/Kconfig14
-rw-r--r--litmus/Makefile1
-rw-r--r--litmus/sched_pfair.c1079
3 files changed, 1094 insertions, 0 deletions
diff --git a/litmus/Kconfig b/litmus/Kconfig
index c764857aec82..5d5d6eb29882 100644
--- a/litmus/Kconfig
+++ b/litmus/Kconfig
@@ -12,6 +12,20 @@ config PLUGIN_CEDF
12 On smaller platforms (e.g., ARM PB11MPCore), using C-EDF 12 On smaller platforms (e.g., ARM PB11MPCore), using C-EDF
13 makes little sense since there aren't any shared caches. 13 makes little sense since there aren't any shared caches.
14 14
15config PLUGIN_PFAIR
16 bool "PFAIR"
17 depends on HIGH_RES_TIMERS && HZ_PERIODIC && HZ = "1000"
18 default y
19 help
20 Include the PFAIR plugin (i.e., the PD^2 scheduler) in the kernel.
21 The PFAIR plugin requires high resolution timers (for staggered
22 quanta) and also requires HZ_PERIODIC (i.e., periodic timer ticks
23 even if a processor is idle, as quanta could be missed otherwise).
24 Further, the PFAIR plugin uses the system tick and thus requires
25 HZ=1000 to achive reasonable granularity.
26
27 If unsure, say Yes.
28
15config RELEASE_MASTER 29config RELEASE_MASTER
16 bool "Release-master Support" 30 bool "Release-master Support"
17 depends on ARCH_HAS_SEND_PULL_TIMERS && SMP 31 depends on ARCH_HAS_SEND_PULL_TIMERS && SMP
diff --git a/litmus/Makefile b/litmus/Makefile
index bcb007d9b592..d26ca7076b62 100644
--- a/litmus/Makefile
+++ b/litmus/Makefile
@@ -23,6 +23,7 @@ obj-y = sched_plugin.o litmus.o \
23 sched_pfp.o 23 sched_pfp.o
24 24
25obj-$(CONFIG_PLUGIN_CEDF) += sched_cedf.o 25obj-$(CONFIG_PLUGIN_CEDF) += sched_cedf.o
26obj-$(CONFIG_PLUGIN_PFAIR) += sched_pfair.o
26obj-$(CONFIG_SCHED_CPU_AFFINITY) += affinity.o 27obj-$(CONFIG_SCHED_CPU_AFFINITY) += affinity.o
27 28
28obj-$(CONFIG_FEATHER_TRACE) += ft_event.o ftdev.o 29obj-$(CONFIG_FEATHER_TRACE) += ft_event.o ftdev.o
diff --git a/litmus/sched_pfair.c b/litmus/sched_pfair.c
new file mode 100644
index 000000000000..efe5e130da15
--- /dev/null
+++ b/litmus/sched_pfair.c
@@ -0,0 +1,1079 @@
1/*
2 * kernel/sched_pfair.c
3 *
4 * Implementation of the PD^2 pfair scheduling algorithm. This
5 * implementation realizes "early releasing," i.e., it is work-conserving.
6 *
7 */
8
9#include <asm/div64.h>
10#include <linux/delay.h>
11#include <linux/module.h>
12#include <linux/spinlock.h>
13#include <linux/percpu.h>
14#include <linux/sched.h>
15#include <linux/list.h>
16#include <linux/slab.h>
17
18#include <litmus/litmus.h>
19#include <litmus/jobs.h>
20#include <litmus/preempt.h>
21#include <litmus/rt_domain.h>
22#include <litmus/sched_plugin.h>
23#include <litmus/sched_trace.h>
24
25#include <litmus/bheap.h>
26
27/* to configure the cluster size */
28#include <litmus/litmus_proc.h>
29
30#include <litmus/clustered.h>
31
32static enum cache_level pfair_cluster_level = GLOBAL_CLUSTER;
33
34struct subtask {
35 /* measured in quanta relative to job release */
36 quanta_t release;
37 quanta_t deadline;
38 quanta_t overlap; /* called "b bit" by PD^2 */
39 quanta_t group_deadline;
40};
41
42struct pfair_param {
43 quanta_t quanta; /* number of subtasks */
44 quanta_t cur; /* index of current subtask */
45
46 quanta_t release; /* in quanta */
47 quanta_t period; /* in quanta */
48
49 quanta_t last_quantum; /* when scheduled last */
50 int last_cpu; /* where scheduled last */
51
52 struct pfair_cluster* cluster; /* where this task is scheduled */
53
54 struct subtask subtasks[0]; /* allocate together with pfair_param */
55};
56
57#define tsk_pfair(tsk) ((tsk)->rt_param.pfair)
58
59struct pfair_state {
60 struct cluster_cpu topology;
61
62 volatile quanta_t cur_tick; /* updated by the CPU that is advancing
63 * the time */
64 volatile quanta_t local_tick; /* What tick is the local CPU currently
65 * executing? Updated only by the local
66 * CPU. In QEMU, this may lag behind the
67 * current tick. In a real system, with
68 * proper timers and aligned quanta,
69 * that should only be the case for a
70 * very short time after the time
71 * advanced. With staggered quanta, it
72 * will lag for the duration of the
73 * offset.
74 */
75
76 struct task_struct* linked; /* the task that should be executing */
77 struct task_struct* local; /* the local copy of linked */
78 struct task_struct* scheduled; /* what is actually scheduled */
79
80 lt_t offset; /* stagger offset */
81 unsigned int missed_updates;
82 unsigned int missed_quanta;
83};
84
85struct pfair_cluster {
86 struct scheduling_cluster topology;
87
88 /* The "global" time in this cluster. */
89 quanta_t pfair_time; /* the "official" PFAIR clock */
90
91 /* The ready queue for this cluster. */
92 rt_domain_t pfair;
93
94 /* The set of jobs that should have their release enacted at the next
95 * quantum boundary.
96 */
97 struct bheap release_queue;
98 raw_spinlock_t release_lock;
99};
100
101#define FLAGS_NEED_REQUEUE 0x1
102
103static inline struct pfair_cluster* cpu_cluster(struct pfair_state* state)
104{
105 return container_of(state->topology.cluster, struct pfair_cluster, topology);
106}
107
108static inline int cpu_id(struct pfair_state* state)
109{
110 return state->topology.id;
111}
112
113static inline struct pfair_state* from_cluster_list(struct list_head* pos)
114{
115 return list_entry(pos, struct pfair_state, topology.cluster_list);
116}
117
118static inline struct pfair_cluster* from_domain(rt_domain_t* rt)
119{
120 return container_of(rt, struct pfair_cluster, pfair);
121}
122
123static inline raw_spinlock_t* cluster_lock(struct pfair_cluster* cluster)
124{
125 /* The ready_lock is used to serialize all scheduling events. */
126 return &cluster->pfair.ready_lock;
127}
128
129static inline raw_spinlock_t* cpu_lock(struct pfair_state* state)
130{
131 return cluster_lock(cpu_cluster(state));
132}
133
134DEFINE_PER_CPU(struct pfair_state, pfair_state);
135struct pfair_state* *pstate; /* short cut */
136
137static struct pfair_cluster* pfair_clusters;
138static int num_pfair_clusters;
139
140/* Enable for lots of trace info.
141 * #define PFAIR_DEBUG
142 */
143
144#ifdef PFAIR_DEBUG
145#define PTRACE_TASK(t, f, args...) TRACE_TASK(t, f, ## args)
146#define PTRACE(f, args...) TRACE(f, ## args)
147#else
148#define PTRACE_TASK(t, f, args...)
149#define PTRACE(f, args...)
150#endif
151
152/* gcc will inline all of these accessor functions... */
153static struct subtask* cur_subtask(struct task_struct* t)
154{
155 return tsk_pfair(t)->subtasks + tsk_pfair(t)->cur;
156}
157
158static quanta_t cur_deadline(struct task_struct* t)
159{
160 return cur_subtask(t)->deadline + tsk_pfair(t)->release;
161}
162
163static quanta_t cur_release(struct task_struct* t)
164{
165 /* This is early releasing: only the release of the first subtask
166 * counts. */
167 return tsk_pfair(t)->release;
168}
169
170static quanta_t cur_overlap(struct task_struct* t)
171{
172 return cur_subtask(t)->overlap;
173}
174
175static quanta_t cur_group_deadline(struct task_struct* t)
176{
177 quanta_t gdl = cur_subtask(t)->group_deadline;
178 if (gdl)
179 return gdl + tsk_pfair(t)->release;
180 else
181 return gdl;
182}
183
184
185static int pfair_higher_prio(struct task_struct* first,
186 struct task_struct* second)
187{
188 return /* first task must exist */
189 first && (
190 /* Does the second task exist and is it a real-time task? If
191 * not, the first task (which is a RT task) has higher
192 * priority.
193 */
194 !second || !is_realtime(second) ||
195
196 /* Is the (subtask) deadline of the first task earlier?
197 * Then it has higher priority.
198 */
199 time_before(cur_deadline(first), cur_deadline(second)) ||
200
201 /* Do we have a deadline tie?
202 * Then break by B-bit.
203 */
204 (cur_deadline(first) == cur_deadline(second) &&
205 (cur_overlap(first) > cur_overlap(second) ||
206
207 /* Do we have a B-bit tie?
208 * Then break by group deadline.
209 */
210 (cur_overlap(first) == cur_overlap(second) &&
211 (time_after(cur_group_deadline(first),
212 cur_group_deadline(second)) ||
213
214 /* Do we have a group deadline tie?
215 * Then break by PID, which are unique.
216 */
217 (cur_group_deadline(first) ==
218 cur_group_deadline(second) &&
219 first->pid < second->pid))))));
220}
221
222int pfair_ready_order(struct bheap_node* a, struct bheap_node* b)
223{
224 return pfair_higher_prio(bheap2task(a), bheap2task(b));
225}
226
227static void pfair_release_jobs(rt_domain_t* rt, struct bheap* tasks)
228{
229 struct pfair_cluster* cluster = from_domain(rt);
230 unsigned long flags;
231
232 raw_spin_lock_irqsave(&cluster->release_lock, flags);
233
234 bheap_union(pfair_ready_order, &cluster->release_queue, tasks);
235
236 raw_spin_unlock_irqrestore(&cluster->release_lock, flags);
237}
238
239static void prepare_release(struct task_struct* t, quanta_t at)
240{
241 tsk_pfair(t)->release = at;
242 tsk_pfair(t)->cur = 0;
243}
244
245/* pull released tasks from the release queue */
246static void poll_releases(struct pfair_cluster* cluster)
247{
248 raw_spin_lock(&cluster->release_lock);
249 __merge_ready(&cluster->pfair, &cluster->release_queue);
250 raw_spin_unlock(&cluster->release_lock);
251}
252
253static void check_preempt(struct task_struct* t)
254{
255 int cpu = NO_CPU;
256 if (tsk_rt(t)->linked_on != tsk_rt(t)->scheduled_on &&
257 is_present(t)) {
258 /* the task can be scheduled and
259 * is not scheduled where it ought to be scheduled
260 */
261 cpu = tsk_rt(t)->linked_on != NO_CPU ?
262 tsk_rt(t)->linked_on :
263 tsk_rt(t)->scheduled_on;
264 PTRACE_TASK(t, "linked_on:%d, scheduled_on:%d\n",
265 tsk_rt(t)->linked_on, tsk_rt(t)->scheduled_on);
266 /* preempt */
267 litmus_reschedule(cpu);
268 }
269}
270
271/* caller must hold pfair.ready_lock */
272static void drop_all_references(struct task_struct *t)
273{
274 int cpu;
275 struct pfair_state* s;
276 struct pfair_cluster* cluster;
277 if (bheap_node_in_heap(tsk_rt(t)->heap_node)) {
278 /* It must be in the ready queue; drop references isn't called
279 * when the job is in a release queue. */
280 cluster = tsk_pfair(t)->cluster;
281 bheap_delete(pfair_ready_order, &cluster->pfair.ready_queue,
282 tsk_rt(t)->heap_node);
283 }
284 for (cpu = 0; cpu < num_online_cpus(); cpu++) {
285 s = &per_cpu(pfair_state, cpu);
286 if (s->linked == t)
287 s->linked = NULL;
288 if (s->local == t)
289 s->local = NULL;
290 if (s->scheduled == t)
291 s->scheduled = NULL;
292 }
293 /* make sure we don't have a stale linked_on field */
294 tsk_rt(t)->linked_on = NO_CPU;
295}
296
297static void pfair_prepare_next_period(struct task_struct* t)
298{
299 struct pfair_param* p = tsk_pfair(t);
300
301 prepare_for_next_period(t);
302 tsk_rt(t)->completed = 0;
303 p->release += p->period;
304}
305
306/* returns 1 if the task needs to go the release queue */
307static int advance_subtask(quanta_t time, struct task_struct* t, int cpu)
308{
309 struct pfair_param* p = tsk_pfair(t);
310 int to_relq;
311 p->cur = (p->cur + 1) % p->quanta;
312 if (!p->cur) {
313 if (is_present(t)) {
314 /* The job overran; we start a new budget allocation. */
315 pfair_prepare_next_period(t);
316 } else {
317 /* remove task from system until it wakes */
318 drop_all_references(t);
319 tsk_rt(t)->flags |= FLAGS_NEED_REQUEUE;
320 TRACE_TASK(t, "on %d advanced to subtask %lu (not present)\n",
321 cpu, p->cur);
322 return 0;
323 }
324 }
325 to_relq = time_after(cur_release(t), time);
326 TRACE_TASK(t, "on %d advanced to subtask %lu -> to_relq=%d (cur_release:%lu time:%lu)\n",
327 cpu, p->cur, to_relq, cur_release(t), time);
328 return to_relq;
329}
330
331static void advance_subtasks(struct pfair_cluster *cluster, quanta_t time)
332{
333 struct task_struct* l;
334 struct pfair_param* p;
335 struct list_head* pos;
336 struct pfair_state* cpu;
337
338 list_for_each(pos, &cluster->topology.cpus) {
339 cpu = from_cluster_list(pos);
340 l = cpu->linked;
341 cpu->missed_updates += cpu->linked != cpu->local;
342 if (l) {
343 p = tsk_pfair(l);
344 p->last_quantum = time;
345 p->last_cpu = cpu_id(cpu);
346 if (advance_subtask(time, l, cpu_id(cpu))) {
347 //cpu->linked = NULL;
348 PTRACE_TASK(l, "should go to release queue. "
349 "scheduled_on=%d present=%d\n",
350 tsk_rt(l)->scheduled_on,
351 tsk_rt(l)->present);
352 }
353 }
354 }
355}
356
357static int target_cpu(quanta_t time, struct task_struct* t, int default_cpu)
358{
359 int cpu;
360 if (tsk_rt(t)->scheduled_on != NO_CPU) {
361 /* always observe scheduled_on linkage */
362 default_cpu = tsk_rt(t)->scheduled_on;
363 } else if (tsk_pfair(t)->last_quantum == time - 1) {
364 /* back2back quanta */
365 /* Only observe last_quantum if no scheduled_on is in the way.
366 * This should only kick in if a CPU missed quanta, and that
367 * *should* only happen in QEMU.
368 */
369 cpu = tsk_pfair(t)->last_cpu;
370 if (!pstate[cpu]->linked ||
371 tsk_rt(pstate[cpu]->linked)->scheduled_on != cpu) {
372 default_cpu = cpu;
373 }
374 }
375 return default_cpu;
376}
377
378/* returns one if linking was redirected */
379static int pfair_link(quanta_t time, int cpu,
380 struct task_struct* t)
381{
382 int target = target_cpu(time, t, cpu);
383 struct task_struct* prev = pstate[cpu]->linked;
384 struct task_struct* other;
385 struct pfair_cluster* cluster = cpu_cluster(pstate[cpu]);
386
387 if (target != cpu) {
388 BUG_ON(pstate[target]->topology.cluster != pstate[cpu]->topology.cluster);
389 other = pstate[target]->linked;
390 pstate[target]->linked = t;
391 tsk_rt(t)->linked_on = target;
392 if (!other)
393 /* linked ok, but reschedule this CPU */
394 return 1;
395 if (target < cpu) {
396 /* link other to cpu instead */
397 tsk_rt(other)->linked_on = cpu;
398 pstate[cpu]->linked = other;
399 if (prev) {
400 /* prev got pushed back into the ready queue */
401 tsk_rt(prev)->linked_on = NO_CPU;
402 __add_ready(&cluster->pfair, prev);
403 }
404 /* we are done with this cpu */
405 return 0;
406 } else {
407 /* re-add other, it's original CPU was not considered yet */
408 tsk_rt(other)->linked_on = NO_CPU;
409 __add_ready(&cluster->pfair, other);
410 /* reschedule this CPU */
411 return 1;
412 }
413 } else {
414 pstate[cpu]->linked = t;
415 tsk_rt(t)->linked_on = cpu;
416 if (prev) {
417 /* prev got pushed back into the ready queue */
418 tsk_rt(prev)->linked_on = NO_CPU;
419 __add_ready(&cluster->pfair, prev);
420 }
421 /* we are done with this CPU */
422 return 0;
423 }
424}
425
426static void schedule_subtasks(struct pfair_cluster *cluster, quanta_t time)
427{
428 int retry;
429 struct list_head *pos;
430 struct pfair_state *cpu_state;
431
432 list_for_each(pos, &cluster->topology.cpus) {
433 cpu_state = from_cluster_list(pos);
434 retry = 1;
435#ifdef CONFIG_RELEASE_MASTER
436 /* skip release master */
437 if (cluster->pfair.release_master == cpu_id(cpu_state))
438 continue;
439#endif
440 while (retry) {
441 if (pfair_higher_prio(__peek_ready(&cluster->pfair),
442 cpu_state->linked))
443 retry = pfair_link(time, cpu_id(cpu_state),
444 __take_ready(&cluster->pfair));
445 else
446 retry = 0;
447 }
448 }
449}
450
451static void schedule_next_quantum(struct pfair_cluster *cluster, quanta_t time)
452{
453 struct pfair_state *cpu;
454 struct list_head* pos;
455
456 /* called with interrupts disabled */
457 PTRACE("--- Q %lu at %llu PRE-SPIN\n",
458 time, litmus_clock());
459 raw_spin_lock(cluster_lock(cluster));
460 PTRACE("<<< Q %lu at %llu\n",
461 time, litmus_clock());
462
463 sched_trace_quantum_boundary();
464
465 advance_subtasks(cluster, time);
466 poll_releases(cluster);
467 schedule_subtasks(cluster, time);
468
469 list_for_each(pos, &cluster->topology.cpus) {
470 cpu = from_cluster_list(pos);
471 if (cpu->linked)
472 PTRACE_TASK(cpu->linked,
473 " linked on %d.\n", cpu_id(cpu));
474 else
475 PTRACE("(null) linked on %d.\n", cpu_id(cpu));
476 }
477 /* We are done. Advance time. */
478 mb();
479 list_for_each(pos, &cluster->topology.cpus) {
480 cpu = from_cluster_list(pos);
481 if (cpu->local_tick != cpu->cur_tick) {
482 TRACE("BAD Quantum not acked on %d "
483 "(l:%lu c:%lu p:%lu)\n",
484 cpu_id(cpu),
485 cpu->local_tick,
486 cpu->cur_tick,
487 cluster->pfair_time);
488 cpu->missed_quanta++;
489 }
490 cpu->cur_tick = time;
491 }
492 PTRACE(">>> Q %lu at %llu\n",
493 time, litmus_clock());
494 raw_spin_unlock(cluster_lock(cluster));
495}
496
497static noinline void wait_for_quantum(quanta_t q, struct pfair_state* state)
498{
499 quanta_t loc;
500
501 goto first; /* skip mb() on first iteration */
502 do {
503 cpu_relax();
504 mb();
505 first: loc = state->cur_tick;
506 /* FIXME: what if loc > cur? */
507 } while (time_before(loc, q));
508 PTRACE("observed cur_tick:%lu >= q:%lu\n",
509 loc, q);
510}
511
512static quanta_t current_quantum(struct pfair_state* state)
513{
514 lt_t t = litmus_clock() - state->offset;
515 return time2quanta(t, FLOOR);
516}
517
518static void catchup_quanta(quanta_t from, quanta_t target,
519 struct pfair_state* state)
520{
521 quanta_t cur = from, time;
522 TRACE("+++< BAD catching up quanta from %lu to %lu\n",
523 from, target);
524 while (time_before(cur, target)) {
525 wait_for_quantum(cur, state);
526 cur++;
527 time = cmpxchg(&cpu_cluster(state)->pfair_time,
528 cur - 1, /* expected */
529 cur /* next */
530 );
531 if (time == cur - 1)
532 schedule_next_quantum(cpu_cluster(state), cur);
533 }
534 TRACE("+++> catching up done\n");
535}
536
537/* pfair_tick - this function is called for every local timer
538 * interrupt.
539 */
540static void pfair_tick(struct task_struct* t)
541{
542 struct pfair_state* state = &__get_cpu_var(pfair_state);
543 quanta_t time, cur;
544 int retry = 10;
545
546 do {
547 cur = current_quantum(state);
548 PTRACE("q %lu at %llu\n", cur, litmus_clock());
549
550 /* Attempt to advance time. First CPU to get here
551 * will prepare the next quantum.
552 */
553 time = cmpxchg(&cpu_cluster(state)->pfair_time,
554 cur - 1, /* expected */
555 cur /* next */
556 );
557 if (time == cur - 1) {
558 /* exchange succeeded */
559 wait_for_quantum(cur - 1, state);
560 schedule_next_quantum(cpu_cluster(state), cur);
561 retry = 0;
562 } else if (time_before(time, cur - 1)) {
563 /* the whole system missed a tick !? */
564 catchup_quanta(time, cur, state);
565 retry--;
566 } else if (time_after(time, cur)) {
567 /* our timer lagging behind!? */
568 TRACE("BAD pfair_time:%lu > cur:%lu\n", time, cur);
569 retry--;
570 } else {
571 /* Some other CPU already started scheduling
572 * this quantum. Let it do its job and then update.
573 */
574 retry = 0;
575 }
576 } while (retry);
577
578 /* Spin locally until time advances. */
579 wait_for_quantum(cur, state);
580
581 /* copy assignment */
582 /* FIXME: what if we race with a future update? Corrupted state? */
583 state->local = state->linked;
584 /* signal that we are done */
585 mb();
586 state->local_tick = state->cur_tick;
587
588 if (state->local != current
589 && (is_realtime(current) || is_present(state->local)))
590 litmus_reschedule_local();
591}
592
593static int safe_to_schedule(struct task_struct* t, int cpu)
594{
595 int where = tsk_rt(t)->scheduled_on;
596 if (where != NO_CPU && where != cpu) {
597 TRACE_TASK(t, "BAD: can't be scheduled on %d, "
598 "scheduled already on %d.\n", cpu, where);
599 return 0;
600 } else
601 return is_present(t) && !is_completed(t);
602}
603
604static struct task_struct* pfair_schedule(struct task_struct * prev)
605{
606 struct pfair_state* state = &__get_cpu_var(pfair_state);
607 struct pfair_cluster* cluster = cpu_cluster(state);
608 int blocks, completion, out_of_time;
609 struct task_struct* next = NULL;
610
611#ifdef CONFIG_RELEASE_MASTER
612 /* Bail out early if we are the release master.
613 * The release master never schedules any real-time tasks.
614 */
615 if (unlikely(cluster->pfair.release_master == cpu_id(state))) {
616 sched_state_task_picked();
617 return NULL;
618 }
619#endif
620
621 raw_spin_lock(cpu_lock(state));
622
623 blocks = is_realtime(prev) && !is_running(prev);
624 completion = is_realtime(prev) && is_completed(prev);
625 out_of_time = is_realtime(prev) && time_after(cur_release(prev),
626 state->local_tick);
627
628 if (is_realtime(prev))
629 PTRACE_TASK(prev, "blocks:%d completion:%d out_of_time:%d\n",
630 blocks, completion, out_of_time);
631
632 if (completion) {
633 sched_trace_task_completion(prev, 0);
634 pfair_prepare_next_period(prev);
635 prepare_release(prev, cur_release(prev));
636 }
637
638 if (!blocks && (completion || out_of_time)) {
639 drop_all_references(prev);
640 sched_trace_task_release(prev);
641 add_release(&cluster->pfair, prev);
642 }
643
644 if (state->local && safe_to_schedule(state->local, cpu_id(state)))
645 next = state->local;
646
647 if (prev != next) {
648 tsk_rt(prev)->scheduled_on = NO_CPU;
649 if (next)
650 tsk_rt(next)->scheduled_on = cpu_id(state);
651 }
652 sched_state_task_picked();
653 raw_spin_unlock(cpu_lock(state));
654
655 if (next)
656 TRACE_TASK(next, "scheduled rel=%lu at %lu (%llu)\n",
657 tsk_pfair(next)->release, cpu_cluster(state)->pfair_time, litmus_clock());
658 else if (is_realtime(prev))
659 TRACE("Becomes idle at %lu (%llu)\n", cpu_cluster(state)->pfair_time, litmus_clock());
660
661 return next;
662}
663
664static void pfair_task_new(struct task_struct * t, int on_rq, int is_scheduled)
665{
666 unsigned long flags;
667 struct pfair_cluster* cluster;
668
669 TRACE("pfair: task new %d state:%d\n", t->pid, t->state);
670
671 cluster = tsk_pfair(t)->cluster;
672
673 raw_spin_lock_irqsave(cluster_lock(cluster), flags);
674
675 prepare_release(t, cluster->pfair_time + 1);
676
677 t->rt_param.scheduled_on = NO_CPU;
678 t->rt_param.linked_on = NO_CPU;
679
680 if (is_scheduled) {
681#ifdef CONFIG_RELEASE_MASTER
682 if (task_cpu(t) != cluster->pfair.release_master)
683#endif
684 t->rt_param.scheduled_on = task_cpu(t);
685 }
686
687 if (is_running(t)) {
688 tsk_rt(t)->present = 1;
689 __add_ready(&cluster->pfair, t);
690 } else {
691 tsk_rt(t)->present = 0;
692 tsk_rt(t)->flags |= FLAGS_NEED_REQUEUE;
693 }
694
695 check_preempt(t);
696
697 raw_spin_unlock_irqrestore(cluster_lock(cluster), flags);
698}
699
700static void pfair_task_wake_up(struct task_struct *t)
701{
702 unsigned long flags;
703 lt_t now;
704 struct pfair_cluster* cluster;
705
706 cluster = tsk_pfair(t)->cluster;
707
708 TRACE_TASK(t, "wakes at %llu, release=%lu, pfair_time:%lu\n",
709 litmus_clock(), cur_release(t), cluster->pfair_time);
710
711 raw_spin_lock_irqsave(cluster_lock(cluster), flags);
712
713 /* If a task blocks and wakes before its next job release,
714 * then it may resume if it is currently linked somewhere
715 * (as if it never blocked at all). Otherwise, we have a
716 * new sporadic job release.
717 */
718 now = litmus_clock();
719 if (is_tardy(t, now)) {
720 TRACE_TASK(t, "sporadic release!\n");
721 release_at(t, now);
722 prepare_release(t, time2quanta(now, CEIL));
723 sched_trace_task_release(t);
724 }
725
726 /* only add to ready queue if the task isn't still linked somewhere */
727 if (tsk_rt(t)->flags & FLAGS_NEED_REQUEUE) {
728 tsk_rt(t)->flags &= ~FLAGS_NEED_REQUEUE;
729 TRACE_TASK(t, "requeueing required\n");
730 tsk_rt(t)->completed = 0;
731 __add_ready(&cluster->pfair, t);
732 }
733
734 check_preempt(t);
735
736 raw_spin_unlock_irqrestore(cluster_lock(cluster), flags);
737 TRACE_TASK(t, "wake up done at %llu\n", litmus_clock());
738}
739
740static void pfair_task_block(struct task_struct *t)
741{
742 BUG_ON(!is_realtime(t));
743 TRACE_TASK(t, "blocks at %llu, state:%d\n",
744 litmus_clock(), t->state);
745}
746
747static void pfair_task_exit(struct task_struct * t)
748{
749 unsigned long flags;
750 struct pfair_cluster *cluster;
751
752 BUG_ON(!is_realtime(t));
753
754 cluster = tsk_pfair(t)->cluster;
755
756 /* Remote task from release or ready queue, and ensure
757 * that it is not the scheduled task for ANY CPU. We
758 * do this blanket check because occassionally when
759 * tasks exit while blocked, the task_cpu of the task
760 * might not be the same as the CPU that the PFAIR scheduler
761 * has chosen for it.
762 */
763 raw_spin_lock_irqsave(cluster_lock(cluster), flags);
764
765 TRACE_TASK(t, "RIP, state:%d\n", t->state);
766 drop_all_references(t);
767
768 raw_spin_unlock_irqrestore(cluster_lock(cluster), flags);
769
770 kfree(t->rt_param.pfair);
771 t->rt_param.pfair = NULL;
772}
773
774
775static void pfair_release_at(struct task_struct* task, lt_t start)
776{
777 unsigned long flags;
778 quanta_t release;
779
780 struct pfair_cluster *cluster;
781
782 cluster = tsk_pfair(task)->cluster;
783
784 BUG_ON(!is_realtime(task));
785
786 raw_spin_lock_irqsave(cluster_lock(cluster), flags);
787
788 release_at(task, start);
789 release = time2quanta(start, CEIL);
790 prepare_release(task, release);
791
792 TRACE_TASK(task, "sys release at %lu\n", release);
793
794 raw_spin_unlock_irqrestore(cluster_lock(cluster), flags);
795}
796
797static void init_subtask(struct subtask* sub, unsigned long i,
798 lt_t quanta, lt_t period)
799{
800 /* since i is zero-based, the formulas are shifted by one */
801 lt_t tmp;
802
803 /* release */
804 tmp = period * i;
805 do_div(tmp, quanta); /* floor */
806 sub->release = (quanta_t) tmp;
807
808 /* deadline */
809 tmp = period * (i + 1);
810 if (do_div(tmp, quanta)) /* ceil */
811 tmp++;
812 sub->deadline = (quanta_t) tmp;
813
814 /* next release */
815 tmp = period * (i + 1);
816 do_div(tmp, quanta); /* floor */
817 sub->overlap = sub->deadline - (quanta_t) tmp;
818
819 /* Group deadline.
820 * Based on the formula given in Uma's thesis.
821 */
822 if (2 * quanta >= period) {
823 /* heavy */
824 tmp = (sub->deadline - (i + 1)) * period;
825 if (period > quanta &&
826 do_div(tmp, (period - quanta))) /* ceil */
827 tmp++;
828 sub->group_deadline = (quanta_t) tmp;
829 } else
830 sub->group_deadline = 0;
831}
832
833static void dump_subtasks(struct task_struct* t)
834{
835 unsigned long i;
836 for (i = 0; i < t->rt_param.pfair->quanta; i++)
837 TRACE_TASK(t, "SUBTASK %lu: rel=%lu dl=%lu bbit:%lu gdl:%lu\n",
838 i + 1,
839 t->rt_param.pfair->subtasks[i].release,
840 t->rt_param.pfair->subtasks[i].deadline,
841 t->rt_param.pfair->subtasks[i].overlap,
842 t->rt_param.pfair->subtasks[i].group_deadline);
843}
844
845static long pfair_admit_task(struct task_struct* t)
846{
847 lt_t quanta;
848 lt_t period;
849 s64 quantum_length = ktime_to_ns(tick_period);
850 struct pfair_param* param;
851 unsigned long i;
852
853 /* first check that the task is in the right cluster */
854 if (cpu_cluster(pstate[tsk_rt(t)->task_params.cpu]) !=
855 cpu_cluster(pstate[task_cpu(t)]))
856 return -EINVAL;
857
858 if (get_rt_period(t) != get_rt_relative_deadline(t)) {
859 printk(KERN_INFO "%s: Admission rejected. "
860 "Only implicit deadlines are currently supported.\n",
861 litmus->plugin_name);
862 return -EINVAL;
863 }
864
865 /* Pfair is a tick-based method, so the time
866 * of interest is jiffies. Calculate tick-based
867 * times for everything.
868 * (Ceiling of exec cost, floor of period.)
869 */
870
871 quanta = get_exec_cost(t);
872 period = get_rt_period(t);
873
874 quanta = time2quanta(get_exec_cost(t), CEIL);
875
876 if (do_div(period, quantum_length))
877 printk(KERN_WARNING
878 "The period of %s/%d is not a multiple of %llu.\n",
879 t->comm, t->pid, (unsigned long long) quantum_length);
880
881 if (quanta == period) {
882 /* special case: task has weight 1.0 */
883 printk(KERN_INFO
884 "Admitting weight 1.0 task. (%s/%d, %llu, %llu).\n",
885 t->comm, t->pid, quanta, period);
886 quanta = 1;
887 period = 1;
888 }
889
890 param = kmalloc(sizeof(*param) +
891 quanta * sizeof(struct subtask), GFP_ATOMIC);
892
893 if (!param)
894 return -ENOMEM;
895
896 param->quanta = quanta;
897 param->cur = 0;
898 param->release = 0;
899 param->period = period;
900
901 param->cluster = cpu_cluster(pstate[tsk_rt(t)->task_params.cpu]);
902
903 for (i = 0; i < quanta; i++)
904 init_subtask(param->subtasks + i, i, quanta, period);
905
906 if (t->rt_param.pfair)
907 /* get rid of stale allocation */
908 kfree(t->rt_param.pfair);
909
910 t->rt_param.pfair = param;
911
912 /* spew out some debug info */
913 dump_subtasks(t);
914
915 return 0;
916}
917
918static void pfair_init_cluster(struct pfair_cluster* cluster)
919{
920 rt_domain_init(&cluster->pfair, pfair_ready_order, NULL, pfair_release_jobs);
921 bheap_init(&cluster->release_queue);
922 raw_spin_lock_init(&cluster->release_lock);
923 INIT_LIST_HEAD(&cluster->topology.cpus);
924}
925
926static void cleanup_clusters(void)
927{
928 int i;
929
930 if (num_pfair_clusters)
931 kfree(pfair_clusters);
932 pfair_clusters = NULL;
933 num_pfair_clusters = 0;
934
935 /* avoid stale pointers */
936 for (i = 0; i < num_online_cpus(); i++) {
937 pstate[i]->topology.cluster = NULL;
938 printk("P%d missed %u updates and %u quanta.\n", cpu_id(pstate[i]),
939 pstate[i]->missed_updates, pstate[i]->missed_quanta);
940 }
941}
942
943static long pfair_activate_plugin(void)
944{
945 int err, i;
946 struct pfair_state* state;
947 struct pfair_cluster* cluster ;
948 quanta_t now;
949 int cluster_size;
950 struct cluster_cpu* cpus[NR_CPUS];
951 struct scheduling_cluster* clust[NR_CPUS];
952
953 cluster_size = get_cluster_size(pfair_cluster_level);
954
955 if (cluster_size <= 0 || num_online_cpus() % cluster_size != 0)
956 return -EINVAL;
957
958 num_pfair_clusters = num_online_cpus() / cluster_size;
959
960 pfair_clusters = kzalloc(num_pfair_clusters * sizeof(struct pfair_cluster), GFP_ATOMIC);
961 if (!pfair_clusters) {
962 num_pfair_clusters = 0;
963 printk(KERN_ERR "Could not allocate Pfair clusters!\n");
964 return -ENOMEM;
965 }
966
967 state = &__get_cpu_var(pfair_state);
968 now = current_quantum(state);
969 TRACE("Activating PFAIR at q=%lu\n", now);
970
971 for (i = 0; i < num_pfair_clusters; i++) {
972 cluster = &pfair_clusters[i];
973 pfair_init_cluster(cluster);
974 cluster->pfair_time = now;
975 clust[i] = &cluster->topology;
976#ifdef CONFIG_RELEASE_MASTER
977 cluster->pfair.release_master = atomic_read(&release_master_cpu);
978#endif
979 }
980
981 for (i = 0; i < num_online_cpus(); i++) {
982 state = &per_cpu(pfair_state, i);
983 state->cur_tick = now;
984 state->local_tick = now;
985 state->missed_quanta = 0;
986 state->missed_updates = 0;
987 state->offset = cpu_stagger_offset(i);
988 printk(KERN_ERR "cpus[%d] set; %d\n", i, num_online_cpus());
989 cpus[i] = &state->topology;
990 }
991
992 err = assign_cpus_to_clusters(pfair_cluster_level, clust, num_pfair_clusters,
993 cpus, num_online_cpus());
994
995 if (err < 0)
996 cleanup_clusters();
997
998 return err;
999}
1000
1001static long pfair_deactivate_plugin(void)
1002{
1003 cleanup_clusters();
1004 return 0;
1005}
1006
1007/* Plugin object */
1008static struct sched_plugin pfair_plugin __cacheline_aligned_in_smp = {
1009 .plugin_name = "PFAIR",
1010 .tick = pfair_tick,
1011 .task_new = pfair_task_new,
1012 .task_exit = pfair_task_exit,
1013 .schedule = pfair_schedule,
1014 .task_wake_up = pfair_task_wake_up,
1015 .task_block = pfair_task_block,
1016 .admit_task = pfair_admit_task,
1017 .release_at = pfair_release_at,
1018 .complete_job = complete_job,
1019 .activate_plugin = pfair_activate_plugin,
1020 .deactivate_plugin = pfair_deactivate_plugin,
1021};
1022
1023
1024static struct proc_dir_entry *cluster_file = NULL, *pfair_dir = NULL;
1025
1026static int __init init_pfair(void)
1027{
1028 int cpu, err, fs;
1029 struct pfair_state *state;
1030
1031 /*
1032 * initialize short_cut for per-cpu pfair state;
1033 * there may be a problem here if someone removes a cpu
1034 * while we are doing this initialization... and if cpus
1035 * are added / removed later... but we don't support CPU hotplug atm anyway.
1036 */
1037 pstate = kmalloc(sizeof(struct pfair_state*) * num_online_cpus(), GFP_KERNEL);
1038
1039 /* initialize CPU state */
1040 for (cpu = 0; cpu < num_online_cpus(); cpu++) {
1041 state = &per_cpu(pfair_state, cpu);
1042 state->topology.id = cpu;
1043 state->cur_tick = 0;
1044 state->local_tick = 0;
1045 state->linked = NULL;
1046 state->local = NULL;
1047 state->scheduled = NULL;
1048 state->missed_quanta = 0;
1049 state->offset = cpu_stagger_offset(cpu);
1050 pstate[cpu] = state;
1051 }
1052
1053 pfair_clusters = NULL;
1054 num_pfair_clusters = 0;
1055
1056 err = register_sched_plugin(&pfair_plugin);
1057 if (!err) {
1058 fs = make_plugin_proc_dir(&pfair_plugin, &pfair_dir);
1059 if (!fs)
1060 cluster_file = create_cluster_file(pfair_dir, &pfair_cluster_level);
1061 else
1062 printk(KERN_ERR "Could not allocate PFAIR procfs dir.\n");
1063 }
1064
1065 return err;
1066}
1067
1068static void __exit clean_pfair(void)
1069{
1070 kfree(pstate);
1071
1072 if (cluster_file)
1073 remove_proc_entry("cluster", pfair_dir);
1074 if (pfair_dir)
1075 remove_plugin_proc_dir(&pfair_plugin);
1076}
1077
1078module_init(init_pfair);
1079module_exit(clean_pfair);