aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBjoern Brandenburg <bbb@mpi-sws.org>2015-08-09 07:18:56 -0400
committerBjoern Brandenburg <bbb@mpi-sws.org>2015-08-09 07:20:36 -0400
commit8e51b378224ae53244c6917964c88aa2a9d024ad (patch)
tree2c79843a5054a32e23a3fc9e02309990e6f66c24
parentb7215111b2f62ff312472de821c16904c518f921 (diff)
Add PD^2 scheduler plugin2015.1
-rw-r--r--litmus/Kconfig13
-rw-r--r--litmus/Makefile1
-rw-r--r--litmus/sched_pfair.c1226
3 files changed, 1240 insertions, 0 deletions
diff --git a/litmus/Kconfig b/litmus/Kconfig
index 38d9e433b345..babb43deffb5 100644
--- a/litmus/Kconfig
+++ b/litmus/Kconfig
@@ -12,6 +12,19 @@ config PLUGIN_CEDF
12 On smaller platforms (e.g., ARM PB11MPCore), using C-EDF 12 On smaller platforms (e.g., ARM PB11MPCore), using C-EDF
13 makes little sense since there aren't any shared caches. 13 makes little sense since there aren't any shared caches.
14 14
15config PLUGIN_PFAIR
16 bool "PFAIR"
17 default y
18 help
19 Include the PFAIR plugin (i.e., the PD^2 scheduler) in the kernel.
20 The PFAIR plugin requires high resolution timers (for staggered
21 quanta) and also requires HZ_PERIODIC (i.e., periodic timer ticks
22 even if a processor is idle, as quanta could be missed otherwise).
23 Further, the PFAIR plugin uses the system tick and thus requires
24 HZ=1000 to achive reasonable granularity.
25
26 If unsure, say Yes.
27
15config RELEASE_MASTER 28config RELEASE_MASTER
16 bool "Release-master Support" 29 bool "Release-master Support"
17 depends on ARCH_HAS_SEND_PULL_TIMERS && SMP 30 depends on ARCH_HAS_SEND_PULL_TIMERS && SMP
diff --git a/litmus/Makefile b/litmus/Makefile
index 7d637197d736..7970cd55e7fd 100644
--- a/litmus/Makefile
+++ b/litmus/Makefile
@@ -24,6 +24,7 @@ obj-y = sched_plugin.o litmus.o \
24 sched_pfp.o 24 sched_pfp.o
25 25
26obj-$(CONFIG_PLUGIN_CEDF) += sched_cedf.o 26obj-$(CONFIG_PLUGIN_CEDF) += sched_cedf.o
27obj-$(CONFIG_PLUGIN_PFAIR) += sched_pfair.o
27 28
28obj-$(CONFIG_FEATHER_TRACE) += ft_event.o ftdev.o 29obj-$(CONFIG_FEATHER_TRACE) += ft_event.o ftdev.o
29obj-$(CONFIG_SCHED_TASK_TRACE) += sched_task_trace.o 30obj-$(CONFIG_SCHED_TASK_TRACE) += sched_task_trace.o
diff --git a/litmus/sched_pfair.c b/litmus/sched_pfair.c
new file mode 100644
index 000000000000..3f82378f5ca8
--- /dev/null
+++ b/litmus/sched_pfair.c
@@ -0,0 +1,1226 @@
1/*
2 * kernel/sched_pfair.c
3 *
4 * Implementation of the PD^2 pfair scheduling algorithm. This
5 * implementation realizes "early releasing," i.e., it is work-conserving.
6 *
7 */
8
9#include <asm/div64.h>
10#include <linux/delay.h>
11#include <linux/module.h>
12#include <linux/spinlock.h>
13#include <linux/percpu.h>
14#include <linux/sched.h>
15#include <linux/list.h>
16#include <linux/slab.h>
17
18#include <litmus/litmus.h>
19#include <litmus/jobs.h>
20#include <litmus/preempt.h>
21#include <litmus/rt_domain.h>
22#include <litmus/sched_plugin.h>
23#include <litmus/sched_trace.h>
24#include <litmus/trace.h>
25
26#include <litmus/bheap.h>
27
28/* to configure the cluster size */
29#include <litmus/litmus_proc.h>
30
31#include <litmus/clustered.h>
32
33static enum cache_level pfair_cluster_level = GLOBAL_CLUSTER;
34
35struct subtask {
36 /* measured in quanta relative to job release */
37 quanta_t release;
38 quanta_t deadline;
39 quanta_t overlap; /* called "b bit" by PD^2 */
40 quanta_t group_deadline;
41};
42
43struct pfair_param {
44 quanta_t quanta; /* number of subtasks */
45 quanta_t cur; /* index of current subtask */
46
47 quanta_t release; /* in quanta */
48 quanta_t period; /* in quanta */
49
50 quanta_t last_quantum; /* when scheduled last */
51 int last_cpu; /* where scheduled last */
52
53 unsigned int needs_requeue:1;
54
55 struct pfair_cluster* cluster; /* where this task is scheduled */
56
57 struct subtask subtasks[0]; /* allocate together with pfair_param */
58};
59
60#define tsk_pfair(tsk) ((tsk)->rt_param.pfair)
61
62struct pfair_state {
63 struct cluster_cpu topology;
64
65 struct hrtimer quantum_timer;
66
67 volatile quanta_t cur_tick; /* updated by the CPU that is advancing
68 * the time */
69 volatile quanta_t local_tick; /* What tick is the local CPU currently
70 * executing? Updated only by the local
71 * CPU. In QEMU, this may lag behind the
72 * current tick. In a real system, with
73 * proper timers and aligned quanta,
74 * that should only be the case for a
75 * very short time after the time
76 * advanced. With staggered quanta, it
77 * will lag for the duration of the
78 * offset.
79 */
80
81 struct task_struct* linked; /* the task that should be executing */
82 struct task_struct* local; /* the local copy of linked */
83 struct task_struct* scheduled; /* what is actually scheduled */
84
85 struct list_head out_of_budget; /* list of tasks that exhausted their allocation */
86
87 lt_t offset; /* stagger offset */
88 unsigned int missed_updates;
89 unsigned int missed_quanta;
90};
91
92struct pfair_cluster {
93 struct scheduling_cluster topology;
94
95 /* The "global" time in this cluster. */
96 quanta_t pfair_time; /* the "official" PFAIR clock */
97
98 /* The ready queue for this cluster. */
99 rt_domain_t pfair;
100
101 /* The set of jobs that should have their release enacted at the next
102 * quantum boundary.
103 */
104 struct bheap release_queue;
105 raw_spinlock_t release_lock;
106};
107
108static inline struct pfair_cluster* cpu_cluster(struct pfair_state* state)
109{
110 return container_of(state->topology.cluster, struct pfair_cluster, topology);
111}
112
113static inline int cpu_id(struct pfair_state* state)
114{
115 return state->topology.id;
116}
117
118static inline struct pfair_state* from_cluster_list(struct list_head* pos)
119{
120 return list_entry(pos, struct pfair_state, topology.cluster_list);
121}
122
123static inline struct pfair_cluster* from_domain(rt_domain_t* rt)
124{
125 return container_of(rt, struct pfair_cluster, pfair);
126}
127
128static inline raw_spinlock_t* cluster_lock(struct pfair_cluster* cluster)
129{
130 /* The ready_lock is used to serialize all scheduling events. */
131 return &cluster->pfair.ready_lock;
132}
133
134static inline raw_spinlock_t* cpu_lock(struct pfair_state* state)
135{
136 return cluster_lock(cpu_cluster(state));
137}
138
139DEFINE_PER_CPU(struct pfair_state, pfair_state);
140struct pfair_state* *pstate; /* short cut */
141
142static struct pfair_cluster* pfair_clusters;
143static int num_pfair_clusters;
144
145/* Enable for lots of trace info.
146 * #define PFAIR_DEBUG
147 */
148
149#ifdef PFAIR_DEBUG
150#define PTRACE_TASK(t, f, args...) TRACE_TASK(t, f, ## args)
151#define PTRACE(f, args...) TRACE(f, ## args)
152#else
153#define PTRACE_TASK(t, f, args...)
154#define PTRACE(f, args...)
155#endif
156
157/* gcc will inline all of these accessor functions... */
158static struct subtask* cur_subtask(struct task_struct* t)
159{
160 return tsk_pfair(t)->subtasks + tsk_pfair(t)->cur;
161}
162
163static quanta_t cur_deadline(struct task_struct* t)
164{
165 return cur_subtask(t)->deadline + tsk_pfair(t)->release;
166}
167
168static quanta_t cur_release(struct task_struct* t)
169{
170 /* This is early releasing: only the release of the first subtask
171 * counts. */
172 return tsk_pfair(t)->release;
173}
174
175static quanta_t cur_overlap(struct task_struct* t)
176{
177 return cur_subtask(t)->overlap;
178}
179
180static quanta_t cur_group_deadline(struct task_struct* t)
181{
182 quanta_t gdl = cur_subtask(t)->group_deadline;
183 if (gdl)
184 return gdl + tsk_pfair(t)->release;
185 else
186 return gdl;
187}
188
189
190static int pfair_higher_prio(struct task_struct* first,
191 struct task_struct* second)
192{
193 return /* first task must exist */
194 first && (
195 /* Does the second task exist and is it a real-time task? If
196 * not, the first task (which is a RT task) has higher
197 * priority.
198 */
199 !second || !is_realtime(second) ||
200
201 /* Is the (subtask) deadline of the first task earlier?
202 * Then it has higher priority.
203 */
204 time_before(cur_deadline(first), cur_deadline(second)) ||
205
206 /* Do we have a deadline tie?
207 * Then break by B-bit.
208 */
209 (cur_deadline(first) == cur_deadline(second) &&
210 (cur_overlap(first) > cur_overlap(second) ||
211
212 /* Do we have a B-bit tie?
213 * Then break by group deadline.
214 */
215 (cur_overlap(first) == cur_overlap(second) &&
216 (time_after(cur_group_deadline(first),
217 cur_group_deadline(second)) ||
218
219 /* Do we have a group deadline tie?
220 * Then break by PID, which are unique.
221 */
222 (cur_group_deadline(first) ==
223 cur_group_deadline(second) &&
224 first->pid < second->pid))))));
225}
226
227int pfair_ready_order(struct bheap_node* a, struct bheap_node* b)
228{
229 return pfair_higher_prio(bheap2task(a), bheap2task(b));
230}
231
232static void pfair_release_jobs(rt_domain_t* rt, struct bheap* tasks)
233{
234 struct pfair_cluster* cluster = from_domain(rt);
235 unsigned long flags;
236
237 raw_spin_lock_irqsave(&cluster->release_lock, flags);
238
239 bheap_union(pfair_ready_order, &cluster->release_queue, tasks);
240
241 raw_spin_unlock_irqrestore(&cluster->release_lock, flags);
242}
243
244static void prepare_release(struct task_struct* t, quanta_t at)
245{
246 tsk_pfair(t)->release = at;
247 tsk_pfair(t)->cur = 0;
248}
249
250/* pull released tasks from the release queue */
251static void poll_releases(struct pfair_cluster* cluster)
252{
253 raw_spin_lock(&cluster->release_lock);
254 __merge_ready(&cluster->pfair, &cluster->release_queue);
255 raw_spin_unlock(&cluster->release_lock);
256}
257
258static void check_preempt(struct task_struct* t)
259{
260 int cpu = NO_CPU;
261 if (tsk_rt(t)->linked_on != tsk_rt(t)->scheduled_on &&
262 is_present(t)) {
263 /* the task can be scheduled and
264 * is not scheduled where it ought to be scheduled
265 */
266 cpu = tsk_rt(t)->linked_on != NO_CPU ?
267 tsk_rt(t)->linked_on :
268 tsk_rt(t)->scheduled_on;
269 PTRACE_TASK(t, "linked_on:%d, scheduled_on:%d\n",
270 tsk_rt(t)->linked_on, tsk_rt(t)->scheduled_on);
271 /* preempt */
272 litmus_reschedule(cpu);
273 }
274}
275
276/* caller must hold pfair.ready_lock */
277static void drop_all_references(struct task_struct *t)
278{
279 int cpu;
280 struct pfair_state* s;
281 struct pfair_cluster* cluster;
282 if (bheap_node_in_heap(tsk_rt(t)->heap_node)) {
283 /* It must be in the ready queue; drop references isn't called
284 * when the job is in a release queue. */
285 cluster = tsk_pfair(t)->cluster;
286 bheap_delete(pfair_ready_order, &cluster->pfair.ready_queue,
287 tsk_rt(t)->heap_node);
288 }
289 for (cpu = 0; cpu < num_online_cpus(); cpu++) {
290 s = &per_cpu(pfair_state, cpu);
291 if (s->linked == t)
292 s->linked = NULL;
293 if (s->local == t)
294 s->local = NULL;
295 if (s->scheduled == t)
296 s->scheduled = NULL;
297 }
298 /* make sure we don't have a stale linked_on field */
299 tsk_rt(t)->linked_on = NO_CPU;
300
301 /* make sure we're not queued for re-releasing */
302 if (in_list(&tsk_rt(t)->list))
303 {
304 TRACE_TASK(t, "removing from out_of_budget queue\n");
305 list_del(&tsk_rt(t)->list);
306 }
307}
308
309static void pfair_prepare_next_period(struct task_struct* t)
310{
311 struct pfair_param* p = tsk_pfair(t);
312
313 prepare_for_next_period(t);
314 tsk_rt(t)->completed = 0;
315 p->release = time2quanta(get_release(t), CEIL);
316}
317
318/* returns 1 if the task needs to go the release queue */
319static int advance_subtask(quanta_t time, struct task_struct* t, int cpu)
320{
321 struct pfair_param* p = tsk_pfair(t);
322 int to_relq;
323 p->cur = (p->cur + 1) % p->quanta;
324 if (!p->cur) {
325 if (is_present(t)) {
326 /* The job overran; we start a new budget allocation. */
327 TRACE_TASK(t, "overran budget, preparing next period\n");
328 sched_trace_task_completion(t, 1);
329 pfair_prepare_next_period(t);
330 } else {
331 /* remove task from system until it wakes */
332 drop_all_references(t);
333 p->needs_requeue = 1;
334 TRACE_TASK(t, "on %d advanced to subtask %lu (not present)\n",
335 cpu, p->cur);
336 return 0;
337 }
338 }
339 to_relq = time_after(cur_release(t), time);
340 TRACE_TASK(t, "on %d advanced to subtask %lu -> to_relq=%d "
341 "(cur_release:%lu time:%lu present:%d on_cpu=%d)\n",
342 cpu, p->cur, to_relq, cur_release(t), time,
343 tsk_rt(t)->present, tsk_rt(t)->scheduled_on);
344 return to_relq;
345}
346
347static void advance_subtasks(struct pfair_cluster *cluster, quanta_t time)
348{
349 struct task_struct* l;
350 struct pfair_param* p;
351 struct list_head* pos;
352 struct pfair_state* cpu;
353
354 list_for_each(pos, &cluster->topology.cpus) {
355 cpu = from_cluster_list(pos);
356 l = cpu->linked;
357 cpu->missed_updates += cpu->linked != cpu->local;
358 if (l) {
359 p = tsk_pfair(l);
360 p->last_quantum = time;
361 p->last_cpu = cpu_id(cpu);
362 if (advance_subtask(time, l, cpu_id(cpu))) {
363 cpu->linked = NULL;
364 tsk_rt(l)->linked_on = NO_CPU;
365 PTRACE_TASK(l, "should go to release queue. "
366 "scheduled_on=%d present=%d\n",
367 tsk_rt(l)->scheduled_on,
368 tsk_rt(l)->present);
369 list_add(&tsk_rt(l)->list, &cpu->out_of_budget);
370 }
371 }
372 }
373}
374
375static int target_cpu(quanta_t time, struct task_struct* t, int default_cpu)
376{
377 int cpu;
378 if (tsk_rt(t)->scheduled_on != NO_CPU) {
379 /* always observe scheduled_on linkage */
380 default_cpu = tsk_rt(t)->scheduled_on;
381 } else if (tsk_pfair(t)->last_quantum == time - 1) {
382 /* back2back quanta */
383 /* Only observe last_quantum if no scheduled_on is in the way.
384 * This should only kick in if a CPU missed quanta, and that
385 * *should* only happen in QEMU.
386 */
387 cpu = tsk_pfair(t)->last_cpu;
388 if (!pstate[cpu]->linked ||
389 tsk_rt(pstate[cpu]->linked)->scheduled_on != cpu) {
390 default_cpu = cpu;
391 }
392 }
393 return default_cpu;
394}
395
396/* returns one if linking was redirected */
397static int pfair_link(quanta_t time, int cpu,
398 struct task_struct* t)
399{
400 int target = target_cpu(time, t, cpu);
401 struct task_struct* prev = pstate[cpu]->linked;
402 struct task_struct* other;
403 struct pfair_cluster* cluster = cpu_cluster(pstate[cpu]);
404
405 if (target != cpu) {
406 BUG_ON(pstate[target]->topology.cluster != pstate[cpu]->topology.cluster);
407 other = pstate[target]->linked;
408 pstate[target]->linked = t;
409 tsk_rt(t)->linked_on = target;
410 if (!other)
411 /* linked ok, but reschedule this CPU */
412 return 1;
413 if (target < cpu) {
414 /* link other to cpu instead */
415 tsk_rt(other)->linked_on = cpu;
416 pstate[cpu]->linked = other;
417 if (prev) {
418 /* prev got pushed back into the ready queue */
419 tsk_rt(prev)->linked_on = NO_CPU;
420 __add_ready(&cluster->pfair, prev);
421 }
422 /* we are done with this cpu */
423 return 0;
424 } else {
425 /* re-add other, it's original CPU was not considered yet */
426 tsk_rt(other)->linked_on = NO_CPU;
427 __add_ready(&cluster->pfair, other);
428 /* reschedule this CPU */
429 return 1;
430 }
431 } else {
432 pstate[cpu]->linked = t;
433 tsk_rt(t)->linked_on = cpu;
434 if (prev) {
435 /* prev got pushed back into the ready queue */
436 tsk_rt(prev)->linked_on = NO_CPU;
437 __add_ready(&cluster->pfair, prev);
438 }
439 /* we are done with this CPU */
440 return 0;
441 }
442}
443
444static void schedule_subtasks(struct pfair_cluster *cluster, quanta_t time)
445{
446 int retry;
447 struct list_head *pos;
448 struct pfair_state *cpu_state;
449
450 list_for_each(pos, &cluster->topology.cpus) {
451 cpu_state = from_cluster_list(pos);
452 retry = 1;
453#ifdef CONFIG_RELEASE_MASTER
454 /* skip release master */
455 if (cluster->pfair.release_master == cpu_id(cpu_state))
456 continue;
457#endif
458 while (retry) {
459 if (pfair_higher_prio(__peek_ready(&cluster->pfair),
460 cpu_state->linked))
461 retry = pfair_link(time, cpu_id(cpu_state),
462 __take_ready(&cluster->pfair));
463 else
464 retry = 0;
465 }
466 }
467}
468
469static void schedule_next_quantum(struct pfair_cluster *cluster, quanta_t time)
470{
471 struct pfair_state *cpu;
472 struct list_head* pos;
473
474 /* called with interrupts disabled */
475 PTRACE("--- Q %lu at %llu PRE-SPIN\n",
476 time, litmus_clock());
477 raw_spin_lock(cluster_lock(cluster));
478 PTRACE("<<< Q %lu at %llu\n",
479 time, litmus_clock());
480
481 sched_trace_quantum_boundary();
482
483 advance_subtasks(cluster, time);
484 poll_releases(cluster);
485 schedule_subtasks(cluster, time);
486
487 list_for_each(pos, &cluster->topology.cpus) {
488 cpu = from_cluster_list(pos);
489 if (cpu->linked)
490 PTRACE_TASK(cpu->linked,
491 " linked on %d.\n", cpu_id(cpu));
492 else
493 PTRACE("(null) linked on %d.\n", cpu_id(cpu));
494 }
495 /* We are done. Advance time. */
496 mb();
497 list_for_each(pos, &cluster->topology.cpus) {
498 cpu = from_cluster_list(pos);
499 if (cpu->local_tick != cpu->cur_tick) {
500 TRACE("BAD Quantum not acked on %d "
501 "(l:%lu c:%lu p:%lu)\n",
502 cpu_id(cpu),
503 cpu->local_tick,
504 cpu->cur_tick,
505 cluster->pfair_time);
506 cpu->missed_quanta++;
507 }
508 cpu->cur_tick = time;
509 }
510 PTRACE(">>> Q %lu at %llu\n",
511 time, litmus_clock());
512 raw_spin_unlock(cluster_lock(cluster));
513}
514
515static noinline void wait_for_quantum(quanta_t q, struct pfair_state* state)
516{
517 quanta_t loc;
518
519 goto first; /* skip mb() on first iteration */
520 do {
521 cpu_relax();
522 mb();
523 first: loc = state->cur_tick;
524 /* FIXME: what if loc > cur? */
525 } while (time_before(loc, q));
526 PTRACE("observed cur_tick:%lu >= q:%lu\n",
527 loc, q);
528}
529
530static quanta_t current_quantum(struct pfair_state* state)
531{
532 lt_t t = litmus_clock() - state->offset;
533 return time2quanta(t, FLOOR);
534}
535
536static void catchup_quanta(quanta_t from, quanta_t target,
537 struct pfair_state* state)
538{
539 quanta_t cur = from, time;
540 TRACE("+++< BAD catching up quanta from %lu to %lu\n",
541 from, target);
542 while (time_before(cur, target)) {
543 wait_for_quantum(cur, state);
544 cur++;
545 time = cmpxchg(&cpu_cluster(state)->pfair_time,
546 cur - 1, /* expected */
547 cur /* next */
548 );
549 if (time == cur - 1)
550 schedule_next_quantum(cpu_cluster(state), cur);
551 }
552 TRACE("+++> catching up done\n");
553}
554
555/* pfair_tick - this function is called for every local timer
556 * interrupt.
557 */
558static void pfair_tick(struct task_struct* t)
559{
560 struct pfair_state* state = this_cpu_ptr(&pfair_state);
561 quanta_t time, cur;
562 int retry = 10;
563
564 do {
565 cur = current_quantum(state);
566 PTRACE("q %lu at %llu\n", cur, litmus_clock());
567
568 /* Attempt to advance time. First CPU to get here
569 * will prepare the next quantum.
570 */
571 time = cpu_cluster(state)->pfair_time;
572 if (time == cur - 1)
573 {
574 /* looks good, see if we can advance the time */
575 time = cmpxchg(&cpu_cluster(state)->pfair_time,
576 cur - 1, /* expected */
577 cur /* next */
578 );
579 }
580
581 if (time == cur - 1) {
582 /* exchange succeeded */
583 wait_for_quantum(cur - 1, state);
584 schedule_next_quantum(cpu_cluster(state), cur);
585 retry = 0;
586 } else if (time_before(time, cur - 1)) {
587 /* the whole system missed a tick !? */
588 catchup_quanta(time, cur, state);
589 retry--;
590 } else if (time_after(time, cur)) {
591 /* our timer lagging behind!? */
592 TRACE("BAD pfair_time:%lu > cur:%lu\n", time, cur);
593 retry--;
594 } else {
595 /* Some other CPU already started scheduling
596 * this quantum. Let it do its job and then update.
597 */
598 retry = 0;
599 }
600 } while (retry);
601
602 /* Spin locally until time advances. */
603 wait_for_quantum(cur, state);
604
605 /* copy assignment */
606 /* FIXME: what if we race with a future update? Corrupted state? */
607 state->local = state->linked;
608 /* signal that we are done */
609 mb();
610 state->local_tick = state->cur_tick;
611
612 if (state->local != current
613 && (is_realtime(current) || is_present(state->local)))
614 litmus_reschedule_local();
615}
616
617static void process_out_of_budget_tasks(
618 struct pfair_state* state,
619 struct task_struct* prev,
620 unsigned int blocks)
621{
622 struct task_struct *t;
623
624 while (!list_empty(&state->out_of_budget))
625 {
626
627 t = list_first_entry(&state->out_of_budget,
628 struct task_struct, rt_param.list);
629 TRACE_TASK(t, "found on out_of_budget queue is_prev=%d\n", t == prev);
630 list_del(&tsk_rt(t)->list);
631 if (t != prev || !blocks)
632 {
633 sched_trace_task_release(t);
634 add_release(&cpu_cluster(state)->pfair, t);
635 TRACE_TASK(t, "adding to release queue (budget exhausted)\n");
636 } else {
637 TRACE_TASK(t, "not added to release queue (blocks=%d)\n", blocks);
638 tsk_pfair(t)->needs_requeue = 1;
639 }
640 if (unlikely(state->local == t)) {
641 TRACE_TASK(t, "still linked as ->local, cleaning up\n");
642 state->local = NULL;
643 }
644 }
645}
646
647/* Custom scheduling tick: called on each quantum boundary. */
648static enum hrtimer_restart on_quantum_boundary(struct hrtimer *timer)
649{
650 TS_QUANTUM_BOUNDARY_START;
651
652 pfair_tick(current);
653 hrtimer_add_expires_ns(timer, LITMUS_QUANTUM_LENGTH_NS);
654
655 TS_QUANTUM_BOUNDARY_END;
656 return HRTIMER_RESTART;
657}
658
659static int safe_to_schedule(struct task_struct* t, int cpu)
660{
661 int where = tsk_rt(t)->scheduled_on;
662 if (where != NO_CPU && where != cpu) {
663 TRACE_TASK(t, "BAD: can't be scheduled on %d, "
664 "scheduled already on %d.\n", cpu, where);
665 return 0;
666 } else
667 return is_present(t) && !is_completed(t);
668}
669
670static struct task_struct* pfair_schedule(struct task_struct * prev)
671{
672 struct pfair_state* state = this_cpu_ptr(&pfair_state);
673 struct pfair_cluster* cluster = cpu_cluster(state);
674 int blocks, completion, out_of_time;
675 struct task_struct* next = NULL;
676
677#ifdef CONFIG_RELEASE_MASTER
678 /* Bail out early if we are the release master.
679 * The release master never schedules any real-time tasks.
680 */
681 if (unlikely(cluster->pfair.release_master == cpu_id(state))) {
682 goto out;
683 }
684#endif
685
686 raw_spin_lock(cpu_lock(state));
687
688 blocks = is_realtime(prev) && !is_current_running();
689 completion = is_realtime(prev) && is_completed(prev);
690 out_of_time = is_realtime(prev) && time_after(cur_release(prev),
691 state->local_tick);
692
693 if (is_realtime(prev))
694 PTRACE_TASK(prev, "blocks:%d completion:%d out_of_time:%d\n",
695 blocks, completion, out_of_time);
696
697 if (completion && !out_of_time) {
698 sched_trace_task_completion(prev, 0);
699 pfair_prepare_next_period(prev);
700 prepare_release(prev, cur_release(prev));
701 drop_all_references(prev);
702 list_add(&tsk_rt(prev)->list, &state->out_of_budget);
703 }
704
705 process_out_of_budget_tasks(state, prev, blocks);
706
707 if (state->local && safe_to_schedule(state->local, cpu_id(state)))
708 next = state->local;
709
710 if (prev != next) {
711 tsk_rt(prev)->scheduled_on = NO_CPU;
712 if (next)
713 tsk_rt(next)->scheduled_on = cpu_id(state);
714 }
715 sched_state_task_picked();
716 raw_spin_unlock(cpu_lock(state));
717
718 if (next)
719 TRACE_TASK(next, "scheduled rel=%lu at %lu (%llu)\n",
720 tsk_pfair(next)->release, cpu_cluster(state)->pfair_time, litmus_clock());
721 else if (is_realtime(prev))
722 TRACE("Becomes idle at %lu (%llu)\n", cpu_cluster(state)->pfair_time, litmus_clock());
723
724#ifdef CONFIG_RELEASE_MASTER
725out:
726#endif
727
728 if (unlikely(!hrtimer_active(&state->quantum_timer))) {
729 TRACE("activating quantum timer start=%llu\n",
730 hrtimer_get_expires(&state->quantum_timer));
731 __hrtimer_start_range_ns(&state->quantum_timer,
732 hrtimer_get_expires(&state->quantum_timer),
733 0, HRTIMER_MODE_ABS, 0);
734 }
735
736 return next;
737}
738
739static void pfair_task_new(struct task_struct * t, int on_rq, int is_scheduled)
740{
741 unsigned long flags;
742 struct pfair_cluster* cluster;
743
744 TRACE("pfair: task new %d state:%d\n", t->pid, t->state);
745
746 cluster = tsk_pfair(t)->cluster;
747
748 raw_spin_lock_irqsave(cluster_lock(cluster), flags);
749
750 prepare_release(t, cluster->pfair_time + 1);
751 release_at(t, quanta2time(cur_release(t)));
752
753 t->rt_param.scheduled_on = NO_CPU;
754 t->rt_param.linked_on = NO_CPU;
755
756 if (is_scheduled) {
757#ifdef CONFIG_RELEASE_MASTER
758 if (task_cpu(t) != cluster->pfair.release_master)
759#endif
760 t->rt_param.scheduled_on = task_cpu(t);
761 }
762
763 if (on_rq || is_scheduled) {
764 tsk_rt(t)->present = 1;
765 __add_ready(&cluster->pfair, t);
766 } else {
767 tsk_rt(t)->present = 0;
768 tsk_pfair(t)->needs_requeue = 1;
769 }
770
771 check_preempt(t);
772
773 raw_spin_unlock_irqrestore(cluster_lock(cluster), flags);
774}
775
776static void pfair_task_wake_up(struct task_struct *t)
777{
778 unsigned long flags;
779 lt_t now;
780 struct pfair_cluster* cluster;
781 struct pfair_state* state;
782 int sporadic_release = 0;
783
784 cluster = tsk_pfair(t)->cluster;
785
786 TRACE_TASK(t, "wakes at %llu, release=%lu, pfair_time:%lu\n",
787 litmus_clock(), cur_release(t), cluster->pfair_time);
788
789 raw_spin_lock_irqsave(cluster_lock(cluster), flags);
790
791 state = this_cpu_ptr(&pfair_state);
792
793 /* If a task blocks and wakes before its next job release,
794 * then it may resume if it is currently linked somewhere
795 * (as if it never blocked at all). Otherwise, we have a
796 * new sporadic job release.
797 */
798 now = litmus_clock();
799 if (is_tardy(t, now)) {
800 TRACE_TASK(t, "sporadic release!\n");
801 sporadic_release = 1;
802 release_at(t, now);
803 prepare_release(t, time2quanta(now, CEIL));
804 sched_trace_task_release(t);
805 }
806
807 /* only add to ready queue if the task isn't still linked somewhere */
808 if (tsk_pfair(t)->needs_requeue) {
809 tsk_pfair(t)->needs_requeue = 0;
810 TRACE_TASK(t, "requeueing required (released:%d)\n",
811 !time_after(cur_release(t), state->local_tick));
812 tsk_rt(t)->completed = 0;
813 if (time_after(cur_release(t), state->local_tick)
814 && !sporadic_release)
815 add_release(&cluster->pfair, t);
816 else
817 __add_ready(&cluster->pfair, t);
818 }
819
820 check_preempt(t);
821
822 raw_spin_unlock_irqrestore(cluster_lock(cluster), flags);
823 TRACE_TASK(t, "wake up done at %llu\n", litmus_clock());
824}
825
826static void pfair_task_block(struct task_struct *t)
827{
828 BUG_ON(!is_realtime(t));
829 TRACE_TASK(t, "blocks at %llu, state:%d\n",
830 litmus_clock(), t->state);
831}
832
833static void pfair_task_exit(struct task_struct * t)
834{
835 unsigned long flags;
836 struct pfair_cluster *cluster;
837
838 BUG_ON(!is_realtime(t));
839
840 cluster = tsk_pfair(t)->cluster;
841
842 /* Remote task from release or ready queue, and ensure
843 * that it is not the scheduled task for ANY CPU. We
844 * do this blanket check because occassionally when
845 * tasks exit while blocked, the task_cpu of the task
846 * might not be the same as the CPU that the PFAIR scheduler
847 * has chosen for it.
848 */
849 raw_spin_lock_irqsave(cluster_lock(cluster), flags);
850
851 TRACE_TASK(t, "RIP, state:%d\n", t->state);
852 drop_all_references(t);
853
854 raw_spin_unlock_irqrestore(cluster_lock(cluster), flags);
855
856 kfree(t->rt_param.pfair);
857 t->rt_param.pfair = NULL;
858}
859
860static void init_subtask(struct subtask* sub, unsigned long i,
861 lt_t quanta, lt_t period)
862{
863 /* since i is zero-based, the formulas are shifted by one */
864 lt_t tmp;
865
866 /* release */
867 tmp = period * i;
868 do_div(tmp, quanta); /* floor */
869 sub->release = (quanta_t) tmp;
870
871 /* deadline */
872 tmp = period * (i + 1);
873 if (do_div(tmp, quanta)) /* ceil */
874 tmp++;
875 sub->deadline = (quanta_t) tmp;
876
877 /* next release */
878 tmp = period * (i + 1);
879 do_div(tmp, quanta); /* floor */
880 sub->overlap = sub->deadline - (quanta_t) tmp;
881
882 /* Group deadline.
883 * Based on the formula given in Uma's thesis.
884 */
885 if (2 * quanta >= period) {
886 /* heavy */
887 tmp = (sub->deadline - (i + 1)) * period;
888 if (period > quanta &&
889 do_div(tmp, (period - quanta))) /* ceil */
890 tmp++;
891 sub->group_deadline = (quanta_t) tmp;
892 } else
893 sub->group_deadline = 0;
894}
895
896static void dump_subtasks(struct task_struct* t)
897{
898 unsigned long i;
899 for (i = 0; i < t->rt_param.pfair->quanta; i++)
900 TRACE_TASK(t, "SUBTASK %lu: rel=%lu dl=%lu bbit:%lu gdl:%lu\n",
901 i + 1,
902 t->rt_param.pfair->subtasks[i].release,
903 t->rt_param.pfair->subtasks[i].deadline,
904 t->rt_param.pfair->subtasks[i].overlap,
905 t->rt_param.pfair->subtasks[i].group_deadline);
906}
907
908static long pfair_admit_task(struct task_struct* t)
909{
910 lt_t quanta;
911 lt_t period;
912 s64 quantum_length = LITMUS_QUANTUM_LENGTH_NS;
913 struct pfair_param* param;
914 unsigned long i;
915
916 /* first check that the task is in the right cluster */
917 if (cpu_cluster(pstate[tsk_rt(t)->task_params.cpu]) !=
918 cpu_cluster(pstate[task_cpu(t)]))
919 return -EINVAL;
920
921 if (get_rt_period(t) != get_rt_relative_deadline(t)) {
922 printk(KERN_INFO "%s: Admission rejected. "
923 "Only implicit deadlines are currently supported.\n",
924 litmus->plugin_name);
925 return -EINVAL;
926 }
927
928 /* Pfair is a tick-based scheduler, so the unit of time
929 * is one quantum. Calculate quantum-based parameters for everything.
930 * (Ceiling of exec cost, floor of period.)
931 */
932
933 quanta = get_exec_cost(t);
934 period = get_rt_period(t);
935
936 quanta = time2quanta(get_exec_cost(t), CEIL);
937
938 if (do_div(period, quantum_length))
939 printk(KERN_WARNING
940 "The period of %s/%d is not a multiple of %llu.\n",
941 t->comm, t->pid, (unsigned long long) quantum_length);
942
943 if (quanta == period) {
944 PTRACE_TASK(t, "Admitting weight 1.0 task. (%llu, %llu).\n", quanta, period);
945 }
946
947 param = kzalloc(sizeof(*param) +
948 quanta * sizeof(struct subtask), GFP_ATOMIC);
949
950 if (!param)
951 return -ENOMEM;
952
953 param->quanta = quanta;
954 param->period = period;
955
956 param->cluster = cpu_cluster(pstate[tsk_rt(t)->task_params.cpu]);
957
958 for (i = 0; i < quanta; i++)
959 init_subtask(param->subtasks + i, i, quanta, period);
960
961 if (t->rt_param.pfair)
962 /* get rid of stale allocation */
963 kfree(t->rt_param.pfair);
964
965 t->rt_param.pfair = param;
966
967 /* spew out some debug info */
968 dump_subtasks(t);
969
970 /* Disable generic budget enforcement (if enabled).
971 * The plugin provides its own (non-optional) enforcement
972 * of allocations at quantum granularity. */
973 tsk_rt(t)->task_params.budget_policy = NO_ENFORCEMENT;
974
975 return 0;
976}
977
978static void pfair_init_cluster(struct pfair_cluster* cluster)
979{
980 rt_domain_init(&cluster->pfair, pfair_ready_order, NULL, pfair_release_jobs);
981 bheap_init(&cluster->release_queue);
982 raw_spin_lock_init(&cluster->release_lock);
983 INIT_LIST_HEAD(&cluster->topology.cpus);
984}
985
986static void cleanup_clusters(void)
987{
988 int i;
989
990 if (num_pfair_clusters)
991 kfree(pfair_clusters);
992 pfair_clusters = NULL;
993 num_pfair_clusters = 0;
994
995 /* avoid stale pointers */
996 for (i = 0; i < num_online_cpus(); i++) {
997 pstate[i]->topology.cluster = NULL;
998 printk("P%d missed %u updates and %u quanta.\n", cpu_id(pstate[i]),
999 pstate[i]->missed_updates, pstate[i]->missed_quanta);
1000 }
1001}
1002
1003static struct domain_proc_info pfair_domain_proc_info;
1004static long pfair_get_domain_proc_info(struct domain_proc_info **ret)
1005{
1006 *ret = &pfair_domain_proc_info;
1007 return 0;
1008}
1009
1010static void pfair_setup_domain_proc(void)
1011{
1012 int i, cpu, domain;
1013#ifdef CONFIG_RELEASE_MASTER
1014 int release_master = atomic_read(&release_master_cpu);
1015 /* skip over the domain with the release master if cluster size is 1 */
1016 int cluster_size = num_online_cpus() / num_pfair_clusters;
1017 int skip_domain = (1 == cluster_size && release_master != NO_CPU) ?
1018 release_master : NO_CPU;
1019#else
1020 int release_master = NO_CPU;
1021 int skip_domain = NO_CPU;
1022#endif
1023 int num_rt_cpus = num_online_cpus() - (release_master != NO_CPU);
1024 int num_rt_domains = num_pfair_clusters - (skip_domain != NO_CPU);
1025 struct cd_mapping *map;
1026
1027 memset(&pfair_domain_proc_info, sizeof(pfair_domain_proc_info), 0);
1028 init_domain_proc_info(&pfair_domain_proc_info, num_rt_cpus, num_pfair_clusters);
1029 pfair_domain_proc_info.num_cpus = num_rt_cpus;
1030 pfair_domain_proc_info.num_domains = num_rt_domains;
1031
1032 for (cpu = 0, i = 0; cpu < num_online_cpus(); ++cpu) {
1033 if (cpu == release_master)
1034 continue;
1035 map = &pfair_domain_proc_info.cpu_to_domains[i];
1036 /* pointer math to figure out the domain index */
1037 domain = cpu_cluster(&per_cpu(pfair_state, cpu)) - pfair_clusters;
1038 map->id = cpu;
1039 cpumask_set_cpu(domain, map->mask);
1040 ++i;
1041 }
1042
1043 for (domain = 0, i = 0; domain < num_pfair_clusters; ++domain) {
1044 struct pfair_cluster *cluster;
1045 struct list_head *pos;
1046
1047 if (domain == skip_domain)
1048 continue;
1049
1050 cluster = &pfair_clusters[domain];
1051 map = &pfair_domain_proc_info.domain_to_cpus[i];
1052 map->id = i;
1053
1054 list_for_each(pos, &cluster->topology.cpus) {
1055 cpu = cpu_id(from_cluster_list(pos));
1056 if (cpu != release_master)
1057 cpumask_set_cpu(cpu, map->mask);
1058 }
1059 ++i;
1060 }
1061}
1062
1063static long pfair_activate_plugin(void)
1064{
1065 int err, i;
1066 struct pfair_state* state;
1067 struct pfair_cluster* cluster;
1068 quanta_t now, start;
1069 int cluster_size;
1070 struct cluster_cpu* cpus[NR_CPUS];
1071 struct scheduling_cluster* clust[NR_CPUS];
1072 lt_t quantum_timer_start;
1073
1074 cluster_size = get_cluster_size(pfair_cluster_level);
1075
1076 if (cluster_size <= 0 || num_online_cpus() % cluster_size != 0)
1077 return -EINVAL;
1078
1079 num_pfair_clusters = num_online_cpus() / cluster_size;
1080
1081 pfair_clusters = kzalloc(num_pfair_clusters * sizeof(struct pfair_cluster), GFP_ATOMIC);
1082 if (!pfair_clusters) {
1083 num_pfair_clusters = 0;
1084 printk(KERN_ERR "Could not allocate Pfair clusters!\n");
1085 return -ENOMEM;
1086 }
1087
1088 state = this_cpu_ptr(&pfair_state);
1089 now = current_quantum(state);
1090 start = now + 50;
1091 quantum_timer_start = quanta2time(start);
1092 TRACE("Activating PFAIR at %llu (q=%lu), first tick at %llu (q=%lu)\n",
1093 litmus_clock(),
1094 now,
1095 quantum_timer_start,
1096 time2quanta(quantum_timer_start, CEIL));
1097
1098 for (i = 0; i < num_pfair_clusters; i++) {
1099 cluster = &pfair_clusters[i];
1100 pfair_init_cluster(cluster);
1101 cluster->pfair_time = start;
1102 clust[i] = &cluster->topology;
1103#ifdef CONFIG_RELEASE_MASTER
1104 cluster->pfair.release_master = atomic_read(&release_master_cpu);
1105#endif
1106 }
1107
1108 for_each_online_cpu(i) {
1109 state = &per_cpu(pfair_state, i);
1110 state->cur_tick = start;
1111 state->local_tick = start;
1112 state->missed_quanta = 0;
1113 state->missed_updates = 0;
1114 state->offset = cpu_stagger_offset(i);
1115 hrtimer_set_expires(&state->quantum_timer,
1116 ns_to_ktime(quantum_timer_start + state->offset));
1117 cpus[i] = &state->topology;
1118 TRACE("cpus[%d] set; offset=%llu; %d\n", i, state->offset, num_online_cpus());
1119 INIT_LIST_HEAD(&state->out_of_budget);
1120 /* force rescheduling to start quantum timer */
1121 litmus_reschedule(i);
1122
1123 WARN_ONCE(!hrtimer_is_hres_active(&state->quantum_timer),
1124 KERN_ERR "WARNING: no high resolution timers available!?\n");
1125 }
1126
1127 err = assign_cpus_to_clusters(pfair_cluster_level, clust, num_pfair_clusters,
1128 cpus, num_online_cpus());
1129
1130 if (err < 0)
1131 cleanup_clusters();
1132 else
1133 pfair_setup_domain_proc();
1134
1135 return err;
1136}
1137
1138static long pfair_deactivate_plugin(void)
1139{
1140 int cpu;
1141 struct pfair_state* state;
1142
1143 for_each_online_cpu(cpu) {
1144 state = &per_cpu(pfair_state, cpu);
1145 TRACE("stopping quantum timer on CPU%d\n", cpu);
1146 hrtimer_cancel(&state->quantum_timer);
1147 }
1148 cleanup_clusters();
1149 destroy_domain_proc_info(&pfair_domain_proc_info);
1150 return 0;
1151}
1152
1153/* Plugin object */
1154static struct sched_plugin pfair_plugin __cacheline_aligned_in_smp = {
1155 .plugin_name = "PFAIR",
1156 .task_new = pfair_task_new,
1157 .task_exit = pfair_task_exit,
1158 .schedule = pfair_schedule,
1159 .task_wake_up = pfair_task_wake_up,
1160 .task_block = pfair_task_block,
1161 .admit_task = pfair_admit_task,
1162 .complete_job = complete_job,
1163 .activate_plugin = pfair_activate_plugin,
1164 .deactivate_plugin = pfair_deactivate_plugin,
1165 .get_domain_proc_info = pfair_get_domain_proc_info,
1166};
1167
1168
1169static struct proc_dir_entry *cluster_file = NULL, *pfair_dir = NULL;
1170
1171static int __init init_pfair(void)
1172{
1173 int cpu, err, fs;
1174 struct pfair_state *state;
1175
1176 /*
1177 * initialize short_cut for per-cpu pfair state;
1178 * there may be a problem here if someone removes a cpu
1179 * while we are doing this initialization... and if cpus
1180 * are added / removed later... but we don't support CPU hotplug atm anyway.
1181 */
1182 pstate = kmalloc(sizeof(struct pfair_state*) * num_online_cpus(), GFP_KERNEL);
1183
1184 /* initialize CPU state */
1185 for (cpu = 0; cpu < num_online_cpus(); cpu++) {
1186 state = &per_cpu(pfair_state, cpu);
1187 hrtimer_init(&state->quantum_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
1188 state->quantum_timer.function = on_quantum_boundary;
1189 state->topology.id = cpu;
1190 state->cur_tick = 0;
1191 state->local_tick = 0;
1192 state->linked = NULL;
1193 state->local = NULL;
1194 state->scheduled = NULL;
1195 state->missed_quanta = 0;
1196 state->offset = cpu_stagger_offset(cpu);
1197 pstate[cpu] = state;
1198 }
1199
1200 pfair_clusters = NULL;
1201 num_pfair_clusters = 0;
1202
1203 err = register_sched_plugin(&pfair_plugin);
1204 if (!err) {
1205 fs = make_plugin_proc_dir(&pfair_plugin, &pfair_dir);
1206 if (!fs)
1207 cluster_file = create_cluster_file(pfair_dir, &pfair_cluster_level);
1208 else
1209 printk(KERN_ERR "Could not allocate PFAIR procfs dir.\n");
1210 }
1211
1212 return err;
1213}
1214
1215static void __exit clean_pfair(void)
1216{
1217 kfree(pstate);
1218
1219 if (cluster_file)
1220 remove_proc_entry("cluster", pfair_dir);
1221 if (pfair_dir)
1222 remove_plugin_proc_dir(&pfair_plugin);
1223}
1224
1225module_init(init_pfair);
1226module_exit(clean_pfair);