2 files changed, 898 insertions, 1 deletions
diff --git a/litmus/Makefile b/litmus/Makefile
index 70c9684c3b98..26e0fdb5c2c1 100644
--- a/litmus/Makefile
+++ b/litmus/Makefile
@@ -11,7 +11,8 @@ obj-y     = sched_plugin.o litmus.o \
            srp.o \
            fmlp.o \
            bheap.o \
-            sched_gsn_edf.o
+            sched_gsn_edf.o \
+            sched_pfair.o
 obj-$(CONFIG_FEATHER_TRACE) += ft_event.o ftdev.o
 obj-$(CONFIG_SCHED_TASK_TRACE) += sched_task_trace.o
diff --git a/litmus/sched_pfair.c b/litmus/sched_pfair.c
new file mode 100644
index 000000000000..2ea39223e7f0
--- /dev/null
+++ b/litmus/sched_pfair.c
@@ -0,0 +1,896 @@
+/*
+ * kernel/sched_pfair.c
+ *
+ * Implementation of the (global) Pfair scheduling algorithm.
+ *
+ */
+#include <asm/div64.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <litmus/litmus.h>
+#include <litmus/jobs.h>
+#include <litmus/rt_domain.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/sched_trace.h>
+#include <litmus/bheap.h>
+struct subtask {
+        /* measured in quanta relative to job release */
+        quanta_t release;
+        quanta_t deadline;
+        quanta_t overlap; /* called "b bit" by PD^2 */
+        quanta_t group_deadline;
+};
+struct pfair_param   {
+        quanta_t        quanta;       /* number of subtasks */
+        quanta_t        cur;          /* index of current subtask */
+        quanta_t        release;      /* in quanta */
+        quanta_t        period;       /* in quanta */
+        quanta_t        last_quantum; /* when scheduled last */
+        int             last_cpu;     /* where scheduled last */
+        unsigned int    sporadic_release; /* On wakeup, new sporadic release? */
+        struct subtask subtasks[0];   /* allocate together with pfair_param */
+};
+#define tsk_pfair(tsk) ((tsk)->rt_param.pfair)
+struct pfair_state {
+        int cpu;
+        volatile quanta_t cur_tick;    /* updated by the CPU that is advancing
+                                        * the time */
+        volatile quanta_t local_tick;  /* What tick is the local CPU currently
+                                        * executing? Updated only by the local
+                                        * CPU. In QEMU, this may lag behind the
+                                        * current tick. In a real system, with
+                                        * proper timers and aligned quanta,
+                                        * that should only be the
+                                        * case for a very short time after the
+                                        * time advanced. With staggered quanta,
+                                        * it will lag for the duration of the
+                                        * offset.
+                                        */
+        struct task_struct* linked;    /* the task that should be executing */
+        struct task_struct* local;     /* the local copy of linked          */
+        struct task_struct* scheduled; /* what is actually scheduled        */
+        unsigned long missed_quanta;
+        lt_t offset;                    /* stagger offset */
+};
+/* Currently, we limit the maximum period of any task to 2000 quanta.
+ * The reason is that it makes the implementation easier since we do not
+ * need to reallocate the release wheel on task arrivals.
+ * In the future
+ */
+#define PFAIR_MAX_PERIOD 2000
+/* This is the release queue wheel. It is indexed by pfair_time %
+ * PFAIR_MAX_PERIOD.  Each heap is ordered by PFAIR priority, so that it can be
+ * merged with the ready queue.
+ */
+static struct bheap release_queue[PFAIR_MAX_PERIOD];
+DEFINE_PER_CPU(struct pfair_state, pfair_state);
+struct pfair_state* *pstate; /* short cut */
+static quanta_t pfair_time = 0; /* the "official" PFAIR clock */
+static quanta_t merge_time = 0; /* Updated after the release queue has been
+                                 * merged. Used by drop_all_references().
+                                 */
+static rt_domain_t pfair;
+/* The pfair_lock is used to serialize all scheduling events.
+ */
+#define pfair_lock pfair.ready_lock
+/* Enable for lots of trace info.
+ * #define PFAIR_DEBUG
+ */
+#ifdef PFAIR_DEBUG
+#define PTRACE_TASK(t, f, args...)  TRACE_TASK(t, f, ## args)
+#define PTRACE(f, args...) TRACE(f, ## args)
+#else
+#define PTRACE_TASK(t, f, args...)
+#define PTRACE(f, args...)
+#endif
+/* gcc will inline all of these accessor functions... */
+static struct subtask* cur_subtask(struct task_struct* t)
+{
+        return tsk_pfair(t)->subtasks + tsk_pfair(t)->cur;
+}
+static quanta_t cur_deadline(struct task_struct* t)
+{
+        return cur_subtask(t)->deadline +  tsk_pfair(t)->release;
+}
+static quanta_t cur_sub_release(struct task_struct* t)
+{
+        return cur_subtask(t)->release +  tsk_pfair(t)->release;
+}
+static quanta_t cur_release(struct task_struct* t)
+{
+#ifdef EARLY_RELEASE
+        /* only the release of the first subtask counts when we early
+         * release */
+        return tsk_pfair(t)->release;
+#else
+        return cur_sub_release(t);
+#endif
+}
+static quanta_t cur_overlap(struct task_struct* t)
+{
+        return cur_subtask(t)->overlap;
+}
+static quanta_t cur_group_deadline(struct task_struct* t)
+{
+        quanta_t gdl = cur_subtask(t)->group_deadline;
+        if (gdl)
+                return gdl + tsk_pfair(t)->release;
+        else
+                return gdl;
+}
+static int pfair_higher_prio(struct task_struct* first,
+                             struct task_struct* second)
+{
+        return  /* first task must exist */
+                first && (
+                /* Does the second task exist and is it a real-time task?  If
+                 * not, the first task (which is a RT task) has higher
+                 * priority.
+                 */
+                !second || !is_realtime(second)  ||
+                /* Is the (subtask) deadline of the first task earlier?
+                 * Then it has higher priority.
+                 */
+                time_before(cur_deadline(first), cur_deadline(second)) ||
+                /* Do we have a deadline tie?
+                 * Then break by B-bit.
+                 */
+                (cur_deadline(first) == cur_deadline(second) &&
+                 (cur_overlap(first) > cur_overlap(second) ||
+                /* Do we have a B-bit tie?
+                 * Then break by group deadline.
+                 */
+                (cur_overlap(first) == cur_overlap(second) &&
+                 (time_after(cur_group_deadline(first),
+                             cur_group_deadline(second)) ||
+                /* Do we have a group deadline tie?
+                 * Then break by PID, which are unique.
+                 */
+                (cur_group_deadline(first) ==
+                 cur_group_deadline(second) &&
+                 first->pid < second->pid))))));
+}
+int pfair_ready_order(struct bheap_node* a, struct bheap_node* b)
+{
+        return pfair_higher_prio(bheap2task(a), bheap2task(b));
+}
+/* return the proper release queue for time t */
+static struct bheap* relq(quanta_t t)
+{
+        struct bheap* rq = &release_queue[t % PFAIR_MAX_PERIOD];
+        return rq;
+}
+static void prepare_release(struct task_struct* t, quanta_t at)
+{
+        tsk_pfair(t)->release    = at;
+        tsk_pfair(t)->cur        = 0;
+}
+static void __pfair_add_release(struct task_struct* t, struct bheap* queue)
+{
+        bheap_insert(pfair_ready_order, queue,
+                    tsk_rt(t)->heap_node);
+}
+static void pfair_add_release(struct task_struct* t)
+{
+        BUG_ON(bheap_node_in_heap(tsk_rt(t)->heap_node));
+        __pfair_add_release(t, relq(cur_release(t)));
+}
+/* pull released tasks from the release queue */
+static void poll_releases(quanta_t time)
+{
+        __merge_ready(&pfair, relq(time));
+        merge_time = time;
+}
+static void check_preempt(struct task_struct* t)
+{
+        int cpu = NO_CPU;
+        if (tsk_rt(t)->linked_on != tsk_rt(t)->scheduled_on &&
+            tsk_rt(t)->present) {
+                /* the task can be scheduled and
+                 * is not scheduled where it ought to be scheduled
+                 */
+                cpu = tsk_rt(t)->linked_on != NO_CPU ?
+                        tsk_rt(t)->linked_on         :
+                        tsk_rt(t)->scheduled_on;
+                PTRACE_TASK(t, "linked_on:%d, scheduled_on:%d\n",
+                           tsk_rt(t)->linked_on, tsk_rt(t)->scheduled_on);
+                /* preempt */
+                if (cpu == smp_processor_id())
+                        set_tsk_need_resched(current);
+                else {
+                        smp_send_reschedule(cpu);
+                }
+        }
+}
+/* caller must hold pfair_lock */
+static void drop_all_references(struct task_struct *t)
+{
+        int cpu;
+        struct pfair_state* s;
+        struct bheap* q;
+        if (bheap_node_in_heap(tsk_rt(t)->heap_node)) {
+                /* figure out what queue the node is in */
+                if (time_before_eq(cur_release(t), merge_time))
+                        q = &pfair.ready_queue;
+                else
+                        q = relq(cur_release(t));
+                bheap_delete(pfair_ready_order, q,
+                            tsk_rt(t)->heap_node);
+        }
+        for (cpu = 0; cpu < num_online_cpus(); cpu++) {
+                s = &per_cpu(pfair_state, cpu);
+                if (s->linked == t)
+                        s->linked = NULL;
+                if (s->local  == t)
+                        s->local  = NULL;
+                if (s->scheduled  == t)
+                        s->scheduled = NULL;
+        }
+}
+/* returns 1 if the task needs to go the release queue */
+static int advance_subtask(quanta_t time, struct task_struct* t, int cpu)
+{
+        struct pfair_param* p = tsk_pfair(t);
+        int to_relq;
+        p->cur = (p->cur + 1) % p->quanta;
+        if (!p->cur) {
+                sched_trace_task_completion(t, 1);
+                if (tsk_rt(t)->present) {
+                        /* we start a new job */
+                        prepare_for_next_period(t);
+                        sched_trace_task_release(t);
+                        get_rt_flags(t) = RT_F_RUNNING;
+                        p->release += p->period;
+                } else {
+                        /* remove task from system until it wakes */
+                        drop_all_references(t);
+                        tsk_pfair(t)->sporadic_release = 1;
+                        TRACE_TASK(t, "on %d advanced to subtask %lu (not present)\n",
+                                   cpu, p->cur);
+                        return 0;
+                }
+        }
+        to_relq = time_after(cur_release(t), time);
+        TRACE_TASK(t, "on %d advanced to subtask %lu -> to_relq=%d\n",
+                   cpu, p->cur, to_relq);
+        return to_relq;
+}
+static void advance_subtasks(quanta_t time)
+{
+        int cpu, missed;
+        struct task_struct* l;
+        struct pfair_param* p;
+        for_each_online_cpu(cpu) {
+                l = pstate[cpu]->linked;
+                missed = pstate[cpu]->linked != pstate[cpu]->local;
+                if (l) {
+                        p = tsk_pfair(l);
+                        p->last_quantum = time;
+                        p->last_cpu     =  cpu;
+                        if (advance_subtask(time, l, cpu)) {
+                                pstate[cpu]->linked = NULL;
+                                pfair_add_release(l);
+                        }
+                }
+        }
+}
+static int target_cpu(quanta_t time, struct task_struct* t, int default_cpu)
+{
+        int cpu;
+        if (tsk_rt(t)->scheduled_on != NO_CPU) {
+                /* always observe scheduled_on linkage */
+                default_cpu = tsk_rt(t)->scheduled_on;
+        } else if (tsk_pfair(t)->last_quantum == time - 1) {
+                /* back2back quanta */
+                /* Only observe last_quantum if no scheduled_on is in the way.
+                 * This should only kick in if a CPU missed quanta, and that
+                 * *should* only happen in QEMU.
+                 */
+                cpu = tsk_pfair(t)->last_cpu;
+                if (!pstate[cpu]->linked ||
+                    tsk_rt(pstate[cpu]->linked)->scheduled_on != cpu) {
+                        default_cpu = cpu;
+                }
+        }
+        return default_cpu;
+}
+/* returns one if linking was redirected */
+static int pfair_link(quanta_t time, int cpu,
+                      struct task_struct* t)
+{
+        int target = target_cpu(time, t, cpu);
+        struct task_struct* prev  = pstate[cpu]->linked;
+        struct task_struct* other;
+        if (target != cpu) {
+                other = pstate[target]->linked;
+                pstate[target]->linked = t;
+                tsk_rt(t)->linked_on   = target;
+                if (!other)
+                        /* linked ok, but reschedule this CPU */
+                        return 1;
+                if (target < cpu) {
+                        /* link other to cpu instead */
+                        tsk_rt(other)->linked_on = cpu;
+                        pstate[cpu]->linked      = other;
+                        if (prev) {
+                                /* prev got pushed back into the ready queue */
+                                tsk_rt(prev)->linked_on = NO_CPU;
+                                __add_ready(&pfair, prev);
+                        }
+                        /* we are done with this cpu */
+                        return 0;
+                } else {
+                        /* re-add other, it's original CPU was not considered yet */
+                        tsk_rt(other)->linked_on = NO_CPU;
+                        __add_ready(&pfair, other);
+                        /* reschedule this CPU */
+                        return 1;
+                }
+        } else {
+                pstate[cpu]->linked  = t;
+                tsk_rt(t)->linked_on = cpu;
+                if (prev) {
+                        /* prev got pushed back into the ready queue */
+                        tsk_rt(prev)->linked_on = NO_CPU;
+                        __add_ready(&pfair, prev);
+                }
+                /* we are done with this CPU */
+                return 0;
+        }
+}
+static void schedule_subtasks(quanta_t time)
+{
+        int cpu, retry;
+        for_each_online_cpu(cpu) {
+                retry = 1;
+                while (retry) {
+                        if (pfair_higher_prio(__peek_ready(&pfair),
+                                              pstate[cpu]->linked))
+                                retry = pfair_link(time, cpu,
+                                                   __take_ready(&pfair));
+                        else
+                                retry = 0;
+                }
+        }
+}
+static void schedule_next_quantum(quanta_t time)
+{
+        int cpu;
+        /* called with interrupts disabled */
+        PTRACE("--- Q %lu at %llu PRE-SPIN\n",
+               time, litmus_clock());
+        spin_lock(&pfair_lock);
+        PTRACE("<<< Q %lu at %llu\n",
+               time, litmus_clock());
+        sched_trace_quantum_boundary();
+        advance_subtasks(time);
+        poll_releases(time);
+        schedule_subtasks(time);
+        for (cpu = 0; cpu < num_online_cpus(); cpu++)
+                if (pstate[cpu]->linked)
+                        PTRACE_TASK(pstate[cpu]->linked,
+                                    " linked on %d.\n", cpu);
+                else
+                        PTRACE("(null) linked on %d.\n", cpu);
+        /* We are done. Advance time. */
+        mb();
+        for (cpu = 0; cpu < num_online_cpus(); cpu++) {
+                if (pstate[cpu]->local_tick != pstate[cpu]->cur_tick) {
+                        TRACE("BAD Quantum not acked on %d "
+                              "(l:%lu c:%lu p:%lu)\n",
+                              cpu,
+                              pstate[cpu]->local_tick,
+                              pstate[cpu]->cur_tick,
+                              pfair_time);
+                        pstate[cpu]->missed_quanta++;
+                }
+                pstate[cpu]->cur_tick = time;
+        }
+        PTRACE(">>> Q %lu at %llu\n",
+               time, litmus_clock());
+        spin_unlock(&pfair_lock);
+}
+static noinline void wait_for_quantum(quanta_t q, struct pfair_state* state)
+{
+        quanta_t loc;
+        goto first; /* skip mb() on first iteration */
+        do {
+                cpu_relax();
+                mb();
+        first:  loc = state->cur_tick;
+                /* FIXME: what if loc > cur? */
+        } while (time_before(loc, q));
+        PTRACE("observed cur_tick:%lu >= q:%lu\n",
+               loc, q);
+}
+static quanta_t current_quantum(struct pfair_state* state)
+{
+        lt_t t = litmus_clock() - state->offset;
+        return time2quanta(t, FLOOR);
+}
+static void catchup_quanta(quanta_t from, quanta_t target,
+                           struct pfair_state* state)
+{
+        quanta_t cur = from, time;
+        TRACE("+++< BAD catching up quanta from %lu to %lu\n",
+              from, target);
+        while (time_before(cur, target)) {
+                wait_for_quantum(cur, state);
+                cur++;
+                time = cmpxchg(&pfair_time,
+                               cur - 1,   /* expected */
+                               cur        /* next     */
+                        );
+                if (time == cur - 1)
+                        schedule_next_quantum(cur);
+        }
+        TRACE("+++> catching up done\n");
+}
+/* pfair_tick - this function is called for every local timer
+ *                         interrupt.
+ */
+static void pfair_tick(struct task_struct* t)
+{
+        struct pfair_state* state = &__get_cpu_var(pfair_state);
+        quanta_t time, cur;
+        int retry = 10;
+        do {
+                cur  = current_quantum(state);
+                PTRACE("q %lu at %llu\n", cur, litmus_clock());
+                /* Attempt to advance time. First CPU to get here
+                 * will prepare the next quantum.
+                 */
+                time = cmpxchg(&pfair_time,
+                               cur - 1,   /* expected */
+                               cur        /* next     */
+                        );
+                if (time == cur - 1) {
+                        /* exchange succeeded */
+                        wait_for_quantum(cur - 1, state);
+                        schedule_next_quantum(cur);
+                        retry = 0;
+                } else if (time_before(time, cur - 1)) {
+                        /* the whole system missed a tick !? */
+                        catchup_quanta(time, cur, state);
+                        retry--;
+                } else if (time_after(time, cur)) {
+                        /* our timer lagging behind!? */
+                        TRACE("BAD pfair_time:%lu > cur:%lu\n", time, cur);
+                        retry--;
+                } else {
+                        /* Some other CPU already started scheduling
+                         * this quantum. Let it do its job and then update.
+                         */
+                        retry = 0;
+                }
+        } while (retry);
+        /* Spin locally until time advances. */
+        wait_for_quantum(cur, state);
+        /* copy assignment */
+        /* FIXME: what if we race with a future update? Corrupted state? */
+        state->local      = state->linked;
+        /* signal that we are done */
+        mb();
+        state->local_tick = state->cur_tick;
+        if (state->local != current
+            && (is_realtime(current) || is_present(state->local)))
+                set_tsk_need_resched(current);
+}
+static int safe_to_schedule(struct task_struct* t, int cpu)
+{
+        int where = tsk_rt(t)->scheduled_on;
+        if (where != NO_CPU && where != cpu) {
+                TRACE_TASK(t, "BAD: can't be scheduled on %d, "
+                           "scheduled already on %d.\n", cpu, where);
+                return 0;
+        } else
+                return tsk_rt(t)->present && get_rt_flags(t) == RT_F_RUNNING;
+}
+static struct task_struct* pfair_schedule(struct task_struct * prev)
+{
+        struct pfair_state* state = &__get_cpu_var(pfair_state);
+        int blocks;
+        struct task_struct* next = NULL;
+        spin_lock(&pfair_lock);
+        blocks  = is_realtime(prev) && !is_running(prev);
+        if (state->local && safe_to_schedule(state->local, state->cpu))
+                next = state->local;
+        if (prev != next) {
+                tsk_rt(prev)->scheduled_on = NO_CPU;
+                if (next)
+                        tsk_rt(next)->scheduled_on = state->cpu;
+        }
+        spin_unlock(&pfair_lock);
+        if (next)
+                TRACE_TASK(next, "scheduled rel=%lu at %lu (%llu)\n",
+                           tsk_pfair(next)->release, pfair_time, litmus_clock());
+        else if (is_realtime(prev))
+                TRACE("Becomes idle at %lu (%llu)\n", pfair_time, litmus_clock());
+        return next;
+}
+static void pfair_task_new(struct task_struct * t, int on_rq, int running)
+{
+        unsigned long           flags;
+        TRACE("pfair: task new %d state:%d\n", t->pid, t->state);
+        spin_lock_irqsave(&pfair_lock, flags);
+        if (running)
+                t->rt_param.scheduled_on = task_cpu(t);
+        else
+                t->rt_param.scheduled_on = NO_CPU;
+        prepare_release(t, pfair_time + 1);
+        tsk_pfair(t)->sporadic_release = 0;
+        pfair_add_release(t);
+        check_preempt(t);
+        spin_unlock_irqrestore(&pfair_lock, flags);
+}
+static void pfair_task_wake_up(struct task_struct *t)
+{
+        unsigned long flags;
+        lt_t now;
+        TRACE_TASK(t, "wakes at %llu, release=%lu, pfair_time:%lu\n",
+                   litmus_clock(), cur_release(t), pfair_time);
+        spin_lock_irqsave(&pfair_lock, flags);
+        /* It is a little unclear how to deal with Pfair
+         * tasks that block for a while and then wake. For now,
+         * if a task blocks and wakes before its next job release,
+         * then it may resume if it is currently linked somewhere
+         * (as if it never blocked at all). Otherwise, we have a
+         * new sporadic job release.
+         */
+        if (tsk_pfair(t)->sporadic_release) {
+                now = litmus_clock();
+                release_at(t, now);
+                prepare_release(t, time2quanta(now, CEIL));
+                sched_trace_task_release(t);
+                /* FIXME: race with pfair_time advancing */
+                pfair_add_release(t);
+                tsk_pfair(t)->sporadic_release = 0;
+        }
+        check_preempt(t);
+        spin_unlock_irqrestore(&pfair_lock, flags);
+        TRACE_TASK(t, "wake up done at %llu\n", litmus_clock());
+}
+static void pfair_task_block(struct task_struct *t)
+{
+        BUG_ON(!is_realtime(t));
+        TRACE_TASK(t, "blocks at %llu, state:%d\n",
+                   litmus_clock(), t->state);
+}
+static void pfair_task_exit(struct task_struct * t)
+{
+        unsigned long flags;
+        BUG_ON(!is_realtime(t));
+        /* Remote task from release or ready queue, and ensure
+         * that it is not the scheduled task for ANY CPU. We
+         * do this blanket check because occassionally when
+         * tasks exit while blocked, the task_cpu of the task
+         * might not be the same as the CPU that the PFAIR scheduler
+         * has chosen for it.
+         */
+        spin_lock_irqsave(&pfair_lock, flags);
+        TRACE_TASK(t, "RIP, state:%d\n", t->state);
+        drop_all_references(t);
+        spin_unlock_irqrestore(&pfair_lock, flags);
+        kfree(t->rt_param.pfair);
+        t->rt_param.pfair = NULL;
+}
+static void pfair_release_at(struct task_struct* task, lt_t start)
+{
+        unsigned long flags;
+        quanta_t release;
+        BUG_ON(!is_realtime(task));
+        spin_lock_irqsave(&pfair_lock, flags);
+        release_at(task, start);
+        release = time2quanta(start, CEIL);
+        if (release - pfair_time >= PFAIR_MAX_PERIOD)
+                release = pfair_time + PFAIR_MAX_PERIOD;
+        TRACE_TASK(task, "sys release at %lu\n", release);
+        drop_all_references(task);
+        prepare_release(task, release);
+        pfair_add_release(task);
+        /* Clear sporadic release flag, since this release subsumes any
+         * sporadic release on wake.
+         */
+        tsk_pfair(task)->sporadic_release = 0;
+        spin_unlock_irqrestore(&pfair_lock, flags);
+}
+static void init_subtask(struct subtask* sub, unsigned long i,
+                         lt_t quanta, lt_t period)
+{
+        /* since i is zero-based, the formulas are shifted by one */
+        lt_t tmp;
+        /* release */
+        tmp = period * i;
+        do_div(tmp, quanta); /* floor */
+        sub->release = (quanta_t) tmp;
+        /* deadline */
+        tmp = period * (i + 1);
+        if (do_div(tmp, quanta)) /* ceil */
+                tmp++;
+        sub->deadline = (quanta_t) tmp;
+        /* next release */
+        tmp = period * (i + 1);
+        do_div(tmp, quanta); /* floor */
+        sub->overlap =  sub->deadline - (quanta_t) tmp;
+        /* Group deadline.
+         * Based on the formula given in Uma's thesis.
+         */
+        if (2 * quanta >= period) {
+                /* heavy */
+                tmp = (sub->deadline - (i + 1)) * period;
+                if (period > quanta &&
+                    do_div(tmp, (period - quanta))) /* ceil */
+                        tmp++;
+                sub->group_deadline = (quanta_t) tmp;
+        } else
+                sub->group_deadline = 0;
+}
+static void dump_subtasks(struct task_struct* t)
+{
+        unsigned long i;
+        for (i = 0; i < t->rt_param.pfair->quanta; i++)
+                TRACE_TASK(t, "SUBTASK %lu: rel=%lu dl=%lu bbit:%lu gdl:%lu\n",
+                           i + 1,
+                           t->rt_param.pfair->subtasks[i].release,
+                           t->rt_param.pfair->subtasks[i].deadline,
+                           t->rt_param.pfair->subtasks[i].overlap,
+                           t->rt_param.pfair->subtasks[i].group_deadline);
+}
+static long pfair_admit_task(struct task_struct* t)
+{
+        lt_t quanta;
+        lt_t period;
+        s64  quantum_length = ktime_to_ns(tick_period);
+        struct pfair_param* param;
+        unsigned long i;
+        /* Pfair is a tick-based method, so the time
+         * of interest is jiffies. Calculate tick-based
+         * times for everything.
+         * (Ceiling of exec cost, floor of period.)
+         */
+        quanta = get_exec_cost(t);
+        period = get_rt_period(t);
+        quanta = time2quanta(get_exec_cost(t), CEIL);
+        if (do_div(period, quantum_length))
+                printk(KERN_WARNING
+                       "The period of %s/%d is not a multiple of %llu.\n",
+                       t->comm, t->pid, (unsigned long long) quantum_length);
+        if (period >= PFAIR_MAX_PERIOD) {
+                printk(KERN_WARNING
+                       "PFAIR: Rejecting task %s/%d; its period is too long.\n",
+                       t->comm, t->pid);
+                return -EINVAL;
+        }
+        if (quanta == period) {
+                /* special case: task has weight 1.0 */
+                printk(KERN_INFO
+                       "Admitting weight 1.0 task. (%s/%d, %llu, %llu).\n",
+                       t->comm, t->pid, quanta, period);
+                quanta = 1;
+                period = 1;
+        }
+        param = kmalloc(sizeof(*param) +
+                        quanta * sizeof(struct subtask), GFP_ATOMIC);
+        if (!param)
+                return -ENOMEM;
+        param->quanta  = quanta;
+        param->cur     = 0;
+        param->release = 0;
+        param->period  = period;
+        for (i = 0; i < quanta; i++)
+                init_subtask(param->subtasks + i, i, quanta, period);
+        if (t->rt_param.pfair)
+                /* get rid of stale allocation */
+                kfree(t->rt_param.pfair);
+        t->rt_param.pfair = param;
+        /* spew out some debug info */
+        dump_subtasks(t);
+        return 0;
+}
+static long pfair_activate_plugin(void)
+{
+        int cpu;
+        struct pfair_state* state;
+        state = &__get_cpu_var(pfair_state);
+        pfair_time = current_quantum(state);
+        TRACE("Activating PFAIR at q=%lu\n", pfair_time);
+        for (cpu = 0; cpu < num_online_cpus(); cpu++)  {
+                state = &per_cpu(pfair_state, cpu);
+                state->cur_tick   = pfair_time;
+                state->local_tick = pfair_time;
+                state->missed_quanta = 0;
+                state->offset     = cpu_stagger_offset(cpu);
+        }
+        return 0;
+}
+/*      Plugin object   */
+static struct sched_plugin pfair_plugin __cacheline_aligned_in_smp = {
+        .plugin_name            = "PFAIR",
+        .tick                   = pfair_tick,
+        .task_new               = pfair_task_new,
+        .task_exit              = pfair_task_exit,
+        .schedule               = pfair_schedule,
+        .task_wake_up           = pfair_task_wake_up,
+        .task_block             = pfair_task_block,
+        .admit_task             = pfair_admit_task,
+        .release_at             = pfair_release_at,
+        .complete_job           = complete_job,
+        .activate_plugin        = pfair_activate_plugin,
+};
+static int __init init_pfair(void)
+{
+        int cpu, i;
+        struct pfair_state *state;
+        /*
+         * initialize short_cut for per-cpu pfair state;
+         * there may be a problem here if someone removes a cpu
+         * while we are doing this initialization... and if cpus
+         * are added / removed later... is it a _real_ problem?
+         */
+        pstate = kmalloc(sizeof(struct pfair_state*) * num_online_cpus(), GFP_KERNEL);
+        /* initialize release queue */
+        for (i = 0; i < PFAIR_MAX_PERIOD; i++)
+                bheap_init(&release_queue[i]);
+        /* initialize CPU state */
+        for (cpu = 0; cpu < num_online_cpus(); cpu++)  {
+                state = &per_cpu(pfair_state, cpu);
+                state->cpu        = cpu;
+                state->cur_tick   = 0;
+                state->local_tick = 0;
+                state->linked     = NULL;
+                state->local      = NULL;
+                state->scheduled  = NULL;
+                state->missed_quanta = 0;
+                state->offset     = cpu_stagger_offset(cpu);
+                pstate[cpu] = state;
+        }
+        rt_domain_init(&pfair, pfair_ready_order, NULL, NULL);
+        return register_sched_plugin(&pfair_plugin);
+}
+static void __exit clean_pfair(void)
+{
+        kfree(pstate);
+}
+module_init(init_pfair);
+module_exit(clean_pfair);