3 files changed, 799 insertions, 9 deletions
diff --git a/include/litmus/rt_param.h b/include/litmus/rt_param.h
index a7dd67a81a..7bb568432e 100644
--- a/include/litmus/rt_param.h
+++ b/include/litmus/rt_param.h
@@ -64,6 +64,8 @@ struct rt_job {
 };
+struct pfair_param;
 /*      RT task parameters for scheduling extensions
 *      These parameters are inherited during clone and therefore must
 *      be explicitly set up before the task set is launched.
@@ -108,15 +110,12 @@ struct rt_param {
         * is currently scheduled. It is the responsibility of the
         * plugin to avoid race conditions.
         *
-         * Used by GSN-EDF.
+         * This used by GSN-EDF and PFAIR.
         */
        volatile int            scheduled_on;
-        /* Is the stack of the task currently in use? Currently, this
+        /* Is the stack of the task currently in use? This is updated by
-         * is the responsibility of the plugin to update this field.
+         * the LITMUS core.
-         * Maybe become part of the LITMUS core some day.
-         * 
-         * Used by GSN-EDF.
         *
         * Be careful to avoid deadlocks!
         */
@@ -130,6 +129,9 @@ struct rt_param {
         */
        volatile int            linked_on;
+        /* PFAIR/PD^2 state. Allocated on demand. */
+        struct pfair_param*     pfair;
        /* Fields saved before BE->RT transition.
         */
        int old_policy;
diff --git a/litmus/Makefile b/litmus/Makefile
index bfe393eb56..545203876a 100644
--- a/litmus/Makefile
+++ b/litmus/Makefile
@@ -3,9 +3,12 @@
 #
 obj-y     = sched_plugin.o litmus.o sched_trace.o \
-            edf_common.o jobs.o\
+            edf_common.o jobs.o \
-            sched_gsn_edf.o sched_psn_edf.o sched_cedf.o \
            rt_domain.o fdso.o sync.o \
-            fmlp.o srp.o norqlock.o
+            fmlp.o srp.o norqlock.o \
+            sched_gsn_edf.o \
+            sched_psn_edf.o \
+            sched_cedf.o \
+            sched_pfair.o
 obj-$(CONFIG_FEATHER_TRACE) += trace.o ft_event.o
diff --git a/litmus/sched_pfair.c b/litmus/sched_pfair.c
new file mode 100755
index 0000000000..6f95688508
--- /dev/null
+++ b/litmus/sched_pfair.c
@@ -0,0 +1,785 @@
+/*
+ * kernel/sched_pfair.c
+ *
+ * Implementation of the (global) Pfair scheduling algorithm.
+ *
+ */
+#include <asm/div64.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <litmus/litmus.h>
+#include <litmus/jobs.h>
+#include <litmus/rt_domain.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/sched_trace.h>
+#include <litmus/heap.h>
+/* Tick period is used to convert ns-specified execution
+ * costs and periods into tick-based equivalents.
+ */
+extern ktime_t tick_period;
+/* make the unit explicit */
+typedef unsigned long quanta_t;
+struct subtask {
+        /* measured in quanta relative to job release */
+        quanta_t release;
+        quanta_t deadline;
+        quanta_t overlap; /* called "b bit" by PD^2 */
+        quanta_t group_deadline;
+};
+struct pfair_param   {
+        quanta_t        quanta;       /* number of subtasks */
+        quanta_t        cur;          /* index of current subtask */
+        quanta_t        release;      /* in quanta */
+        quanta_t        period;       /* in quanta */
+        quanta_t        last_quantum; /* when scheduled last */
+        int             last_cpu;     /* where scheduled last */
+        unsigned int    present;    /* Can the task be scheduled? */
+        struct subtask subtasks[0];   /* allocate together with pfair_param */
+};
+#define tsk_pfair(tsk) ((tsk)->rt_param.pfair)
+struct pfair_state {
+        int cpu;
+        volatile quanta_t cur_tick;    /* updated by the CPU that is advancing
+                                        * the time */
+        volatile quanta_t local_tick;  /* What tick is the local CPU currently
+                                        * executing? Updated only by the local
+                                        * CPU. In QEMU, this may lag behind the
+                                        * current tick. In a real system, with
+                                        * proper timers and aligned quanta,
+                                        * that should only be the
+                                        * case for a very short time after the
+                                        * time advanced. With staggered quanta,
+                                        * it will lag for the duration of the
+                                        * offset.
+                                        */
+        struct task_struct* linked;    /* the task that should be executing */
+        struct task_struct* local;     /* the local copy of linked          */
+        struct task_struct* scheduled; /* what is actually scheduled        */
+        unsigned long missed_quanta;
+};
+/* Currently, we limit the maximum period of any task to 1000 quanta.
+ * The reason is that it makes the implementation easier since we do not
+ * need to reallocate the release wheel on task arrivals.
+ * In the future
+ */
+#define PFAIR_MAX_PERIOD 1000
+/* This is the release queue wheel. It is indexed by pfair_time %
+ * PFAIR_MAX_PERIOD.  Each heap is ordered by PFAIR priority, so that it can be
+ * merged with the ready queue.
+ */
+static struct heap release_queue[PFAIR_MAX_PERIOD];
+DEFINE_PER_CPU(struct pfair_state, pfair_state);
+struct pfair_state*  pstate[NR_CPUS]; /* short cut */
+#define NO_CPU 0xffffffff
+static quanta_t pfair_time = 0; /* the "official" PFAIR clock */
+static quanta_t merge_time = 0; /* Updated after the release queue has been
+                                 * merged. Used by drop_all_references().
+                                 */
+static rt_domain_t pfair;
+/* The pfair_lock is used to serialize all scheduling events.
+ */
+#define pfair_lock pfair.ready_lock
+/* Enable for lots of trace info.
+ * #define PFAIR_DEBUG
+ */
+#ifdef PFAIR_DEBUG
+#define PTRACE_TASK(t, f, args...)  TRACE_TASK(t, f, # args)
+#define PTRACE(f, args...) TRACE(f, # args)
+#else
+#define PTRACE_TASK(t, f, args...)
+#define PTRACE(f, args...)
+#endif
+/* gcc will inline all of these accessor functions... */
+static struct subtask* cur_subtask(struct task_struct* t)
+{
+        return tsk_pfair(t)->subtasks + tsk_pfair(t)->cur;
+}
+static quanta_t cur_deadline(struct task_struct* t)
+{
+        return cur_subtask(t)->deadline +  tsk_pfair(t)->release;
+}
+static quanta_t cur_release(struct task_struct* t)
+{
+#ifdef EARLY_RELEASE
+        /* only the release of the first subtask counts when we early
+         * release */
+        return tsk_pfair(t)->release;
+#else
+        return cur_subtask(t)->release +  tsk_pfair(t)->release;
+#endif
+}
+static quanta_t cur_sub_release(struct task_struct* t)
+{
+        return cur_subtask(t)->release +  tsk_pfair(t)->release;
+}
+static quanta_t cur_overlap(struct task_struct* t)
+{
+        return cur_subtask(t)->overlap;
+}
+static quanta_t cur_group_deadline(struct task_struct* t)
+{
+        quanta_t gdl = cur_subtask(t)->group_deadline;
+        if (gdl)
+                return gdl + tsk_pfair(t)->release;
+        else
+                return gdl;
+}
+enum round {
+        FLOOR,
+        CEIL
+};
+static quanta_t time2quanta(lt_t time, enum round round)
+{
+        s64  quantum_length = ktime_to_ns(tick_period);
+        if (do_div(time, quantum_length) && round == CEIL)
+                time++;
+        return (quanta_t) time;
+}
+static int pfair_higher_prio(struct task_struct* first,
+                             struct task_struct* second)
+{
+        return  /* first task must exist */
+                first && (
+                /* Does the second task exist and is it a real-time task?  If
+                 * not, the first task (which is a RT task) has higher
+                 * priority.
+                 */
+                !second || !is_realtime(second)  ||
+                /* Is the (subtask) deadline of the first task earlier?
+                 * Then it has higher priority.
+                 */
+                time_before(cur_deadline(first), cur_deadline(second)) ||
+                /* Do we have a deadline tie?
+                 * Then break by B-bit.
+                 */
+                (cur_deadline(first) == cur_deadline(second) &&
+                 cur_overlap(first) > cur_overlap(second)) ||
+                /* Do we have a B-bit tie?
+                 * Then break by group deadline.
+                 */
+                (cur_overlap(first) == cur_overlap(second) &&
+                 time_after(cur_group_deadline(first),
+                            cur_group_deadline(second))) ||
+                /* Do we have a group deadline tie?
+                 * Then break by PID, which are unique.
+                 */
+                (cur_group_deadline(first) ==
+                 cur_group_deadline(second) &&
+                 first->pid < second->pid));
+}
+int pfair_ready_order(struct heap_node* a, struct heap_node* b)
+{
+        return pfair_higher_prio(heap2task(a), heap2task(b));
+}
+/* return the proper release queue for time t */
+static struct heap* relq(quanta_t t)
+{
+        struct heap* rq = &release_queue[t % PFAIR_MAX_PERIOD];
+        return rq;
+}
+static void prepare_release(struct task_struct* t, quanta_t at)
+{
+        tsk_pfair(t)->release    = at;
+        tsk_pfair(t)->cur        = 0;
+}
+static void __pfair_add_release(struct task_struct* t, struct heap* queue)
+{
+        heap_insert(pfair_ready_order, queue,
+                    tsk_rt(t)->heap_node);
+}
+static void pfair_add_release(struct task_struct* t)
+{
+        BUG_ON(heap_node_in_heap(tsk_rt(t)->heap_node));
+        __pfair_add_release(t, relq(cur_release(t)));
+}
+/* pull released tasks from the release queue */
+static void poll_releases(quanta_t time)
+{
+        heap_union(pfair_ready_order, &pfair.ready_queue, relq(time));
+        merge_time = time;
+}
+static void check_preempt(struct task_struct* t)
+{
+        int cpu = NO_CPU;
+        if (tsk_rt(t)->linked_on != tsk_rt(t)->scheduled_on &&
+            tsk_pfair(t)->present) {
+                /* the task can be scheduled and
+                 * is not scheduled where it ought to be scheduled
+                 */
+                cpu = tsk_rt(t)->linked_on != NO_CPU ?
+                        tsk_rt(t)->linked_on         :
+                        tsk_rt(t)->scheduled_on;
+                PTRACE_TASK(t, "linked_on:%d, scheduled_on:%d\n",
+                           tsk_rt(t)->linked_on, tsk_rt(t)->scheduled_on);
+                /* preempt */
+                if (cpu == smp_processor_id())
+                        set_tsk_need_resched(current);
+                else {
+                        smp_send_reschedule(cpu);
+                }
+        }
+}
+/* returns 1 if the task needs to go the release queue */
+static int advance_subtask(quanta_t time, struct task_struct* t, int cpu)
+{
+        struct pfair_param* p = tsk_pfair(t);
+        p->cur = (p->cur + 1) % p->quanta;
+        TRACE_TASK(t, "on %d advanced to subtask %lu\n",
+                   cpu,
+                   p->cur);
+        if (!p->cur) {
+                /* we start a new job */
+                get_rt_flags(t) = RT_F_RUNNING;
+                prepare_for_next_period(t);
+                p->release += p->period;
+        }
+        return time_after(cur_release(t), time);
+}
+static void advance_subtasks(quanta_t time)
+{
+        int cpu, missed;
+        struct task_struct* l;
+        struct pfair_param* p;
+        for_each_online_cpu(cpu) {
+                l = pstate[cpu]->linked;
+                missed = pstate[cpu]->linked != pstate[cpu]->local;
+                if (l) {
+                        p = tsk_pfair(l);
+                        p->last_quantum = time;
+                        p->last_cpu     =  cpu;
+                        if (advance_subtask(time, l, cpu)) {
+                                pstate[cpu]->linked = NULL;
+                                pfair_add_release(l);
+                        }
+                }
+        }
+}
+static int target_cpu(quanta_t time, struct task_struct* t, int default_cpu)
+{
+        int cpu;
+        if (tsk_rt(t)->scheduled_on != NO_CPU) {
+                /* always observe scheduled_on linkage */
+                default_cpu = tsk_rt(t)->scheduled_on;
+                PTRACE_TASK(t, "forced on %d (scheduled on)\n", default_cpu);
+        } else if (tsk_pfair(t)->last_quantum == time - 1) {
+                /* back2back quanta */
+                /* Only observe last_quantum if no scheduled_on is in the way.
+                 * This should only kick in if a CPU missed quanta, and that
+                 * *should* only happen in QEMU.
+                 */
+                cpu = tsk_pfair(t)->last_cpu;
+                if (!pstate[cpu]->linked ||
+                    tsk_rt(pstate[cpu]->linked)->scheduled_on != cpu) {
+                        default_cpu = cpu;
+                        PTRACE_TASK(t, "forced on %d (linked on)\n",
+                                    default_cpu);
+                } else {
+                        PTRACE_TASK(t, "DID NOT force on %d (linked on)\n",
+                                    default_cpu);
+                }
+        }
+        return default_cpu;
+}
+/* returns one if linking was redirected */
+static int pfair_link(quanta_t time, int cpu,
+                      struct task_struct* t)
+{
+        int target = target_cpu(time, t, cpu);
+        struct task_struct* prev  = pstate[cpu]->linked;
+        struct task_struct* other;
+        PTRACE_TASK(t, "linked to %d for quantum %lu\n", target, time);
+        if (target != cpu) {
+                other = pstate[target]->linked;
+                pstate[target]->linked = t;
+                tsk_rt(t)->linked_on   = target;
+                if (!other)
+                        /* linked ok, but reschedule this CPU */
+                        return 1;
+                if (target < cpu) {
+                        /* link other to cpu instead */
+                        tsk_rt(other)->linked_on = cpu;
+                        pstate[cpu]->linked      = other;
+                        if (prev) {
+                                /* prev got pushed back into the ready queue */
+                                tsk_rt(prev)->linked_on = NO_CPU;
+                                __add_ready(&pfair, prev);
+                        }
+                        /* we are done with this cpu */
+                        return 0;
+                } else {
+                        /* re-add other, it's original CPU was not considered yet */
+                        tsk_rt(other)->linked_on = NO_CPU;
+                        __add_ready(&pfair, other);
+                        /* reschedule this CPU */
+                        return 1;
+                }
+        } else {
+                pstate[cpu]->linked  = t;
+                tsk_rt(t)->linked_on = cpu;
+                if (prev) {
+                        /* prev got pushed back into the ready queue */
+                        tsk_rt(prev)->linked_on = NO_CPU;
+                        __add_ready(&pfair, prev);
+                }
+                /* we are done with this CPU */
+                return 0;
+        }
+}
+static void schedule_subtasks(quanta_t time)
+{
+        int cpu, retry;
+        for_each_online_cpu(cpu) {
+                retry = 1;
+                while (retry) {
+                        if (pfair_higher_prio(__peek_ready(&pfair),
+                                              pstate[cpu]->linked))
+                                retry = pfair_link(time, cpu,
+                                                   __take_ready(&pfair));
+                        else
+                                retry = 0;
+                }
+        }
+}
+static void schedule_next_quantum(quanta_t time)
+{
+        int cpu;
+        PTRACE("<<< Q %lu at %llu\n",
+               time, litmus_clock());
+        /* called with interrupts disabled */
+        spin_lock(&pfair_lock);
+        advance_subtasks(time);
+        poll_releases(time);
+        schedule_subtasks(time);
+        spin_unlock(&pfair_lock);
+        /* We are done. Advance time. */
+        mb();
+        for (cpu = 0; cpu < NR_CPUS; cpu++)
+                pstate[cpu]->cur_tick = pfair_time;
+        PTRACE(">>> Q %lu at %llu\n",
+               time, litmus_clock());
+}
+/* pfair_tick - this function is called for every local timer
+ *                         interrupt.
+ */
+static void pfair_tick(struct task_struct* t)
+{
+        struct pfair_state* state = &__get_cpu_var(pfair_state);
+        quanta_t time, loc, cur;
+        /* Attempt to advance time. First CPU to get here 00
+         * will prepare the next quantum.
+         */
+        time = cmpxchg(&pfair_time,
+                       state->local_tick,     /* expected */
+                       state->local_tick + 1  /* next     */
+                );
+        if (time == state->local_tick)
+                /* exchange succeeded */
+                schedule_next_quantum(time + 1);
+        /* Spin locally until time advances. */
+        while (1) {
+                mb();
+                cur = state->cur_tick;
+                loc = state->local_tick;
+                if (time_before(loc, cur)) {
+                        if (loc + 1 != cur) {
+                                TRACE("MISSED quantum! loc:%lu -> cur:%lu\n",
+                                      loc, cur);
+                                state->missed_quanta++;
+                        }
+                        break;
+                }
+                cpu_relax();
+        }
+        /* copy state info */
+        state->local_tick = state->cur_tick;
+        state->local      = state->linked;
+        if (state->local && tsk_pfair(state->local)->present &&
+            state->local != current)
+                set_tsk_need_resched(current);
+}
+static int safe_to_schedule(struct task_struct* t, int cpu)
+{
+        int where = tsk_rt(t)->scheduled_on;
+        if (where != NO_CPU && where != cpu) {
+                TRACE_TASK(t, "BAD: can't be scheduled on %d, "
+                           "scheduled already on %d.\n", cpu, where);
+                return 0;
+        } else
+                return tsk_pfair(t)->present && get_rt_flags(t) == RT_F_RUNNING;
+}
+static struct task_struct* pfair_schedule(struct task_struct * prev)
+{
+        struct pfair_state* state = &__get_cpu_var(pfair_state);
+        int blocks;
+        struct task_struct* next = NULL;
+        spin_lock(&pfair_lock);
+        blocks  = is_realtime(prev) && !is_running(prev);
+        if (blocks)
+                tsk_pfair(prev)->present = 0;
+        if (state->local && safe_to_schedule(state->local, state->cpu))
+                next = state->local;
+        if (prev != next) {
+                tsk_rt(prev)->scheduled_on = NO_CPU;
+                if (next)
+                        tsk_rt(next)->scheduled_on = state->cpu;
+        }
+        spin_unlock(&pfair_lock);
+        if (next)
+                TRACE_TASK(next, "scheduled rel=%lu at %lu\n",
+                           tsk_pfair(next)->release, pfair_time);
+        else if (is_realtime(prev))
+                TRACE("Becomes idle at %lu\n", pfair_time);
+        return next;
+}
+static void pfair_task_new(struct task_struct * t, int on_rq, int running)
+{
+        unsigned long           flags;
+        TRACE("pfair: task new %d state:%d\n", t->pid, t->state);
+        spin_lock_irqsave(&pfair_lock, flags);
+        if (running)
+                t->rt_param.scheduled_on = task_cpu(t);
+        else
+                t->rt_param.scheduled_on = NO_CPU;
+        prepare_release(t, pfair_time + 1);
+        tsk_pfair(t)->present = running;
+        pfair_add_release(t);
+        check_preempt(t);
+        spin_unlock_irqrestore(&pfair_lock, flags);
+}
+static void pfair_task_wake_up(struct task_struct *t)
+{
+        unsigned long flags;
+        TRACE_TASK(t, "wakes at %lld, release=%lu, pfair_time:%lu\n",
+                   cur_release(t), pfair_time);
+        spin_lock_irqsave(&pfair_lock, flags);
+        tsk_pfair(t)->present = 1;
+        /* It is a little unclear how to deal with Pfair
+         * tasks that block for a while and then wake.
+         * For now, we assume that such suspensions are included
+         * in the stated execution time of the task, and thus
+         * count as execution time for our purposes. Thus, if the
+         * task is currently linked somewhere, it may resume, otherwise
+         * it has to wait for its next quantum allocation.
+         */
+        check_preempt(t);
+        spin_unlock_irqrestore(&pfair_lock, flags);
+}
+static void pfair_task_block(struct task_struct *t)
+{
+        BUG_ON(!is_realtime(t));
+        TRACE_TASK(t, "blocks at %lld, state:%d\n",
+                   (lt_t) jiffies, t->state);
+}
+/* caller must hold pfair_lock */
+static void drop_all_references(struct task_struct *t)
+{
+        int cpu;
+        struct pfair_state* s;
+        struct heap* q;
+        if (heap_node_in_heap(tsk_rt(t)->heap_node)) {
+                /* figure out what queue the node is in */
+                if (time_before_eq(cur_release(t), merge_time))
+                        q = &pfair.ready_queue;
+                else
+                        q = relq(cur_release(t));
+                heap_delete(pfair_ready_order, q,
+                            tsk_rt(t)->heap_node);
+        }
+        for (cpu = 0; cpu < NR_CPUS; cpu++) {
+                s = &per_cpu(pfair_state, cpu);
+                if (s->linked == t)
+                        s->linked = NULL;
+                if (s->local  == t)
+                        s->local  = NULL;
+                if (s->scheduled  == t)
+                        s->scheduled = NULL;
+        }
+}
+static void pfair_task_exit(struct task_struct * t)
+{
+        unsigned long flags;
+        BUG_ON(!is_realtime(t));
+        /* Remote task from release or ready queue, and ensure
+         * that it is not the scheduled task for ANY CPU. We
+         * do this blanket check because occassionally when
+         * tasks exit while blocked, the task_cpu of the task
+         * might not be the same as the CPU that the PFAIR scheduler
+         * has chosen for it.
+         */
+        spin_lock_irqsave(&pfair_lock, flags);
+        TRACE_TASK(t, "RIP, state:%d\n", t->state);
+        drop_all_references(t);
+        spin_unlock_irqrestore(&pfair_lock, flags);
+        kfree(t->rt_param.pfair);
+        t->rt_param.pfair = NULL;
+}
+static void pfair_release_at(struct task_struct* task, lt_t start)
+{
+        unsigned long flags;
+        lt_t now = litmus_clock();
+        quanta_t release, delta;
+        BUG_ON(!is_realtime(task));
+        spin_lock_irqsave(&pfair_lock, flags);
+        if (lt_before(now, start)) {
+                delta = time2quanta((long long) start - (long long) now, CEIL);
+                if (delta >= PFAIR_MAX_PERIOD)
+                        delta = PFAIR_MAX_PERIOD - 1;
+        } else
+                delta = 10;  /* release in 10 ticks */
+        release = pfair_time + delta;
+        drop_all_references(task);
+        prepare_release(task, release);
+        pfair_add_release(task);
+        spin_unlock_irqrestore(&pfair_lock, flags);
+}
+static void init_subtask(struct subtask* sub, unsigned long i,
+                         lt_t quanta, lt_t period)
+{
+        /* since i is zero-based, the formulas are shifted by one */
+        lt_t tmp;
+        /* release */
+        tmp = period * i;
+        do_div(tmp, quanta); /* floor */
+        sub->release = (quanta_t) tmp;
+        /* deadline */
+        tmp = period * (i + 1);
+        if (do_div(tmp, quanta)) /* ceil */
+                tmp++;
+        sub->deadline = (quanta_t) tmp;
+        /* next release */
+        tmp = period * (i + 1);
+        do_div(tmp, quanta); /* floor */
+        sub->overlap =  sub->deadline - (quanta_t) tmp;
+        /* Group deadline.
+         * Based on the formula given in Uma's thesis.
+         */
+        if (2 * quanta >= period) {
+                /* heavy */
+                tmp = (sub->deadline - (i + 1)) * period;
+                if (do_div(tmp, (period - quanta))) /* ceil */
+                        tmp++;
+                sub->group_deadline = (quanta_t) tmp;
+        } else
+                sub->group_deadline = 0;
+}
+static void dump_subtasks(struct task_struct* t)
+{
+        unsigned long i;
+        for (i = 0; i < t->rt_param.pfair->quanta; i++)
+                TRACE_TASK(t, "SUBTASK %lu: rel=%lu dl=%lu bbit:%lu gdl:%lu\n",
+                           i + 1,
+                           t->rt_param.pfair->subtasks[i].release,
+                           t->rt_param.pfair->subtasks[i].deadline,
+                           t->rt_param.pfair->subtasks[i].overlap,
+                           t->rt_param.pfair->subtasks[i].group_deadline);
+}
+static long pfair_admit_task(struct task_struct* t)
+{
+        lt_t quanta;
+        lt_t period;
+        s64  quantum_length = ktime_to_ns(tick_period);
+        struct pfair_param* param;
+        unsigned long i;
+        /* Pfair is a tick-based method, so the time
+         * of interest is jiffies. Calculate tick-based
+         * times for everything.
+         * (Ceiling of exec cost, floor of period.)
+         */
+        quanta = get_exec_cost(t);
+        period = get_rt_period(t);
+        quanta = time2quanta(get_exec_cost(t), CEIL);
+        if (do_div(period, quantum_length))
+                printk(KERN_WARNING
+                       "The period of %s/%d is not a multiple of %llu.\n",
+                       t->comm, t->pid, (unsigned long long) quantum_length);
+        if (period >= PFAIR_MAX_PERIOD) {
+                printk(KERN_WARNING
+                       "PFAIR: Rejecting task %s/%d; its period is too long.\n",
+                       t->comm, t->pid);
+                return -EINVAL;
+        }
+        param = kmalloc(sizeof(struct pfair_param) +
+                        quanta * sizeof(struct subtask), GFP_ATOMIC);
+        if (!param)
+                return -ENOMEM;
+        param->quanta  = quanta;
+        param->cur     = 0;
+        param->release = 0;
+        param->period  = period;
+        for (i = 0; i < quanta; i++)
+                init_subtask(param->subtasks + i, i, quanta, period);
+        if (t->rt_param.pfair)
+                /* get rid of stale allocation */
+                kfree(t->rt_param.pfair);
+        t->rt_param.pfair = param;
+        /* spew out some debug info */
+        dump_subtasks(t);
+        return 0;
+}
+/*      Plugin object   */
+static struct sched_plugin pfair_plugin __cacheline_aligned_in_smp = {
+        .plugin_name            = "PFAIR",
+        .tick                   = pfair_tick,
+        .task_new               = pfair_task_new,
+        .task_exit              = pfair_task_exit,
+        .schedule               = pfair_schedule,
+        .task_wake_up           = pfair_task_wake_up,
+        .task_block             = pfair_task_block,
+        .admit_task             = pfair_admit_task,
+        .release_at             = pfair_release_at,
+        .complete_job           = complete_job
+};
+static int __init init_pfair(void)
+{
+        int cpu, i;
+        struct pfair_state *state;
+        /* initialize release queue */
+        for (i = 0; i < PFAIR_MAX_PERIOD; i++)
+                heap_init(&release_queue[i]);
+        /* initialize CPU state */
+        for (cpu = 0; cpu < NR_CPUS; cpu++)  {
+                state = &per_cpu(pfair_state, cpu);
+                state->cpu        = cpu;
+                state->cur_tick   = 0;
+                state->local_tick = 0;
+                state->linked     = NULL;
+                state->local      = NULL;
+                state->scheduled  = NULL;
+                state->missed_quanta = 0;
+                pstate[cpu] = state;
+        }
+        rt_domain_init(&pfair, pfair_ready_order, NULL, NULL);
+        return register_sched_plugin(&pfair_plugin);
+}
+module_init(init_pfair);