3 files changed, 1240 insertions, 0 deletions
diff --git a/litmus/Kconfig b/litmus/Kconfig
index 38d9e433b345..babb43deffb5 100644
--- a/litmus/Kconfig
+++ b/litmus/Kconfig
@@ -12,6 +12,19 @@ config PLUGIN_CEDF
          On smaller platforms (e.g., ARM PB11MPCore), using C-EDF
          makes little sense since there aren't any shared caches.
+config PLUGIN_PFAIR
+        bool "PFAIR"
+        default y
+        help
+          Include the PFAIR plugin (i.e., the PD^2 scheduler) in the kernel.
+          The PFAIR plugin requires high resolution timers (for staggered
+          quanta) and also requires HZ_PERIODIC (i.e., periodic timer ticks
+          even if a processor is idle, as quanta could be missed otherwise).
+          Further, the PFAIR plugin uses the system tick and thus requires
+          HZ=1000 to achive reasonable granularity.
+          If unsure, say Yes.
 config RELEASE_MASTER
        bool "Release-master Support"
        depends on ARCH_HAS_SEND_PULL_TIMERS && SMP
diff --git a/litmus/Makefile b/litmus/Makefile
index 7d637197d736..7970cd55e7fd 100644
--- a/litmus/Makefile
+++ b/litmus/Makefile
@@ -24,6 +24,7 @@ obj-y     = sched_plugin.o litmus.o \
            sched_pfp.o
 obj-$(CONFIG_PLUGIN_CEDF) += sched_cedf.o
+obj-$(CONFIG_PLUGIN_PFAIR) += sched_pfair.o
 obj-$(CONFIG_FEATHER_TRACE) += ft_event.o ftdev.o
 obj-$(CONFIG_SCHED_TASK_TRACE) += sched_task_trace.o
diff --git a/litmus/sched_pfair.c b/litmus/sched_pfair.c
new file mode 100644
index 000000000000..3f82378f5ca8
--- /dev/null
+++ b/litmus/sched_pfair.c
@@ -0,0 +1,1226 @@
+/*
+ * kernel/sched_pfair.c
+ *
+ * Implementation of the PD^2 pfair scheduling algorithm. This
+ * implementation realizes "early releasing," i.e., it is work-conserving.
+ *
+ */
+#include <asm/div64.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <litmus/litmus.h>
+#include <litmus/jobs.h>
+#include <litmus/preempt.h>
+#include <litmus/rt_domain.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/sched_trace.h>
+#include <litmus/trace.h>
+#include <litmus/bheap.h>
+/* to configure the cluster size */
+#include <litmus/litmus_proc.h>
+#include <litmus/clustered.h>
+static enum cache_level pfair_cluster_level = GLOBAL_CLUSTER;
+struct subtask {
+        /* measured in quanta relative to job release */
+        quanta_t release;
+        quanta_t deadline;
+        quanta_t overlap; /* called "b bit" by PD^2 */
+        quanta_t group_deadline;
+};
+struct pfair_param   {
+        quanta_t        quanta;       /* number of subtasks */
+        quanta_t        cur;          /* index of current subtask */
+        quanta_t        release;      /* in quanta */
+        quanta_t        period;       /* in quanta */
+        quanta_t        last_quantum; /* when scheduled last */
+        int             last_cpu;     /* where scheduled last */
+        unsigned int    needs_requeue:1;
+        struct pfair_cluster* cluster; /* where this task is scheduled */
+        struct subtask subtasks[0];   /* allocate together with pfair_param */
+};
+#define tsk_pfair(tsk) ((tsk)->rt_param.pfair)
+struct pfair_state {
+        struct cluster_cpu topology;
+        struct hrtimer quantum_timer;
+        volatile quanta_t cur_tick;    /* updated by the CPU that is advancing
+                                        * the time */
+        volatile quanta_t local_tick;  /* What tick is the local CPU currently
+                                        * executing? Updated only by the local
+                                        * CPU. In QEMU, this may lag behind the
+                                        * current tick. In a real system, with
+                                        * proper timers and aligned quanta,
+                                        * that should only be the case for a
+                                        * very short time after the time
+                                        * advanced. With staggered quanta, it
+                                        * will lag for the duration of the
+                                        * offset.
+                                        */
+        struct task_struct* linked;    /* the task that should be executing */
+        struct task_struct* local;     /* the local copy of linked          */
+        struct task_struct* scheduled; /* what is actually scheduled        */
+        struct list_head    out_of_budget; /* list of tasks that exhausted their allocation */
+        lt_t offset;                    /* stagger offset */
+        unsigned int missed_updates;
+        unsigned int missed_quanta;
+};
+struct pfair_cluster {
+        struct scheduling_cluster topology;
+        /* The "global" time in this cluster. */
+        quanta_t pfair_time; /* the "official" PFAIR clock */
+        /* The ready queue for this cluster. */
+        rt_domain_t pfair;
+        /* The set of jobs that should have their release enacted at the next
+         * quantum boundary.
+         */
+        struct bheap release_queue;
+        raw_spinlock_t release_lock;
+};
+static inline struct pfair_cluster* cpu_cluster(struct pfair_state* state)
+{
+        return container_of(state->topology.cluster, struct pfair_cluster, topology);
+}
+static inline int cpu_id(struct pfair_state* state)
+{
+        return state->topology.id;
+}
+static inline struct pfair_state* from_cluster_list(struct list_head* pos)
+{
+        return list_entry(pos, struct pfair_state, topology.cluster_list);
+}
+static inline struct pfair_cluster* from_domain(rt_domain_t* rt)
+{
+        return container_of(rt, struct pfair_cluster, pfair);
+}
+static inline raw_spinlock_t* cluster_lock(struct pfair_cluster* cluster)
+{
+        /* The ready_lock is used to serialize all scheduling events. */
+        return &cluster->pfair.ready_lock;
+}
+static inline raw_spinlock_t* cpu_lock(struct pfair_state* state)
+{
+        return cluster_lock(cpu_cluster(state));
+}
+DEFINE_PER_CPU(struct pfair_state, pfair_state);
+struct pfair_state* *pstate; /* short cut */
+static struct pfair_cluster* pfair_clusters;
+static int num_pfair_clusters;
+/* Enable for lots of trace info.
+ * #define PFAIR_DEBUG
+ */
+#ifdef PFAIR_DEBUG
+#define PTRACE_TASK(t, f, args...)  TRACE_TASK(t, f, ## args)
+#define PTRACE(f, args...) TRACE(f, ## args)
+#else
+#define PTRACE_TASK(t, f, args...)
+#define PTRACE(f, args...)
+#endif
+/* gcc will inline all of these accessor functions... */
+static struct subtask* cur_subtask(struct task_struct* t)
+{
+        return tsk_pfair(t)->subtasks + tsk_pfair(t)->cur;
+}
+static quanta_t cur_deadline(struct task_struct* t)
+{
+        return cur_subtask(t)->deadline +  tsk_pfair(t)->release;
+}
+static quanta_t cur_release(struct task_struct* t)
+{
+        /* This is early releasing: only the release of the first subtask
+         * counts. */
+        return tsk_pfair(t)->release;
+}
+static quanta_t cur_overlap(struct task_struct* t)
+{
+        return cur_subtask(t)->overlap;
+}
+static quanta_t cur_group_deadline(struct task_struct* t)
+{
+        quanta_t gdl = cur_subtask(t)->group_deadline;
+        if (gdl)
+                return gdl + tsk_pfair(t)->release;
+        else
+                return gdl;
+}
+static int pfair_higher_prio(struct task_struct* first,
+                             struct task_struct* second)
+{
+        return  /* first task must exist */
+                first && (
+                /* Does the second task exist and is it a real-time task?  If
+                 * not, the first task (which is a RT task) has higher
+                 * priority.
+                 */
+                !second || !is_realtime(second)  ||
+                /* Is the (subtask) deadline of the first task earlier?
+                 * Then it has higher priority.
+                 */
+                time_before(cur_deadline(first), cur_deadline(second)) ||
+                /* Do we have a deadline tie?
+                 * Then break by B-bit.
+                 */
+                (cur_deadline(first) == cur_deadline(second) &&
+                 (cur_overlap(first) > cur_overlap(second) ||
+                /* Do we have a B-bit tie?
+                 * Then break by group deadline.
+                 */
+                (cur_overlap(first) == cur_overlap(second) &&
+                 (time_after(cur_group_deadline(first),
+                             cur_group_deadline(second)) ||
+                /* Do we have a group deadline tie?
+                 * Then break by PID, which are unique.
+                 */
+                (cur_group_deadline(first) ==
+                 cur_group_deadline(second) &&
+                 first->pid < second->pid))))));
+}
+int pfair_ready_order(struct bheap_node* a, struct bheap_node* b)
+{
+        return pfair_higher_prio(bheap2task(a), bheap2task(b));
+}
+static void pfair_release_jobs(rt_domain_t* rt, struct bheap* tasks)
+{
+        struct pfair_cluster* cluster = from_domain(rt);
+        unsigned long flags;
+        raw_spin_lock_irqsave(&cluster->release_lock, flags);
+        bheap_union(pfair_ready_order, &cluster->release_queue, tasks);
+        raw_spin_unlock_irqrestore(&cluster->release_lock, flags);
+}
+static void prepare_release(struct task_struct* t, quanta_t at)
+{
+        tsk_pfair(t)->release    = at;
+        tsk_pfair(t)->cur        = 0;
+}
+/* pull released tasks from the release queue */
+static void poll_releases(struct pfair_cluster* cluster)
+{
+        raw_spin_lock(&cluster->release_lock);
+        __merge_ready(&cluster->pfair, &cluster->release_queue);
+        raw_spin_unlock(&cluster->release_lock);
+}
+static void check_preempt(struct task_struct* t)
+{
+        int cpu = NO_CPU;
+        if (tsk_rt(t)->linked_on != tsk_rt(t)->scheduled_on &&
+            is_present(t)) {
+                /* the task can be scheduled and
+                 * is not scheduled where it ought to be scheduled
+                 */
+                cpu = tsk_rt(t)->linked_on != NO_CPU ?
+                        tsk_rt(t)->linked_on         :
+                        tsk_rt(t)->scheduled_on;
+                PTRACE_TASK(t, "linked_on:%d, scheduled_on:%d\n",
+                           tsk_rt(t)->linked_on, tsk_rt(t)->scheduled_on);
+                /* preempt */
+                litmus_reschedule(cpu);
+        }
+}
+/* caller must hold pfair.ready_lock */
+static void drop_all_references(struct task_struct *t)
+{
+        int cpu;
+        struct pfair_state* s;
+        struct pfair_cluster* cluster;
+        if (bheap_node_in_heap(tsk_rt(t)->heap_node)) {
+                /* It must be in the ready queue; drop references isn't called
+                 * when the job is in a release queue. */
+                cluster = tsk_pfair(t)->cluster;
+                bheap_delete(pfair_ready_order, &cluster->pfair.ready_queue,
+                            tsk_rt(t)->heap_node);
+        }
+        for (cpu = 0; cpu < num_online_cpus(); cpu++) {
+                s = &per_cpu(pfair_state, cpu);
+                if (s->linked == t)
+                        s->linked = NULL;
+                if (s->local  == t)
+                        s->local  = NULL;
+                if (s->scheduled  == t)
+                        s->scheduled = NULL;
+        }
+        /* make sure we don't have a stale linked_on field */
+        tsk_rt(t)->linked_on = NO_CPU;
+        /* make sure we're not queued for re-releasing */
+        if (in_list(&tsk_rt(t)->list))
+        {
+                TRACE_TASK(t, "removing from out_of_budget queue\n");
+                list_del(&tsk_rt(t)->list);
+        }
+}
+static void pfair_prepare_next_period(struct task_struct* t)
+{
+        struct pfair_param* p = tsk_pfair(t);
+        prepare_for_next_period(t);
+        tsk_rt(t)->completed = 0;
+        p->release = time2quanta(get_release(t), CEIL);
+}
+/* returns 1 if the task needs to go the release queue */
+static int advance_subtask(quanta_t time, struct task_struct* t, int cpu)
+{
+        struct pfair_param* p = tsk_pfair(t);
+        int to_relq;
+        p->cur = (p->cur + 1) % p->quanta;
+        if (!p->cur) {
+                if (is_present(t)) {
+                        /* The job overran; we start a new budget allocation. */
+                        TRACE_TASK(t, "overran budget, preparing next period\n");
+                        sched_trace_task_completion(t, 1);
+                        pfair_prepare_next_period(t);
+                } else {
+                        /* remove task from system until it wakes */
+                        drop_all_references(t);
+                        p->needs_requeue = 1;
+                        TRACE_TASK(t, "on %d advanced to subtask %lu (not present)\n",
+                                   cpu, p->cur);
+                        return 0;
+                }
+        }
+        to_relq = time_after(cur_release(t), time);
+        TRACE_TASK(t, "on %d advanced to subtask %lu -> to_relq=%d "
+                "(cur_release:%lu time:%lu present:%d on_cpu=%d)\n",
+                cpu, p->cur, to_relq, cur_release(t), time,
+                tsk_rt(t)->present, tsk_rt(t)->scheduled_on);
+        return to_relq;
+}
+static void advance_subtasks(struct pfair_cluster *cluster, quanta_t time)
+{
+        struct task_struct* l;
+        struct pfair_param* p;
+        struct list_head* pos;
+        struct pfair_state* cpu;
+        list_for_each(pos, &cluster->topology.cpus) {
+                cpu = from_cluster_list(pos);
+                l = cpu->linked;
+                cpu->missed_updates += cpu->linked != cpu->local;
+                if (l) {
+                        p = tsk_pfair(l);
+                        p->last_quantum = time;
+                        p->last_cpu     =  cpu_id(cpu);
+                        if (advance_subtask(time, l, cpu_id(cpu))) {
+                                cpu->linked = NULL;
+                                tsk_rt(l)->linked_on = NO_CPU;
+                                PTRACE_TASK(l, "should go to release queue. "
+                                            "scheduled_on=%d present=%d\n",
+                                            tsk_rt(l)->scheduled_on,
+                                            tsk_rt(l)->present);
+                                list_add(&tsk_rt(l)->list, &cpu->out_of_budget);
+                        }
+                }
+        }
+}
+static int target_cpu(quanta_t time, struct task_struct* t, int default_cpu)
+{
+        int cpu;
+        if (tsk_rt(t)->scheduled_on != NO_CPU) {
+                /* always observe scheduled_on linkage */
+                default_cpu = tsk_rt(t)->scheduled_on;
+        } else if (tsk_pfair(t)->last_quantum == time - 1) {
+                /* back2back quanta */
+                /* Only observe last_quantum if no scheduled_on is in the way.
+                 * This should only kick in if a CPU missed quanta, and that
+                 * *should* only happen in QEMU.
+                 */
+                cpu = tsk_pfair(t)->last_cpu;
+                if (!pstate[cpu]->linked ||
+                    tsk_rt(pstate[cpu]->linked)->scheduled_on != cpu) {
+                        default_cpu = cpu;
+                }
+        }
+        return default_cpu;
+}
+/* returns one if linking was redirected */
+static int pfair_link(quanta_t time, int cpu,
+                      struct task_struct* t)
+{
+        int target = target_cpu(time, t, cpu);
+        struct task_struct* prev  = pstate[cpu]->linked;
+        struct task_struct* other;
+        struct pfair_cluster* cluster = cpu_cluster(pstate[cpu]);
+        if (target != cpu) {
+                BUG_ON(pstate[target]->topology.cluster != pstate[cpu]->topology.cluster);
+                other = pstate[target]->linked;
+                pstate[target]->linked = t;
+                tsk_rt(t)->linked_on   = target;
+                if (!other)
+                        /* linked ok, but reschedule this CPU */
+                        return 1;
+                if (target < cpu) {
+                        /* link other to cpu instead */
+                        tsk_rt(other)->linked_on = cpu;
+                        pstate[cpu]->linked      = other;
+                        if (prev) {
+                                /* prev got pushed back into the ready queue */
+                                tsk_rt(prev)->linked_on = NO_CPU;
+                                __add_ready(&cluster->pfair, prev);
+                        }
+                        /* we are done with this cpu */
+                        return 0;
+                } else {
+                        /* re-add other, it's original CPU was not considered yet */
+                        tsk_rt(other)->linked_on = NO_CPU;
+                        __add_ready(&cluster->pfair, other);
+                        /* reschedule this CPU */
+                        return 1;
+                }
+        } else {
+                pstate[cpu]->linked  = t;
+                tsk_rt(t)->linked_on = cpu;
+                if (prev) {
+                        /* prev got pushed back into the ready queue */
+                        tsk_rt(prev)->linked_on = NO_CPU;
+                        __add_ready(&cluster->pfair, prev);
+                }
+                /* we are done with this CPU */
+                return 0;
+        }
+}
+static void schedule_subtasks(struct pfair_cluster *cluster, quanta_t time)
+{
+        int retry;
+        struct list_head *pos;
+        struct pfair_state *cpu_state;
+        list_for_each(pos, &cluster->topology.cpus) {
+                cpu_state = from_cluster_list(pos);
+                retry = 1;
+#ifdef CONFIG_RELEASE_MASTER
+                /* skip release master */
+                if (cluster->pfair.release_master == cpu_id(cpu_state))
+                        continue;
+#endif
+                while (retry) {
+                        if (pfair_higher_prio(__peek_ready(&cluster->pfair),
+                                              cpu_state->linked))
+                                retry = pfair_link(time, cpu_id(cpu_state),
+                                                   __take_ready(&cluster->pfair));
+                        else
+                                retry = 0;
+                }
+        }
+}
+static void schedule_next_quantum(struct pfair_cluster *cluster, quanta_t time)
+{
+        struct pfair_state *cpu;
+        struct list_head* pos;
+        /* called with interrupts disabled */
+        PTRACE("--- Q %lu at %llu PRE-SPIN\n",
+               time, litmus_clock());
+        raw_spin_lock(cluster_lock(cluster));
+        PTRACE("<<< Q %lu at %llu\n",
+               time, litmus_clock());
+        sched_trace_quantum_boundary();
+        advance_subtasks(cluster, time);
+        poll_releases(cluster);
+        schedule_subtasks(cluster, time);
+        list_for_each(pos, &cluster->topology.cpus) {
+                cpu = from_cluster_list(pos);
+                if (cpu->linked)
+                        PTRACE_TASK(cpu->linked,
+                                    " linked on %d.\n", cpu_id(cpu));
+                else
+                        PTRACE("(null) linked on %d.\n", cpu_id(cpu));
+        }
+        /* We are done. Advance time. */
+        mb();
+        list_for_each(pos, &cluster->topology.cpus) {
+                cpu = from_cluster_list(pos);
+                if (cpu->local_tick != cpu->cur_tick) {
+                        TRACE("BAD Quantum not acked on %d "
+                              "(l:%lu c:%lu p:%lu)\n",
+                              cpu_id(cpu),
+                              cpu->local_tick,
+                              cpu->cur_tick,
+                              cluster->pfair_time);
+                        cpu->missed_quanta++;
+                }
+                cpu->cur_tick = time;
+        }
+        PTRACE(">>> Q %lu at %llu\n",
+               time, litmus_clock());
+        raw_spin_unlock(cluster_lock(cluster));
+}
+static noinline void wait_for_quantum(quanta_t q, struct pfair_state* state)
+{
+        quanta_t loc;
+        goto first; /* skip mb() on first iteration */
+        do {
+                cpu_relax();
+                mb();
+        first:  loc = state->cur_tick;
+                /* FIXME: what if loc > cur? */
+        } while (time_before(loc, q));
+        PTRACE("observed cur_tick:%lu >= q:%lu\n",
+               loc, q);
+}
+static quanta_t current_quantum(struct pfair_state* state)
+{
+        lt_t t = litmus_clock() - state->offset;
+        return time2quanta(t, FLOOR);
+}
+static void catchup_quanta(quanta_t from, quanta_t target,
+                           struct pfair_state* state)
+{
+        quanta_t cur = from, time;
+        TRACE("+++< BAD catching up quanta from %lu to %lu\n",
+              from, target);
+        while (time_before(cur, target)) {
+                wait_for_quantum(cur, state);
+                cur++;
+                time = cmpxchg(&cpu_cluster(state)->pfair_time,
+                               cur - 1,   /* expected */
+                               cur        /* next     */
+                        );
+                if (time == cur - 1)
+                        schedule_next_quantum(cpu_cluster(state), cur);
+        }
+        TRACE("+++> catching up done\n");
+}
+/* pfair_tick - this function is called for every local timer
+ *                         interrupt.
+ */
+static void pfair_tick(struct task_struct* t)
+{
+        struct pfair_state* state = this_cpu_ptr(&pfair_state);
+        quanta_t time, cur;
+        int retry = 10;
+        do {
+                cur  = current_quantum(state);
+                PTRACE("q %lu at %llu\n", cur, litmus_clock());
+                /* Attempt to advance time. First CPU to get here
+                 * will prepare the next quantum.
+                 */
+                time = cpu_cluster(state)->pfair_time;
+                if (time == cur - 1)
+                {
+                        /* looks good, see if we can advance the time */
+                        time = cmpxchg(&cpu_cluster(state)->pfair_time,
+                                       cur - 1,   /* expected */
+                                       cur        /* next     */
+                                );
+                }
+                if (time == cur - 1) {
+                        /* exchange succeeded */
+                        wait_for_quantum(cur - 1, state);
+                        schedule_next_quantum(cpu_cluster(state), cur);
+                        retry = 0;
+                } else if (time_before(time, cur - 1)) {
+                        /* the whole system missed a tick !? */
+                        catchup_quanta(time, cur, state);
+                        retry--;
+                } else if (time_after(time, cur)) {
+                        /* our timer lagging behind!? */
+                        TRACE("BAD pfair_time:%lu > cur:%lu\n", time, cur);
+                        retry--;
+                } else {
+                        /* Some other CPU already started scheduling
+                         * this quantum. Let it do its job and then update.
+                         */
+                        retry = 0;
+                }
+        } while (retry);
+        /* Spin locally until time advances. */
+        wait_for_quantum(cur, state);
+        /* copy assignment */
+        /* FIXME: what if we race with a future update? Corrupted state? */
+        state->local      = state->linked;
+        /* signal that we are done */
+        mb();
+        state->local_tick = state->cur_tick;
+        if (state->local != current
+            && (is_realtime(current) || is_present(state->local)))
+                litmus_reschedule_local();
+}
+static void process_out_of_budget_tasks(
+        struct pfair_state* state,
+        struct task_struct* prev,
+        unsigned int blocks)
+{
+        struct task_struct *t;
+        while (!list_empty(&state->out_of_budget))
+        {
+                t = list_first_entry(&state->out_of_budget,
+                                     struct task_struct, rt_param.list);
+                TRACE_TASK(t, "found on out_of_budget queue is_prev=%d\n", t == prev);
+                list_del(&tsk_rt(t)->list);
+                if (t != prev || !blocks)
+                {
+                        sched_trace_task_release(t);
+                        add_release(&cpu_cluster(state)->pfair, t);
+                        TRACE_TASK(t, "adding to release queue (budget exhausted)\n");
+                } else {
+                        TRACE_TASK(t, "not added to release queue (blocks=%d)\n", blocks);
+                        tsk_pfair(t)->needs_requeue = 1;
+                }
+                if (unlikely(state->local == t)) {
+                        TRACE_TASK(t, "still linked as ->local, cleaning up\n");
+                        state->local = NULL;
+                }
+        }
+}
+/* Custom scheduling tick: called on each quantum boundary. */
+static enum hrtimer_restart on_quantum_boundary(struct hrtimer *timer)
+{
+        TS_QUANTUM_BOUNDARY_START;
+        pfair_tick(current);
+        hrtimer_add_expires_ns(timer, LITMUS_QUANTUM_LENGTH_NS);
+        TS_QUANTUM_BOUNDARY_END;
+        return  HRTIMER_RESTART;
+}
+static int safe_to_schedule(struct task_struct* t, int cpu)
+{
+        int where = tsk_rt(t)->scheduled_on;
+        if (where != NO_CPU && where != cpu) {
+                TRACE_TASK(t, "BAD: can't be scheduled on %d, "
+                           "scheduled already on %d.\n", cpu, where);
+                return 0;
+        } else
+                return is_present(t) && !is_completed(t);
+}
+static struct task_struct* pfair_schedule(struct task_struct * prev)
+{
+        struct pfair_state* state = this_cpu_ptr(&pfair_state);
+        struct pfair_cluster* cluster = cpu_cluster(state);
+        int blocks, completion, out_of_time;
+        struct task_struct* next = NULL;
+#ifdef CONFIG_RELEASE_MASTER
+        /* Bail out early if we are the release master.
+         * The release master never schedules any real-time tasks.
+         */
+        if (unlikely(cluster->pfair.release_master == cpu_id(state))) {
+                goto out;
+        }
+#endif
+        raw_spin_lock(cpu_lock(state));
+        blocks      = is_realtime(prev) && !is_current_running();
+        completion  = is_realtime(prev) && is_completed(prev);
+        out_of_time = is_realtime(prev) && time_after(cur_release(prev),
+                                                      state->local_tick);
+        if (is_realtime(prev))
+            PTRACE_TASK(prev, "blocks:%d completion:%d out_of_time:%d\n",
+                        blocks, completion, out_of_time);
+        if (completion && !out_of_time) {
+                sched_trace_task_completion(prev, 0);
+                pfair_prepare_next_period(prev);
+                prepare_release(prev, cur_release(prev));
+                drop_all_references(prev);
+                list_add(&tsk_rt(prev)->list, &state->out_of_budget);
+        }
+        process_out_of_budget_tasks(state, prev, blocks);
+        if (state->local && safe_to_schedule(state->local, cpu_id(state)))
+                next = state->local;
+        if (prev != next) {
+                tsk_rt(prev)->scheduled_on = NO_CPU;
+                if (next)
+                        tsk_rt(next)->scheduled_on = cpu_id(state);
+        }
+        sched_state_task_picked();
+        raw_spin_unlock(cpu_lock(state));
+        if (next)
+                TRACE_TASK(next, "scheduled rel=%lu at %lu (%llu)\n",
+                           tsk_pfair(next)->release, cpu_cluster(state)->pfair_time, litmus_clock());
+        else if (is_realtime(prev))
+                TRACE("Becomes idle at %lu (%llu)\n", cpu_cluster(state)->pfair_time, litmus_clock());
+#ifdef CONFIG_RELEASE_MASTER
+out:
+#endif
+        if (unlikely(!hrtimer_active(&state->quantum_timer))) {
+                TRACE("activating quantum timer start=%llu\n",
+                        hrtimer_get_expires(&state->quantum_timer));
+                __hrtimer_start_range_ns(&state->quantum_timer,
+                        hrtimer_get_expires(&state->quantum_timer),
+                        0, HRTIMER_MODE_ABS, 0);
+        }
+        return next;
+}
+static void pfair_task_new(struct task_struct * t, int on_rq, int is_scheduled)
+{
+        unsigned long flags;
+        struct pfair_cluster* cluster;
+        TRACE("pfair: task new %d state:%d\n", t->pid, t->state);
+        cluster = tsk_pfair(t)->cluster;
+        raw_spin_lock_irqsave(cluster_lock(cluster), flags);
+        prepare_release(t, cluster->pfair_time + 1);
+        release_at(t, quanta2time(cur_release(t)));
+        t->rt_param.scheduled_on = NO_CPU;
+        t->rt_param.linked_on    = NO_CPU;
+        if (is_scheduled) {
+#ifdef CONFIG_RELEASE_MASTER
+                if (task_cpu(t) != cluster->pfair.release_master)
+#endif
+                        t->rt_param.scheduled_on = task_cpu(t);
+        }
+        if (on_rq || is_scheduled) {
+                tsk_rt(t)->present = 1;
+                __add_ready(&cluster->pfair, t);
+        } else {
+                tsk_rt(t)->present = 0;
+                tsk_pfair(t)->needs_requeue = 1;
+        }
+        check_preempt(t);
+        raw_spin_unlock_irqrestore(cluster_lock(cluster), flags);
+}
+static void pfair_task_wake_up(struct task_struct *t)
+{
+        unsigned long flags;
+        lt_t now;
+        struct pfair_cluster* cluster;
+        struct pfair_state* state;
+        int sporadic_release = 0;
+        cluster = tsk_pfair(t)->cluster;
+        TRACE_TASK(t, "wakes at %llu, release=%lu, pfair_time:%lu\n",
+                   litmus_clock(), cur_release(t), cluster->pfair_time);
+        raw_spin_lock_irqsave(cluster_lock(cluster), flags);
+        state = this_cpu_ptr(&pfair_state);
+        /* If a task blocks and wakes before its next job release,
+         * then it may resume if it is currently linked somewhere
+         * (as if it never blocked at all). Otherwise, we have a
+         * new sporadic job release.
+         */
+        now = litmus_clock();
+        if (is_tardy(t, now)) {
+                TRACE_TASK(t, "sporadic release!\n");
+                sporadic_release = 1;
+                release_at(t, now);
+                prepare_release(t, time2quanta(now, CEIL));
+                sched_trace_task_release(t);
+        }
+        /* only add to ready queue if the task isn't still linked somewhere */
+        if (tsk_pfair(t)->needs_requeue) {
+                tsk_pfair(t)->needs_requeue = 0;
+                TRACE_TASK(t, "requeueing required (released:%d)\n",
+                        !time_after(cur_release(t), state->local_tick));
+                tsk_rt(t)->completed = 0;
+                if (time_after(cur_release(t), state->local_tick)
+                    && !sporadic_release)
+                        add_release(&cluster->pfair, t);
+                else
+                        __add_ready(&cluster->pfair, t);
+        }
+        check_preempt(t);
+        raw_spin_unlock_irqrestore(cluster_lock(cluster), flags);
+        TRACE_TASK(t, "wake up done at %llu\n", litmus_clock());
+}
+static void pfair_task_block(struct task_struct *t)
+{
+        BUG_ON(!is_realtime(t));
+        TRACE_TASK(t, "blocks at %llu, state:%d\n",
+                   litmus_clock(), t->state);
+}
+static void pfair_task_exit(struct task_struct * t)
+{
+        unsigned long flags;
+        struct pfair_cluster *cluster;
+        BUG_ON(!is_realtime(t));
+        cluster = tsk_pfair(t)->cluster;
+        /* Remote task from release or ready queue, and ensure
+         * that it is not the scheduled task for ANY CPU. We
+         * do this blanket check because occassionally when
+         * tasks exit while blocked, the task_cpu of the task
+         * might not be the same as the CPU that the PFAIR scheduler
+         * has chosen for it.
+         */
+        raw_spin_lock_irqsave(cluster_lock(cluster), flags);
+        TRACE_TASK(t, "RIP, state:%d\n", t->state);
+        drop_all_references(t);
+        raw_spin_unlock_irqrestore(cluster_lock(cluster), flags);
+        kfree(t->rt_param.pfair);
+        t->rt_param.pfair = NULL;
+}
+static void init_subtask(struct subtask* sub, unsigned long i,
+                         lt_t quanta, lt_t period)
+{
+        /* since i is zero-based, the formulas are shifted by one */
+        lt_t tmp;
+        /* release */
+        tmp = period * i;
+        do_div(tmp, quanta); /* floor */
+        sub->release = (quanta_t) tmp;
+        /* deadline */
+        tmp = period * (i + 1);
+        if (do_div(tmp, quanta)) /* ceil */
+                tmp++;
+        sub->deadline = (quanta_t) tmp;
+        /* next release */
+        tmp = period * (i + 1);
+        do_div(tmp, quanta); /* floor */
+        sub->overlap =  sub->deadline - (quanta_t) tmp;
+        /* Group deadline.
+         * Based on the formula given in Uma's thesis.
+         */
+        if (2 * quanta >= period) {
+                /* heavy */
+                tmp = (sub->deadline - (i + 1)) * period;
+                if (period > quanta &&
+                    do_div(tmp, (period - quanta))) /* ceil */
+                        tmp++;
+                sub->group_deadline = (quanta_t) tmp;
+        } else
+                sub->group_deadline = 0;
+}
+static void dump_subtasks(struct task_struct* t)
+{
+        unsigned long i;
+        for (i = 0; i < t->rt_param.pfair->quanta; i++)
+                TRACE_TASK(t, "SUBTASK %lu: rel=%lu dl=%lu bbit:%lu gdl:%lu\n",
+                           i + 1,
+                           t->rt_param.pfair->subtasks[i].release,
+                           t->rt_param.pfair->subtasks[i].deadline,
+                           t->rt_param.pfair->subtasks[i].overlap,
+                           t->rt_param.pfair->subtasks[i].group_deadline);
+}
+static long pfair_admit_task(struct task_struct* t)
+{
+        lt_t quanta;
+        lt_t period;
+        s64  quantum_length = LITMUS_QUANTUM_LENGTH_NS;
+        struct pfair_param* param;
+        unsigned long i;
+        /* first check that the task is in the right cluster */
+        if (cpu_cluster(pstate[tsk_rt(t)->task_params.cpu]) !=
+            cpu_cluster(pstate[task_cpu(t)]))
+                return -EINVAL;
+        if (get_rt_period(t) != get_rt_relative_deadline(t)) {
+                printk(KERN_INFO "%s: Admission rejected. "
+                        "Only implicit deadlines are currently supported.\n",
+                        litmus->plugin_name);
+                return -EINVAL;
+        }
+        /* Pfair is a tick-based scheduler, so the unit of time
+         * is one quantum. Calculate quantum-based parameters for everything.
+         * (Ceiling of exec cost, floor of period.)
+         */
+        quanta = get_exec_cost(t);
+        period = get_rt_period(t);
+        quanta = time2quanta(get_exec_cost(t), CEIL);
+        if (do_div(period, quantum_length))
+                printk(KERN_WARNING
+                       "The period of %s/%d is not a multiple of %llu.\n",
+                       t->comm, t->pid, (unsigned long long) quantum_length);
+        if (quanta == period) {
+                PTRACE_TASK(t, "Admitting weight 1.0 task. (%llu, %llu).\n", quanta, period);
+        }
+        param = kzalloc(sizeof(*param) +
+                        quanta * sizeof(struct subtask), GFP_ATOMIC);
+        if (!param)
+                return -ENOMEM;
+        param->quanta  = quanta;
+        param->period  = period;
+        param->cluster = cpu_cluster(pstate[tsk_rt(t)->task_params.cpu]);
+        for (i = 0; i < quanta; i++)
+                init_subtask(param->subtasks + i, i, quanta, period);
+        if (t->rt_param.pfair)
+                /* get rid of stale allocation */
+                kfree(t->rt_param.pfair);
+        t->rt_param.pfair = param;
+        /* spew out some debug info */
+        dump_subtasks(t);
+        /* Disable generic budget enforcement (if enabled).
+         * The plugin provides its own (non-optional) enforcement
+         * of allocations at quantum granularity. */
+        tsk_rt(t)->task_params.budget_policy = NO_ENFORCEMENT;
+        return 0;
+}
+static void pfair_init_cluster(struct pfair_cluster* cluster)
+{
+        rt_domain_init(&cluster->pfair, pfair_ready_order, NULL, pfair_release_jobs);
+        bheap_init(&cluster->release_queue);
+        raw_spin_lock_init(&cluster->release_lock);
+        INIT_LIST_HEAD(&cluster->topology.cpus);
+}
+static void cleanup_clusters(void)
+{
+        int i;
+        if (num_pfair_clusters)
+                kfree(pfair_clusters);
+        pfair_clusters = NULL;
+        num_pfair_clusters = 0;
+        /* avoid stale pointers */
+        for (i = 0; i < num_online_cpus(); i++) {
+                pstate[i]->topology.cluster = NULL;
+                printk("P%d missed %u updates and %u quanta.\n", cpu_id(pstate[i]),
+                       pstate[i]->missed_updates, pstate[i]->missed_quanta);
+        }
+}
+static struct domain_proc_info pfair_domain_proc_info;
+static long pfair_get_domain_proc_info(struct domain_proc_info **ret)
+{
+        *ret = &pfair_domain_proc_info;
+        return 0;
+}
+static void pfair_setup_domain_proc(void)
+{
+        int i, cpu, domain;
+#ifdef CONFIG_RELEASE_MASTER
+        int release_master = atomic_read(&release_master_cpu);
+        /* skip over the domain with the release master if cluster size is 1 */
+        int cluster_size = num_online_cpus() / num_pfair_clusters;
+        int skip_domain = (1 == cluster_size && release_master != NO_CPU) ?
+                        release_master : NO_CPU;
+#else
+        int release_master = NO_CPU;
+        int skip_domain = NO_CPU;
+#endif
+        int num_rt_cpus = num_online_cpus() - (release_master != NO_CPU);
+        int num_rt_domains = num_pfair_clusters - (skip_domain != NO_CPU);
+        struct cd_mapping *map;
+        memset(&pfair_domain_proc_info, sizeof(pfair_domain_proc_info), 0);
+        init_domain_proc_info(&pfair_domain_proc_info, num_rt_cpus, num_pfair_clusters);
+        pfair_domain_proc_info.num_cpus = num_rt_cpus;
+        pfair_domain_proc_info.num_domains = num_rt_domains;
+        for (cpu = 0, i = 0; cpu < num_online_cpus(); ++cpu) {
+                if (cpu == release_master)
+                        continue;
+                map = &pfair_domain_proc_info.cpu_to_domains[i];
+                /* pointer math to figure out the domain index */
+                domain = cpu_cluster(&per_cpu(pfair_state, cpu)) - pfair_clusters;
+                map->id = cpu;
+                cpumask_set_cpu(domain, map->mask);
+                ++i;
+        }
+        for (domain = 0, i = 0; domain < num_pfair_clusters; ++domain) {
+                struct pfair_cluster *cluster;
+                struct list_head *pos;
+                if (domain == skip_domain)
+                        continue;
+                cluster = &pfair_clusters[domain];
+                map = &pfair_domain_proc_info.domain_to_cpus[i];
+                map->id = i;
+                list_for_each(pos, &cluster->topology.cpus) {
+                        cpu = cpu_id(from_cluster_list(pos));
+                        if (cpu != release_master)
+                                cpumask_set_cpu(cpu, map->mask);
+                }
+                ++i;
+        }
+}
+static long pfair_activate_plugin(void)
+{
+        int err, i;
+        struct pfair_state* state;
+        struct pfair_cluster* cluster;
+        quanta_t now, start;
+        int cluster_size;
+        struct cluster_cpu* cpus[NR_CPUS];
+        struct scheduling_cluster* clust[NR_CPUS];
+        lt_t quantum_timer_start;
+        cluster_size = get_cluster_size(pfair_cluster_level);
+        if (cluster_size <= 0 || num_online_cpus() % cluster_size != 0)
+                return -EINVAL;
+        num_pfair_clusters = num_online_cpus() / cluster_size;
+        pfair_clusters = kzalloc(num_pfair_clusters * sizeof(struct pfair_cluster), GFP_ATOMIC);
+        if (!pfair_clusters) {
+                num_pfair_clusters = 0;
+                printk(KERN_ERR "Could not allocate Pfair clusters!\n");
+                return -ENOMEM;
+        }
+        state = this_cpu_ptr(&pfair_state);
+        now   = current_quantum(state);
+        start = now + 50;
+        quantum_timer_start = quanta2time(start);
+        TRACE("Activating PFAIR at %llu (q=%lu), first tick at %llu (q=%lu)\n",
+                litmus_clock(),
+                now,
+                quantum_timer_start,
+                time2quanta(quantum_timer_start, CEIL));
+        for (i = 0; i < num_pfair_clusters; i++) {
+                cluster = &pfair_clusters[i];
+                pfair_init_cluster(cluster);
+                cluster->pfair_time = start;
+                clust[i] = &cluster->topology;
+#ifdef CONFIG_RELEASE_MASTER
+                cluster->pfair.release_master = atomic_read(&release_master_cpu);
+#endif
+        }
+        for_each_online_cpu(i) {
+                state = &per_cpu(pfair_state, i);
+                state->cur_tick   = start;
+                state->local_tick = start;
+                state->missed_quanta = 0;
+                state->missed_updates = 0;
+                state->offset     = cpu_stagger_offset(i);
+                hrtimer_set_expires(&state->quantum_timer,
+                        ns_to_ktime(quantum_timer_start + state->offset));
+                cpus[i] = &state->topology;
+                TRACE("cpus[%d] set; offset=%llu; %d\n", i, state->offset, num_online_cpus());
+                INIT_LIST_HEAD(&state->out_of_budget);
+                /* force rescheduling to start quantum timer */
+                litmus_reschedule(i);
+                WARN_ONCE(!hrtimer_is_hres_active(&state->quantum_timer),
+                        KERN_ERR "WARNING: no high resolution timers available!?\n");
+        }
+        err = assign_cpus_to_clusters(pfair_cluster_level, clust, num_pfair_clusters,
+                                      cpus, num_online_cpus());
+        if (err < 0)
+                cleanup_clusters();
+        else
+                pfair_setup_domain_proc();
+        return err;
+}
+static long pfair_deactivate_plugin(void)
+{
+        int cpu;
+        struct pfair_state* state;
+        for_each_online_cpu(cpu) {
+                state = &per_cpu(pfair_state, cpu);
+                TRACE("stopping quantum timer on CPU%d\n", cpu);
+                hrtimer_cancel(&state->quantum_timer);
+        }
+        cleanup_clusters();
+        destroy_domain_proc_info(&pfair_domain_proc_info);
+        return 0;
+}
+/*      Plugin object   */
+static struct sched_plugin pfair_plugin __cacheline_aligned_in_smp = {
+        .plugin_name            = "PFAIR",
+        .task_new               = pfair_task_new,
+        .task_exit              = pfair_task_exit,
+        .schedule               = pfair_schedule,
+        .task_wake_up           = pfair_task_wake_up,
+        .task_block             = pfair_task_block,
+        .admit_task             = pfair_admit_task,
+        .complete_job           = complete_job,
+        .activate_plugin        = pfair_activate_plugin,
+        .deactivate_plugin      = pfair_deactivate_plugin,
+        .get_domain_proc_info   = pfair_get_domain_proc_info,
+};
+static struct proc_dir_entry *cluster_file = NULL, *pfair_dir = NULL;
+static int __init init_pfair(void)
+{
+        int cpu, err, fs;
+        struct pfair_state *state;
+        /*
+         * initialize short_cut for per-cpu pfair state;
+         * there may be a problem here if someone removes a cpu
+         * while we are doing this initialization... and if cpus
+         * are added / removed later... but we don't support CPU hotplug atm anyway.
+         */
+        pstate = kmalloc(sizeof(struct pfair_state*) * num_online_cpus(), GFP_KERNEL);
+        /* initialize CPU state */
+        for (cpu = 0; cpu < num_online_cpus(); cpu++)  {
+                state = &per_cpu(pfair_state, cpu);
+                hrtimer_init(&state->quantum_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
+                state->quantum_timer.function = on_quantum_boundary;
+                state->topology.id = cpu;
+                state->cur_tick   = 0;
+                state->local_tick = 0;
+                state->linked     = NULL;
+                state->local      = NULL;
+                state->scheduled  = NULL;
+                state->missed_quanta = 0;
+                state->offset     = cpu_stagger_offset(cpu);
+                pstate[cpu] = state;
+        }
+        pfair_clusters = NULL;
+        num_pfair_clusters = 0;
+        err = register_sched_plugin(&pfair_plugin);
+        if (!err) {
+                fs = make_plugin_proc_dir(&pfair_plugin, &pfair_dir);
+                if (!fs)
+                        cluster_file = create_cluster_file(pfair_dir, &pfair_cluster_level);
+                else
+                        printk(KERN_ERR "Could not allocate PFAIR procfs dir.\n");
+        }
+        return err;
+}
+static void __exit clean_pfair(void)
+{
+        kfree(pstate);
+        if (cluster_file)
+                remove_proc_entry("cluster", pfair_dir);
+        if (pfair_dir)
+                remove_plugin_proc_dir(&pfair_plugin);
+}
+module_init(init_pfair);
+module_exit(clean_pfair);