From c2f4c165b208062d90f65a1c1a0c815261c6a81e Mon Sep 17 00:00:00 2001
From: Andrea Bastoni <bastoni@cs.unc.edu>
Date: Wed, 27 Jan 2010 19:57:09 -0500
Subject: [ported from 2008.3] Add PFAIR plugin

---
 litmus/Makefile      |   3 +-
 litmus/sched_pfair.c | 896 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 898 insertions(+), 1 deletion(-)
 create mode 100644 litmus/sched_pfair.c

(limited to 'litmus')

diff --git a/litmus/Makefile b/litmus/Makefile
index 70c9684c3b98..26e0fdb5c2c1 100644
--- a/litmus/Makefile
+++ b/litmus/Makefile
@@ -11,7 +11,8 @@ obj-y     = sched_plugin.o litmus.o \
 	    srp.o \
 	    fmlp.o \
 	    bheap.o \
-	    sched_gsn_edf.o
+	    sched_gsn_edf.o \
+	    sched_pfair.o
 
 obj-$(CONFIG_FEATHER_TRACE) += ft_event.o ftdev.o
 obj-$(CONFIG_SCHED_TASK_TRACE) += sched_task_trace.o
diff --git a/litmus/sched_pfair.c b/litmus/sched_pfair.c
new file mode 100644
index 000000000000..2ea39223e7f0
--- /dev/null
+++ b/litmus/sched_pfair.c
@@ -0,0 +1,896 @@
+/*
+ * kernel/sched_pfair.c
+ *
+ * Implementation of the (global) Pfair scheduling algorithm.
+ *
+ */
+
+#include <asm/div64.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+
+#include <litmus/litmus.h>
+#include <litmus/jobs.h>
+#include <litmus/rt_domain.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/sched_trace.h>
+
+#include <litmus/bheap.h>
+
+struct subtask {
+	/* measured in quanta relative to job release */
+	quanta_t release;
+        quanta_t deadline;
+	quanta_t overlap; /* called "b bit" by PD^2 */
+	quanta_t group_deadline;
+};
+
+struct pfair_param   {
+	quanta_t	quanta;       /* number of subtasks */
+	quanta_t	cur;          /* index of current subtask */
+
+	quanta_t	release;      /* in quanta */
+	quanta_t	period;       /* in quanta */
+
+	quanta_t	last_quantum; /* when scheduled last */
+	int		last_cpu;     /* where scheduled last */
+
+	unsigned int	sporadic_release; /* On wakeup, new sporadic release? */
+
+	struct subtask subtasks[0];   /* allocate together with pfair_param */
+};
+
+#define tsk_pfair(tsk) ((tsk)->rt_param.pfair)
+
+struct pfair_state {
+	int cpu;
+	volatile quanta_t cur_tick;    /* updated by the CPU that is advancing
+				        * the time */
+	volatile quanta_t local_tick;  /* What tick is the local CPU currently
+				        * executing? Updated only by the local
+				        * CPU. In QEMU, this may lag behind the
+					* current tick. In a real system, with
+					* proper timers and aligned quanta,
+					* that should only be the
+					* case for a very short time after the
+					* time advanced. With staggered quanta,
+					* it will lag for the duration of the
+					* offset.
+					*/
+
+	struct task_struct* linked;    /* the task that should be executing */
+	struct task_struct* local;     /* the local copy of linked          */
+	struct task_struct* scheduled; /* what is actually scheduled        */
+
+	unsigned long missed_quanta;
+	lt_t offset;			/* stagger offset */
+};
+
+/* Currently, we limit the maximum period of any task to 2000 quanta.
+ * The reason is that it makes the implementation easier since we do not
+ * need to reallocate the release wheel on task arrivals.
+ * In the future
+ */
+#define PFAIR_MAX_PERIOD 2000
+
+/* This is the release queue wheel. It is indexed by pfair_time %
+ * PFAIR_MAX_PERIOD.  Each heap is ordered by PFAIR priority, so that it can be
+ * merged with the ready queue.
+ */
+static struct bheap release_queue[PFAIR_MAX_PERIOD];
+
+DEFINE_PER_CPU(struct pfair_state, pfair_state);
+struct pfair_state* *pstate; /* short cut */
+
+static quanta_t pfair_time = 0; /* the "official" PFAIR clock */
+static quanta_t merge_time = 0; /* Updated after the release queue has been
+				 * merged. Used by drop_all_references().
+				 */
+
+static rt_domain_t pfair;
+
+/* The pfair_lock is used to serialize all scheduling events.
+ */
+#define pfair_lock pfair.ready_lock
+
+/* Enable for lots of trace info.
+ * #define PFAIR_DEBUG
+ */
+
+#ifdef PFAIR_DEBUG
+#define PTRACE_TASK(t, f, args...)  TRACE_TASK(t, f, ## args)
+#define PTRACE(f, args...) TRACE(f, ## args)
+#else
+#define PTRACE_TASK(t, f, args...)
+#define PTRACE(f, args...)
+#endif
+
+/* gcc will inline all of these accessor functions... */
+static struct subtask* cur_subtask(struct task_struct* t)
+{
+	return tsk_pfair(t)->subtasks + tsk_pfair(t)->cur;
+}
+
+static quanta_t cur_deadline(struct task_struct* t)
+{
+	return cur_subtask(t)->deadline +  tsk_pfair(t)->release;
+}
+
+
+static quanta_t cur_sub_release(struct task_struct* t)
+{
+	return cur_subtask(t)->release +  tsk_pfair(t)->release;
+}
+
+static quanta_t cur_release(struct task_struct* t)
+{
+#ifdef EARLY_RELEASE
+	/* only the release of the first subtask counts when we early
+	 * release */
+	return tsk_pfair(t)->release;
+#else
+	return cur_sub_release(t);
+#endif
+}
+
+static quanta_t cur_overlap(struct task_struct* t)
+{
+	return cur_subtask(t)->overlap;
+}
+
+static quanta_t cur_group_deadline(struct task_struct* t)
+{
+	quanta_t gdl = cur_subtask(t)->group_deadline;
+	if (gdl)
+		return gdl + tsk_pfair(t)->release;
+	else
+		return gdl;
+}
+
+
+static int pfair_higher_prio(struct task_struct* first,
+			     struct task_struct* second)
+{
+	return  /* first task must exist */
+		first && (
+		/* Does the second task exist and is it a real-time task?  If
+		 * not, the first task (which is a RT task) has higher
+		 * priority.
+		 */
+		!second || !is_realtime(second)  ||
+
+		/* Is the (subtask) deadline of the first task earlier?
+		 * Then it has higher priority.
+		 */
+		time_before(cur_deadline(first), cur_deadline(second)) ||
+
+		/* Do we have a deadline tie?
+		 * Then break by B-bit.
+		 */
+		(cur_deadline(first) == cur_deadline(second) &&
+		 (cur_overlap(first) > cur_overlap(second) ||
+
+		/* Do we have a B-bit tie?
+		 * Then break by group deadline.
+		 */
+		(cur_overlap(first) == cur_overlap(second) &&
+		 (time_after(cur_group_deadline(first),
+			     cur_group_deadline(second)) ||
+
+		/* Do we have a group deadline tie?
+		 * Then break by PID, which are unique.
+		 */
+		(cur_group_deadline(first) ==
+		 cur_group_deadline(second) &&
+		 first->pid < second->pid))))));
+}
+
+int pfair_ready_order(struct bheap_node* a, struct bheap_node* b)
+{
+	return pfair_higher_prio(bheap2task(a), bheap2task(b));
+}
+
+/* return the proper release queue for time t */
+static struct bheap* relq(quanta_t t)
+{
+	struct bheap* rq = &release_queue[t % PFAIR_MAX_PERIOD];
+	return rq;
+}
+
+static void prepare_release(struct task_struct* t, quanta_t at)
+{
+	tsk_pfair(t)->release    = at;
+	tsk_pfair(t)->cur        = 0;
+}
+
+static void __pfair_add_release(struct task_struct* t, struct bheap* queue)
+{
+	bheap_insert(pfair_ready_order, queue,
+		    tsk_rt(t)->heap_node);
+}
+
+static void pfair_add_release(struct task_struct* t)
+{
+	BUG_ON(bheap_node_in_heap(tsk_rt(t)->heap_node));
+	__pfair_add_release(t, relq(cur_release(t)));
+}
+
+/* pull released tasks from the release queue */
+static void poll_releases(quanta_t time)
+{
+	__merge_ready(&pfair, relq(time));
+	merge_time = time;
+}
+
+static void check_preempt(struct task_struct* t)
+{
+	int cpu = NO_CPU;
+	if (tsk_rt(t)->linked_on != tsk_rt(t)->scheduled_on &&
+	    tsk_rt(t)->present) {
+		/* the task can be scheduled and
+		 * is not scheduled where it ought to be scheduled
+		 */
+		cpu = tsk_rt(t)->linked_on != NO_CPU ?
+			tsk_rt(t)->linked_on         :
+			tsk_rt(t)->scheduled_on;
+		PTRACE_TASK(t, "linked_on:%d, scheduled_on:%d\n",
+			   tsk_rt(t)->linked_on, tsk_rt(t)->scheduled_on);
+		/* preempt */
+		if (cpu == smp_processor_id())
+			set_tsk_need_resched(current);
+		else {
+			smp_send_reschedule(cpu);
+		}
+	}
+}
+
+/* caller must hold pfair_lock */
+static void drop_all_references(struct task_struct *t)
+{
+        int cpu;
+        struct pfair_state* s;
+        struct bheap* q;
+        if (bheap_node_in_heap(tsk_rt(t)->heap_node)) {
+                /* figure out what queue the node is in */
+                if (time_before_eq(cur_release(t), merge_time))
+                        q = &pfair.ready_queue;
+                else
+                        q = relq(cur_release(t));
+                bheap_delete(pfair_ready_order, q,
+                            tsk_rt(t)->heap_node);
+        }
+        for (cpu = 0; cpu < num_online_cpus(); cpu++) {
+                s = &per_cpu(pfair_state, cpu);
+                if (s->linked == t)
+                        s->linked = NULL;
+                if (s->local  == t)
+                        s->local  = NULL;
+                if (s->scheduled  == t)
+                        s->scheduled = NULL;
+        }
+}
+
+/* returns 1 if the task needs to go the release queue */
+static int advance_subtask(quanta_t time, struct task_struct* t, int cpu)
+{
+	struct pfair_param* p = tsk_pfair(t);
+	int to_relq;
+	p->cur = (p->cur + 1) % p->quanta;
+	if (!p->cur) {
+		sched_trace_task_completion(t, 1);
+		if (tsk_rt(t)->present) {
+			/* we start a new job */
+			prepare_for_next_period(t);
+			sched_trace_task_release(t);
+			get_rt_flags(t) = RT_F_RUNNING;
+			p->release += p->period;
+		} else {
+			/* remove task from system until it wakes */
+			drop_all_references(t);
+			tsk_pfair(t)->sporadic_release = 1;
+			TRACE_TASK(t, "on %d advanced to subtask %lu (not present)\n",
+				   cpu, p->cur);
+			return 0;
+		}
+	}
+	to_relq = time_after(cur_release(t), time);
+	TRACE_TASK(t, "on %d advanced to subtask %lu -> to_relq=%d\n",
+		   cpu, p->cur, to_relq);
+	return to_relq;
+}
+
+static void advance_subtasks(quanta_t time)
+{
+	int cpu, missed;
+	struct task_struct* l;
+	struct pfair_param* p;
+
+	for_each_online_cpu(cpu) {
+		l = pstate[cpu]->linked;
+		missed = pstate[cpu]->linked != pstate[cpu]->local;
+		if (l) {
+			p = tsk_pfair(l);
+			p->last_quantum = time;
+			p->last_cpu     =  cpu;
+			if (advance_subtask(time, l, cpu)) {
+				pstate[cpu]->linked = NULL;
+				pfair_add_release(l);
+			}
+		}
+	}
+}
+
+static int target_cpu(quanta_t time, struct task_struct* t, int default_cpu)
+{
+	int cpu;
+	if (tsk_rt(t)->scheduled_on != NO_CPU) {
+		/* always observe scheduled_on linkage */
+		default_cpu = tsk_rt(t)->scheduled_on;
+	} else if (tsk_pfair(t)->last_quantum == time - 1) {
+		/* back2back quanta */
+		/* Only observe last_quantum if no scheduled_on is in the way.
+		 * This should only kick in if a CPU missed quanta, and that
+		 * *should* only happen in QEMU.
+		 */
+		cpu = tsk_pfair(t)->last_cpu;
+		if (!pstate[cpu]->linked ||
+		    tsk_rt(pstate[cpu]->linked)->scheduled_on != cpu) {
+			default_cpu = cpu;
+		}
+	}
+	return default_cpu;
+}
+
+/* returns one if linking was redirected */
+static int pfair_link(quanta_t time, int cpu,
+		      struct task_struct* t)
+{
+	int target = target_cpu(time, t, cpu);
+	struct task_struct* prev  = pstate[cpu]->linked;
+	struct task_struct* other;
+
+	if (target != cpu) {
+		other = pstate[target]->linked;
+		pstate[target]->linked = t;
+		tsk_rt(t)->linked_on   = target;
+		if (!other)
+			/* linked ok, but reschedule this CPU */
+			return 1;
+		if (target < cpu) {
+			/* link other to cpu instead */
+			tsk_rt(other)->linked_on = cpu;
+			pstate[cpu]->linked      = other;
+			if (prev) {
+				/* prev got pushed back into the ready queue */
+				tsk_rt(prev)->linked_on = NO_CPU;
+				__add_ready(&pfair, prev);
+			}
+			/* we are done with this cpu */
+			return 0;
+		} else {
+			/* re-add other, it's original CPU was not considered yet */
+			tsk_rt(other)->linked_on = NO_CPU;
+			__add_ready(&pfair, other);
+			/* reschedule this CPU */
+			return 1;
+		}
+	} else {
+		pstate[cpu]->linked  = t;
+		tsk_rt(t)->linked_on = cpu;
+		if (prev) {
+			/* prev got pushed back into the ready queue */
+			tsk_rt(prev)->linked_on = NO_CPU;
+			__add_ready(&pfair, prev);
+		}
+		/* we are done with this CPU */
+		return 0;
+	}
+}
+
+static void schedule_subtasks(quanta_t time)
+{
+	int cpu, retry;
+
+	for_each_online_cpu(cpu) {
+		retry = 1;
+		while (retry) {
+			if (pfair_higher_prio(__peek_ready(&pfair),
+					      pstate[cpu]->linked))
+				retry = pfair_link(time, cpu,
+						   __take_ready(&pfair));
+			else
+				retry = 0;
+		}
+	}
+}
+
+static void schedule_next_quantum(quanta_t time)
+{
+	int cpu;
+
+	/* called with interrupts disabled */
+	PTRACE("--- Q %lu at %llu PRE-SPIN\n",
+	       time, litmus_clock());
+	spin_lock(&pfair_lock);
+	PTRACE("<<< Q %lu at %llu\n",
+	       time, litmus_clock());
+
+	sched_trace_quantum_boundary();
+
+	advance_subtasks(time);
+	poll_releases(time);
+	schedule_subtasks(time);
+
+	for (cpu = 0; cpu < num_online_cpus(); cpu++)
+		if (pstate[cpu]->linked)
+			PTRACE_TASK(pstate[cpu]->linked,
+				    " linked on %d.\n", cpu);
+		else
+			PTRACE("(null) linked on %d.\n", cpu);
+
+	/* We are done. Advance time. */
+	mb();
+	for (cpu = 0; cpu < num_online_cpus(); cpu++) {
+		if (pstate[cpu]->local_tick != pstate[cpu]->cur_tick) {
+			TRACE("BAD Quantum not acked on %d "
+			      "(l:%lu c:%lu p:%lu)\n",
+			      cpu,
+			      pstate[cpu]->local_tick,
+			      pstate[cpu]->cur_tick,
+			      pfair_time);
+			pstate[cpu]->missed_quanta++;
+		}
+		pstate[cpu]->cur_tick = time;
+	}
+	PTRACE(">>> Q %lu at %llu\n",
+	       time, litmus_clock());
+	spin_unlock(&pfair_lock);
+}
+
+static noinline void wait_for_quantum(quanta_t q, struct pfair_state* state)
+{
+	quanta_t loc;
+
+	goto first; /* skip mb() on first iteration */
+	do {
+		cpu_relax();
+		mb();
+	first:	loc = state->cur_tick;
+		/* FIXME: what if loc > cur? */
+	} while (time_before(loc, q));
+	PTRACE("observed cur_tick:%lu >= q:%lu\n",
+	       loc, q);
+}
+
+static quanta_t current_quantum(struct pfair_state* state)
+{
+	lt_t t = litmus_clock() - state->offset;
+	return time2quanta(t, FLOOR);
+}
+
+static void catchup_quanta(quanta_t from, quanta_t target,
+			   struct pfair_state* state)
+{
+	quanta_t cur = from, time;
+	TRACE("+++< BAD catching up quanta from %lu to %lu\n",
+	      from, target);
+	while (time_before(cur, target)) {
+		wait_for_quantum(cur, state);
+		cur++;
+		time = cmpxchg(&pfair_time,
+			       cur - 1,   /* expected */
+			       cur        /* next     */
+			);
+		if (time == cur - 1)
+			schedule_next_quantum(cur);
+	}
+	TRACE("+++> catching up done\n");
+}
+
+/* pfair_tick - this function is called for every local timer
+ *                         interrupt.
+ */
+static void pfair_tick(struct task_struct* t)
+{
+	struct pfair_state* state = &__get_cpu_var(pfair_state);
+	quanta_t time, cur;
+	int retry = 10;
+
+	do {
+		cur  = current_quantum(state);
+		PTRACE("q %lu at %llu\n", cur, litmus_clock());
+
+		/* Attempt to advance time. First CPU to get here
+		 * will prepare the next quantum.
+		 */
+		time = cmpxchg(&pfair_time,
+			       cur - 1,   /* expected */
+			       cur        /* next     */
+			);
+		if (time == cur - 1) {
+			/* exchange succeeded */
+			wait_for_quantum(cur - 1, state);
+			schedule_next_quantum(cur);
+			retry = 0;
+		} else if (time_before(time, cur - 1)) {
+			/* the whole system missed a tick !? */
+			catchup_quanta(time, cur, state);
+			retry--;
+		} else if (time_after(time, cur)) {
+			/* our timer lagging behind!? */
+			TRACE("BAD pfair_time:%lu > cur:%lu\n", time, cur);
+			retry--;
+		} else {
+			/* Some other CPU already started scheduling
+			 * this quantum. Let it do its job and then update.
+			 */
+			retry = 0;
+		}
+	} while (retry);
+
+	/* Spin locally until time advances. */
+	wait_for_quantum(cur, state);
+
+	/* copy assignment */
+	/* FIXME: what if we race with a future update? Corrupted state? */
+	state->local      = state->linked;
+	/* signal that we are done */
+	mb();
+	state->local_tick = state->cur_tick;
+
+	if (state->local != current
+	    && (is_realtime(current) || is_present(state->local)))
+		set_tsk_need_resched(current);
+}
+
+static int safe_to_schedule(struct task_struct* t, int cpu)
+{
+	int where = tsk_rt(t)->scheduled_on;
+	if (where != NO_CPU && where != cpu) {
+		TRACE_TASK(t, "BAD: can't be scheduled on %d, "
+			   "scheduled already on %d.\n", cpu, where);
+		return 0;
+	} else
+		return tsk_rt(t)->present && get_rt_flags(t) == RT_F_RUNNING;
+}
+
+static struct task_struct* pfair_schedule(struct task_struct * prev)
+{
+	struct pfair_state* state = &__get_cpu_var(pfair_state);
+	int blocks;
+	struct task_struct* next = NULL;
+
+	spin_lock(&pfair_lock);
+
+	blocks  = is_realtime(prev) && !is_running(prev);
+
+	if (state->local && safe_to_schedule(state->local, state->cpu))
+		next = state->local;
+
+	if (prev != next) {
+		tsk_rt(prev)->scheduled_on = NO_CPU;
+		if (next)
+			tsk_rt(next)->scheduled_on = state->cpu;
+	}
+
+	spin_unlock(&pfair_lock);
+
+	if (next)
+		TRACE_TASK(next, "scheduled rel=%lu at %lu (%llu)\n",
+			   tsk_pfair(next)->release, pfair_time, litmus_clock());
+	else if (is_realtime(prev))
+		TRACE("Becomes idle at %lu (%llu)\n", pfair_time, litmus_clock());
+
+	return next;
+}
+
+static void pfair_task_new(struct task_struct * t, int on_rq, int running)
+{
+	unsigned long 		flags;
+
+	TRACE("pfair: task new %d state:%d\n", t->pid, t->state);
+
+	spin_lock_irqsave(&pfair_lock, flags);
+	if (running)
+		t->rt_param.scheduled_on = task_cpu(t);
+	else
+		t->rt_param.scheduled_on = NO_CPU;
+
+	prepare_release(t, pfair_time + 1);
+	tsk_pfair(t)->sporadic_release = 0;
+	pfair_add_release(t);
+	check_preempt(t);
+
+	spin_unlock_irqrestore(&pfair_lock, flags);
+}
+
+static void pfair_task_wake_up(struct task_struct *t)
+{
+	unsigned long flags;
+	lt_t now;
+
+	TRACE_TASK(t, "wakes at %llu, release=%lu, pfair_time:%lu\n",
+		   litmus_clock(), cur_release(t), pfair_time);
+
+	spin_lock_irqsave(&pfair_lock, flags);
+
+	/* It is a little unclear how to deal with Pfair
+	 * tasks that block for a while and then wake. For now,
+	 * if a task blocks and wakes before its next job release,
+	 * then it may resume if it is currently linked somewhere
+	 * (as if it never blocked at all). Otherwise, we have a
+	 * new sporadic job release.
+	 */
+	if (tsk_pfair(t)->sporadic_release) {
+		now = litmus_clock();
+		release_at(t, now);
+		prepare_release(t, time2quanta(now, CEIL));
+		sched_trace_task_release(t);
+		/* FIXME: race with pfair_time advancing */
+		pfair_add_release(t);
+		tsk_pfair(t)->sporadic_release = 0;
+	}
+
+	check_preempt(t);
+
+	spin_unlock_irqrestore(&pfair_lock, flags);
+	TRACE_TASK(t, "wake up done at %llu\n", litmus_clock());
+}
+
+static void pfair_task_block(struct task_struct *t)
+{
+	BUG_ON(!is_realtime(t));
+	TRACE_TASK(t, "blocks at %llu, state:%d\n",
+		   litmus_clock(), t->state);
+}
+
+static void pfair_task_exit(struct task_struct * t)
+{
+	unsigned long flags;
+
+	BUG_ON(!is_realtime(t));
+
+	/* Remote task from release or ready queue, and ensure
+	 * that it is not the scheduled task for ANY CPU. We
+	 * do this blanket check because occassionally when
+	 * tasks exit while blocked, the task_cpu of the task
+	 * might not be the same as the CPU that the PFAIR scheduler
+	 * has chosen for it.
+	 */
+	spin_lock_irqsave(&pfair_lock, flags);
+
+	TRACE_TASK(t, "RIP, state:%d\n", t->state);
+	drop_all_references(t);
+
+	spin_unlock_irqrestore(&pfair_lock, flags);
+
+	kfree(t->rt_param.pfair);
+	t->rt_param.pfair = NULL;
+}
+
+
+static void pfair_release_at(struct task_struct* task, lt_t start)
+{
+	unsigned long flags;
+	quanta_t release;
+
+	BUG_ON(!is_realtime(task));
+
+	spin_lock_irqsave(&pfair_lock, flags);
+	release_at(task, start);
+	release = time2quanta(start, CEIL);
+
+	if (release - pfair_time >= PFAIR_MAX_PERIOD)
+		release = pfair_time + PFAIR_MAX_PERIOD;
+
+	TRACE_TASK(task, "sys release at %lu\n", release);
+
+	drop_all_references(task);
+	prepare_release(task, release);
+	pfair_add_release(task);
+
+	/* Clear sporadic release flag, since this release subsumes any
+	 * sporadic release on wake.
+	 */
+	tsk_pfair(task)->sporadic_release = 0;
+
+	spin_unlock_irqrestore(&pfair_lock, flags);
+}
+
+static void init_subtask(struct subtask* sub, unsigned long i,
+			 lt_t quanta, lt_t period)
+{
+	/* since i is zero-based, the formulas are shifted by one */
+	lt_t tmp;
+
+	/* release */
+	tmp = period * i;
+	do_div(tmp, quanta); /* floor */
+	sub->release = (quanta_t) tmp;
+
+	/* deadline */
+	tmp = period * (i + 1);
+	if (do_div(tmp, quanta)) /* ceil */
+		tmp++;
+	sub->deadline = (quanta_t) tmp;
+
+	/* next release */
+	tmp = period * (i + 1);
+	do_div(tmp, quanta); /* floor */
+	sub->overlap =  sub->deadline - (quanta_t) tmp;
+
+	/* Group deadline.
+	 * Based on the formula given in Uma's thesis.
+	 */
+	if (2 * quanta >= period) {
+		/* heavy */
+		tmp = (sub->deadline - (i + 1)) * period;
+		if (period > quanta &&
+		    do_div(tmp, (period - quanta))) /* ceil */
+			tmp++;
+		sub->group_deadline = (quanta_t) tmp;
+	} else
+		sub->group_deadline = 0;
+}
+
+static void dump_subtasks(struct task_struct* t)
+{
+	unsigned long i;
+	for (i = 0; i < t->rt_param.pfair->quanta; i++)
+		TRACE_TASK(t, "SUBTASK %lu: rel=%lu dl=%lu bbit:%lu gdl:%lu\n",
+			   i + 1,
+			   t->rt_param.pfair->subtasks[i].release,
+			   t->rt_param.pfair->subtasks[i].deadline,
+			   t->rt_param.pfair->subtasks[i].overlap,
+			   t->rt_param.pfair->subtasks[i].group_deadline);
+}
+
+static long pfair_admit_task(struct task_struct* t)
+{
+	lt_t quanta;
+	lt_t period;
+	s64  quantum_length = ktime_to_ns(tick_period);
+	struct pfair_param* param;
+	unsigned long i;
+
+	/* Pfair is a tick-based method, so the time
+	 * of interest is jiffies. Calculate tick-based
+	 * times for everything.
+	 * (Ceiling of exec cost, floor of period.)
+	 */
+
+	quanta = get_exec_cost(t);
+	period = get_rt_period(t);
+
+	quanta = time2quanta(get_exec_cost(t), CEIL);
+
+	if (do_div(period, quantum_length))
+		printk(KERN_WARNING
+		       "The period of %s/%d is not a multiple of %llu.\n",
+		       t->comm, t->pid, (unsigned long long) quantum_length);
+
+	if (period >= PFAIR_MAX_PERIOD) {
+		printk(KERN_WARNING
+		       "PFAIR: Rejecting task %s/%d; its period is too long.\n",
+		       t->comm, t->pid);
+		return -EINVAL;
+	}
+
+	if (quanta == period) {
+		/* special case: task has weight 1.0 */
+		printk(KERN_INFO
+		       "Admitting weight 1.0 task. (%s/%d, %llu, %llu).\n",
+		       t->comm, t->pid, quanta, period);
+		quanta = 1;
+		period = 1;
+	}
+
+	param = kmalloc(sizeof(*param) +
+			quanta * sizeof(struct subtask), GFP_ATOMIC);
+
+	if (!param)
+		return -ENOMEM;
+
+	param->quanta  = quanta;
+	param->cur     = 0;
+	param->release = 0;
+	param->period  = period;
+
+	for (i = 0; i < quanta; i++)
+		init_subtask(param->subtasks + i, i, quanta, period);
+
+	if (t->rt_param.pfair)
+		/* get rid of stale allocation */
+		kfree(t->rt_param.pfair);
+
+	t->rt_param.pfair = param;
+
+	/* spew out some debug info */
+	dump_subtasks(t);
+
+	return 0;
+}
+
+static long pfair_activate_plugin(void)
+{
+	int cpu;
+	struct pfair_state* state;
+
+	state = &__get_cpu_var(pfair_state);
+	pfair_time = current_quantum(state);
+
+	TRACE("Activating PFAIR at q=%lu\n", pfair_time);
+
+	for (cpu = 0; cpu < num_online_cpus(); cpu++)  {
+		state = &per_cpu(pfair_state, cpu);
+		state->cur_tick   = pfair_time;
+		state->local_tick = pfair_time;
+		state->missed_quanta = 0;
+		state->offset     = cpu_stagger_offset(cpu);
+	}
+
+	return 0;
+}
+
+/*	Plugin object	*/
+static struct sched_plugin pfair_plugin __cacheline_aligned_in_smp = {
+	.plugin_name		= "PFAIR",
+	.tick			= pfair_tick,
+	.task_new		= pfair_task_new,
+	.task_exit		= pfair_task_exit,
+	.schedule		= pfair_schedule,
+	.task_wake_up		= pfair_task_wake_up,
+	.task_block		= pfair_task_block,
+	.admit_task		= pfair_admit_task,
+	.release_at		= pfair_release_at,
+	.complete_job		= complete_job,
+	.activate_plugin	= pfair_activate_plugin,
+};
+
+static int __init init_pfair(void)
+{
+	int cpu, i;
+	struct pfair_state *state;
+
+
+	/*
+	 * initialize short_cut for per-cpu pfair state;
+	 * there may be a problem here if someone removes a cpu
+	 * while we are doing this initialization... and if cpus
+	 * are added / removed later... is it a _real_ problem?
+	 */
+	pstate = kmalloc(sizeof(struct pfair_state*) * num_online_cpus(), GFP_KERNEL);
+
+	/* initialize release queue */
+	for (i = 0; i < PFAIR_MAX_PERIOD; i++)
+		bheap_init(&release_queue[i]);
+
+	/* initialize CPU state */
+	for (cpu = 0; cpu < num_online_cpus(); cpu++)  {
+		state = &per_cpu(pfair_state, cpu);
+		state->cpu 	  = cpu;
+		state->cur_tick   = 0;
+		state->local_tick = 0;
+		state->linked     = NULL;
+		state->local      = NULL;
+		state->scheduled  = NULL;
+		state->missed_quanta = 0;
+		state->offset     = cpu_stagger_offset(cpu);
+		pstate[cpu] = state;
+	}
+
+	rt_domain_init(&pfair, pfair_ready_order, NULL, NULL);
+	return register_sched_plugin(&pfair_plugin);
+}
+
+static void __exit clean_pfair(void)
+{
+	kfree(pstate);
+}
+
+module_init(init_pfair);
+module_exit(clean_pfair);
-- 
cgit v1.2.2