/*
 * kernel/sched_pfair.c
 *
 * Implementation of the PD^2 pfair scheduling algorithm. This
 * implementation realizes "early releasing," i.e., it is work-conserving.
 *
 */

#include <asm/div64.h>
#include <linux/delay.h>
#include <linux/module.h>
#include <linux/spinlock.h>
#include <linux/percpu.h>
#include <linux/sched.h>
#include <linux/list.h>
#include <linux/slab.h>

#include <litmus/litmus.h>
#include <litmus/jobs.h>
#include <litmus/preempt.h>
#include <litmus/rt_domain.h>
#include <litmus/sched_plugin.h>
#include <litmus/sched_trace.h>
#include <litmus/trace.h>

#include <litmus/bheap.h>

/* to configure the cluster size */
#include <litmus/litmus_proc.h>

#include <litmus/clustered.h>

static enum cache_level pfair_cluster_level = GLOBAL_CLUSTER;

struct subtask {
	/* measured in quanta relative to job release */
	quanta_t release;
        quanta_t deadline;
	quanta_t overlap; /* called "b bit" by PD^2 */
	quanta_t group_deadline;
};

struct pfair_param   {
	quanta_t	quanta;       /* number of subtasks */
	quanta_t	cur;          /* index of current subtask */

	quanta_t	release;      /* in quanta */
	quanta_t	period;       /* in quanta */

	quanta_t	last_quantum; /* when scheduled last */
	int		last_cpu;     /* where scheduled last */

	unsigned int	needs_requeue:1;

	struct pfair_cluster* cluster; /* where this task is scheduled */

	struct subtask subtasks[0];   /* allocate together with pfair_param */
};

#define tsk_pfair(tsk) ((tsk)->rt_param.pfair)

struct pfair_state {
	struct cluster_cpu topology;

	struct hrtimer quantum_timer;

	volatile quanta_t cur_tick;    /* updated by the CPU that is advancing
				        * the time */
	volatile quanta_t local_tick;  /* What tick is the local CPU currently
				        * executing? Updated only by the local
				        * CPU. In QEMU, this may lag behind the
				        * current tick. In a real system, with
				        * proper timers and aligned quanta,
				        * that should only be the case for a
				        * very short time after the time
				        * advanced. With staggered quanta, it
				        * will lag for the duration of the
				        * offset.
					*/

	struct task_struct* linked;    /* the task that should be executing */
	struct task_struct* local;     /* the local copy of linked          */
	struct task_struct* scheduled; /* what is actually scheduled        */

	struct list_head    out_of_budget; /* list of tasks that exhausted their allocation */

	lt_t offset;			/* stagger offset */
	unsigned int missed_updates;
	unsigned int missed_quanta;
};

struct pfair_cluster {
	struct scheduling_cluster topology;

	/* The "global" time in this cluster. */
	quanta_t pfair_time; /* the "official" PFAIR clock */

	/* The ready queue for this cluster. */
	rt_domain_t pfair;

	/* The set of jobs that should have their release enacted at the next
	 * quantum boundary.
	 */
	struct bheap release_queue;
	raw_spinlock_t release_lock;
};

static inline struct pfair_cluster* cpu_cluster(struct pfair_state* state)
{
	return container_of(state->topology.cluster, struct pfair_cluster, topology);
}

static inline int cpu_id(struct pfair_state* state)
{
	return state->topology.id;
}

static inline struct pfair_state* from_cluster_list(struct list_head* pos)
{
	return list_entry(pos, struct pfair_state, topology.cluster_list);
}

static inline struct pfair_cluster* from_domain(rt_domain_t* rt)
{
	return container_of(rt, struct pfair_cluster, pfair);
}

static inline raw_spinlock_t* cluster_lock(struct pfair_cluster* cluster)
{
	/* The ready_lock is used to serialize all scheduling events. */
	return &cluster->pfair.ready_lock;
}

static inline raw_spinlock_t* cpu_lock(struct pfair_state* state)
{
	return cluster_lock(cpu_cluster(state));
}

DEFINE_PER_CPU(struct pfair_state, pfair_state);
struct pfair_state* *pstate; /* short cut */

static struct pfair_cluster* pfair_clusters;
static int num_pfair_clusters;

/* Enable for lots of trace info.
 * #define PFAIR_DEBUG
 */

#ifdef PFAIR_DEBUG
#define PTRACE_TASK(t, f, args...)  TRACE_TASK(t, f, ## args)
#define PTRACE(f, args...) TRACE(f, ## args)
#else
#define PTRACE_TASK(t, f, args...)
#define PTRACE(f, args...)
#endif

/* gcc will inline all of these accessor functions... */
static struct subtask* cur_subtask(struct task_struct* t)
{
	return tsk_pfair(t)->subtasks + tsk_pfair(t)->cur;
}

static quanta_t cur_deadline(struct task_struct* t)
{
	return cur_subtask(t)->deadline +  tsk_pfair(t)->release;
}

static quanta_t cur_release(struct task_struct* t)
{
	/* This is early releasing: only the release of the first subtask
	 * counts. */
	return tsk_pfair(t)->release;
}

static quanta_t cur_overlap(struct task_struct* t)
{
	return cur_subtask(t)->overlap;
}

static quanta_t cur_group_deadline(struct task_struct* t)
{
	quanta_t gdl = cur_subtask(t)->group_deadline;
	if (gdl)
		return gdl + tsk_pfair(t)->release;
	else
		return gdl;
}


static int pfair_higher_prio(struct task_struct* first,
			     struct task_struct* second)
{
	return  /* first task must exist */
		first && (
		/* Does the second task exist and is it a real-time task?  If
		 * not, the first task (which is a RT task) has higher
		 * priority.
		 */
		!second || !is_realtime(second)  ||

		/* Is the (subtask) deadline of the first task earlier?
		 * Then it has higher priority.
		 */
		time_before(cur_deadline(first), cur_deadline(second)) ||

		/* Do we have a deadline tie?
		 * Then break by B-bit.
		 */
		(cur_deadline(first) == cur_deadline(second) &&
		 (cur_overlap(first) > cur_overlap(second) ||

		/* Do we have a B-bit tie?
		 * Then break by group deadline.
		 */
		(cur_overlap(first) == cur_overlap(second) &&
		 (time_after(cur_group_deadline(first),
			     cur_group_deadline(second)) ||

		/* Do we have a group deadline tie?
		 * Then break by PID, which are unique.
		 */
		(cur_group_deadline(first) ==
		 cur_group_deadline(second) &&
		 first->pid < second->pid))))));
}

int pfair_ready_order(struct bheap_node* a, struct bheap_node* b)
{
	return pfair_higher_prio(bheap2task(a), bheap2task(b));
}

static void pfair_release_jobs(rt_domain_t* rt, struct bheap* tasks)
{
	struct pfair_cluster* cluster = from_domain(rt);
	unsigned long flags;

	raw_spin_lock_irqsave(&cluster->release_lock, flags);

	bheap_union(pfair_ready_order, &cluster->release_queue, tasks);

	raw_spin_unlock_irqrestore(&cluster->release_lock, flags);
}

static void prepare_release(struct task_struct* t, quanta_t at)
{
	tsk_pfair(t)->release    = at;
	tsk_pfair(t)->cur        = 0;
}

/* pull released tasks from the release queue */
static void poll_releases(struct pfair_cluster* cluster)
{
	raw_spin_lock(&cluster->release_lock);
	__merge_ready(&cluster->pfair, &cluster->release_queue);
	raw_spin_unlock(&cluster->release_lock);
}

static void check_preempt(struct task_struct* t)
{
	int cpu = NO_CPU;
	if (tsk_rt(t)->linked_on != tsk_rt(t)->scheduled_on &&
	    is_present(t)) {
		/* the task can be scheduled and
		 * is not scheduled where it ought to be scheduled
		 */
		cpu = tsk_rt(t)->linked_on != NO_CPU ?
			tsk_rt(t)->linked_on         :
			tsk_rt(t)->scheduled_on;
		PTRACE_TASK(t, "linked_on:%d, scheduled_on:%d\n",
			   tsk_rt(t)->linked_on, tsk_rt(t)->scheduled_on);
		/* preempt */
		litmus_reschedule(cpu);
	}
}

/* caller must hold pfair.ready_lock */
static void drop_all_references(struct task_struct *t)
{
        int cpu;
        struct pfair_state* s;
	struct pfair_cluster* cluster;
        if (bheap_node_in_heap(tsk_rt(t)->heap_node)) {
                /* It must be in the ready queue; drop references isn't called
		 * when the job is in a release queue. */
		cluster = tsk_pfair(t)->cluster;
                bheap_delete(pfair_ready_order, &cluster->pfair.ready_queue,
                            tsk_rt(t)->heap_node);
        }
        for (cpu = 0; cpu < num_online_cpus(); cpu++) {
                s = &per_cpu(pfair_state, cpu);
                if (s->linked == t)
                        s->linked = NULL;
                if (s->local  == t)
                        s->local  = NULL;
                if (s->scheduled  == t)
                        s->scheduled = NULL;
        }
	/* make sure we don't have a stale linked_on field */
	tsk_rt(t)->linked_on = NO_CPU;

	/* make sure we're not queued for re-releasing */
	if (in_list(&tsk_rt(t)->list))
	{
		TRACE_TASK(t, "removing from out_of_budget queue\n");
		list_del(&tsk_rt(t)->list);
	}
}

static void pfair_prepare_next_period(struct task_struct* t)
{
	struct pfair_param* p = tsk_pfair(t);

	prepare_for_next_period(t);
	tsk_rt(t)->completed = 0;
	p->release = time2quanta(get_release(t), CEIL);
}

/* returns 1 if the task needs to go the release queue */
static int advance_subtask(quanta_t time, struct task_struct* t, int cpu)
{
	struct pfair_param* p = tsk_pfair(t);
	int to_relq;
	p->cur = (p->cur + 1) % p->quanta;
	if (!p->cur) {
		if (is_present(t)) {
			/* The job overran; we start a new budget allocation. */
			TRACE_TASK(t, "overran budget, preparing next period\n");
			sched_trace_task_completion(t, 1);
			pfair_prepare_next_period(t);
		} else {
			/* remove task from system until it wakes */
			drop_all_references(t);
			p->needs_requeue = 1;
			TRACE_TASK(t, "on %d advanced to subtask %lu (not present)\n",
				   cpu, p->cur);
			return 0;
		}
	}
	to_relq = time_after(cur_release(t), time);
	TRACE_TASK(t, "on %d advanced to subtask %lu -> to_relq=%d "
		"(cur_release:%lu time:%lu present:%d on_cpu=%d)\n",
		cpu, p->cur, to_relq, cur_release(t), time,
		tsk_rt(t)->present, tsk_rt(t)->scheduled_on);
	return to_relq;
}

static void advance_subtasks(struct pfair_cluster *cluster, quanta_t time)
{
	struct task_struct* l;
	struct pfair_param* p;
	struct list_head* pos;
	struct pfair_state* cpu;

	list_for_each(pos, &cluster->topology.cpus) {
		cpu = from_cluster_list(pos);
		l = cpu->linked;
		cpu->missed_updates += cpu->linked != cpu->local;
		if (l) {
			p = tsk_pfair(l);
			p->last_quantum = time;
			p->last_cpu     =  cpu_id(cpu);
			if (advance_subtask(time, l, cpu_id(cpu))) {
				cpu->linked = NULL;
				tsk_rt(l)->linked_on = NO_CPU;
				PTRACE_TASK(l, "should go to release queue. "
					    "scheduled_on=%d present=%d\n",
					    tsk_rt(l)->scheduled_on,
					    tsk_rt(l)->present);
				list_add(&tsk_rt(l)->list, &cpu->out_of_budget);
			}
		}
	}
}

static int target_cpu(quanta_t time, struct task_struct* t, int default_cpu)
{
	int cpu;
	if (tsk_rt(t)->scheduled_on != NO_CPU) {
		/* always observe scheduled_on linkage */
		default_cpu = tsk_rt(t)->scheduled_on;
	} else if (tsk_pfair(t)->last_quantum == time - 1) {
		/* back2back quanta */
		/* Only observe last_quantum if no scheduled_on is in the way.
		 * This should only kick in if a CPU missed quanta, and that
		 * *should* only happen in QEMU.
		 */
		cpu = tsk_pfair(t)->last_cpu;
		if (!pstate[cpu]->linked ||
		    tsk_rt(pstate[cpu]->linked)->scheduled_on != cpu) {
			default_cpu = cpu;
		}
	}
	return default_cpu;
}

/* returns one if linking was redirected */
static int pfair_link(quanta_t time, int cpu,
		      struct task_struct* t)
{
	int target = target_cpu(time, t, cpu);
	struct task_struct* prev  = pstate[cpu]->linked;
	struct task_struct* other;
	struct pfair_cluster* cluster = cpu_cluster(pstate[cpu]);

	if (target != cpu) {
		BUG_ON(pstate[target]->topology.cluster != pstate[cpu]->topology.cluster);
		other = pstate[target]->linked;
		pstate[target]->linked = t;
		tsk_rt(t)->linked_on   = target;
		if (!other)
			/* linked ok, but reschedule this CPU */
			return 1;
		if (target < cpu) {
			/* link other to cpu instead */
			tsk_rt(other)->linked_on = cpu;
			pstate[cpu]->linked      = other;
			if (prev) {
				/* prev got pushed back into the ready queue */
				tsk_rt(prev)->linked_on = NO_CPU;
				__add_ready(&cluster->pfair, prev);
			}
			/* we are done with this cpu */
			return 0;
		} else {
			/* re-add other, it's original CPU was not considered yet */
			tsk_rt(other)->linked_on = NO_CPU;
			__add_ready(&cluster->pfair, other);
			/* reschedule this CPU */
			return 1;
		}
	} else {
		pstate[cpu]->linked  = t;
		tsk_rt(t)->linked_on = cpu;
		if (prev) {
			/* prev got pushed back into the ready queue */
			tsk_rt(prev)->linked_on = NO_CPU;
			__add_ready(&cluster->pfair, prev);
		}
		/* we are done with this CPU */
		return 0;
	}
}

static void schedule_subtasks(struct pfair_cluster *cluster, quanta_t time)
{
	int retry;
	struct list_head *pos;
	struct pfair_state *cpu_state;

	list_for_each(pos, &cluster->topology.cpus) {
		cpu_state = from_cluster_list(pos);
		retry = 1;
#ifdef CONFIG_RELEASE_MASTER
		/* skip release master */
		if (cluster->pfair.release_master == cpu_id(cpu_state))
			continue;
#endif
		while (retry) {
			if (pfair_higher_prio(__peek_ready(&cluster->pfair),
					      cpu_state->linked))
				retry = pfair_link(time, cpu_id(cpu_state),
						   __take_ready(&cluster->pfair));
			else
				retry = 0;
		}
	}
}

static void schedule_next_quantum(struct pfair_cluster *cluster, quanta_t time)
{
	struct pfair_state *cpu;
	struct list_head* pos;

	/* called with interrupts disabled */
	PTRACE("--- Q %lu at %llu PRE-SPIN\n",
	       time, litmus_clock());
	raw_spin_lock(cluster_lock(cluster));
	PTRACE("<<< Q %lu at %llu\n",
	       time, litmus_clock());

	sched_trace_quantum_boundary();

	advance_subtasks(cluster, time);
	poll_releases(cluster);
	schedule_subtasks(cluster, time);

	list_for_each(pos, &cluster->topology.cpus) {
		cpu = from_cluster_list(pos);
		if (cpu->linked)
			PTRACE_TASK(cpu->linked,
				    " linked on %d.\n", cpu_id(cpu));
		else
			PTRACE("(null) linked on %d.\n", cpu_id(cpu));
	}
	/* We are done. Advance time. */
	mb();
	list_for_each(pos, &cluster->topology.cpus) {
		cpu = from_cluster_list(pos);
		if (cpu->local_tick != cpu->cur_tick) {
			TRACE("BAD Quantum not acked on %d "
			      "(l:%lu c:%lu p:%lu)\n",
			      cpu_id(cpu),
			      cpu->local_tick,
			      cpu->cur_tick,
			      cluster->pfair_time);
			cpu->missed_quanta++;
		}
		cpu->cur_tick = time;
	}
	PTRACE(">>> Q %lu at %llu\n",
	       time, litmus_clock());
	raw_spin_unlock(cluster_lock(cluster));
}

static noinline void wait_for_quantum(quanta_t q, struct pfair_state* state)
{
	quanta_t loc;

	goto first; /* skip mb() on first iteration */
	do {
		cpu_relax();
		mb();
	first:	loc = state->cur_tick;
		/* FIXME: what if loc > cur? */
	} while (time_before(loc, q));
	PTRACE("observed cur_tick:%lu >= q:%lu\n",
	       loc, q);
}

static quanta_t current_quantum(struct pfair_state* state)
{
	lt_t t = litmus_clock() - state->offset;
	return time2quanta(t, FLOOR);
}

static void catchup_quanta(quanta_t from, quanta_t target,
			   struct pfair_state* state)
{
	quanta_t cur = from, time;
	TRACE("+++< BAD catching up quanta from %lu to %lu\n",
	      from, target);
	while (time_before(cur, target)) {
		wait_for_quantum(cur, state);
		cur++;
		time = cmpxchg(&cpu_cluster(state)->pfair_time,
			       cur - 1,   /* expected */
			       cur        /* next     */
			);
		if (time == cur - 1)
			schedule_next_quantum(cpu_cluster(state), cur);
	}
	TRACE("+++> catching up done\n");
}

/* pfair_tick - this function is called for every local timer
 *                         interrupt.
 */
static void pfair_tick(struct task_struct* t)
{
	struct pfair_state* state = this_cpu_ptr(&pfair_state);
	quanta_t time, cur;
	int retry = 10;

	do {
		cur  = current_quantum(state);
		PTRACE("q %lu at %llu\n", cur, litmus_clock());

		/* Attempt to advance time. First CPU to get here
		 * will prepare the next quantum.
		 */
		time = cpu_cluster(state)->pfair_time;
		if (time == cur - 1)
		{
			/* looks good, see if we can advance the time */
			time = cmpxchg(&cpu_cluster(state)->pfair_time,
				       cur - 1,   /* expected */
				       cur        /* next     */
				);
		}

		if (time == cur - 1) {
			/* exchange succeeded */
			wait_for_quantum(cur - 1, state);
			schedule_next_quantum(cpu_cluster(state), cur);
			retry = 0;
		} else if (time_before(time, cur - 1)) {
			/* the whole system missed a tick !? */
			catchup_quanta(time, cur, state);
			retry--;
		} else if (time_after(time, cur)) {
			/* our timer lagging behind!? */
			TRACE("BAD pfair_time:%lu > cur:%lu\n", time, cur);
			retry--;
		} else {
			/* Some other CPU already started scheduling
			 * this quantum. Let it do its job and then update.
			 */
			retry = 0;
		}
	} while (retry);

	/* Spin locally until time advances. */
	wait_for_quantum(cur, state);

	/* copy assignment */
	/* FIXME: what if we race with a future update? Corrupted state? */
	state->local      = state->linked;
	/* signal that we are done */
	mb();
	state->local_tick = state->cur_tick;

	if (state->local != current
	    && (is_realtime(current) || is_present(state->local)))
		litmus_reschedule_local();
}

static void process_out_of_budget_tasks(
	struct pfair_state* state,
	struct task_struct* prev,
	unsigned int blocks)
{
	struct task_struct *t;

	while (!list_empty(&state->out_of_budget))
	{

		t = list_first_entry(&state->out_of_budget,
		                     struct task_struct, rt_param.list);
		TRACE_TASK(t, "found on out_of_budget queue is_prev=%d\n", t == prev);
		list_del(&tsk_rt(t)->list);
		if (t != prev || !blocks)
		{
			sched_trace_task_release(t);
			add_release(&cpu_cluster(state)->pfair, t);
			TRACE_TASK(t, "adding to release queue (budget exhausted)\n");
		} else {
			TRACE_TASK(t, "not added to release queue (blocks=%d)\n", blocks);
			tsk_pfair(t)->needs_requeue = 1;
		}
		if (unlikely(state->local == t)) {
			TRACE_TASK(t, "still linked as ->local, cleaning up\n");
			state->local = NULL;
		}
	}
}

/* Custom scheduling tick: called on each quantum boundary. */
static enum hrtimer_restart on_quantum_boundary(struct hrtimer *timer)
{
	TS_QUANTUM_BOUNDARY_START;

	pfair_tick(current);
	hrtimer_add_expires_ns(timer, LITMUS_QUANTUM_LENGTH_NS);

	TS_QUANTUM_BOUNDARY_END;
	return  HRTIMER_RESTART;
}

static int safe_to_schedule(struct task_struct* t, int cpu)
{
	int where = tsk_rt(t)->scheduled_on;
	if (where != NO_CPU && where != cpu) {
		TRACE_TASK(t, "BAD: can't be scheduled on %d, "
			   "scheduled already on %d.\n", cpu, where);
		return 0;
	} else
		return is_present(t) && !is_completed(t);
}

static struct task_struct* pfair_schedule(struct task_struct * prev)
{
	struct pfair_state* state = this_cpu_ptr(&pfair_state);
	struct pfair_cluster* cluster = cpu_cluster(state);
	int blocks, completion, out_of_time;
	struct task_struct* next = NULL;

#ifdef CONFIG_RELEASE_MASTER
	/* Bail out early if we are the release master.
	 * The release master never schedules any real-time tasks.
	 */
	if (unlikely(cluster->pfair.release_master == cpu_id(state))) {
		goto out;
	}
#endif

	raw_spin_lock(cpu_lock(state));

	blocks      = is_realtime(prev) && !is_current_running();
	completion  = is_realtime(prev) && is_completed(prev);
	out_of_time = is_realtime(prev) && time_after(cur_release(prev),
						      state->local_tick);

	if (is_realtime(prev))
	    PTRACE_TASK(prev, "blocks:%d completion:%d out_of_time:%d\n",
			blocks, completion, out_of_time);

	if (completion && !out_of_time) {
		sched_trace_task_completion(prev, 0);
		pfair_prepare_next_period(prev);
		prepare_release(prev, cur_release(prev));
		drop_all_references(prev);
		list_add(&tsk_rt(prev)->list, &state->out_of_budget);
	}

	process_out_of_budget_tasks(state, prev, blocks);

	if (state->local && safe_to_schedule(state->local, cpu_id(state)))
		next = state->local;

	if (prev != next) {
		tsk_rt(prev)->scheduled_on = NO_CPU;
		if (next)
			tsk_rt(next)->scheduled_on = cpu_id(state);
	}
	sched_state_task_picked();
	raw_spin_unlock(cpu_lock(state));

	if (next)
		TRACE_TASK(next, "scheduled rel=%lu at %lu (%llu)\n",
			   tsk_pfair(next)->release, cpu_cluster(state)->pfair_time, litmus_clock());
	else if (is_realtime(prev))
		TRACE("Becomes idle at %lu (%llu)\n", cpu_cluster(state)->pfair_time, litmus_clock());

#ifdef CONFIG_RELEASE_MASTER
out:
#endif

	if (unlikely(!hrtimer_active(&state->quantum_timer))) {
		TRACE("activating quantum timer start=%llu\n",
			hrtimer_get_expires(&state->quantum_timer));
		__hrtimer_start_range_ns(&state->quantum_timer,
			hrtimer_get_expires(&state->quantum_timer),
			0, HRTIMER_MODE_ABS, 0);
	}

	return next;
}

static void pfair_task_new(struct task_struct * t, int on_rq, int is_scheduled)
{
	unsigned long flags;
	struct pfair_cluster* cluster;

	TRACE("pfair: task new %d state:%d\n", t->pid, t->state);

	cluster = tsk_pfair(t)->cluster;

	raw_spin_lock_irqsave(cluster_lock(cluster), flags);

	prepare_release(t, cluster->pfair_time + 1);
	release_at(t, quanta2time(cur_release(t)));

	t->rt_param.scheduled_on = NO_CPU;
	t->rt_param.linked_on    = NO_CPU;

	if (is_scheduled) {
#ifdef CONFIG_RELEASE_MASTER
		if (task_cpu(t) != cluster->pfair.release_master)
#endif
			t->rt_param.scheduled_on = task_cpu(t);
	}

	if (on_rq || is_scheduled) {
		tsk_rt(t)->present = 1;
		__add_ready(&cluster->pfair, t);
	} else {
		tsk_rt(t)->present = 0;
		tsk_pfair(t)->needs_requeue = 1;
	}

	check_preempt(t);

	raw_spin_unlock_irqrestore(cluster_lock(cluster), flags);
}

static void pfair_task_wake_up(struct task_struct *t)
{
	unsigned long flags;
	lt_t now;
	struct pfair_cluster* cluster;
	struct pfair_state* state;
	int sporadic_release = 0;

	cluster = tsk_pfair(t)->cluster;

	TRACE_TASK(t, "wakes at %llu, release=%lu, pfair_time:%lu\n",
		   litmus_clock(), cur_release(t), cluster->pfair_time);

	raw_spin_lock_irqsave(cluster_lock(cluster), flags);

	state = this_cpu_ptr(&pfair_state);

	/* If a task blocks and wakes before its next job release,
	 * then it may resume if it is currently linked somewhere
	 * (as if it never blocked at all). Otherwise, we have a
	 * new sporadic job release.
	 */
	now = litmus_clock();
	if (is_tardy(t, now)) {
		TRACE_TASK(t, "sporadic release!\n");
		sporadic_release = 1;
		release_at(t, now);
		prepare_release(t, time2quanta(now, CEIL));
		sched_trace_task_release(t);
	}

	/* only add to ready queue if the task isn't still linked somewhere */
	if (tsk_pfair(t)->needs_requeue) {
		tsk_pfair(t)->needs_requeue = 0;
		TRACE_TASK(t, "requeueing required (released:%d)\n",
			!time_after(cur_release(t), state->local_tick));
		tsk_rt(t)->completed = 0;
		if (time_after(cur_release(t), state->local_tick)
		    && !sporadic_release)
			add_release(&cluster->pfair, t);
		else
			__add_ready(&cluster->pfair, t);
	}

	check_preempt(t);

	raw_spin_unlock_irqrestore(cluster_lock(cluster), flags);
	TRACE_TASK(t, "wake up done at %llu\n", litmus_clock());
}

static void pfair_task_block(struct task_struct *t)
{
	BUG_ON(!is_realtime(t));
	TRACE_TASK(t, "blocks at %llu, state:%d\n",
		   litmus_clock(), t->state);
}

static void pfair_task_exit(struct task_struct * t)
{
	unsigned long flags;
	struct pfair_cluster *cluster;

	BUG_ON(!is_realtime(t));

	cluster = tsk_pfair(t)->cluster;

	/* Remote task from release or ready queue, and ensure
	 * that it is not the scheduled task for ANY CPU. We
	 * do this blanket check because occassionally when
	 * tasks exit while blocked, the task_cpu of the task
	 * might not be the same as the CPU that the PFAIR scheduler
	 * has chosen for it.
	 */
	raw_spin_lock_irqsave(cluster_lock(cluster), flags);

	TRACE_TASK(t, "RIP, state:%d\n", t->state);
	drop_all_references(t);

	raw_spin_unlock_irqrestore(cluster_lock(cluster), flags);

	kfree(t->rt_param.pfair);
	t->rt_param.pfair = NULL;
}

static void init_subtask(struct subtask* sub, unsigned long i,
			 lt_t quanta, lt_t period)
{
	/* since i is zero-based, the formulas are shifted by one */
	lt_t tmp;

	/* release */
	tmp = period * i;
	do_div(tmp, quanta); /* floor */
	sub->release = (quanta_t) tmp;

	/* deadline */
	tmp = period * (i + 1);
	if (do_div(tmp, quanta)) /* ceil */
		tmp++;
	sub->deadline = (quanta_t) tmp;

	/* next release */
	tmp = period * (i + 1);
	do_div(tmp, quanta); /* floor */
	sub->overlap =  sub->deadline - (quanta_t) tmp;

	/* Group deadline.
	 * Based on the formula given in Uma's thesis.
	 */
	if (2 * quanta >= period) {
		/* heavy */
		tmp = (sub->deadline - (i + 1)) * period;
		if (period > quanta &&
		    do_div(tmp, (period - quanta))) /* ceil */
			tmp++;
		sub->group_deadline = (quanta_t) tmp;
	} else
		sub->group_deadline = 0;
}

static void dump_subtasks(struct task_struct* t)
{
	unsigned long i;
	for (i = 0; i < t->rt_param.pfair->quanta; i++)
		TRACE_TASK(t, "SUBTASK %lu: rel=%lu dl=%lu bbit:%lu gdl:%lu\n",
			   i + 1,
			   t->rt_param.pfair->subtasks[i].release,
			   t->rt_param.pfair->subtasks[i].deadline,
			   t->rt_param.pfair->subtasks[i].overlap,
			   t->rt_param.pfair->subtasks[i].group_deadline);
}

static long pfair_admit_task(struct task_struct* t)
{
	lt_t quanta;
	lt_t period;
	s64  quantum_length = LITMUS_QUANTUM_LENGTH_NS;
	struct pfair_param* param;
	unsigned long i;

	/* first check that the task is in the right cluster */
	if (cpu_cluster(pstate[tsk_rt(t)->task_params.cpu]) !=
	    cpu_cluster(pstate[task_cpu(t)]))
		return -EINVAL;

	if (get_rt_period(t) != get_rt_relative_deadline(t)) {
		printk(KERN_INFO "%s: Admission rejected. "
			"Only implicit deadlines are currently supported.\n",
			litmus->plugin_name);
		return -EINVAL;
	}

	/* Pfair is a tick-based scheduler, so the unit of time
	 * is one quantum. Calculate quantum-based parameters for everything.
	 * (Ceiling of exec cost, floor of period.)
	 */

	quanta = get_exec_cost(t);
	period = get_rt_period(t);

	quanta = time2quanta(get_exec_cost(t), CEIL);

	if (do_div(period, quantum_length))
		printk(KERN_WARNING
		       "The period of %s/%d is not a multiple of %llu.\n",
		       t->comm, t->pid, (unsigned long long) quantum_length);

	if (quanta == period) {
		PTRACE_TASK(t, "Admitting weight 1.0 task. (%llu, %llu).\n", quanta, period);
	}

	param = kzalloc(sizeof(*param) +
			quanta * sizeof(struct subtask), GFP_ATOMIC);

	if (!param)
		return -ENOMEM;

	param->quanta  = quanta;
	param->period  = period;

	param->cluster = cpu_cluster(pstate[tsk_rt(t)->task_params.cpu]);

	for (i = 0; i < quanta; i++)
		init_subtask(param->subtasks + i, i, quanta, period);

	if (t->rt_param.pfair)
		/* get rid of stale allocation */
		kfree(t->rt_param.pfair);

	t->rt_param.pfair = param;

	/* spew out some debug info */
	dump_subtasks(t);

	/* Disable generic budget enforcement (if enabled).
	 * The plugin provides its own (non-optional) enforcement
	 * of allocations at quantum granularity. */
	tsk_rt(t)->task_params.budget_policy = NO_ENFORCEMENT;

	return 0;
}

static void pfair_init_cluster(struct pfair_cluster* cluster)
{
	rt_domain_init(&cluster->pfair, pfair_ready_order, NULL, pfair_release_jobs);
	bheap_init(&cluster->release_queue);
	raw_spin_lock_init(&cluster->release_lock);
	INIT_LIST_HEAD(&cluster->topology.cpus);
}

static void cleanup_clusters(void)
{
	int i;

	if (num_pfair_clusters)
		kfree(pfair_clusters);
	pfair_clusters = NULL;
	num_pfair_clusters = 0;

	/* avoid stale pointers */
	for (i = 0; i < num_online_cpus(); i++) {
		pstate[i]->topology.cluster = NULL;
		printk("P%d missed %u updates and %u quanta.\n", cpu_id(pstate[i]),
		       pstate[i]->missed_updates, pstate[i]->missed_quanta);
	}
}

static struct domain_proc_info pfair_domain_proc_info;
static long pfair_get_domain_proc_info(struct domain_proc_info **ret)
{
	*ret = &pfair_domain_proc_info;
	return 0;
}

static void pfair_setup_domain_proc(void)
{
	int i, cpu, domain;
#ifdef CONFIG_RELEASE_MASTER
	int release_master = atomic_read(&release_master_cpu);
	/* skip over the domain with the release master if cluster size is 1 */
	int cluster_size = num_online_cpus() / num_pfair_clusters;
	int skip_domain = (1 == cluster_size && release_master != NO_CPU) ?
			release_master : NO_CPU;
#else
	int release_master = NO_CPU;
	int skip_domain = NO_CPU;
#endif
	int num_rt_cpus = num_online_cpus() - (release_master != NO_CPU);
	int num_rt_domains = num_pfair_clusters - (skip_domain != NO_CPU);
	struct cd_mapping *map;

	memset(&pfair_domain_proc_info, sizeof(pfair_domain_proc_info), 0);
	init_domain_proc_info(&pfair_domain_proc_info, num_rt_cpus, num_pfair_clusters);
	pfair_domain_proc_info.num_cpus = num_rt_cpus;
	pfair_domain_proc_info.num_domains = num_rt_domains;

	for (cpu = 0, i = 0; cpu < num_online_cpus(); ++cpu) {
		if (cpu == release_master)
			continue;
		map = &pfair_domain_proc_info.cpu_to_domains[i];
		/* pointer math to figure out the domain index */
		domain = cpu_cluster(&per_cpu(pfair_state, cpu)) - pfair_clusters;
		map->id = cpu;
		cpumask_set_cpu(domain, map->mask);
		++i;
	}

	for (domain = 0, i = 0; domain < num_pfair_clusters; ++domain) {
		struct pfair_cluster *cluster;
		struct list_head *pos;

		if (domain == skip_domain)
			continue;

		cluster = &pfair_clusters[domain];
		map = &pfair_domain_proc_info.domain_to_cpus[i];
		map->id = i;

		list_for_each(pos, &cluster->topology.cpus) {
			cpu = cpu_id(from_cluster_list(pos));
			if (cpu != release_master)
				cpumask_set_cpu(cpu, map->mask);
		}
		++i;
	}
}

static long pfair_activate_plugin(void)
{
	int err, i;
	struct pfair_state* state;
	struct pfair_cluster* cluster;
	quanta_t now, start;
	int cluster_size;
	struct cluster_cpu* cpus[NR_CPUS];
	struct scheduling_cluster* clust[NR_CPUS];
	lt_t quantum_timer_start;

	cluster_size = get_cluster_size(pfair_cluster_level);

	if (cluster_size <= 0 || num_online_cpus() % cluster_size != 0)
		return -EINVAL;

	num_pfair_clusters = num_online_cpus() / cluster_size;

	pfair_clusters = kzalloc(num_pfair_clusters * sizeof(struct pfair_cluster), GFP_ATOMIC);
	if (!pfair_clusters) {
		num_pfair_clusters = 0;
		printk(KERN_ERR "Could not allocate Pfair clusters!\n");
		return -ENOMEM;
	}

	state = this_cpu_ptr(&pfair_state);
	now   = current_quantum(state);
	start = now + 50;
	quantum_timer_start = quanta2time(start);
	TRACE("Activating PFAIR at %llu (q=%lu), first tick at %llu (q=%lu)\n",
		litmus_clock(),
		now,
		quantum_timer_start,
		time2quanta(quantum_timer_start, CEIL));

	for (i = 0; i < num_pfair_clusters; i++) {
		cluster = &pfair_clusters[i];
		pfair_init_cluster(cluster);
		cluster->pfair_time = start;
		clust[i] = &cluster->topology;
#ifdef CONFIG_RELEASE_MASTER
		cluster->pfair.release_master = atomic_read(&release_master_cpu);
#endif
	}

	for_each_online_cpu(i) {
		state = &per_cpu(pfair_state, i);
		state->cur_tick   = start;
		state->local_tick = start;
		state->missed_quanta = 0;
		state->missed_updates = 0;
		state->offset     = cpu_stagger_offset(i);
		hrtimer_set_expires(&state->quantum_timer,
			ns_to_ktime(quantum_timer_start + state->offset));
		cpus[i] = &state->topology;
		TRACE("cpus[%d] set; offset=%llu; %d\n", i, state->offset, num_online_cpus());
		INIT_LIST_HEAD(&state->out_of_budget);
		/* force rescheduling to start quantum timer */
		litmus_reschedule(i);

		WARN_ONCE(!hrtimer_is_hres_active(&state->quantum_timer),
			KERN_ERR "WARNING: no high resolution timers available!?\n");
	}

	err = assign_cpus_to_clusters(pfair_cluster_level, clust, num_pfair_clusters,
				      cpus, num_online_cpus());

	if (err < 0)
		cleanup_clusters();
	else
		pfair_setup_domain_proc();

	return err;
}

static long pfair_deactivate_plugin(void)
{
	int cpu;
	struct pfair_state* state;

	for_each_online_cpu(cpu) {
		state = &per_cpu(pfair_state, cpu);
		TRACE("stopping quantum timer on CPU%d\n", cpu);
		hrtimer_cancel(&state->quantum_timer);
	}
	cleanup_clusters();
	destroy_domain_proc_info(&pfair_domain_proc_info);
	return 0;
}

/*	Plugin object	*/
static struct sched_plugin pfair_plugin __cacheline_aligned_in_smp = {
	.plugin_name		= "PFAIR",
	.task_new		= pfair_task_new,
	.task_exit		= pfair_task_exit,
	.schedule		= pfair_schedule,
	.task_wake_up		= pfair_task_wake_up,
	.task_block		= pfair_task_block,
	.admit_task		= pfair_admit_task,
	.complete_job		= complete_job,
	.activate_plugin	= pfair_activate_plugin,
	.deactivate_plugin	= pfair_deactivate_plugin,
	.get_domain_proc_info	= pfair_get_domain_proc_info,
};


static struct proc_dir_entry *cluster_file = NULL, *pfair_dir = NULL;

static int __init init_pfair(void)
{
	int cpu, err, fs;
	struct pfair_state *state;

	/*
	 * initialize short_cut for per-cpu pfair state;
	 * there may be a problem here if someone removes a cpu
	 * while we are doing this initialization... and if cpus
	 * are added / removed later... but we don't support CPU hotplug atm anyway.
	 */
	pstate = kmalloc(sizeof(struct pfair_state*) * num_online_cpus(), GFP_KERNEL);

	/* initialize CPU state */
	for (cpu = 0; cpu < num_online_cpus(); cpu++)  {
		state = &per_cpu(pfair_state, cpu);
		hrtimer_init(&state->quantum_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
		state->quantum_timer.function = on_quantum_boundary;
		state->topology.id = cpu;
		state->cur_tick   = 0;
		state->local_tick = 0;
		state->linked     = NULL;
		state->local      = NULL;
		state->scheduled  = NULL;
		state->missed_quanta = 0;
		state->offset     = cpu_stagger_offset(cpu);
		pstate[cpu] = state;
	}

	pfair_clusters = NULL;
	num_pfair_clusters = 0;

	err = register_sched_plugin(&pfair_plugin);
	if (!err) {
		fs = make_plugin_proc_dir(&pfair_plugin, &pfair_dir);
		if (!fs)
			cluster_file = create_cluster_file(pfair_dir, &pfair_cluster_level);
		else
			printk(KERN_ERR "Could not allocate PFAIR procfs dir.\n");
	}

	return err;
}

static void __exit clean_pfair(void)
{
	kfree(pstate);

	if (cluster_file)
		remove_proc_entry("cluster", pfair_dir);
	if (pfair_dir)
		remove_plugin_proc_dir(&pfair_plugin);
}

module_init(init_pfair);
module_exit(clean_pfair);