/**
 * litmus/sched_mc.c
 *
 * Implementation of the Mixed Criticality scheduling algorithm.
 *
 * (Per Mollison, Erickson, Anderson, Baruah, Scoredos 2010)
 */

#include <linux/spinlock.h>
#include <linux/percpu.h>
#include <linux/sched.h>
#include <linux/hrtimer.h>
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/poison.h>

#include <litmus/litmus.h>
#include <litmus/jobs.h>
#include <litmus/sched_plugin.h>
#include <litmus/edf_common.h>
#include <litmus/sched_trace.h>
#include <litmus/domain.h>
#include <litmus/bheap.h>

#include <litmus/sched_mc.h>

/**
 * crit_entry_t - State of a CPU within each criticality level system.
 * @level	Criticality level of this entry
 * @linked	Logically running task, ghost or regular
 * @domain	Domain from which to draw tasks
 * @usable	False if a higher criticality task is running
 * @timer	For ghost task budget enforcement
 * @node	Used to sort crit_entries by preemptability in global domains
 */
typedef struct {
	enum crit_level		level;
	struct task_struct*	linked;
	domain_t*		domain;
	int			usable;
	struct hrtimer		timer;
	struct bheap_node*	node;
	atomic_t		dirty;
} crit_entry_t;

/**
 * cpu_entry_t - State of a CPU for the entire MC system
 * @cpu		  CPU id
 * @scheduled	  Task that is physically running
 * @linked	  Task that should be running / is logically running
 * @lock	  For serialization
 * @crit_entries  Array of CPU state per criticality level
 */
typedef struct {
	int			cpu;
	struct task_struct*	scheduled;
	struct task_struct*	linked;
	raw_spinlock_t		lock;
	crit_entry_t		crit_entries[NUM_CRIT_LEVELS];
#ifdef CONFIG_PLUGIN_MC_REDIRECT
	struct list_head	redir;
	raw_spinlock_t		redir_lock;
#endif
} cpu_entry_t;

/**
 * domain_data_t - Wrap domains with related CPU state
 * @domain	A domain for a criticality level
 * @heap	The preemptable heap of crit entries (for global domains)
 * @crit_entry	The crit entry for this domain (for partitioned domains)
 */
typedef struct {
	domain_t 	domain;
	struct bheap*	heap;
	crit_entry_t*	crit_entry;
} domain_data_t;

static cpu_entry_t* cpus[NR_CPUS];
#ifdef CONFIG_RELEASE_MASTER
static int interrupt_cpu;
#endif

#define domain_data(dom)  (container_of(dom, domain_data_t, domain))
#define is_global(dom)    (domain_data(dom)->heap)
#define is_global_task(t) (is_global(get_task_domain(t)))
#define is_in_list(t)	  (tsk_rt(t)->list.next != tsk_rt(t)->list)
#define can_requeue(t)					\
	(!is_global_task(t) || (t)->rt_param.scheduled_on == NO_CPU)
#define entry_level(e) \
	(((e)->linked) ? tsk_mc_crit((e)->linked) : NUM_CRIT_LEVELS - 1)
#define crit_cpu(ce) \
	(container_of((void*)((ce) - (ce)->level), cpu_entry_t, crit_entries))
/* Useful debug macros */
#define TS "(%s/%d:%d:%s)"
#define TA(t) (t) ? (is_ghost(t)) ? "ghost" : t->comm : "NULL", (t) ? t->pid : 1, \
	      (t) ? t->rt_param.job_params.job_no : 1,			\
	      (t && get_task_domain(t)) ? get_task_domain(t)->name : ""
#define TRACE_ENTRY(e, fmt, args...)				\
	TRACE("P%d, linked=" TS " " fmt "\n", \
	      e->cpu, TA(e->linked), ##args)
#define TRACE_CRIT_ENTRY(ce, fmt, args...)			\
	TRACE("%s P%d, linked=" TS " " fmt "\n",		\
	      (ce)->domain->name, crit_cpu(ce)->cpu, TA((ce)->linked), ##args)
#undef  TRACE_TASK
#define TRACE_TASK(t, fmt, args...)				\
	TRACE(TS " " fmt "\n", TA(t), ##args)

/*
 * Sort CPUs within a global domain by the domain's priority function.
 */
static int cpu_lower_prio(struct bheap_node *a, struct bheap_node *b)
{
	domain_t *domain;
	crit_entry_t *first, *second;
	struct task_struct *first_link, *second_link;

	first  = a->value;
	second = b->value;
	first_link  = first->linked;
	second_link = second->linked;
	if (!first->usable || !second->usable) {
		return second->usable && first->usable;
	} else if (!first_link || !second_link) {
		return second_link && !first_link;
	} else {
		domain = get_task_domain(first_link);
		BUG_ON(domain != get_task_domain(second_link));
		return domain->higher_prio(second_link, first_link);
	}
}

/*
 * Return true if the domain has a higher priority ready task. The curr
 * task must belong to the domain.
 */
static noinline int mc_preempt_needed(domain_t *dom, struct task_struct* curr)
{
	struct task_struct *next = dom->peek_ready(dom);
	if (!next || !curr) {
		return next && !curr;
	} else {
		BUG_ON(tsk_mc_crit(next) != tsk_mc_crit(curr));
		return get_task_domain(next)->higher_prio(next, curr);
	}
}

/*
 * Return next CPU which should preempted or NULL if the domain has no
 * preemptable CPUs.
 */
static inline crit_entry_t* lowest_prio_cpu(domain_t *dom)
{
	struct bheap *heap = domain_data(dom)->heap;
	struct bheap_node* hn = bheap_peek(cpu_lower_prio, heap);
	return (hn) ? hn->value : NULL;
}

/**
 * update_ghost_time() - Time accounting for ghost tasks.
 * Must be called before a decision is made involving the task's budget.
 */
static void update_ghost_time(struct task_struct *p)
{
	u64 clock = litmus_clock();
	u64 delta = clock - p->se.exec_start;
	BUG_ON(!is_ghost(p));
	if (unlikely ((s64)delta < 0)) {
		delta = 0;
		TRACE_TASK(p, "WARNING: negative time delta");
	}
	if (tsk_mc_data(p)->mc_job.ghost_budget <= delta) {
		TRACE_TASK(p, "Ghost job could have ended");
		tsk_mc_data(p)->mc_job.ghost_budget = 0;
		p->se.exec_start = clock;
	} else {
		TRACE_TASK(p, "Ghost job updated, but didn't finish");
		tsk_mc_data(p)->mc_job.ghost_budget -= delta;
		p->se.exec_start = clock;
	}
}

/**
 * link_task_to_crit() - Logically run a task at a criticality level.
 * Caller must hold @ce's domain's lock.
 */
static void link_task_to_crit(crit_entry_t *ce,
			      struct task_struct *task)
{
	lt_t when_to_fire;
	struct bheap *heap;

	TRACE_CRIT_ENTRY(ce, "Linking " TS, TA(task));
	BUG_ON(!ce->usable && task);
	BUG_ON(task && tsk_rt(task)->linked_on != NO_CPU);
	BUG_ON(task && is_global(ce->domain) &&
	       !bheap_node_in_heap(ce->node));

	/* Unlink last task */
	if (ce->linked) {
		TRACE_TASK(ce->linked, "Unlinking");
		ce->linked->rt_param.linked_on = NO_CPU;
		if (is_ghost(ce->linked)) {
			hrtimer_try_to_cancel(&ce->timer);
			if (tsk_mc_data(ce->linked)->mc_job.ghost_budget > 0) {
				/* Job isn't finished, so do accounting */
				update_ghost_time(ce->linked);
			}
		}
	}

	/* Actually link task */
	ce->linked = task;
	if (task) {
		task->rt_param.linked_on = crit_cpu(ce)->cpu;
		if (is_ghost(task)) {
			/* Reset budget timer */
			task->se.exec_start = litmus_clock();
			when_to_fire = litmus_clock() +
				tsk_mc_data(task)->mc_job.ghost_budget;
			__hrtimer_start_range_ns(&ce->timer,
						 ns_to_ktime(when_to_fire),
						 0 /* delta */,
						 HRTIMER_MODE_ABS_PINNED,
						 0 /* no wakeup */);
		}
	}

	/* Update global heap node position */
	if (is_global(ce->domain)) {
		heap = domain_data(ce->domain)->heap;
		bheap_delete(cpu_lower_prio, heap, ce->node);
		bheap_insert(cpu_lower_prio, heap, ce->node);
	}
}

static void check_for_preempt(domain_t*);
/**
 * job_arrival() - Called when a task re-enters the system.
 * Caller must hold no locks.
 */
static void job_arrival(struct task_struct *task)
{
	domain_t *dom = get_task_domain(task);

	TRACE_TASK(task, "Job arriving");
	BUG_ON(!task);
	if (can_requeue(task)) {
		raw_spin_lock(dom->lock);
		dom->requeue(dom, task);
		check_for_preempt(dom);
		raw_spin_unlock(dom->lock);
	} else {
		/* If a global task is scheduled on one cpu, it CANNOT
		 * be requeued into a global domain. Another cpu might
		 * dequeue the global task before it is descheduled,
		 * causing the system to crash when the task is scheduled
		 * in two places simultaneously.
		 */
		TRACE_TASK(task, "Delayed arrival of scheduled task");
	}
}

/**
 * low_prio_arrival() - If CONFIG_PLUGIN_MC_REDIRECT is enabled, will
 * redirect lower priority job_arrival work to the interrupt_cpu.
 */
static void low_prio_arrival(struct task_struct *task)
{
	cpu_entry_t *entry;

#ifdef  CONFIG_PLUGIN_MC_REDIRECT
#ifndef CONFIG_PLUGIN_MC_REDIRECT_ALL
	if (!is_global_task(task))
	    goto arrive;
#endif
	if (smp_processor_id() != interrupt_cpu) {
		entry = cpus[smp_processor_id()];
		raw_spin_lock(&entry->redir_lock);
		list_add(&tsk_rt(task)->list, &entry->redir);
		raw_spin_unlock(&entry->redir_lock);
		litmus_reschedule(interrupt_cpu);
	} else
#endif
	{
	arrive:
		job_arrival(task);
	}
}

#ifdef CONFIG_PLUGIN_MC_REDIRECT
/**
 * fix_global_levels() - Execute redirected job arrivals on this cpu.
 */
static void fix_global_levels(void)
{
	int c;
	cpu_entry_t *e;
	struct list_head *pos, *safe;
	struct task_struct *t;

	TRACE("Fixing global levels\n");
	for_each_online_cpu(c) {
		e = cpus[c];
		raw_spin_lock(&e->redir_lock);
		list_for_each_safe(pos, safe, &e->redir) {
			t = list_entry(pos, struct task_struct, rt_param.list);
			TRACE_TASK(t, "Arriving yo");
			BUG_ON(is_queued(t));
			list_del_init(pos);
			job_arrival(t);
		}
		raw_spin_unlock(&e->redir_lock);
	}
}
#endif

/**
 * link_task_to_cpu() - Logically run a task on a CPU.
 * The task must first have been linked to one of the CPU's crit_entries.
 * Caller must hold the entry lock.
 */
static void link_task_to_cpu(cpu_entry_t *entry, struct task_struct *task)
{
	int i = entry_level(entry);
	TRACE_TASK(task, "Linking to P%d", entry->cpu);
	BUG_ON(task && tsk_rt(task)->linked_on != entry->cpu);
	BUG_ON(task && is_ghost(task));

	if (task){
		set_rt_flags(task, RT_F_RUNNING);
	}
	entry->linked = task;
	/* Higher criticality crit entries are now usable */
	for (; i < entry_level(entry) + 1; i++) {
		TRACE_CRIT_ENTRY(&entry->crit_entries[i], "now usable");
		entry->crit_entries[i].usable = 1;
	}
}

/**
 * preempt() - Preempt a logically running task with a higher priority one.
 * @dom	Domain from which to draw higher priority task
 * @ce	CPU criticality level to preempt
 *
 * Caller must hold the lock for @dom and @ce's CPU lock.
 */
static void preempt(domain_t *dom, crit_entry_t *ce)
{
	struct task_struct *task = dom->take_ready(dom);
	cpu_entry_t *entry = crit_cpu(ce);

	BUG_ON(!task);
	TRACE_CRIT_ENTRY(ce, "Preempted by " TS, TA(task));

	/* Per-domain preemption */
	if (ce->linked && can_requeue(ce->linked)) {
		dom->requeue(dom, ce->linked);
	}
	link_task_to_crit(ce, task);
	/* Preempt actual execution if this is a running task */
	if (!is_ghost(task)) {
		link_task_to_cpu(entry, task);
		preempt_if_preemptable(entry->scheduled, entry->cpu);
	}
}

/**
 * update_crit_levels() - Update criticality entries for the new cpu state.
 * This should be called after a new task has been linked to @entry.
 * The caller must hold the @entry->lock, but this method will release it.
 */
static void update_crit_levels(cpu_entry_t *entry)
{
	int i;
	crit_entry_t *ce;
	struct task_struct *tasks[NUM_CRIT_LEVELS];
	enum crit_level level = entry_level(entry);

	/* Remove lower priority tasks from the entry */
	for (i = level + 1; i < NUM_CRIT_LEVELS; i++) {
		ce = &entry->crit_entries[i];
		tasks[i] = ce->linked;
		ce->usable = 0;
		if (ce->linked)
			link_task_to_crit(ce, NULL);
	}
	/* Need to unlock so we can access domains */
	raw_spin_unlock(&entry->lock);

	/* Re-admit tasks to the system */
	for (i = level + 1; i < NUM_CRIT_LEVELS; i++) {
		ce = &entry->crit_entries[i];
		TRACE("Checking %s\n", ce->domain->name);
		if (tasks[i])
			low_prio_arrival(tasks[i]);
	}
}

/**
 * check_for_preempt() - Causes a preemption if higher-priority tasks are ready.
 * Caller must hold domain lock.
 * Makes gigantic nasty assumption that there is 1 global criticality level,
 * and it is the last one in each list, so it doesn't call update_crit..
 */
static void check_for_preempt(domain_t *dom)
{
	int preempted = 1;
	cpu_entry_t *entry;
	crit_entry_t *ce;

	if (is_global(dom)) {
		/* Loop until we find a non-preemptable CPU */
		while ((ce = lowest_prio_cpu(dom)) && preempted) {
			entry = crit_cpu(ce);
			preempted = 0;
			raw_spin_lock(&entry->lock);
			if (ce->usable && dom->preempt_needed(dom, ce->linked)){
				preempted = 1;
				preempt(dom, ce);
			}
			raw_spin_unlock(&entry->lock);
		}
	} else /* Partitioned */ {
		ce = domain_data(dom)->crit_entry;
		entry = crit_cpu(ce);
		raw_spin_lock(&entry->lock);
		if (ce->usable && dom->preempt_needed(dom, ce->linked)) {
			preempt(dom, ce);
			update_crit_levels(entry);
		} else {
			raw_spin_unlock(&entry->lock);
		}
	}
}

/**
 * remove_from_all() - Logically remove a task from all structures.
 * Caller must hold no locks.
 */
static void remove_from_all(struct task_struct* task)
{
	int update = 0;
    	cpu_entry_t *entry;
	crit_entry_t *ce;
	domain_t *dom = get_task_domain(task);

	TRACE_TASK(task, "Removing from everything");
	BUG_ON(!task);

	raw_spin_lock(dom->lock);

	if (task->rt_param.linked_on != NO_CPU) {
		entry = cpus[task->rt_param.linked_on];
		raw_spin_lock(&entry->lock);

		/* Unlink only if task is still linked post lock */
		ce = &entry->crit_entries[tsk_mc_crit(task)];
		if (task->rt_param.linked_on != NO_CPU) {
			BUG_ON(entry->linked != task);
			link_task_to_crit(ce, NULL);
			if (!is_ghost(task)) {
				update = 1;
				link_task_to_cpu(entry, NULL);
			}
		}

		if (update)
			update_crit_levels(entry);
		else
			raw_spin_unlock(&entry->lock);
	} else if (is_queued(task)) {
		/* This is an interesting situation: t is scheduled,
		 * but was just recently unlinked. It cannot be
		 * linked anywhere else (because then it would have
		 * been relinked to this CPU), thus it must be in some
		 * queue. We must remove it from the list in this
		 * case.
		 */
		remove((rt_domain_t*)get_task_domain(task)->data, task);
	}
	BUG_ON(is_queued(task));
	raw_spin_unlock(dom->lock);
}

/**
 * job_completion() - Update task state and re-enter it into the system.
 * Converts tasks which have completed their execution early into ghost jobs.
 * Caller must hold no locks.
 */
static void job_completion(struct task_struct *task, int forced)
{
	TRACE_TASK(task, "Completed");
	sched_trace_task_completion(task, forced);
	BUG_ON(!task);

	/* Logically stop the task execution */
	set_rt_flags(task, RT_F_SLEEP);
	remove_from_all(task);

	/* If it's not a ghost job, do ghost job conversion */
	if (!is_ghost(task)) {
		tsk_mc_data(task)->mc_job.ghost_budget = budget_remaining(task);
		tsk_mc_data(task)->mc_job.is_ghost = 1;
	}

	/* If the task is a ghost job with no budget, it either exhausted
	 * its ghost budget or there was no ghost budget after the job
	 * conversion. Revert back to a normal task and complete the period.
	 */
	if (tsk_mc_data(task)->mc_job.ghost_budget == 0) {
		tsk_mc_data(task)->mc_job.is_ghost = 0;
		prepare_for_next_period(task);
		if (is_released(task, litmus_clock()))
			sched_trace_task_release(task);
	}

	/* Requeue non-blocking tasks */
	if (is_running(task))
		job_arrival(task);
}

/**
 * mc_ghost_exhausted() - Complete logically running ghost task.
 */
static enum hrtimer_restart mc_ghost_exhausted(struct hrtimer *timer)
{
	unsigned long flags;
	struct task_struct *tmp = NULL;
	crit_entry_t *ce = container_of(timer, crit_entry_t, timer);;

	local_irq_save(flags);
	TRACE_CRIT_ENTRY(ce, "Ghost exhausted firing");

	/* Due to race conditions, we cannot just set the linked
	 * task's budget to 0 as it may no longer be the task
	 * for which this timer was armed. Instead, update the running
	 * task time and see if this causes exhaustion.
	 */
	raw_spin_lock(&crit_cpu(ce)->lock);
	if (ce->linked && is_ghost(ce->linked)) {
		update_ghost_time(ce->linked);
		if (tsk_mc_data(ce->linked)->mc_job.ghost_budget == 0) {
			tmp = ce->linked;
			link_task_to_crit(ce, NULL);
		}
	}
	raw_spin_unlock(&crit_cpu(ce)->lock);

	if (tmp)
		job_completion(tmp, 0);

	local_irq_restore(flags);
	return HRTIMER_NORESTART;
}

/**
 * mc_release_jobs() - Add heap of tasks to the system, check for preemptions.
 */
static void mc_release_jobs(rt_domain_t* rt, struct bheap* tasks)
{
	unsigned long flags;
	struct task_struct *first = bheap_peek(rt->order, tasks)->value;
	domain_t *dom = get_task_domain(first);

	raw_spin_lock_irqsave(dom->lock, flags);
	TRACE_TASK(first, "Jobs released");
	__merge_ready(rt, tasks);
	check_for_preempt(dom);
	raw_spin_unlock_irqrestore(dom->lock, flags);
}

/**
 * ms_task_new() - Setup new mixed-criticality task.
 * Assumes that there are no partitioned domains after level B.
 */
static void mc_task_new(struct task_struct *t, int on_rq, int running)
{
	unsigned long flags;
	cpu_entry_t* entry;
	enum crit_level level = tsk_mc_crit(t);

	local_irq_save(flags);
	TRACE("New mixed criticality task %d\n", t->pid);

	/* Assign domain */
	if (level < CRIT_LEVEL_C)
		entry = cpus[get_partition(t)];
	else
		entry = cpus[task_cpu(t)];
	t->rt_param._domain = entry->crit_entries[level].domain;

	/* Setup job params */
	release_at(t, litmus_clock());
	tsk_mc_data(t)->mc_job.ghost_budget = 0;
	tsk_mc_data(t)->mc_job.is_ghost = 0;
	if (running) {
		BUG_ON(entry->scheduled);
		entry->scheduled = t;
		tsk_rt(t)->scheduled_on = entry->cpu;
	} else {
		t->rt_param.scheduled_on = NO_CPU;
	}
	t->rt_param.linked_on = NO_CPU;

	job_arrival(t);

	local_irq_restore(flags);
}

/**
 * mc_task_new() - Add task back into its domain check for preemptions.
 */
static void mc_task_wake_up(struct task_struct *task)
{
	unsigned long flags;
	lt_t now = litmus_clock();
	local_irq_save(flags);

	TRACE_TASK(task, "Wakes up");
	if (is_tardy(task, now)) {
		/* Task missed its last release */
		release_at(task, now);
		sched_trace_task_release(task);
	}
	if (!is_ghost(task))
		job_arrival(task);

	local_irq_restore(flags);
}

/**
 * mc_task_block() - Remove task from state to prevent it being run anywhere.
 */
static void mc_task_block(struct task_struct *task)
{
	unsigned long flags;
	local_irq_save(flags);
	TRACE_TASK(task, "Block at %llu", litmus_clock());
	remove_from_all(task);
	local_irq_restore(flags);
}

/**
 * mc_task_exit() - Remove task from the system.
 */
static void mc_task_exit(struct task_struct *task)
{
	unsigned long flags;
	local_irq_save(flags);
	BUG_ON(!is_realtime(task));
        TRACE_TASK(task, "RIP");

	remove_from_all(task);
	if (tsk_rt(task)->scheduled_on != NO_CPU) {
		cpus[tsk_rt(task)->scheduled_on]->scheduled = NULL;
		tsk_rt(task)->scheduled_on = NO_CPU;
	}

	local_irq_restore(flags);
}

/**
 * mc_admit_task() - Return true if the task is valid.
 * Assumes there are no partitioned levels after level B.
 */
static long mc_admit_task(struct task_struct* task)
{
	if (!tsk_mc_data(task))	{
		printk(KERN_WARNING "Tried to admit task with no criticality "
			"level\n");
		return -EINVAL;
	}
	if (tsk_mc_crit(task) < CRIT_LEVEL_C && get_partition(task) == NO_CPU) {
		printk(KERN_WARNING "Tried to admit partitioned task with no "
		       "partition\n");
		return -EINVAL;
	}
	printk(KERN_INFO "Admitted task with criticality level %d\n",
		tsk_mc_crit(task));
	return 0;
}

/**
 * mc_schedule() - Return next task which should be scheduled.
 */
static struct task_struct* mc_schedule(struct task_struct * prev)
{
	unsigned long flags;
	domain_t *dom;
	crit_entry_t *ce;
	cpu_entry_t* entry = cpus[smp_processor_id()];
	int i, out_of_time, sleep, preempt, exists, blocks, global, lower;
	struct task_struct *dtask = NULL, *ready_task = NULL, *next = NULL;

	local_irq_save(flags);

	/* Sanity checking */
	BUG_ON(entry->scheduled && entry->scheduled != prev);
	BUG_ON(entry->scheduled && !is_realtime(prev));
	BUG_ON(is_realtime(prev) && !entry->scheduled);

	/* Determine state */
	exists      = entry->scheduled != NULL;
	blocks      = exists && !is_running(entry->scheduled);
	out_of_time = exists &&	budget_enforced(entry->scheduled) &&
				budget_exhausted(entry->scheduled);
	sleep	    = exists && get_rt_flags(entry->scheduled) == RT_F_SLEEP;
	global      = exists && is_global_task(entry->scheduled);
	preempt     = entry->scheduled != entry->linked;
	lower       = preempt && entry->linked &&
		tsk_mc_crit(entry->scheduled) < tsk_mc_crit(entry->linked);

	if (exists) {
		entry->scheduled->rt_param.scheduled_on = NO_CPU;
		TRACE_TASK(prev,
			   "blocks:%d out_of_time:%d sleep:%d preempt:%d "
			   "state:%d sig:%d global:%d",
			   blocks, out_of_time, sleep, preempt,
			   prev->state, signal_pending(prev), global);
	}

#ifdef CONFIG_PLUGIN_MC_REDIRECT
	if (smp_processor_id() == interrupt_cpu)
		fix_global_levels();
#endif
	/* If a task blocks we have no choice but to reschedule */
	if (blocks)
		remove_from_all(entry->scheduled);
	/* Any task which exhausts its budget or sleeps waiting for its next
	 * period completes unless its execution has been forcibly stopped.
	 */
	if ((out_of_time || sleep) && !blocks && !preempt)
		job_completion(entry->scheduled, !sleep);
	/* Global scheduled tasks must wait for a deschedule before they
	 * can rejoin the global state. Rejoin them here.
	 */
	else if (global && preempt && !blocks) {
		if (lower)
			low_prio_arrival(entry->scheduled);
		else
			job_arrival(entry->scheduled);
	}

	/* Pick next task if none is linked */
	raw_spin_lock(&entry->lock);
	for (i = 0; i < NUM_CRIT_LEVELS && !entry->linked; i++) {
		ce = &entry->crit_entries[i];
		dom = ce->domain;

		/* Swap locks. We cannot acquire a domain lock while
		 * holding an entry lock or deadlocks will happen.
		 */
		raw_spin_unlock(&entry->lock);
		raw_spin_lock(dom->lock);
		raw_spin_lock(&entry->lock);

		dtask = dom->peek_ready(dom);
		if (!entry->linked && ce->usable && !ce->linked && dtask) {
			dom->take_ready(dom);
			link_task_to_crit(ce, dtask);
			ready_task = (is_ghost(dtask)) ? NULL : dtask;

			/* Task found! */
			if (ready_task) {
				link_task_to_cpu(entry, ready_task);
				raw_spin_unlock(dom->lock);
				update_crit_levels(entry);
				raw_spin_lock(&entry->lock);
				continue;
			}
		}
		raw_spin_unlock(dom->lock);
	}

	/* Schedule next task */
	next = entry->linked;
	entry->scheduled = next;
	if (entry->scheduled)
		entry->scheduled->rt_param.scheduled_on = entry->cpu;
	sched_state_task_picked();

	raw_spin_unlock(&entry->lock);
	local_irq_restore(flags);
	if (next)
		TRACE_TASK(next, "Scheduled at %llu", litmus_clock());
	else if (exists && !next)
		TRACE("Becomes idle at %llu\n", litmus_clock());
	return next;
}

static long mc_activate_plugin(void)
{
#ifdef CONFIG_RELEASE_MASTER
	interrupt_cpu = atomic_read(&release_master_cpu);
	if (interrupt_cpu == NO_CPU)
		interrupt_cpu = 0;
#endif
	return 0;
}

/* **************************************************************************
 * Initialization
 * ************************************************************************** */

static struct sched_plugin mc_plugin __cacheline_aligned_in_smp = {
	.plugin_name		= "MC",
	.task_new		= mc_task_new,
	.complete_job		= complete_job,
	.task_exit		= mc_task_exit,
	.schedule		= mc_schedule,
	.task_wake_up		= mc_task_wake_up,
	.task_block		= mc_task_block,
	.admit_task		= mc_admit_task,
	.activate_plugin	= mc_activate_plugin,
};

/* Initialize values here so that they are allocated with the module
 * and destroyed when the module is unloaded.
 */
DEFINE_PER_CPU(cpu_entry_t, _mc_cpus);
/* LVL-A */
DEFINE_PER_CPU(domain_data_t, _mc_crit_a);
DEFINE_PER_CPU(rt_domain_t, _mc_crit_a_rt);
/* LVL-B */
DEFINE_PER_CPU(domain_data_t, _mc_crit_b);
DEFINE_PER_CPU(rt_domain_t, _mc_crit_b_rt);
/* LVL-C */
static domain_data_t _mc_crit_c;
static rt_domain_t _mc_crit_c_rt;
struct bheap _mc_heap_c;
struct bheap_node _mc_nodes_c[NR_CPUS];

static void init_crit_entry(crit_entry_t *ce, enum crit_level level,
			    domain_data_t *dom_data,
			    struct bheap_node *node)
{
	ce->level  = level;
	ce->linked = NULL;
	ce->node   = node;
	ce->domain = &dom_data->domain;
	ce->usable = 1;
	atomic_set(&ce->dirty, 1);
	hrtimer_init(&ce->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
	ce->timer.function = mc_ghost_exhausted;
}

static void init_local_domain(cpu_entry_t *entry, domain_data_t *dom_data,
			      enum crit_level level)
{
	dom_data->heap = NULL;
	dom_data->crit_entry = &entry->crit_entries[level];
	init_crit_entry(dom_data->crit_entry, level, dom_data, NULL);
}

static void init_global_domain(domain_data_t *dom_data, enum crit_level level,
			       struct bheap *heap, struct bheap_node *nodes)
{
	int cpu;
	cpu_entry_t *entry;
	crit_entry_t *ce;
	struct bheap_node *node;

	dom_data->crit_entry = NULL;
	dom_data->heap = heap;
	bheap_init(heap);

	for_each_online_cpu(cpu) {
		entry = cpus[cpu];
		node = &nodes[cpu];
		ce = &entry->crit_entries[level];
		init_crit_entry(ce, level, dom_data, node);
		bheap_node_init(&ce->node, ce);
		bheap_insert(cpu_lower_prio, heap, node);
	}
}

static inline void init_edf_domain(domain_t *dom, rt_domain_t *rt)
{
	pd_domain_init(dom, rt, edf_ready_order, NULL,
		       mc_release_jobs, mc_preempt_needed,
		       edf_higher_prio);
}

static int __init init_mc(void)
{
	int cpu;
	cpu_entry_t *entry;
	rt_domain_t *rt;
	domain_data_t *dom_data;
	raw_spinlock_t *a_dom, *b_dom, *c_dom; /* For lock debugger */

	for_each_online_cpu(cpu) {
		entry = &per_cpu(_mc_cpus, cpu);
		cpus[cpu] = entry;

		/* CPU */
		entry->cpu = cpu;
		entry->scheduled = NULL;
		entry->linked = NULL;
		raw_spin_lock_init(&entry->lock);
#ifdef CONFIG_PLUGIN_MC_REDIRECT
		raw_spin_lock_init(&entry->redir_lock);
		INIT_LIST_HEAD(&entry->redir);
#endif

		/* CRIT_LEVEL_A */
		dom_data = &per_cpu(_mc_crit_a, cpu);
		rt = &per_cpu(_mc_crit_a_rt, cpu);
		init_local_domain(entry, dom_data, CRIT_LEVEL_A);
		init_edf_domain(&dom_data->domain, rt);
		a_dom = dom_data->domain.lock;
		raw_spin_lock_init(a_dom);
		dom_data->domain.name = "LVL-A";

		/* CRIT_LEVEL_B */
		dom_data = &per_cpu(_mc_crit_b, cpu);
		rt = &per_cpu(_mc_crit_b_rt, cpu);
		init_local_domain(entry, dom_data, CRIT_LEVEL_B);
		init_edf_domain(&dom_data->domain, rt);
		b_dom = dom_data->domain.lock;
		raw_spin_lock_init(b_dom);
		dom_data->domain.name = "LVL-B";
	}

	/* CRIT_LEVEL_C */
	init_global_domain(&_mc_crit_c, CRIT_LEVEL_C,
			   &_mc_heap_c, _mc_nodes_c);
	init_edf_domain(&_mc_crit_c.domain, &_mc_crit_c_rt);
	c_dom = _mc_crit_c.domain.lock;
	raw_spin_lock_init(c_dom);
	_mc_crit_c.domain.name = "LVL-C";

	return register_sched_plugin(&mc_plugin);
}

module_init(init_mc);