path: root/litmus/sched_mc.c

  
                    
/*
 * litmus/sched_mc.c
 * Implementation of the Mixed Criticality scheduling algorithm.
 *
 * (Per Mollison, Erickson, Anderson, Baruah, Scoredos 2010)
 * TODO: optimize reschedule
 */
#include <linux/spinlock.h>
#include <linux/percpu.h>
#include <linux/sched.h>
#include <linux/hrtimer.h>
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/poison.h>
#include <linux/pid.h>

#include <litmus/litmus.h>
#include <litmus/trace.h>
#include <litmus/jobs.h>
#include <litmus/sched_plugin.h>
#include <litmus/edf_common.h>
#include <litmus/sched_trace.h>
#include <litmus/domain.h>
#include <litmus/bheap.h>
#include <litmus/event_group.h>
#include <litmus/budget.h>
#include <litmus/server.h>
#include <litmus/sched_mc.h>
#include <litmus/ce_domain.h>
#include <litmus/dgl.h>
#include <litmus/color.h>
#include <litmus/way_tracker.h>

struct mc_signal {
	int			update:1;
	int 			preempt:1;
};

struct cpu_entry {
	int			cpu;
	struct crit_entry	crit_entries[NUM_CRIT_LEVELS];

	struct task_struct*	scheduled;
	struct task_struct*	will_schedule;
	struct task_struct*	linked;

	struct mc_signal	signal;

	raw_spinlock_t		lock;
	raw_spinlock_t 		signal_lock;

#ifdef CONFIG_PLUGIN_MC_REDIRECT
	struct list_head	redir;
	raw_spinlock_t		redir_lock;
#endif
#ifdef CONFIG_MERGE_TIMERS
	struct event_group *event_group;
#endif
};

static struct dgl group_lock;
static raw_spinlock_t dgl_lock;

DEFINE_PER_CPU(struct cpu_entry, cpus);
static int interrupt_cpu;
#define has_resources(t, c)   (tsk_rt(t)->req == group_lock.acquired[c])

#define domain_data(dom)  (container_of(dom, struct domain_data, domain))
#define is_global(dom)    (domain_data(dom)->heap)
#define is_global_task(t) (is_global(get_task_domain(t)))
#define can_requeue(t)							\
	((t)->rt_param.linked_on == NO_CPU && /* Not linked anywhere */ \
	 !is_queued(t) &&	              /* Not gonna be linked */ \
	 (!is_global_task(t) || (t)->rt_param.scheduled_on == NO_CPU))
#define entry_level(e) \
	(((e)->linked) ? tsk_mc_crit((e)->linked) : NUM_CRIT_LEVELS - 1)
#define get_crit_entry_for(cpu, level) (&per_cpu(cpus, cpu).crit_entries[level])
#define crit_cpu(ce) \
	(container_of((void*)((ce) - (ce)->level), struct cpu_entry, crit_entries))

static void clear_signal(struct mc_signal *signal)
{
	signal->update = signal->preempt = 0;
}

/*
 * Put in requests for resources needed by @t.
 */
static int acquire_resources(struct task_struct *t)
{
	int cpu, acquired;
	struct server *task_server;
	struct cpu_entry *entry;

	if (!lock_cache)
		return 1;

	BUG_ON(tsk_rt(t)->linked_on == NO_CPU);


	raw_spin_lock(&dgl_lock);

	cpu = tsk_rt(t)->linked_on;
	task_server = &tsk_rt(t)->server;

	if (!cache_preempt && is_kernel_np(t)) {
		TRACE_MC_TASK(t, "Already contending for resources\n");
		return has_resources(t, cpu);
	}

	if (!has_resources(t, cpu)) {
		sched_trace_task_block(t);
		server_state_change(task_server, SS_BLOCKED, 0);
		TRACE_MC_TASK(t, "Blocked at %llu\n", litmus_clock());

		add_group_req(&group_lock, tsk_rt(t)->req, cpu);
		if (!cache_preempt)
			make_np(t);
	}

	acquired = has_resources(t, cpu);

	if (acquired) {
		entry = &per_cpu(cpus, cpu);
		entry->signal.update = 0;
	}
	raw_spin_unlock(&dgl_lock);


	return acquired;
}

static void release_resources(struct task_struct *t)
{
	struct server *task_server = &tsk_rt(t)->server;

	if (!lock_cache)
		return;

	raw_spin_lock(&dgl_lock);

	server_state_change(task_server, SS_REMOVED, 0);

	if (cache_preempt || is_kernel_np(t)) {
		TRACE_MC_TASK(t, "Releasing resources\n");

		remove_group_req(&group_lock, tsk_rt(t)->req);
		take_np(t);
	} else if (!cache_preempt) {
		TRACE_MC_TASK(t, "No resources to release!\n");
	}

	raw_spin_unlock(&dgl_lock);

}

static int dumb_acquire(struct task_struct *t)
{
	struct server *server = &tsk_rt(t)->server;
	server_state_change(server, SS_ACTIVE, 0);
	return 1;
}

static void dumb_release(struct task_struct *t)
{
	struct server *server = &tsk_rt(t)->server;
	server_state_change(server, SS_REMOVED, 0);
}

#define fully_removed(s) ((s)->state == SS_REMOVED && !(s)->in_transit)

/*
 * Sort CPUs within a global domain's heap.
 */
static int cpu_lower_prio(struct bheap_node *a, struct bheap_node *b)
{
	struct domain *domain;
	struct crit_entry *first, *second;
	struct task_struct *first_link, *second_link;

	first  = a->value;
	second = b->value;
	first_link  = first->server.linked;
	second_link = second->server.linked;

	if (fully_removed(&first->server) || fully_removed(&second->server)){
		/* Removed entries go at the back of the heap */
		return fully_removed(&second->server) &&
			!fully_removed(&first->server);
	} else if (!first_link || !second_link) {
		/* Entry with nothing scheduled is lowest priority (front) */
		return second_link && !first_link;
	} else {
		/* Sort by deadlines of tasks (later deadlines first) */
		domain = get_task_domain(first_link);
		return domain->higher_prio(second_link, first_link);
	}
}

/*
 * Return true if the domain has a higher priority ready task. The @curr
 * task must belong to the domain.
 */
static int mc_preempt_needed(struct domain *dom, struct task_struct* curr)
{
	struct task_struct *next = dom->peek_ready(dom);
	if (!next || !curr) {
		return next && !curr;
	} else {
		BUG_ON(tsk_mc_crit(next) != tsk_mc_crit(curr));
		return !is_np(curr) &&
			get_task_domain(next)->higher_prio(next, curr);
	}
}

/*
 * Update crit entry position in a global heap. Caller must hold
 * @ce's domain lock.
 */
static void update_crit_position(struct crit_entry *ce)
{
	struct bheap *heap;
	if (is_global(ce->domain)) {
		heap = domain_data(ce->domain)->heap;
		BUG_ON(!heap);
		BUG_ON(!bheap_node_in_heap(ce->node));
		bheap_delete(cpu_lower_prio, heap, ce->node);
		bheap_insert(cpu_lower_prio, heap, ce->node);
	}
}

/*
 * Update crit entry position in a global heap if it has been marked
 * for update. Caller must hold @ce's domain lock.
 */
static void fix_crit_position(struct crit_entry *ce)
{
	struct server *server = &ce->server;
	if (is_global(ce->domain) && server->in_transit) {
		server_state_change(server, server->state, 0);
		update_crit_position(ce);
	}
}

/*
 * Return next CPU which should preempted or NULL if the domain has no
 * preemptable CPUs. Caller must hold the @dom lock.
 */
static struct crit_entry* lowest_prio_cpu(struct domain *dom)
{
	struct bheap *heap = domain_data(dom)->heap;
	struct bheap_node* hn;
	struct crit_entry *ce, *res = NULL;

	do {
		hn = bheap_peek(cpu_lower_prio, heap);
		ce = (hn) ? hn->value : NULL;
		if (ce) {
			if (ce->server.in_transit)
				fix_crit_position(ce);
			else if (ce->server.state == SS_ACTIVE)
				res = ce;
			else if (ce->server.state == SS_REMOVED)
				ce = NULL;
		}
	} while (ce && !res);

	return res;
}

/*
 * Time accounting for ghost tasks.
 * Must be called before a decision is made involving the task's budget.
 */
static void update_server_time(struct task_struct *p)
{
	u64 clock = litmus_clock();
	u64 delta = clock - p->rt_param.last_exec_time;
	if (unlikely ((s64)delta < 0)) {
		delta = 0;
	}
	if (budget_remaining(p) <= delta) {
		tsk_rt(p)->job_params.exec_time = get_exec_cost(p);
	} else {
		tsk_rt(p)->job_params.exec_time += delta;
	}
	p->rt_param.last_exec_time = clock;
}

/*
 * Arm ghost timer. Will merge timers if the option is specified.
 */
static void start_crit(struct crit_entry *ce)
{
	lt_t fire;
	struct task_struct *task;
	struct server *task_server;

	BUG_ON(ce->server.state != SS_ACTIVE);

	task = ce->server.linked;
	task_server = &tsk_rt(task)->server;

	if (is_ghost(task) && CRIT_LEVEL_A != tsk_mc_crit(task)) {
		/* There is a level-A timer that will force a
		 * preemption, so we don't set this for level-A
		 * tasks. Otherwise reset the budget timer
		 */
		fire = litmus_clock() + budget_remaining(task);

#ifdef CONFIG_MERGE_TIMERS
		add_event(crit_cpu(ce)->event_group, &ce->event, fire);
#else
		__hrtimer_start_range_ns(&ce->timer,
					 ns_to_ktime(fire),
					 0 /* delta */,
					 HRTIMER_MODE_ABS_PINNED,
					 0 /* no wakeup */);
#endif
	}

	server_state_change(task_server, SS_ACTIVE, 0);
}

static void stop_crit(struct crit_entry *ce)
{
	struct server *task_server = &tsk_rt(ce->server.linked)->server;

	if (is_ghost(ce->server.linked)) {
		if (!budget_exhausted(ce->server.linked)) {
			/* Job isn't finished, so do accounting */
			update_server_time(ce->server.linked);
		}
#ifdef CONFIG_MERGE_TIMERS
		cancel_event(&ce->event);
#else
		hrtimer_try_to_cancel(&ce->timer);
#endif
	}

	if (task_server->state != SS_BLOCKED) {
		server_state_change(task_server, SS_REMOVED, 0);
	}
}

/**
 * link_task_to_crit() - Logically run a task at a criticality level.
 * Caller must hold @ce's CPU lock.
 */
static void link_task_to_crit(struct crit_entry *ce,
			      struct task_struct *task)
{
	struct server *ce_server = &ce->server;

	TRACE_CRIT_ENTRY(ce, "Linking " TS "\n", TA(task));
	BUG_ON(task && ce_server->state != SS_ACTIVE);
	BUG_ON(task && tsk_rt(task)->linked_on != NO_CPU);
	BUG_ON(task && is_global(ce->domain) &&
	       !bheap_node_in_heap(ce->node));

	/* Unlink last task */
	if (ce->server.linked) {
		ce->domain->release_resources(ce->server.linked);
		if (ce_server->state == SS_BLOCKED) {
			server_state_change(ce_server, SS_ACTIVE, 0);
		}

		TRACE_MC_TASK(ce->server.linked, "Unlinking\n");

		stop_crit(ce);
		tsk_rt(ce->server.linked)->server.parent = 0;
		tsk_rt(ce->server.linked)->server.cpu = NO_CPU;
		ce->server.linked->rt_param.linked_on = NO_CPU;
	}

	/* Actually link task */
	ce->server.linked = task;
	if (task) {
		/* Block if task cannot acquire resources */
		task->rt_param.linked_on = crit_cpu(ce)->cpu;
		tsk_rt(task)->server.parent = ce_sid(ce);
		tsk_rt(ce->server.linked)->server.cpu = crit_cpu(ce)->cpu;

		if (ce->domain->acquire_resources(task)) {
			start_crit(ce);
		} else {
			server_state_change(ce_server, SS_BLOCKED, 0);
		}
	}
}

static void check_for_preempt(struct domain*);

/**
 * job_arrival() - Called when a task re-enters the system.
 * Caller must hold no locks.
 */
static void job_arrival(struct task_struct *task)
{
	struct domain *dom = get_task_domain(task);

	TRACE_MC_TASK(task, "Job arriving\n");
	BUG_ON(!task);

	raw_spin_lock(dom->lock);
	if (can_requeue(task)) {
		BUG_ON(task->rt_param.linked_on != NO_CPU);
		dom->requeue(dom, task);
		check_for_preempt(dom);
	} else {
		/* If a global task is scheduled on one cpu, it CANNOT
		 * be requeued into a global domain. Another cpu might
		 * dequeue the global task before it is descheduled,
		 * causing the system to crash when the task is scheduled
		 * in two places simultaneously.
		 */
		TRACE_MC_TASK(task, "Delayed arrival of scheduled task, "
			      "linked: %d, sched: %d, queued: %d\n",
			      tsk_rt(task)->linked_on, tsk_rt(task)->scheduled_on,
			      is_queued(task));
	}
	raw_spin_unlock(dom->lock);
}

/**
 * low_prio_arrival() - If CONFIG_PLUGIN_MC_REDIRECT is enabled, will
 * redirect a lower priority job_arrival work to the interrupt_cpu.
 */
static void low_prio_arrival(struct task_struct *task)
{
	struct cpu_entry *entry;

	/* Race conditions! */
	if (!can_requeue(task)) return;

#ifdef  CONFIG_PLUGIN_MC_REDIRECT
	if (!is_global_task(task))
		goto arrive;
	if (smp_processor_id() != interrupt_cpu) {
		entry = &__get_cpu_var(cpus);
		raw_spin_lock(&entry->redir_lock);
		TRACE_MC_TASK(task, "Adding to redirect queue\n");
		list_add(&tsk_rt(task)->list, &entry->redir);
		raw_spin_unlock(&entry->redir_lock);
		litmus_reschedule(interrupt_cpu);
	} else
#endif
	{
arrive:
		TRACE_MC_TASK(task, "On interrupt master, requeueing task\n");
		job_arrival(task);
	}
}

#ifdef CONFIG_PLUGIN_MC_REDIRECT
/**
 * fix_global_levels() - Execute redirected job arrivals on this cpu.
 */
static void fix_global_levels(void)
{
	int c;
	struct cpu_entry *e;
	struct list_head *pos, *safe;
	struct task_struct *t;

	STRACE("Fixing global levels\n");
	for_each_online_cpu(c) {
		e = &per_cpu(cpus, c);
		raw_spin_lock(&e->redir_lock);
		list_for_each_safe(pos, safe, &e->redir) {
			t = list_entry(pos, struct task_struct, rt_param.list);
			BUG_ON(!t);
			TRACE_MC_TASK(t, "Dequeued redirected job\n");
			list_del_init(pos);
			job_arrival(t);
		}
		raw_spin_unlock(&e->redir_lock);
	}
}
#endif

/**
 * link_task_to_cpu() - Logically run a task on a CPU.
 * The task must first have been linked to one of the CPU's crit_entries.
 * Caller must hold the entry lock.
 */
static void link_task_to_cpu(struct cpu_entry *entry, struct task_struct *task)
{
	int i = entry_level(entry);
	struct crit_entry *ce;
	struct server *server;

	TRACE_MC_TASK(task, "Linking to P%d\n", entry->cpu);
	BUG_ON(task && tsk_rt(task)->linked_on != entry->cpu);
	BUG_ON(task && is_ghost(task));

	if (entry->linked) {
		server = &tsk_rt(entry->linked)->server;
		sched_trace_server_switch_away(server->sid, *server->job,
					       entry->linked->pid,
					       get_user_job(entry->linked),
					       entry->cpu);
	}

	if (task) {
		server = &tsk_rt(task)->server;
		sched_trace_server_switch_to(server->sid, *server->job,
					     task->pid,
					     get_user_job(task),
					     entry->cpu);
	}

	entry->linked = task;

	/* Higher criticality crit entries are now usable */
	for (; i < entry_level(entry) + 1; i++) {
		ce = &entry->crit_entries[i];
		server = &ce->server;

		if (server->state == SS_REMOVED) {
			TRACE_CRIT_ENTRY(ce, "Moving up to active\n");
			server_state_change(server, SS_ACTIVE, 1);
		}
	}
}

static void preempt_cpu(struct cpu_entry *entry, struct task_struct *t)
{
	link_task_to_cpu(entry, t);
	litmus_reschedule(entry->cpu);
}

/**
 * preempt_crit() - Preempt a logically running task with a higher priority one.
 * @dom	Domain from which to draw higher priority task
 * @ce	CPU criticality level to preempt
 * @return Preempted task
 *
 * Caller must hold the lock for @dom and @ce's CPU lock.
 */
static struct task_struct* preempt_crit(struct domain *dom, struct crit_entry *ce)
{
	struct task_struct *task = dom->take_ready(dom);
	struct cpu_entry *entry  = crit_cpu(ce);
	struct task_struct *old  = ce->server.linked;

	BUG_ON(!task);
	TRACE_CRIT_ENTRY(ce, "Preempted by " TS "\n", TA(task));

	/* Per-domain preemption */
	link_task_to_crit(ce, task);
	/* if (old && can_requeue(old)) { */
	/* 	dom->requeue(dom, old); */
	/* } */
	update_crit_position(ce);

	/* Preempt actual execution if this is a running task.
	 * We know that our task is higher priority than what is currently
	 * running on this CPU as otherwise the crit_entry would have
	 * been disabled and a preemption could not have occurred
	 */
	if (!is_ghost(task) && SS_BLOCKED != ce->server.state) {
		preempt_cpu(entry, task);
	} else if (old && old == entry->linked) {
		/* Preempted running task with ghost job. Nothing should run */
		preempt_cpu(entry, NULL);
	}

	return old;
}

/**
 * update_crit_levels() - Update criticality entries for the new cpu state.
 * This should be called after a new task has been linked to @entry.
 * The caller must hold the @entry->lock, but this method will release it.
 */
static void update_crit_levels(struct cpu_entry *entry)
{
	int i, global_preempted;
	struct server *server;
	struct crit_entry *ce;
	struct task_struct *readmit[NUM_CRIT_LEVELS];
	enum crit_level level = entry_level(entry);

	/* Remove lower priority tasks from the entry */
	for (i = level + 1; i < NUM_CRIT_LEVELS; i++) {
		ce = &entry->crit_entries[i];
		server = &ce->server;

		global_preempted = ce->server.linked &&
			/* This task is running on a cpu */
			ce->server.linked->rt_param.scheduled_on == entry->cpu &&
			/* But it was preempted */
			ce->server.linked != entry->linked &&
			/* And it is an eligible global task */
			!is_ghost(ce->server.linked) && is_global(ce->domain);

		/* Do not readmit global tasks which are preempted! These can't
		 * ever be re-admitted until they are descheduled for reasons
		 * explained in job_arrival.
		 */
		readmit[i] = (!global_preempted) ? ce->server.linked : NULL;

		if (server->state != SS_REMOVED) {
			if (ce->server.linked) {
				link_task_to_crit(ce, NULL);
			}
			TRACE_CRIT_ENTRY(ce, "Removing lower crit\n");
			server_state_change(server, SS_REMOVED, 1);

		}
	}
	/* Need to unlock so we can access domains */
	raw_spin_unlock(&entry->lock);

	/* Re-admit tasks to the system */
	for (i = level + 1; i < NUM_CRIT_LEVELS; i++) {
		ce = &entry->crit_entries[i];
		if (readmit[i]) {
			low_prio_arrival(readmit[i]);
		}
	}
}

/*
 * Assumes a single, lowest-priority global criticicality level. This avoids
 * unnecessary calls to update_crit_levels.
 */
static void check_global_preempt(struct domain *dom)
{
	int recheck;
	struct crit_entry  *ce;
	struct cpu_entry   *entry;
	struct task_struct *preempted;

	recheck = 1;

	/* Loop until we find a non-preemptable CPU */
	while (recheck && (ce = lowest_prio_cpu(dom))) {
		entry = crit_cpu(ce);
		recheck = 1;
		preempted = NULL;

		/* Cache next task */
		dom->peek_ready(dom);

		raw_spin_lock(&entry->lock);

		if (ce->server.in_transit) {
			/* CPU disabled while locking! */
			fix_crit_position(ce);
		} else if (mc_preempt_needed(dom, ce->server.linked)) {
			/* Success! Check for more preemptions */
			preempted = preempt_crit(dom, ce);
		} else {
			/* Failure! */
			recheck = 0;
		}

		raw_spin_unlock(&entry->lock);

		/* Only add preempted task after lock has been released */
		if (preempted && can_requeue(preempted)) {
			dom->requeue(dom, preempted);
		}
	}
}

static void check_partitioned_preempt(struct domain *dom)
{
	struct cpu_entry *entry;
	struct crit_entry *ce;

	ce = domain_data(dom)->crit_entry;
	entry = crit_cpu(ce);

	if (ce->server.state == SS_REMOVED ||
	    !mc_preempt_needed(dom, ce->server.linked)) {
		return;
	}

	entry->signal.preempt = 1;
	litmus_reschedule(entry->cpu);
}

/**
 * check_for_preempt() - Causes a preemption if higher-priority tasks are ready.
 * Caller must hold domain lock.
 */
static void check_for_preempt(struct domain *dom)
{
	struct crit_entry *ce;
	struct cpu_entry *entry;

	if (is_global(dom)) {
		check_global_preempt(dom);
	} else {
		ce = domain_data(dom)->crit_entry;
		entry = crit_cpu(ce);

		/* Cache next task */
		dom->peek_ready(dom);

		raw_spin_lock(&entry->lock);
		check_partitioned_preempt(dom);
		raw_spin_unlock(&entry->lock);
	}
}

/**
 * remove_from_all() - Logically remove a task from all structures.
 * Caller must hold no locks.
 */
static void remove_from_all(struct task_struct* task)
{
	int update = 0;
    	struct cpu_entry *entry;
	struct crit_entry *ce;
	struct domain *dom = get_task_domain(task);

	TRACE_MC_TASK(task, "Removing from everything\n");
	BUG_ON(!task);

	raw_spin_lock(dom->lock);

	/* Remove the task from any CPU state */
	if (task->rt_param.linked_on != NO_CPU) {
		TRACE_MC_TASK(task, "Linked to something\n");
		entry = &per_cpu(cpus, task->rt_param.linked_on);
		raw_spin_lock(&entry->lock);

		/* Unlink only if task is still linked post lock */
		ce = &entry->crit_entries[tsk_mc_crit(task)];
		if (task->rt_param.linked_on != NO_CPU) {
			BUG_ON(ce->server.linked != task);
			if (entry->linked == task) {
				update = 1;
				link_task_to_cpu(entry, NULL);
			}
			link_task_to_crit(ce, NULL);
			update_crit_position(ce);
		} else {
			TRACE_MC_TASK(task, "Unlinked before we got lock!\n");
		}
		raw_spin_unlock(&entry->lock);
	} else {
		TRACE_MC_TASK(task, "Not linked to anything\n");
	}

	/* Ensure the task isn't returned by its domain */
	dom->remove(dom, task);

	raw_spin_unlock(dom->lock);
}

/**
 * job_completion() - Update task state and re-enter it into the system.
 * Converts tasks which have completed their execution early into ghost jobs.
 * Caller must hold no locks.
 */
static void job_completion(struct task_struct *task, int forced)
{
	int release_server;
	struct cpu_entry *entry;
	struct crit_entry *ce;

	TRACE_MC_TASK(task, "Completed\n");

	if (!forced) {
		/* Userspace signaled job completion */
		sched_trace_task_completion(current, 0);
		mb();
		setup_user_release(current, get_user_deadline(current));
	}

#ifndef CONFIG_PLUGIN_MC_LINUX_SLACK_STEALING
	/* Release lowest-criticality task's servers with their userspace tasks,
	 * preventing them from turning into idle ghost tasks
	 */
	if (tsk_mc_crit(task) == NUM_CRIT_LEVELS - 1)
		release_server = 1;
	else
#endif
	release_server = budget_exhausted(task);

	if (release_server || forced) {
		if (release_server)
			sched_trace_server_completion(-task->pid,
						      get_rt_job(task));
		/* Only unlink (and release resources) if the current server job
		 * must stop logically running
		 */
		remove_from_all(task);
	}

	if (lt_before(get_user_release(task), litmus_clock()) ||
	    (release_server && tsk_rt(task)->completed)){
		TRACE_TASK(task, "Executable task going back to running\n");
		tsk_rt(task)->completed = 0;
	}

	if (release_server || forced) {
		/* TODO: Level A does this independently and should not */
		if (release_server && CRIT_LEVEL_A != tsk_mc_crit(task)) {
			prepare_for_next_period(task);
		}

		TRACE_TASK(task, "Is released: %d, now: %llu, rel: %llu\n",
			   is_released(task, litmus_clock()), litmus_clock(),
			   get_release(task));

		/* Requeue non-blocking tasks */
		if (is_running(task)) {
			job_arrival(task);
		}
	} else if (is_ghost(task)) {
		entry = &per_cpu(cpus, tsk_rt(task)->linked_on);
		ce = &entry->crit_entries[tsk_mc_crit(task)];

		raw_spin_lock(&entry->lock);
		if (ce->server.linked == task) {
			/* The task went ghost while it was linked to a CPU */
			link_task_to_cpu(entry, NULL);
			stop_crit(ce);
			if (ce->server.state == SS_ACTIVE)
				start_crit(ce);
		}
		raw_spin_unlock(&entry->lock);
	}
}

/**
 * mc_ghost_exhausted() - Complete logically running ghost task.
 */
#ifdef CONFIG_MERGE_TIMERS
static void mc_ghost_exhausted(struct rt_event *e)
{
	struct crit_entry *ce = container_of(e, struct crit_entry, event);
#else
static enum hrtimer_restart mc_ghost_exhausted(struct hrtimer *timer)
{
	struct crit_entry *ce = container_of(timer, struct crit_entry, timer);

#endif
	struct task_struct *tmp = NULL;
	struct cpu_entry *entry = crit_cpu(ce);
	TRACE("Firing here at %llu\n", litmus_clock());
	TRACE_CRIT_ENTRY(ce, "For this\n");

	raw_spin_lock(&entry->lock);

	if (is_ghost(ce->server.linked)) {
		update_server_time(ce->server.linked);
		if (budget_exhausted(ce->server.linked)) {
			tmp = ce->server.linked;
		}
	} else {
		litmus_reschedule(crit_cpu(ce)->cpu);
	}

	raw_spin_unlock(&entry->lock);

	if (tmp)
		job_completion(tmp, 1);

#ifndef CONFIG_MERGE_TIMERS
	return HRTIMER_NORESTART;
#endif
}

/*
 * The MC-CE common timer callback code for merged and non-merged timers.
 * Returns the next time the timer should fire.
 */
static lt_t __ce_timer_function(struct ce_dom_data *ce_data)
{
	struct crit_entry *ce = get_crit_entry_for(ce_data->cpu, CRIT_LEVEL_A);
	struct domain *dom = ce->domain;
	struct task_struct *old_link = NULL;
	lt_t next_timer_abs;

	TRACE("MC level-A timer callback for CPU %d\n", ce_data->cpu);

	raw_spin_lock(dom->lock);

	raw_spin_lock(&crit_cpu(ce)->lock);
	if (ce->server.linked &&
	    ce->server.linked == ce_data->should_schedule)
	{
		old_link = ce->server.linked;
		link_task_to_crit(ce, NULL);
		mc_ce_job_completion(dom, old_link);
	}
	raw_spin_unlock(&crit_cpu(ce)->lock);

	next_timer_abs = mc_ce_timer_callback_common(dom);

	/* Job completion will check for preemptions by means of calling job
	 * arrival if the task is not blocked */
	if (NULL != old_link) {
		STRACE("old_link " TS " so will call job completion\n", TA(old_link));
		raw_spin_unlock(dom->lock);
		job_completion(old_link, 1);
	} else {
		STRACE("old_link was null, so will call check for preempt\n");
		check_for_preempt(dom);
		raw_spin_unlock(dom->lock);
	}
	return next_timer_abs;
}

#ifdef CONFIG_MERGE_TIMERS
static void ce_timer_function(struct rt_event *e)
{
	struct ce_dom_data *ce_data =
		container_of(e, struct ce_dom_data, event);
	unsigned long flags;
	lt_t next_timer_abs;

	TS_LVLA_RELEASE_START;

	local_irq_save(flags);
	next_timer_abs = __ce_timer_function(ce_data);
	add_event(per_cpu(cpus, ce_data->cpu).event_group, e, next_timer_abs);
	local_irq_restore(flags);

	TS_LVLA_RELEASE_END;
}
#else /* else to CONFIG_MERGE_TIMERS */
static enum hrtimer_restart ce_timer_function(struct hrtimer *timer)
{
	struct ce_dom_data *ce_data =
		container_of(timer, struct ce_dom_data, timer);
	unsigned long flags;
	lt_t next_timer_abs;

	TS_LVLA_RELEASE_START;

	local_irq_save(flags);
	next_timer_abs = __ce_timer_function(ce_data);
	hrtimer_set_expires(timer, ns_to_ktime(next_timer_abs));
	local_irq_restore(flags);

	TS_LVLA_RELEASE_END;

	return HRTIMER_RESTART;
}
#endif /* CONFIG_MERGE_TIMERS */

/**
 * mc_release_jobs() - Add heap of tasks to the system, check for preemptions.
 */
static void mc_release_jobs(rt_domain_t* rt, struct bheap* tasks)
{
	unsigned long flags;
	struct task_struct *first = bheap_peek(rt->order, tasks)->value;
	struct domain *dom = get_task_domain(first);

	raw_spin_lock_irqsave(dom->lock, flags);
	TRACE(TS "Jobs released\n", TA(first));
	__merge_ready(rt, tasks);
	check_for_preempt(dom);
	raw_spin_unlock_irqrestore(dom->lock, flags);
}

/**
 * ms_task_new() - Setup new mixed-criticality task.
 * Assumes that there are no partitioned domains after level B.
 */
static void mc_task_new(struct task_struct *t, int on_rq, int running)
{
	unsigned long flags;
	int i;
	struct cpu_entry* entry;
	enum crit_level level = tsk_mc_crit(t);
	struct dgl_group_req *req;
	struct control_page  *cp = tsk_rt(t)->ctrl_page;
	struct color_ctrl_page *ccp = &tsk_rt(t)->color_ctrl_page;

	local_irq_save(flags);
	TRACE("New mixed criticality task %d\n", t->pid);

	if (level == CRIT_LEVEL_A)
		get_rt_relative_deadline(t) = get_exec_cost(t);

	/* Assign domain */
	if (level < CRIT_LEVEL_C)
		entry = &per_cpu(cpus, get_partition(t));
	else
		entry = &per_cpu(cpus, task_cpu(t));
	t->rt_param._domain = entry->crit_entries[level].domain;

	tsk_rt(t)->flush = 0;
	tsk_rt(t)->load  = 0;

	/* Userspace and kernelspace view of task state may differ.
	 * Model kernel state as a budget enforced container
	 */
	sched_trace_container_param(t->pid, t->comm);
	sched_trace_server_param(-t->pid, t->pid,
				 get_exec_cost(t), get_rt_period(t));
	server_init(&tsk_rt(t)->server, -t->pid,
		    &tsk_rt(t)->job_params.job_no,
		    NO_CPU);
	tsk_rt(t)->task_params.budget_policy = PRECISE_ENFORCEMENT;

	BUG_ON(!tsk_rt(t)->server.job);

	/* Apply chunking */
	if (level == CRIT_LEVEL_B && color_chunk &&
	    lt_after(get_exec_cost(t), color_chunk)) {
		tsk_rt(t)->orig_cost = get_exec_cost(t);
	}

	/* Setup color request */
	req = kmalloc(sizeof(*req), GFP_ATOMIC);
	req->task = t;
	tsk_rt(t)->req = req;
	if (cp && ccp) {
		TRACE_MC_TASK(t, "Initializing group request\n");
		cp->colors_updated = 0;
		dgl_group_req_init(&group_lock, req);
		for (i = 0; ccp->pages[i]; ++i)
			set_req(&group_lock, req, ccp->colors[i], ccp->pages[i]);
	} else {
		BUG_ON(CRIT_LEVEL_B == tsk_mc_crit(t));
	}

	/* Setup job params */
	release_at(t, litmus_clock());
	if (running) {
		BUG_ON(entry->scheduled);
		TRACE_MC_TASK(t, "Was already running\n");
		entry->scheduled = t;
		tsk_rt(t)->scheduled_on = entry->cpu;
		tsk_rt(t)->last_exec_time = litmus_clock();
	} else {
		t->rt_param.scheduled_on = NO_CPU;
	}
	t->rt_param.linked_on = NO_CPU;

	job_arrival(t);

	local_irq_restore(flags);
}

/**
 * mc_task_new() - Add task back into its domain check for preemptions.
 */
static void mc_task_wake_up(struct task_struct *task)
{
	unsigned long flags;
	lt_t now = litmus_clock();
	local_irq_save(flags);

	TRACE(TS " wakes up\n", TA(task));
	if (is_tardy(task, now)) {
		/* Task missed its last release */
		release_at(task, now);
		sched_trace_task_release(task);
	}

	if (budget_exhausted(task))
		/* Rare, but possible, race condition */
		job_completion(task, 1);
	else
		job_arrival(task);

	local_irq_restore(flags);
}

/**
 * mc_task_block() - Remove task from state to prevent it being run anywhere.
 */
static void mc_task_block(struct task_struct *task)
{
	unsigned long flags;
	local_irq_save(flags);
	TRACE(TS " blocks\n", TA(task));
	remove_from_all(task);
	local_irq_restore(flags);
}

/**
 * mc_task_exit() - Remove task from the system.
 */
static void mc_task_exit(struct task_struct *task)
{
	unsigned long flags;
	local_irq_save(flags);
	BUG_ON(!is_realtime(task));
	TRACE(TS " RIP\n", TA(task));

	if (tsk_mc_crit(task) == CRIT_LEVEL_B && lock_cache) {
		color_sched_out_task(task);
	}

	remove_from_all(task);
	if (tsk_rt(task)->scheduled_on != NO_CPU) {
		per_cpu(cpus, tsk_rt(task)->scheduled_on).scheduled = NULL;
		tsk_rt(task)->scheduled_on = NO_CPU;
	}

	/* TODO: restore. This was geting triggered by race conditions even when
	 * no level-A task was executing */
	/* if (CRIT_LEVEL_A == tsk_mc_crit(task)) */
	/* 	mc_ce_task_exit_common(task); */

	local_irq_restore(flags);
}

/**
 * mc_admit_task() - Return true if the task is valid.
 * Assumes there are no partitioned levels after level B.
 */
static long mc_admit_task(struct task_struct* task)
{
	const enum crit_level crit = tsk_mc_crit(task);
	long ret;
	if (!tsk_mc_data(task))	{
		printk(KERN_WARNING "Tried to admit task with no criticality "
			"level\n");
		ret = -EINVAL;
		goto out;
	}
	if (crit < CRIT_LEVEL_C && get_partition(task) == NO_CPU) {
		printk(KERN_WARNING "Tried to admit partitioned task with no "
		       "partition\n");
		ret = -EINVAL;
		goto out;
	}
	/* if (crit < CRIT_LEVEL_C && get_partition(task) == interrupt_cpu) { */
	/* 	printk(KERN_WARNING "Tried to admit partitioned task on " */
	/* 	       "the interrupt master\n"); */
	/* 	ret = -EINVAL; */
	/* 	goto out; */
	/* } */
	if (crit == CRIT_LEVEL_A) {
		ret = mc_ce_admit_task_common(task);
		if (ret)
			goto out;
	}
	printk(KERN_INFO "Admitted task with criticality level %d\n",
		tsk_mc_crit(task));
	ret = 0;
out:
	return ret;
}

/*
 * Caller must hold the entry lock.
 */
void pick_next_task(struct cpu_entry *entry)
{
	int i;
	struct crit_entry *ce;
	struct domain *dom;
	struct task_struct *dtask, *ready_task;
	struct server *server;

	STRACE("Picking next task\n");

	for (i = 0; i < NUM_CRIT_LEVELS && !entry->linked; i++) {
		ce = &entry->crit_entries[i];
		dom = ce->domain;
		server = &ce->server;

		/* Swap locks. We cannot acquire a domain lock while
		 * holding an entry lock or deadlocks will happen
		 */
		raw_spin_unlock(&entry->lock);
		raw_spin_lock(dom->lock);

		/* Do domain stuff before grabbing CPU locks */
		dtask = dom->peek_ready(dom);
		fix_crit_position(ce);

		raw_spin_lock(&entry->lock);

		ready_task = NULL;
		if (!entry->linked && server->state == SS_ACTIVE) {
			if (ce->server.linked) {
				ready_task = ce->server.linked;
			} else if (dtask) {
				/* Need a new task */
				dom->take_ready(dom);
				ready_task = dtask;

				link_task_to_crit(ce, dtask);
				update_crit_position(ce);
			}
		}
		if (ready_task && !is_ghost(ready_task) &&
		    server->state == SS_ACTIVE) {
			link_task_to_cpu(entry, ready_task);
			raw_spin_unlock(dom->lock);
			update_crit_levels(entry);
			raw_spin_lock(&entry->lock);
			continue;
		}
		raw_spin_unlock(dom->lock);
	}
}

static void process_update_signal(struct cpu_entry *entry)
{
	int locked;
	struct crit_entry *ce;
	struct server *crit_server, *task_server;
	struct task_struct *linked;

	STRACE("Reading update signal\n");

	ce = &entry->crit_entries[CRIT_LEVEL_B];

	/* Complete task state transitions */
	crit_server = &ce->server;
	if (!crit_server->linked) {
		return;
	}

	linked = crit_server->linked;
	task_server = &tsk_rt(linked)->server;
	if (!task_server->in_transit) {
		return;
	}

	raw_spin_lock(&dgl_lock);

	/* Update and save lock state */
	update_group_req(&group_lock, tsk_rt(linked)->req);
	locked = has_resources(linked, entry->cpu);

	raw_spin_unlock(&dgl_lock);

	if (locked && crit_server->state != SS_ACTIVE) {
		TRACE_MC_TASK(linked, "Activated\n");

		server_state_change(crit_server, SS_ACTIVE, 0);
		start_crit(ce);
		server_state_change(task_server, SS_ACTIVE, 0);

		if (!is_ghost(linked)) {
			link_task_to_cpu(entry, linked);
			update_crit_levels(entry);
			raw_spin_lock(&entry->lock);
		}
	} else if (!locked && crit_server->state != SS_BLOCKED) {
		TRACE_MC_TASK(linked, "Blocked\n");

		if (entry->linked == linked) {
			link_task_to_cpu(entry, NULL);
		}

		server_state_change(task_server, SS_BLOCKED, 0);
		stop_crit(ce);
		server_state_change(crit_server, SS_BLOCKED, 0);
	}


}

static void process_signals(struct cpu_entry *entry)
{
	struct domain *dom;
	struct crit_entry *ce;
	struct mc_signal signal;
	struct task_struct *preempted;

	ce  = &entry->crit_entries[CRIT_LEVEL_B];
	dom = ce->domain;

	/* Load signals */
	raw_spin_lock(&entry->signal_lock);
	signal = entry->signal;
	clear_signal(&entry->signal);
	raw_spin_unlock(&entry->signal_lock);

	if (signal.preempt) {
		raw_spin_lock(dom->lock);
		/* A higher-priority task may exist */
		STRACE("Reading preempt signal\n");
		dom->peek_ready(dom);

		raw_spin_lock(&entry->lock);

		if (ce->server.state == SS_ACTIVE &&
		    mc_preempt_needed(ce->domain, ce->server.linked)) {
			preempted = preempt_crit(ce->domain, ce);
			raw_spin_unlock(dom->lock);

			/* Can't requeue while we hold the entry lock, but
			 * can't release that lock until state of lower-crit
			 * servers is updated
			 */
			if (!is_ghost(ce->server.linked)) {
				update_crit_levels(entry);
			} else {
				raw_spin_unlock(&entry->lock);
			}

			if (preempted) {
				raw_spin_lock(dom->lock);
				dom->requeue(dom, preempted);
				raw_spin_unlock(dom->lock);
			}

			raw_spin_lock(&entry->lock);
		} else {
			raw_spin_unlock(dom->lock);
		}
	} else {
		raw_spin_lock(&entry->lock);
	}

	if (signal.update) {
		process_update_signal(entry);
	}
}

/**
 * mc_schedule() - Return next task which should be scheduled.
 */
static struct task_struct* mc_schedule(struct task_struct* prev)
{
	lt_t start, exec;
	int out_of_time, sleep, preempt, exists, blocks, global, lower, work;
	struct cpu_entry* entry = &__get_cpu_var(cpus);
	struct task_struct *next = NULL;

	/* Litmus gave up because it couldn't access the stack of the CPU
	 * on which will_schedule was migrating from. Requeue it.
	 * This really only happens in VMs
	 */
	if (entry->will_schedule && entry->will_schedule != prev) {
		entry->will_schedule->rt_param.scheduled_on = NO_CPU;
		low_prio_arrival(entry->will_schedule);
	}

	if (prev && tsk_rt(prev)->last_exec_time) {
		exec = litmus_clock() - tsk_rt(prev)->last_exec_time;
		tsk_rt(prev)->user_job.exec_time += exec;
	}

	if (prev && tsk_mc_crit(prev) == CRIT_LEVEL_B &&
	    is_realtime(prev) && get_rt_job(prev) > 1 && lock_cache) {
		start = litmus_clock();
		work = color_sched_out_task(prev);
		tsk_rt(prev)->flush = litmus_clock() - start;
		++tsk_rt(prev)->flush_work;
	}

	TS_LVLA_SCHED_START;
	TS_LVLB_SCHED_START;
	TS_LVLC_SCHED_START;

	raw_spin_lock(&entry->lock);

	BUG_ON(entry->scheduled && entry->scheduled != prev);
	BUG_ON(entry->scheduled && !is_realtime(prev));
	BUG_ON(prev && is_realtime(prev) && !entry->scheduled);

	if (entry->scheduled != NULL) {
		entry->scheduled->rt_param.scheduled_on = NO_CPU;
		update_server_time(entry->scheduled);
	}

	/* Determine state */
	exists      = entry->scheduled != NULL;
	blocks      = exists && !is_running(entry->scheduled);
	out_of_time = exists &&	budget_exhausted(entry->scheduled);
	sleep	    = exists && tsk_rt(entry->scheduled)->completed;
	global      = exists && is_global_task(entry->scheduled);
	preempt     = entry->scheduled != entry->linked;
	lower       = exists && preempt && entry->linked &&
		tsk_mc_crit(entry->scheduled) > tsk_mc_crit(entry->linked);

	TRACE(TS " block:%d oot:%d sleep:%d preempt:%d, now: %llu\n",
	      TA(prev), blocks, out_of_time, sleep, preempt, litmus_clock());

	raw_spin_unlock(&entry->lock);

#ifdef CONFIG_PLUGIN_MC_REDIRECT
	if (smp_processor_id() == interrupt_cpu)
		fix_global_levels();
#endif

	/* If a task blocks we have no choice but to reschedule */
	if (blocks)
		remove_from_all(entry->scheduled);
	/* Any task which exhausts its budget or sleeps waiting for its next
	 * period completes unless its execution has been forcibly stopped
	 */
	else if (out_of_time || sleep)/* && !preempt)*/
		job_completion(entry->scheduled, !sleep || preempt);
	/* Global scheduled tasks must wait for a deschedule before they
	 * can rejoin the global state. Rejoin them here
	 */
	else if (global && preempt) {
		if (lower)
			low_prio_arrival(entry->scheduled);
		else
			job_arrival(entry->scheduled);
	}

	/* TODO: move this down somehow */
	sched_state_task_picked();

	process_signals(entry);

	/* Pick next task if none is linked */
	if (!entry->linked)
		pick_next_task(entry);

	/* Schedule next task */
	next = entry->linked;
	if (next) {
		next->rt_param.scheduled_on = entry->cpu;
	}
	entry->will_schedule = next;

	raw_spin_unlock(&entry->lock);

	if (next) {
		switch (tsk_mc_crit(next)) {
		case CRIT_LEVEL_A: TS_LVLA_SCHED_END(next); break;
		case CRIT_LEVEL_B: TS_LVLB_SCHED_END(next); break;
		case CRIT_LEVEL_C: TS_LVLC_SCHED_END(next); break;
		}
	}

	if (next && tsk_mc_crit(next) == CRIT_LEVEL_B && lock_cache && get_rt_job(next) > 1) {
		start = litmus_clock();
		work = color_sched_in_task(next);
		tsk_rt(next)->load = litmus_clock() - start;
		tsk_rt(next)->load_work = work;
	}

	if (next) {
		tsk_rt(next)->last_exec_time = litmus_clock();
		TRACE_MC_TASK(next, "Picked this task\n");
	} else {
		STRACE("CPU %d idles at %llu\n", entry->cpu, litmus_clock());
	}

	return next;
}

void mc_finish_switch(struct task_struct *prev)
{
	struct cpu_entry* entry = &__get_cpu_var(cpus);
	entry->scheduled = is_realtime(current) ? current : NULL;
	TRACE_TASK(prev, "Switched away from to " TS "\n",
		   TA(entry->scheduled));
}

long mc_deactivate_plugin(void)
{
	return mc_ce_deactivate_plugin_common();
}

static unsigned long long deadline_prio(struct dgl *dgl, struct dgl_group_req *greq)
{
	return get_deadline(greq->task);
}

static void cpu_update(struct dgl_group_req *greq)
{
	struct cpu_entry *entry = &per_cpu(cpus, greq->cpu);

	raw_spin_lock(&entry->signal_lock);
	entry->signal.update = 1;
	raw_spin_unlock(&entry->signal_lock);

	litmus_reschedule(greq->cpu);
}

/*
 * Setup and send signal to CPU for resource acquisition. To avoid touching
 * CPU locks, all CPU state modifications are delayed until the signal is
 * processed.
 */
static void cpu_acquired(struct dgl_group_req *greq)
{
	struct server *server = &tsk_rt(greq->task)->server;

	TRACE_MC_TASK(greq->task, "Acquired CPU %d\n", greq->cpu);

	sched_trace_task_resume(greq->task);
	server_state_change(server, SS_ACTIVE, 1);

	cpu_update(greq);
}

static void cpu_preempted(struct dgl_group_req *greq)
{
	struct server *server = &tsk_rt(greq->task)->server;

	TRACE_MC_TASK(greq->task, "Dropping CPU %d\n", greq->cpu);

	sched_trace_task_block(greq->task);
	server_state_change(server, SS_BLOCKED, 1);

	cpu_update(greq);
}

/* **************************************************************************
 * Initialization
 * ************************************************************************** */

/* Initialize values here so that they are allocated with the module
 * and destroyed when the module is unloaded.
 */

/* LVL-A */
DEFINE_PER_CPU(struct domain_data, _mc_crit_a);
DEFINE_PER_CPU(raw_spinlock_t, _mc_crit_a_lock);
DEFINE_PER_CPU(struct ce_dom_data, _mc_crit_a_ce_data);
/* LVL-B */
DEFINE_PER_CPU(struct domain_data, _mc_crit_b);
DEFINE_PER_CPU(rt_domain_t, _mc_crit_b_rt);
/* LVL-C */
static struct domain_data _mc_crit_c;
static rt_domain_t _mc_crit_c_rt;
struct bheap _mc_heap_c;
struct bheap_node _mc_nodes_c[NR_CPUS];

static long mc_activate_plugin(void)
{
	struct domain_data *dom_data;
	struct domain *dom;
	struct domain_data *our_domains[NR_CPUS];
	rt_domain_t *rt_dom;
	int cpu, n = 0;
	long ret;

	reset_way_tracker();

	interrupt_cpu = atomic_read(&release_master_cpu);

	for_each_online_cpu(cpu) {
		rt_dom = &per_cpu(_mc_crit_b_rt, cpu);
		/* rt_dom->release_master = cpu; */
	}

	if (cache_preempt && !lock_cache) {
		printk(KERN_ERR "LITMUS-MC: specified cache preemption without "
		       "enabling the locking protocol (lock_cache)\n");
		ret = -EINVAL;
		goto out;
	}

	dgl_init(&group_lock, color_cache_info.nr_colors,
		 color_cache_info.ways);
	if (cache_preempt) {
		group_lock.assign_priority = deadline_prio;
		group_lock.cpu_preempted = cpu_preempted;
	}
	group_lock.cpu_acquired  = cpu_acquired;

	for_each_online_cpu(cpu) {
		BUG_ON(NR_CPUS <= n);
		dom = per_cpu(cpus, cpu).crit_entries[CRIT_LEVEL_A].domain;
		dom_data = domain_data(dom);
		our_domains[cpu] = dom_data;
#if defined(CONFIG_MERGE_TIMERS) && defined(CONFIG_PLUGIN_MC_RELEASE_MASTER)
		per_cpu(cpus, cpu).event_group =
			get_event_group_for(interrupt_cpu);
#elif defined(CONFIG_MERGE_TIMERS) && !defined(CONFIG_PLUGIN_MC_RELEASE_MASTER)
		per_cpu(cpus, cpu).event_group = get_event_group_for(cpu);
#endif
		n++;
	}
	ret = mc_ce_set_domains(n, our_domains);
	if (ret)
		goto out;
	ret = mc_ce_activate_plugin_common();
out:
	return ret;
}

static void mc_release_ts(lt_t time)
{
	int cpu, cont_id = -1;
	char name[TASK_COMM_LEN];
	enum crit_level level;
	struct cpu_entry *entry;
	struct crit_entry *ce;

	level = CRIT_LEVEL_A;
	strcpy(name, "LVL-A");
	for_each_online_cpu(cpu) {
		/* if (cpu == interrupt_cpu) */
		/* 	continue; */
		entry = &per_cpu(cpus, cpu);
		sched_trace_container_param(++cont_id, (const char*)&name);
		ce = &entry->crit_entries[level];
		sched_trace_server_param(ce_sid(ce), cont_id, 0, 0);
		server_state_change(&ce->server, SS_ACTIVE, 0);
	}

	level = CRIT_LEVEL_B;
	strcpy(name, "LVL-B");
	for_each_online_cpu(cpu) {
		/* if (cpu == interrupt_cpu) */
		/* 	continue; */
		entry = &per_cpu(cpus, cpu);
		sched_trace_container_param(++cont_id, (const char*)&name);
		ce = &entry->crit_entries[level];
		sched_trace_server_param(ce_sid(ce), cont_id, 0, 0);
		server_state_change(&ce->server, SS_ACTIVE, 0);
	}

	level = CRIT_LEVEL_C;
	strcpy(name, "LVL-C");
	sched_trace_container_param(++cont_id, (const char*)&name);
	for_each_online_cpu(cpu) {
		entry = &per_cpu(cpus, cpu);
		ce = &entry->crit_entries[level];
		sched_trace_server_param(ce_sid(ce), cont_id, 0, 0);
		server_state_change(&ce->server, SS_ACTIVE, 0);
	}

	mc_ce_release_at_common(NULL, time);
}

static struct sched_plugin mc_plugin __cacheline_aligned_in_smp = {
	.plugin_name		= "MC",
	.task_new		= mc_task_new,
	.complete_job		= complete_job,
	.task_exit		= mc_task_exit,
	.schedule		= mc_schedule,
	.task_wake_up		= mc_task_wake_up,
	.task_block		= mc_task_block,
	.admit_task		= mc_admit_task,
	.activate_plugin	= mc_activate_plugin,
	.release_at		= release_at,
	.deactivate_plugin	= mc_deactivate_plugin,
	.finish_switch		= mc_finish_switch,
	.release_ts		= mc_release_ts,
};

static void init_crit_entry(struct cpu_entry *entry,
			    struct crit_entry *ce, enum crit_level level,
			    struct domain_data *dom_data,
			    struct bheap_node *node)
{

	ce->level  = level;
	ce->server.linked = NULL;
	ce->node   = node;
	ce->domain = &dom_data->domain;
	server_init(&ce->server, ce_sid(ce), 0, entry->cpu);
	ce->server.parent = -entry->cpu - 1;
#ifdef CONFIG_MERGE_TIMERS
	init_event(&ce->event, level, mc_ghost_exhausted,
		   event_list_alloc(GFP_ATOMIC));
#else
	hrtimer_init(&ce->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
	ce->timer.function = mc_ghost_exhausted;
#endif

}

static void init_local_domain(struct cpu_entry *entry, struct domain_data *dom_data,
			      enum crit_level level)
{
	dom_data->heap = NULL;
	dom_data->crit_entry = &entry->crit_entries[level];
	init_crit_entry(entry, dom_data->crit_entry, level, dom_data, NULL);
}

static void init_global_domain(struct domain_data *dom_data, enum crit_level level,
			       struct bheap *heap, struct bheap_node *nodes)
{
	int cpu;
	struct cpu_entry *entry;
	struct crit_entry *ce;
	struct bheap_node *node;

	dom_data->crit_entry = NULL;
	dom_data->heap = heap;
	bheap_init(heap);

	for_each_online_cpu(cpu) {
		entry = &per_cpu(cpus, cpu);
		node = &nodes[cpu];
		ce = &entry->crit_entries[level];
		init_crit_entry(entry, ce, level, dom_data, node);
		bheap_node_init(&ce->node, ce);
		bheap_insert(cpu_lower_prio, heap, node);
	}
}

static void init_edf_domain(struct domain *dom, rt_domain_t *rt,
				   enum crit_level prio, int is_partitioned, int cpu)
{
	pd_domain_init(dom, rt, edf_ready_order, NULL,
		       mc_release_jobs, edf_higher_prio);
	rt->level = prio;
#if defined(CONFIG_PLUGIN_MC_RELEASE_MASTER) && defined(CONFIG_MERGE_TIMERS)
	/* All timers are on one CPU and release-master is using the event
	 * merging interface as well. */
	BUG_ON(NO_CPU == interrupt_cpu);
	rt->event_group = get_event_group_for(interrupt_cpu);
	rt->prio = prio;
#elif defined(CONFIG_PLUGIN_MC_RELEASE_MASTER) && !defined(CONFIG_MERGE_TIMERS)
	/* Using release master, but not merging timers. */
	/* rt->release_master = interrupt_cpu; */
#elif !defined(CONFIG_PLUGIN_MC_RELEASE_MASTER) && defined(CONFIG_MERGE_TIMERS)
	/* Merge the timers, but don't move them to the release master. */
	if (is_partitioned) {
		rt->event_group = get_event_group_for(cpu);
	} else {
		/* Global timers will be added to the event groups that code is
		 * executing on when add_event() is called.
		 */
		rt->event_group = NULL;
	}
	rt->prio = prio;
#endif
}

static char* domain_name(const char *name, int cpu)
{
	char *buf = kmalloc(LITMUS_LOCKDEP_NAME_MAX_LEN * sizeof(char), GFP_ATOMIC);
	snprintf(buf, LITMUS_LOCKDEP_NAME_MAX_LEN, "%s%d", name, cpu);
	return buf;
}

struct domain_data *ce_domain_for(int);
static int __init init_mc(void)
{
	int cpu;
	rt_domain_t *rt;
	raw_spinlock_t *a_dom_lock, *b_dom_lock, *c_dom_lock; /* For lock debugger */
	struct cpu_entry *entry;
	struct domain_data *dom_data;
	struct ce_dom_data *ce_data;

	for_each_online_cpu(cpu) {
		entry = &per_cpu(cpus, cpu);

		/* CPU */
		entry->cpu = cpu;
		entry->scheduled = NULL;
		entry->linked = NULL;

		raw_spin_lock_init(&entry->lock);
		raw_spin_lock_init(&entry->signal_lock);
		clear_signal(&entry->signal);

#ifdef CONFIG_PLUGIN_MC_REDIRECT
		raw_spin_lock_init(&entry->redir_lock);
		INIT_LIST_HEAD(&entry->redir);
#endif

		/* CRIT_LEVEL_A */
		dom_data = &per_cpu(_mc_crit_a, cpu);
		ce_data = &per_cpu(_mc_crit_a_ce_data, cpu);
		a_dom_lock = &per_cpu(_mc_crit_a_lock, cpu);
		dom_data->domain.acquire_resources = dumb_acquire;
		dom_data->domain.release_resources = dumb_release;
		raw_spin_lock_init(a_dom_lock);
		ce_domain_init(&dom_data->domain,
				a_dom_lock, ce_requeue, ce_peek_and_take_ready,
				ce_peek_and_take_ready, ce_higher_prio, ce_data, cpu,
				ce_timer_function);
		init_local_domain(entry, dom_data, CRIT_LEVEL_A);
		dom_data->domain.name = domain_name("LVL-A", cpu);

		/* CRIT_LEVEL_B */
		dom_data = &per_cpu(_mc_crit_b, cpu);
		rt = &per_cpu(_mc_crit_b_rt, cpu);
		init_local_domain(entry, dom_data, CRIT_LEVEL_B);
		init_edf_domain(&dom_data->domain, rt, CRIT_LEVEL_B, 1, cpu);
		dom_data->domain.acquire_resources = acquire_resources;
		dom_data->domain.release_resources = release_resources;
		b_dom_lock = dom_data->domain.lock;
		raw_spin_lock_init(b_dom_lock);

		dom_data->domain.name = domain_name("LVL-B", cpu);
	}

	/* CRIT_LEVEL_C */
	init_global_domain(&_mc_crit_c, CRIT_LEVEL_C,
			   &_mc_heap_c, _mc_nodes_c);
	init_edf_domain(&_mc_crit_c.domain, &_mc_crit_c_rt, CRIT_LEVEL_C,
			0, NO_CPU);
	_mc_crit_c.domain.acquire_resources = dumb_acquire;
	_mc_crit_c.domain.release_resources = dumb_release;
	c_dom_lock = _mc_crit_c.domain.lock;
	raw_spin_lock_init(c_dom_lock);
	_mc_crit_c.domain.name = "LVL-C";


	/* GROUP LOCK */
	raw_spin_lock_init(&dgl_lock);

	return register_sched_plugin(&mc_plugin);
}

module_init(init_mc);