16 files changed, 2723 insertions, 38 deletions
diff --git a/include/litmus/litmus.h b/include/litmus/litmus.h
index 0b071fd359f9..2776470bb897 100644
--- a/include/litmus/litmus.h
+++ b/include/litmus/litmus.h
@@ -44,6 +44,8 @@ void litmus_exit_task(struct task_struct *tsk);
 #define tsk_rt(t)               (&(t)->rt_param)
+#define get_server_job(t) (tsk_rt(t)->job_params.fake_job_no)
 /*      Realtime utility macros */
 #define get_rt_flags(t)         (tsk_rt(t)->flags)
 #define set_rt_flags(t,f)       (tsk_rt(t)->flags=(f))
diff --git a/include/litmus/locking.h b/include/litmus/locking.h
index 4d7b870cb443..41991d5af01b 100644
--- a/include/litmus/locking.h
+++ b/include/litmus/locking.h
@@ -9,6 +9,7 @@ struct litmus_lock_ops;
 struct litmus_lock {
        struct litmus_lock_ops *ops;
        int type;
+        int id;
 };
 struct litmus_lock_ops {
diff --git a/include/litmus/rt_param.h b/include/litmus/rt_param.h
index d6d799174160..ba62e10d6f2c 100644
--- a/include/litmus/rt_param.h
+++ b/include/litmus/rt_param.h
@@ -89,7 +89,7 @@ struct rt_job {
        lt_t    exec_time;
        /* Which job is this. This is used to let user space
-         * specify which job to wait for, which is important if jobs
+yes      * specify which job to wait for, which is important if jobs
         * overrun. If we just call sys_sleep_next_period() then we
         * will unintentionally miss jobs after an overrun.
         *
diff --git a/include/litmus/sched_plugin.h b/include/litmus/sched_plugin.h
index 6e7cabdddae8..01786b57a4a9 100644
--- a/include/litmus/sched_plugin.h
+++ b/include/litmus/sched_plugin.h
@@ -67,6 +67,9 @@ typedef long (*admit_task_t)(struct task_struct* tsk);
 typedef void (*release_at_t)(struct task_struct *t, lt_t start);
+/* TODO remove me */
+typedef void (*release_ts_t)(lt_t time);
 struct sched_plugin {
        struct list_head        list;
        /*      basic info              */
@@ -93,6 +96,8 @@ struct sched_plugin {
        task_block_t            task_block;
        task_exit_t             task_exit;
+        release_ts_t            release_ts;
 #ifdef CONFIG_LITMUS_LOCKING
        /*      locking protocols       */
        allocate_lock_t         allocate_lock;
diff --git a/include/litmus/sched_trace.h b/include/litmus/sched_trace.h
index 82bde8241298..49972d75ef38 100644
--- a/include/litmus/sched_trace.h
+++ b/include/litmus/sched_trace.h
@@ -180,6 +180,13 @@ feather_callback void do_sched_trace_sys_release(unsigned long id,
 #define trace_litmus_task_resume(t)
 #define trace_litmus_sys_release(start)
+#define trace_litmus_container_param(cid, name)
+#define trace_litmus_server_param(sid, cid, wcet, time)
+#define trace_litmus_server_switch_to(sid, job, tid)
+#define trace_litmus_server_switch_away(sid, job, tid)
+#define trace_litmus_server_release(sid, job, release, deadline)
+#define trace_litmus_server_completion(sid, job)
 #endif
@@ -226,18 +233,28 @@ feather_callback void do_sched_trace_sys_release(unsigned long id,
                trace_litmus_task_completion(t, forced);                \
        } while (0)
-#define sched_trace_task_block(t)                                       \
+#define sched_trace_task_block(t, i)                                    \
        do {                                                            \
                SCHED_TRACE(SCHED_TRACE_BASE_ID + 7,                    \
                        do_sched_trace_task_block, t);                  \
-                trace_litmus_task_block(t);                             \
+                trace_litmus_task_block(t, i);                          \
        } while (0)
-#define sched_trace_task_resume(t)                                      \
+#define sched_trace_task_resume(t, i)                                   \
        do {                                                            \
                SCHED_TRACE(SCHED_TRACE_BASE_ID + 8,                    \
                                do_sched_trace_task_resume, t);         \
-                trace_litmus_task_resume(t);                            \
+                trace_litmus_task_resume(t, i);                         \
+        } while (0)
+#define sched_trace_resource_acquire(t, i)                              \
+        do {                                                            \
+                trace_litmus_resource_acquire(t, i);                    \
+        } while (0)
+#define sched_trace_resource_released(t, i)                             \
+        do {                                                            \
+                trace_litmus_resource_released(t, i);                   \
        } while (0)
 #define sched_trace_action(t, action)                                   \
@@ -252,6 +269,41 @@ feather_callback void do_sched_trace_sys_release(unsigned long id,
                trace_litmus_sys_release(when);                         \
        } while (0)
+#define QT_START lt_t _qt_start = litmus_clock()
+#define QT_END \
+        sched_trace_log_message("%d P%d      [%s@%s:%d]: Took %llu\n\n", \
+                TRACE_ARGS, litmus_clock() - _qt_start)
+#define sched_trace_container_param(cid, name)                          \
+        do {                                                            \
+                trace_litmus_container_param(cid, name);                \
+        } while (0)
+#define sched_trace_server_param(sid, cid, wcet, period)                \
+        do {                                                            \
+                trace_litmus_server_param(sid, cid, wcet, period);      \
+        } while(0)
+#define sched_trace_server_switch_to(sid, job, tid)                     \
+        do {                                                            \
+                trace_litmus_server_switch_to(sid, job, tid);           \
+        } while(0)
+#define sched_trace_server_switch_away(sid, job, tid)                   \
+        do {                                                            \
+                trace_litmus_server_switch_away(sid, job, tid);         \
+        } while (0)
+#define sched_trace_server_release(sid, job, rel, dead)                 \
+        do {                                                            \
+                trace_litmus_server_release(sid, job, rel, dead);       \
+        } while (0)
+#define sched_trace_server_completion(sid, job)                         \
+        do {                                                            \
+                trace_litmus_server_completion(sid, job);               \
+        } while (0)
 #define sched_trace_quantum_boundary() /* NOT IMPLEMENTED */
 #endif /* __KERNEL__ */
diff --git a/include/trace/events/litmus.h b/include/trace/events/litmus.h
index 0fffcee02be0..b3a8f166e65f 100644
--- a/include/trace/events/litmus.h
+++ b/include/trace/events/litmus.h
@@ -11,10 +11,6 @@
 #include <litmus/litmus.h>
 #include <litmus/rt_param.h>
-/*
- * Tracing task admission
- */
 TRACE_EVENT(litmus_task_param,
        TP_PROTO(struct task_struct *t),
@@ -24,9 +20,9 @@ TRACE_EVENT(litmus_task_param,
        TP_STRUCT__entry(
                __field( pid_t,         pid     )
                __field( unsigned int,  job     )
-                __field( lt_t,          wcet    )
+                __field( unsigned long long,            wcet    )
-                __field( lt_t,          period  )
+                __field( unsigned long long,            period  )
-                __field( lt_t,          phase   )
+                __field( unsigned long long,            phase   )
                __field( int,           partition )
        ),
@@ -56,8 +52,8 @@ TRACE_EVENT(litmus_task_release,
        TP_STRUCT__entry(
                __field( pid_t,         pid     )
                __field( unsigned int,  job     )
-                __field( lt_t,          release )
+                __field( unsigned long long,            release )
-                __field( lt_t,          deadline        )
+                __field( unsigned long long,            deadline        )
        ),
        TP_fast_assign(
@@ -84,8 +80,8 @@ TRACE_EVENT(litmus_switch_to,
        TP_STRUCT__entry(
                __field( pid_t,         pid     )
                __field( unsigned int,  job     )
-                __field( lt_t,          when    )
+                __field( unsigned long long,            when    )
-                __field( lt_t,          exec_time       )
+                __field( unsigned long long,            exec_time       )
        ),
        TP_fast_assign(
@@ -112,8 +108,8 @@ TRACE_EVENT(litmus_switch_away,
        TP_STRUCT__entry(
                __field( pid_t,         pid     )
                __field( unsigned int,  job     )
-                __field( lt_t,          when    )
+                __field( unsigned long long,            when    )
-                __field( lt_t,          exec_time       )
+                __field( unsigned long long,            exec_time       )
        ),
        TP_fast_assign(
@@ -140,7 +136,7 @@ TRACE_EVENT(litmus_task_completion,
        TP_STRUCT__entry(
                __field( pid_t,         pid     )
                __field( unsigned int,  job     )
-                __field( lt_t,          when    )
+                __field( unsigned long long,            when    )
                __field( unsigned long, forced  )
        ),
@@ -161,21 +157,71 @@ TRACE_EVENT(litmus_task_completion,
 */
 TRACE_EVENT(litmus_task_block,
-        TP_PROTO(struct task_struct *t),
+        TP_PROTO(struct task_struct *t, int lid),
-        TP_ARGS(t),
+        TP_ARGS(t, lid),
        TP_STRUCT__entry(
                __field( pid_t,         pid     )
-                __field( lt_t,          when    )
+                __field( int,           lid      )
+                __field( unsigned long long,            when    )
        ),
        TP_fast_assign(
                __entry->pid    = t ? t->pid : 0;
+                __entry->lid    = lid;
                __entry->when   = litmus_clock();
        ),
-        TP_printk("(%u) blocks: %Lu\n", __entry->pid, __entry->when)
+        TP_printk("(%u) blocks on %d: %Lu\n", __entry->pid,
+                  __entry->lid, __entry->when)
+);
+/*
+ * Lock events
+ */
+TRACE_EVENT(litmus_resource_acquire,
+        TP_PROTO(struct task_struct *t, int lid),
+        TP_ARGS(t, lid),
+        TP_STRUCT__entry(
+                __field( pid_t,         pid     )
+                __field( int,           lid      )
+                __field( unsigned long long,            when    )
+        ),
+        TP_fast_assign(
+                __entry->pid    = t ? t->pid : 0;
+                __entry->lid    = lid;
+                __entry->when   = litmus_clock();
+        ),
+        TP_printk("(%u) acquires %d: %Lu\n", __entry->pid,
+                  __entry->lid, __entry->when)
+);
+TRACE_EVENT(litmus_resource_release,
+        TP_PROTO(struct task_struct *t, int lid),
+        TP_ARGS(t, lid),
+        TP_STRUCT__entry(
+                __field( pid_t,         pid     )
+                __field( int,           lid      )
+                __field( unsigned long long,            when    )
+        ),
+        TP_fast_assign(
+                __entry->pid    = t ? t->pid : 0;
+                __entry->lid    = lid;
+                __entry->when   = litmus_clock();
+        ),
+        TP_printk("(%u) releases %d: %Lu\n", __entry->pid,
+                  __entry->lid, __entry->when)
 );
 /*
@@ -183,24 +229,27 @@ TRACE_EVENT(litmus_task_block,
 */
 TRACE_EVENT(litmus_task_resume,
-        TP_PROTO(struct task_struct *t),
+        TP_PROTO(struct task_struct *t, int lid),
-        TP_ARGS(t),
+        TP_ARGS(t, lid),
        TP_STRUCT__entry(
                __field( pid_t,         pid     )
+                __field( int,           lid      )
                __field( unsigned int,  job     )
-                __field( lt_t,          when    )
+                __field( unsigned long long,            when    )
        ),
        TP_fast_assign(
                __entry->pid    = t ? t->pid : 0;
                __entry->job    = t ? t->rt_param.job_params.job_no : 0;
                __entry->when   = litmus_clock();
+                __entry->lid    = lid;
        ),
-        TP_printk("resume(job(%u, %u)): %Lu\n",
+        TP_printk("resume(job(%u, %u)) on %d: %Lu\n",
-                        __entry->pid, __entry->job, __entry->when)
+                  __entry->pid, __entry->job,
+                  __entry->lid, __entry->when)
 );
 /*
@@ -208,13 +257,13 @@ TRACE_EVENT(litmus_task_resume,
 */
 TRACE_EVENT(litmus_sys_release,
-        TP_PROTO(lt_t *start),
+        TP_PROTO(unsigned long long *start),
        TP_ARGS(start),
        TP_STRUCT__entry(
-                __field( lt_t,          rel     )
+                __field( unsigned long long,            rel     )
-                __field( lt_t,          when    )
+                __field( unsigned long long,            when    )
        ),
        TP_fast_assign(
@@ -225,6 +274,137 @@ TRACE_EVENT(litmus_sys_release,
        TP_printk("SynRelease(%Lu) at %Lu\n", __entry->rel, __entry->when)
 );
+/*
+ * Containers
+ */
+TRACE_EVENT(litmus_container_param,
+        TP_PROTO(int cid, const char *name),
+        TP_ARGS(cid, name),
+        TP_STRUCT__entry(
+                __field( int,  cid )
+                __array( char,  name,   TASK_COMM_LEN   )
+        ),
+        TP_fast_assign(
+               memcpy(__entry->name, name, TASK_COMM_LEN);
+               __entry->cid = cid;
+        ),
+        TP_printk("container, name: %s, id: %d\n", __entry->name, __entry->cid)
+);
+TRACE_EVENT(litmus_server_param,
+        TP_PROTO(int sid, int cid, unsigned long long wcet, unsigned long long period),
+        TP_ARGS(sid, cid, wcet, period),
+        TP_STRUCT__entry(
+                __field( int, sid )
+                __field( int, cid )
+                __field( unsigned long long, wcet )
+                __field( unsigned long long, period )
+        ),
+        TP_fast_assign(
+               __entry->cid = cid;
+               __entry->sid = sid;
+               __entry->wcet = wcet;
+               __entry->period = period;
+        ),
+        TP_printk("server(%llu, %llu), sid: %llu, cont: %llu\n",
+                  __entry->wcet, __entry->period, __entry->sid, __entry->cid)
+);
+TRACE_EVENT(litmus_server_switch_to,
+        TP_PROTO(int sid, unsigned int job, int tid),
+        TP_ARGS(sid, job, tid),
+        TP_STRUCT__entry(
+                __field( int, sid)
+                __field( unsigned int, job)
+                __field( int, tid)
+        ),
+        TP_fast_assign(
+                __entry->sid = sid;
+                __entry->tid = tid;
+                __entry->job = job;
+        ),
+        TP_printk("switch_to(server(%d, %u)): %d\n", __entry->sid, __entry->job, __entry->tid)
+);
+TRACE_EVENT(litmus_server_switch_away,
+        TP_PROTO(int sid, unsigned int job, int tid),
+        TP_ARGS(sid, job, tid),
+        TP_STRUCT__entry(
+                __field( int, sid)
+                __field( unsigned int, job)
+                __field( int, tid)
+        ),
+        TP_fast_assign(
+                __entry->sid = sid;
+                __entry->tid = tid;
+        ),
+        TP_printk("switch_away(server(%d, %u)): %d\n", __entry->sid, __entry->job, __entry->tid)
+);
+TRACE_EVENT(litmus_server_release,
+        TP_PROTO(int sid, unsigned int job,
+                 unsigned long long release,
+                 unsigned long long deadline),
+        TP_ARGS(sid, job, release, deadline),
+        TP_STRUCT__entry(
+                __field( int, sid)
+                __field( unsigned int, job)
+                __field( unsigned long long, release)
+                __field( unsigned long long, deadline)
+        ),
+        TP_fast_assign(
+                __entry->sid = sid;
+                __entry->job = job;
+                __entry->release = release;
+                __entry->deadline = deadline;
+        ),
+        TP_printk("release(server(%d, %u)), release: %llu, deadline: %llu\n",  __entry->sid, __entry->job, __entry->release, __entry->deadline)
+);
+TRACE_EVENT(litmus_server_completion,
+        TP_PROTO(int sid, int job),
+        TP_ARGS(sid, job),
+        TP_STRUCT__entry(
+                __field( int, sid)
+                __field( unsigned int, job)
+        ),
+        TP_fast_assign(
+                __entry->sid = sid;
+                __entry->job = job;
+        ),
+        TP_printk("completion(server(%d, %d))\n", __entry->sid, __entry->job)
+);
 #endif /* _SCHED_TASK_TRACEPOINT_H */
 /* Must stay outside the protection */
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 533c49f48047..4d6f3474e8fa 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -17,6 +17,7 @@
 */
 #include <linux/ftrace_event.h>
+#include <litmus/litmus.h>
 /*
 * DECLARE_EVENT_CLASS can be used to add a generic function
@@ -54,7 +55,7 @@
 #define __string(item, src) __dynamic_array(char, item, -1)
 #undef TP_STRUCT__entry
-#define TP_STRUCT__entry(args...) args
+#define TP_STRUCT__entry(args...) args __field( unsigned long long, __rt_ts )
 #undef DECLARE_EVENT_CLASS
 #define DECLARE_EVENT_CLASS(name, proto, args, tstruct, assign, print)  \
@@ -507,7 +508,7 @@ static inline notrace int ftrace_get_offsets_##call(			\
        strcpy(__get_str(dst), src);
 #undef TP_fast_assign
-#define TP_fast_assign(args...) args
+#define TP_fast_assign(args...) args; __entry->__rt_ts = litmus_clock();
 #undef TP_perf_assign
 #define TP_perf_assign(args...)
diff --git a/litmus/Makefile b/litmus/Makefile
index 7338180f196f..3487dfe8df05 100644
--- a/litmus/Makefile
+++ b/litmus/Makefile
@@ -16,8 +16,9 @@ obj-y     = sched_plugin.o litmus.o \
            srp.o \
            bheap.o \
            ctrldev.o \
-            sched_gsn_edf.o \
+            domain.o \
-            sched_psn_edf.o
+            sched_psn_edf.o \
+            sched_gsn_edf.o
 obj-$(CONFIG_PLUGIN_CEDF) += sched_cedf.o
 obj-$(CONFIG_PLUGIN_PFAIR) += sched_pfair.o
diff --git a/litmus/locking.c b/litmus/locking.c
index 0c1aa6aa40b7..447b8aaee8dc 100644
--- a/litmus/locking.c
+++ b/litmus/locking.c
@@ -28,14 +28,18 @@ static inline struct litmus_lock* get_lock(struct od_table_entry* entry)
        return (struct litmus_lock*) entry->obj->obj;
 }
+atomic_t lock_id = ATOMIC_INIT(0);
 static  int create_generic_lock(void** obj_ref, obj_type_t type, void* __user arg)
 {
        struct litmus_lock* lock;
        int err;
        err = litmus->allocate_lock(&lock, type, arg);
-        if (err == 0)
+        if (err == 0) {
+                lock->id = atomic_add_return(1, &lock_id);
                *obj_ref = lock;
+        }
        return err;
 }
diff --git a/litmus/sched_gsn_edf.c b/litmus/sched_gsn_edf.c
index 6ed504f4750e..f50b58c37b31 100644
--- a/litmus/sched_gsn_edf.c
+++ b/litmus/sched_gsn_edf.c
@@ -805,6 +805,8 @@ int gsnedf_fmlp_lock(struct litmus_lock* l)
                /* release lock before sleeping */
                spin_unlock_irqrestore(&sem->wait.lock, flags);
+                sched_trace_task_block(t, l->id);
                /* We depend on the FIFO order.  Thus, we don't need to recheck
                 * when we wake up; we are guaranteed to have the lock since
                 * there is only one wake up per release.
@@ -812,7 +814,11 @@ int gsnedf_fmlp_lock(struct litmus_lock* l)
                schedule();
+<<<<<<< HEAD
                TS_LOCK_RESUME;
+=======
+                sched_trace_task_resume(t, l->id);
+>>>>>>> d47039b... sched_trace: allow tasks to block on a lock id
                /* Since we hold the lock, no other task will change
                 * ->owner. We can thus check it without acquiring the spin
diff --git a/litmus/sched_litmus.c b/litmus/sched_litmus.c
index 5a15ce938984..659dc13d3fa9 100644
--- a/litmus/sched_litmus.c
+++ b/litmus/sched_litmus.c
@@ -160,7 +160,7 @@ static void enqueue_task_litmus(struct rq *rq, struct task_struct *p,
                                int flags)
 {
        if (flags & ENQUEUE_WAKEUP) {
-                sched_trace_task_resume(p);
+                sched_trace_task_resume(p, 0);
                tsk_rt(p)->present = 1;
                /* LITMUS^RT plugins need to update the state
                 * _before_ making it available in global structures.
@@ -185,7 +185,7 @@ static void dequeue_task_litmus(struct rq *rq, struct task_struct *p,
        if (flags & DEQUEUE_SLEEP) {
                litmus->task_block(p);
                tsk_rt(p)->present = 0;
-                sched_trace_task_block(p);
+                sched_trace_task_block(p, 0);
                rq->litmus.nr_running--;
        } else
diff --git a/litmus/sched_mc.c b/litmus/sched_mc.c
new file mode 100644
index 000000000000..41f02ee3e6ca
--- /dev/null
+++ b/litmus/sched_mc.c
@@ -0,0 +1,1369 @@
+/**
+ * litmus/sched_mc.c
+ *
+ * Implementation of the Mixed Criticality scheduling algorithm.
+ *
+ * (Per Mollison, Erickson, Anderson, Baruah, Scoredos 2010)
+ *
+ * Absolute first: relative time spent doing different parts of release
+ * and scheduling overhead needs to be measured and graphed.
+ *
+ * Domain locks should be more fine-grained. There is no reason to hold the
+ * ready-queue lock when adding a task to the release-queue.
+ *
+ * The levels should be converted to linked-lists so that they are more
+ * adaptable and need not be identical on all processors.
+ *
+ * The interaction between remove_from_all and other concurrent operations
+ * should be re-examined. If a job_completion and a preemption happen
+ * simultaneously, a task could be requeued, removed, then requeued again.
+ *
+ * Level-C tasks should be able to swap CPUs a-la GSN-EDF. They should also
+ * try and swap with the last CPU they were on. This could be complicated for
+ * ghost tasks.
+ *
+ * Locking for timer-merging could be infinitely more fine-grained. A second
+ * hash could select a lock to use based on queue slot. This approach might
+ * also help with add_release in rt_domains.
+ *
+ * It should be possible to reserve a CPU for ftdumping.
+ *
+ * The real_deadline business seems sloppy.
+ *
+ * The amount of data in the header file should be cut down. The use of the
+ * header file in general needs to be re-examined.
+ *
+ * The plugin needs to be modified so that it doesn't freeze when it is
+ * deactivated in a VM.
+ *
+ * The locking in check_for_preempt is not fine-grained enough.
+ *
+ * The size of the structures could be smaller. Debugging info might be
+ * excessive as things currently stand.
+ *
+ * The macro can_requeue has been expanded too much. Anything beyond
+ * scheduled_on is a hack!
+ *
+ * Domain names (rt_domain) are still clumsy.
+ *
+ * Should BE be moved into the kernel? This will require benchmarking.
+ */
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/hrtimer.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/poison.h>
+#include <linux/pid.h>
+#include <litmus/litmus.h>
+#include <litmus/trace.h>
+#include <litmus/jobs.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/edf_common.h>
+#include <litmus/sched_trace.h>
+#include <litmus/domain.h>
+#include <litmus/bheap.h>
+#include <litmus/event_group.h>
+#include <litmus/budget.h>
+#include <litmus/sched_mc.h>
+#include <litmus/ce_domain.h>
+/**
+ * struct cpu_entry - State of a CPU for the entire MC system
+ * @cpu           CPU id
+ * @scheduled     Task that is physically running
+ * @linked        Task that should be running / is logically running
+ * @lock          For serialization
+ * @crit_entries  Array of CPU state per criticality level
+ * @redir         List of redirected work for this CPU.
+ * @redir_lock    Lock for @redir.
+ * @event_group   Event group for timer merging.
+ */
+struct cpu_entry {
+        int                     cpu;
+        struct task_struct*     scheduled;
+        struct task_struct*     will_schedule;
+        struct task_struct*     linked;
+        raw_spinlock_t          lock;
+        struct crit_entry       crit_entries[NUM_CRIT_LEVELS];
+#ifdef CONFIG_PLUGIN_MC_REDIRECT
+        struct list_head        redir;
+        raw_spinlock_t          redir_lock;
+#endif
+#ifdef CONFIG_MERGE_TIMERS
+        struct event_group *event_group;
+#endif
+};
+DEFINE_PER_CPU(struct cpu_entry, cpus);
+#ifdef CONFIG_RELEASE_MASTER
+static int interrupt_cpu;
+#endif
+#define domain_data(dom)  (container_of(dom, struct domain_data, domain))
+#define is_global(dom)    (domain_data(dom)->heap)
+#define is_global_task(t) (is_global(get_task_domain(t)))
+#define can_use(ce) \
+        ((ce)->state == CS_ACTIVE || (ce->state == CS_ACTIVATE))
+#define can_requeue(t)                                                  \
+        ((t)->rt_param.linked_on == NO_CPU && /* Not linked anywhere */ \
+         !is_queued(t) &&                     /* Not gonna be linked */ \
+         (!is_global_task(t) || (t)->rt_param.scheduled_on == NO_CPU))
+#define entry_level(e) \
+        (((e)->linked) ? tsk_mc_crit((e)->linked) : NUM_CRIT_LEVELS - 1)
+#define crit_cpu(ce) \
+        (container_of((void*)((ce) - (ce)->level), struct cpu_entry, crit_entries))
+#define get_crit_entry_for(cpu, level) (&per_cpu(cpus, cpu).crit_entries[level])
+#define TRACE_ENTRY(e, fmt, args...)                            \
+        STRACE("P%d, linked=" TS " " fmt, e->cpu, TA(e->linked), ##args)
+#define TRACE_CRIT_ENTRY(ce, fmt, args...)                      \
+        STRACE("%s P%d, linked=" TS " " fmt,                    \
+              (ce)->domain->name, crit_cpu(ce)->cpu, TA((ce)->linked), ##args)
+static int sid(struct crit_entry *ce)
+{
+        int level = ce->level * num_online_cpus() + crit_cpu(ce)->cpu + 1;
+        BUG_ON(level >= 0);
+        return -level;
+}
+/*
+ * Sort CPUs within a global domain's heap.
+ */
+static int cpu_lower_prio(struct bheap_node *a, struct bheap_node *b)
+{
+        struct domain *domain;
+        struct crit_entry *first, *second;
+        struct task_struct *first_link, *second_link;
+        first  = a->value;
+        second = b->value;
+        first_link  = first->linked;
+        second_link = second->linked;
+        if (first->state == CS_REMOVED || second->state == CS_REMOVED) {
+                /* Removed entries go at the back of the heap */
+                return first->state  != CS_REMOVED &&
+                       second->state != CS_REMOVED;
+        } else if (!first_link || !second_link) {
+                /* Entry with nothing scheduled is lowest priority */
+                return second_link && !first_link;
+        } else {
+                /* Sort by deadlines of tasks */
+                domain = get_task_domain(first_link);
+                return domain->higher_prio(second_link, first_link);
+        }
+}
+/*
+ * Return true if the domain has a higher priority ready task. The @curr
+ * task must belong to the domain.
+ */
+static int mc_preempt_needed(struct domain *dom, struct task_struct* curr)
+{
+        struct task_struct *next = dom->peek_ready(dom);
+        if (!next || !curr) {
+                return next && !curr;
+        } else {
+                BUG_ON(tsk_mc_crit(next) != tsk_mc_crit(curr));
+                return get_task_domain(next)->higher_prio(next, curr);
+        }
+}
+/*
+ * Update crit entry position in a global heap. Caller must hold
+ * @ce's domain lock.
+ */
+static inline void update_crit_position(struct crit_entry *ce)
+{
+        struct bheap *heap;
+        if (is_global(ce->domain)) {
+                heap = domain_data(ce->domain)->heap;
+                BUG_ON(!heap);
+                BUG_ON(!bheap_node_in_heap(ce->node));
+                bheap_delete(cpu_lower_prio, heap, ce->node);
+                bheap_insert(cpu_lower_prio, heap, ce->node);
+        }
+}
+/*
+ * Update crit entry position in a global heap if it has been marked
+ * for update. Caller must hold @ce's domain lock.
+ */
+static void fix_crit_position(struct crit_entry *ce)
+{
+        if (is_global(ce->domain)) {
+                if (CS_ACTIVATE == ce->state) {
+                        ce->state = CS_ACTIVE;
+                        update_crit_position(ce);
+                } else if (CS_REMOVE == ce->state) {
+                        ce->state = CS_REMOVED;
+                        update_crit_position(ce);
+                }
+        }
+}
+/*
+ * Return next CPU which should preempted or NULL if the domain has no
+ * preemptable CPUs. Caller must hold the @dom lock.
+ */
+static inline struct crit_entry* lowest_prio_cpu(struct domain *dom)
+{
+        struct bheap *heap = domain_data(dom)->heap;
+        struct bheap_node* hn;
+        struct crit_entry *ce, *res = NULL;
+        do {
+                hn = bheap_peek(cpu_lower_prio, heap);
+                ce = (hn) ? hn->value : NULL;
+                if (ce) {
+                        if (ce->state == CS_ACTIVE)
+                                res = ce;
+                        else if (ce->state == CS_REMOVED)
+                                ce = NULL;
+                        else
+                                fix_crit_position(ce);
+                }
+        } while (ce && !res);
+        return res;
+}
+/*
+ * Cancel ghost timer.
+ */
+static inline void cancel_ghost(struct crit_entry *ce)
+{
+#ifdef CONFIG_MERGE_TIMERS
+        cancel_event(&ce->event);
+#else
+        hrtimer_try_to_cancel(&ce->timer);
+#endif
+}
+/*
+ * Arm ghost timer. Will merge timers if the option is specified.
+ */
+static inline void arm_ghost(struct crit_entry *ce, lt_t fire)
+{
+#ifdef CONFIG_MERGE_TIMERS
+        add_event(crit_cpu(ce)->event_group, &ce->event, fire);
+#else
+        __hrtimer_start_range_ns(&ce->timer,
+                                 ns_to_ktime(fire),
+                                 0 /* delta */,
+                                 HRTIMER_MODE_ABS_PINNED,
+                                 0 /* no wakeup */);
+#endif
+}
+/*
+ * Time accounting for ghost tasks.
+ * Must be called before a decision is made involving the task's budget.
+ */
+static void update_ghost_time(struct task_struct *p)
+{
+        u64 clock = litmus_clock();
+        u64 delta = clock - p->se.exec_start;
+        BUG_ON(!is_ghost(p));
+        if (unlikely ((s64)delta < 0)) {
+                delta = 0;
+                TRACE_MC_TASK(p, "WARNING: negative time delta\n");
+        }
+        if (tsk_mc_data(p)->mc_job.ghost_budget <= delta) {
+                TRACE_MC_TASK(p, "Ghost job could have ended\n");
+                tsk_mc_data(p)->mc_job.ghost_budget = 0;
+                p->se.exec_start = clock;
+        } else {
+                TRACE_MC_TASK(p, "Ghost job updated, but didn't finish\n");
+                tsk_mc_data(p)->mc_job.ghost_budget -= delta;
+                p->se.exec_start = clock;
+        }
+}
+/**
+ * link_task_to_crit() - Logically run a task at a criticality level.
+ * Caller must hold @ce's CPU lock.
+ */
+static void link_task_to_crit(struct crit_entry *ce,
+                              struct task_struct *task)
+{
+        lt_t when_to_fire;
+        TRACE_CRIT_ENTRY(ce, "Linking " TS "\n", TA(task));
+        BUG_ON(!can_use(ce) && task);
+        BUG_ON(task && tsk_rt(task)->linked_on != NO_CPU);
+        BUG_ON(task && is_global(ce->domain) &&
+               !bheap_node_in_heap(ce->node));
+        /* Unlink last task */
+        if (ce->linked) {
+                TRACE_MC_TASK(ce->linked, "Unlinking\n");
+                ce->linked->rt_param.linked_on = NO_CPU;
+                if (is_ghost(ce->linked)) {
+                        cancel_ghost(ce);
+                        if (tsk_mc_data(ce->linked)->mc_job.ghost_budget > 0) {
+                                /* Job isn't finished, so do accounting */
+                                update_ghost_time(ce->linked);
+                        }
+                }
+                sched_trace_server_switch_away(sid(ce), 0, ce->linked->pid);
+        }
+        /* Actually link task */
+        ce->linked = task;
+        if (task) {
+                task->rt_param.linked_on = crit_cpu(ce)->cpu;
+                if (is_ghost(task) && CRIT_LEVEL_A != tsk_mc_crit(task)) {
+                        /* There is a level-A timer that will force a
+                         * preemption, so we don't set this for level-A
+                         * tasks. Otherwise reset the budget timer.
+                         */
+                        task->se.exec_start = litmus_clock();
+                        when_to_fire = task->se.exec_start +
+                                tsk_mc_data(task)->mc_job.ghost_budget;
+                        arm_ghost(ce, when_to_fire);
+                        sched_trace_server_switch_to(sid(ce), 0, 0);
+                } else {
+                        sched_trace_server_switch_to(sid(ce), 0, task->pid);
+                }
+        }
+}
+static void check_for_preempt(struct domain*);
+/**
+ * job_arrival() - Called when a task re-enters the system.
+ * Caller must hold no locks.
+ */
+static void job_arrival(struct task_struct *task)
+{
+        struct domain *dom = get_task_domain(task);
+        TRACE_MC_TASK(task, "Job arriving\n");
+        BUG_ON(!task);
+        raw_spin_lock(dom->lock);
+        if (can_requeue(task)) {
+                BUG_ON(task->rt_param.linked_on != NO_CPU);
+                dom->requeue(dom, task);
+                check_for_preempt(dom);
+        } else {
+                /* If a global task is scheduled on one cpu, it CANNOT
+                 * be requeued into a global domain. Another cpu might
+                 * dequeue the global task before it is descheduled,
+                 * causing the system to crash when the task is scheduled
+                 * in two places simultaneously.
+                 */
+                TRACE_MC_TASK(task, "Delayed arrival of scheduled task\n");
+        }
+        raw_spin_unlock(dom->lock);
+}
+/**
+ * low_prio_arrival() - If CONFIG_PLUGIN_MC_REDIRECT is enabled, will
+ * redirect a lower priority job_arrival work to the interrupt_cpu.
+ */
+static void low_prio_arrival(struct task_struct *task)
+{
+        struct cpu_entry *entry;
+        /* Race conditions! */
+        if (!can_requeue(task)) return;
+#ifdef  CONFIG_PLUGIN_MC_REDIRECT
+        if (!is_global_task(task))
+                goto arrive;
+        if (smp_processor_id() != interrupt_cpu) {
+                entry = &__get_cpu_var(cpus);
+                raw_spin_lock(&entry->redir_lock);
+                TRACE_MC_TASK(task, "Adding to redirect queue\n");
+                list_add(&tsk_rt(task)->list, &entry->redir);
+                raw_spin_unlock(&entry->redir_lock);
+                litmus_reschedule(interrupt_cpu);
+        } else
+#endif
+        {
+arrive:
+                job_arrival(task);
+        }
+}
+#ifdef CONFIG_PLUGIN_MC_REDIRECT
+/**
+ * fix_global_levels() - Execute redirected job arrivals on this cpu.
+ */
+static void fix_global_levels(void)
+{
+        int c;
+        struct cpu_entry *e;
+        struct list_head *pos, *safe;
+        struct task_struct *t;
+        STRACE("Fixing global levels\n");
+        for_each_online_cpu(c) {
+                e = &per_cpu(cpus, c);
+                raw_spin_lock(&e->redir_lock);
+                list_for_each_safe(pos, safe, &e->redir) {
+                        t = list_entry(pos, struct task_struct, rt_param.list);
+                        BUG_ON(!t);
+                        TRACE_MC_TASK(t, "Dequeued redirected job\n");
+                        list_del_init(pos);
+                        job_arrival(t);
+                }
+                raw_spin_unlock(&e->redir_lock);
+        }
+}
+#endif
+/**
+ * link_task_to_cpu() - Logically run a task on a CPU.
+ * The task must first have been linked to one of the CPU's crit_entries.
+ * Caller must hold the entry lock.
+ */
+static void link_task_to_cpu(struct cpu_entry *entry, struct task_struct *task)
+{
+        int i = entry_level(entry);
+        struct crit_entry *ce;
+        TRACE_MC_TASK(task, "Linking to P%d\n", entry->cpu);
+        BUG_ON(task && tsk_rt(task)->linked_on != entry->cpu);
+        BUG_ON(task && is_ghost(task));
+        if (entry->linked) {
+                sched_trace_server_switch_away(-entry->linked->pid,
+                                               get_server_job(entry->linked),
+                                               entry->linked->pid);
+        }
+        if (task){
+                set_rt_flags(task, RT_F_RUNNING);
+                sched_trace_server_switch_to(-task->pid,
+                                             get_server_job(task),
+                                             task->pid);
+        }
+        entry->linked = task;
+        /* Higher criticality crit entries are now usable */
+        for (; i < entry_level(entry) + 1; i++) {
+                ce = &entry->crit_entries[i];
+                if (!can_use(ce)) {
+                        ce->state = CS_ACTIVATE;
+                }
+        }
+}
+/**
+ * preempt() - Preempt a logically running task with a higher priority one.
+ * @dom Domain from which to draw higher priority task
+ * @ce  CPU criticality level to preempt
+ *
+ * Caller must hold the lock for @dom and @ce's CPU lock.
+ */
+static void preempt(struct domain *dom, struct crit_entry *ce)
+{
+        struct task_struct *task = dom->take_ready(dom);
+        struct cpu_entry *entry = crit_cpu(ce);
+        struct task_struct *old = ce->linked;
+        BUG_ON(!task);
+        TRACE_CRIT_ENTRY(ce, "Preempted by " TS "\n", TA(task));
+        /* Per-domain preemption */
+        link_task_to_crit(ce, task);
+        if (old && can_requeue(old)) {
+                dom->requeue(dom, old);
+        }
+        update_crit_position(ce);
+        /* Preempt actual execution if this is a running task */
+        if (!is_ghost(task)) {
+                link_task_to_cpu(entry, task);
+                preempt_if_preemptable(entry->scheduled, entry->cpu);
+        } else if (old && old == entry->linked) {
+                /* Preempted a running task with a ghost job. Null needs to be
+                 * running.
+                 */
+                link_task_to_cpu(entry, NULL);
+                preempt_if_preemptable(entry->scheduled, entry->cpu);
+        }
+}
+/**
+ * update_crit_levels() - Update criticality entries for the new cpu state.
+ * This should be called after a new task has been linked to @entry.
+ * The caller must hold the @entry->lock, but this method will release it.
+ */
+static void update_crit_levels(struct cpu_entry *entry)
+{
+        int i, global_preempted;
+        struct crit_entry *ce;
+        struct task_struct *readmit[NUM_CRIT_LEVELS];
+        enum crit_level level = entry_level(entry);
+        /* Remove lower priority tasks from the entry */
+        for (i = level + 1; i < NUM_CRIT_LEVELS; i++) {
+                ce = &entry->crit_entries[i];
+                global_preempted = ce->linked &&
+                        /* This task is running on a cpu */
+                        ce->linked->rt_param.scheduled_on == entry->cpu &&
+                        /* But it was preempted */
+                        ce->linked != entry->linked &&
+                        /* And it is an eligible global task */
+                        !is_ghost(ce->linked) && is_global(ce->domain);
+                /* Do not readmit global tasks which are preempted! These can't
+                 * ever be re-admitted until they are descheduled for reasons
+                 * explained in job_arrival.
+                 */
+                readmit[i] = (!global_preempted) ? ce->linked : NULL;
+                ce->state = CS_REMOVE;
+                if (ce->linked)
+                        link_task_to_crit(ce, NULL);
+        }
+        /* Need to unlock so we can access domains */
+        raw_spin_unlock(&entry->lock);
+        /* Re-admit tasks to the system */
+        for (i = level + 1; i < NUM_CRIT_LEVELS; i++) {
+                ce = &entry->crit_entries[i];
+                if (readmit[i]) {
+                        low_prio_arrival(readmit[i]);
+                }
+        }
+}
+/**
+ * check_for_preempt() - Causes a preemption if higher-priority tasks are ready.
+ * Caller must hold domain lock.
+ * Makes gigantic nasty assumption that there is 1 global criticality level,
+ * and it is the last one in each list, so it doesn't call update_crit..
+ */
+static void check_for_preempt(struct domain *dom)
+{
+        int recheck = 1;
+        struct cpu_entry *entry;
+        struct crit_entry *ce;
+        if (is_global(dom)) {
+                /* Loop until we find a non-preemptable CPU */
+                while ((ce = lowest_prio_cpu(dom)) && recheck) {
+                        entry = crit_cpu(ce);
+                        recheck = 1;
+                        /* Cache next task */
+                        dom->peek_ready(dom);
+                        raw_spin_lock(&entry->lock);
+                        if (!can_use(ce))
+                                /* CPU disabled while locking! */
+                                fix_crit_position(ce);
+                        else if (dom->preempt_needed(dom, ce->linked))
+                                /* Success! Check for more preemptions */
+                                preempt(dom, ce);
+                        else {
+                                /* Failure! */
+                                recheck = 0;
+                                TRACE_CRIT_ENTRY(ce, "Stopped global check\n");
+                        }
+                        raw_spin_unlock(&entry->lock);
+                }
+        } else /* Partitioned */ {
+                ce = domain_data(dom)->crit_entry;
+                entry = crit_cpu(ce);
+                /* Cache next task */
+                dom->peek_ready(dom);
+                raw_spin_lock(&entry->lock);
+                if (can_use(ce) && dom->preempt_needed(dom, ce->linked)) {
+                        preempt(dom, ce);
+                        update_crit_levels(entry);
+                } else {
+                        raw_spin_unlock(&entry->lock);
+                }
+        }
+}
+/**
+ * remove_from_all() - Logically remove a task from all structures.
+ * Caller must hold no locks.
+ */
+static void remove_from_all(struct task_struct* task)
+{
+        int update = 0;
+        struct cpu_entry *entry;
+        struct crit_entry *ce;
+        struct domain *dom = get_task_domain(task);
+        TRACE_MC_TASK(task, "Removing from everything\n");
+        BUG_ON(!task);
+        raw_spin_lock(dom->lock);
+        /* Remove the task from any CPU state */
+        if (task->rt_param.linked_on != NO_CPU) {
+                entry = &per_cpu(cpus, task->rt_param.linked_on);
+                raw_spin_lock(&entry->lock);
+                /* Unlink only if task is still linked post lock */
+                ce = &entry->crit_entries[tsk_mc_crit(task)];
+                if (task->rt_param.linked_on != NO_CPU) {
+                        BUG_ON(ce->linked != task);
+                        link_task_to_crit(ce, NULL);
+                        update_crit_position(ce);
+                        if (!is_ghost(task) && entry->linked == task) {
+                                update = 1;
+                                link_task_to_cpu(entry, NULL);
+                        }
+                } else {
+                        TRACE_MC_TASK(task, "Unlinked before we got lock!\n");
+                }
+                if (update)
+                        update_crit_levels(entry);
+                else
+                        raw_spin_unlock(&entry->lock);
+        } else {
+                TRACE_MC_TASK(task, "Not linked to anything\n");
+        }
+        /* Ensure the task isn't returned by its domain */
+        dom->remove(dom, task);
+        raw_spin_unlock(dom->lock);
+}
+/**
+ * job_completion() - Update task state and re-enter it into the system.
+ * Converts tasks which have completed their execution early into ghost jobs.
+ * Caller must hold no locks.
+ */
+static void job_completion(struct task_struct *task, int forced)
+{
+        int behind;
+        TRACE_MC_TASK(task, "Completed\n");
+        /* Logically stop the task execution */
+        set_rt_flags(task, RT_F_SLEEP);
+        remove_from_all(task);
+        /* Level-A tasks cannot ever get behind */
+        behind = tsk_mc_crit(task) != CRIT_LEVEL_A && behind_server(task);
+        if (!forced && !is_ghost(task)) {
+                /* Task voluntarily ceased execution. Move on to next period */
+                task_release(task);
+                sched_trace_task_completion(task, forced);
+                /* Convert to ghost job */
+                tsk_mc_data(task)->mc_job.ghost_budget = budget_remaining(task);
+                tsk_mc_data(task)->mc_job.is_ghost = 1;
+        }
+        /* If the task has no ghost budget, convert back from ghost.
+         * If the task is behind, undo ghost conversion so that it
+         * can catch up.
+         */
+        if (behind || tsk_mc_data(task)->mc_job.ghost_budget == 0) {
+                TRACE_MC_TASK(task, "Not a ghost task\n");
+                tsk_mc_data(task)->mc_job.is_ghost = 0;
+                tsk_mc_data(task)->mc_job.ghost_budget = 0;
+        }
+        /* If server has run out of budget, wait until next release */
+        if (budget_exhausted(task)) {
+                sched_trace_server_completion(-task->pid,
+                                              get_server_job(task));
+                server_release(task);
+        }
+        /* Requeue non-blocking tasks */
+        if (is_running(task))
+                job_arrival(task);
+}
+/**
+ * mc_ghost_exhausted() - Complete logically running ghost task.
+ */
+#ifdef CONFIG_MERGE_TIMERS
+static void mc_ghost_exhausted(struct rt_event *e)
+{
+        struct crit_entry *ce = container_of(e, struct crit_entry, event);
+#else
+static enum hrtimer_restart mc_ghost_exhausted(struct hrtimer *timer)
+{
+        struct crit_entry *ce = container_of(timer, struct crit_entry, timer);
+#endif
+        unsigned long flags;
+        struct task_struct *tmp = NULL;
+        local_irq_save(flags);
+        TRACE("Ghost exhausted\n");
+        TRACE_CRIT_ENTRY(ce, "Firing here\n");
+        /* Due to race conditions, we cannot just set the linked
+         * task's budget to 0 as it may no longer be the task
+         * for which this timer was armed. Instead, update the running
+         * task time and see if this causes exhaustion.
+         */
+        raw_spin_lock(&crit_cpu(ce)->lock);
+        if (ce->linked && is_ghost(ce->linked)) {
+                update_ghost_time(ce->linked);
+                if (tsk_mc_data(ce->linked)->mc_job.ghost_budget == 0) {
+                        tmp = ce->linked;
+                }
+        }
+        raw_spin_unlock(&crit_cpu(ce)->lock);
+        if (tmp)
+                job_completion(tmp, 0);
+        local_irq_restore(flags);
+#ifndef CONFIG_MERGE_TIMERS
+        return HRTIMER_NORESTART;
+#endif
+}
+/*
+ * The MC-CE common timer callback code for merged and non-merged timers.
+ * Returns the next time the timer should fire.
+ */
+static lt_t __ce_timer_function(struct ce_dom_data *ce_data)
+{
+        struct crit_entry *ce = get_crit_entry_for(ce_data->cpu, CRIT_LEVEL_A);
+        struct domain *dom = ce->domain;
+        struct task_struct *old_link = NULL;
+        lt_t next_timer_abs;
+        TRACE("MC level-A timer callback for CPU %d\n", ce_data->cpu);
+        raw_spin_lock(dom->lock);
+        raw_spin_lock(&crit_cpu(ce)->lock);
+        if (ce->linked &&
+            ce->linked == ce_data->should_schedule &&
+            is_ghost(ce->linked))
+        {
+                old_link = ce->linked;
+                tsk_mc_data(ce->linked)->mc_job.ghost_budget = 0;
+                link_task_to_crit(ce, NULL);
+        }
+        raw_spin_unlock(&crit_cpu(ce)->lock);
+        next_timer_abs = mc_ce_timer_callback_common(dom);
+        /* Job completion will check for preemptions by means of calling job
+         * arrival if the task is not blocked */
+        if (NULL != old_link) {
+                STRACE("old_link " TS " so will call job completion\n", TA(old_link));
+                raw_spin_unlock(dom->lock);
+                job_completion(old_link, 0);
+        } else {
+                STRACE("old_link was null, so will call check for preempt\n");
+                raw_spin_unlock(dom->lock);
+                check_for_preempt(dom);
+        }
+        return next_timer_abs;
+}
+#ifdef CONFIG_MERGE_TIMERS
+static void ce_timer_function(struct rt_event *e)
+{
+        struct ce_dom_data *ce_data =
+                container_of(e, struct ce_dom_data, event);
+        unsigned long flags;
+        lt_t next_timer_abs;
+        TS_LVLA_RELEASE_START;
+        local_irq_save(flags);
+        next_timer_abs = __ce_timer_function(ce_data);
+        add_event(per_cpu(cpus, ce_data->cpu).event_group, e, next_timer_abs);
+        local_irq_restore(flags);
+        TS_LVLA_RELEASE_END;
+}
+#else /* else to CONFIG_MERGE_TIMERS */
+static enum hrtimer_restart ce_timer_function(struct hrtimer *timer)
+{
+        struct ce_dom_data *ce_data =
+                container_of(timer, struct ce_dom_data, timer);
+        unsigned long flags;
+        lt_t next_timer_abs;
+        TS_LVLA_RELEASE_START;
+        local_irq_save(flags);
+        next_timer_abs = __ce_timer_function(ce_data);
+        hrtimer_set_expires(timer, ns_to_ktime(next_timer_abs));
+        local_irq_restore(flags);
+        TS_LVLA_RELEASE_END;
+        return HRTIMER_RESTART;
+}
+#endif /* CONFIG_MERGE_TIMERS */
+/**
+ * mc_release_jobs() - Add heap of tasks to the system, check for preemptions.
+ */
+static void mc_release_jobs(rt_domain_t* rt, struct bheap* tasks)
+{
+        unsigned long flags;
+        struct task_struct *first = bheap_peek(rt->order, tasks)->value;
+        struct domain *dom = get_task_domain(first);
+        raw_spin_lock_irqsave(dom->lock, flags);
+        TRACE(TS "Jobs released\n", TA(first));
+        __merge_ready(rt, tasks);
+        check_for_preempt(dom);
+        raw_spin_unlock_irqrestore(dom->lock, flags);
+}
+/**
+ * ms_task_new() - Setup new mixed-criticality task.
+ * Assumes that there are no partitioned domains after level B.
+ */
+static void mc_task_new(struct task_struct *t, int on_rq, int running)
+{
+        unsigned long flags;
+        struct cpu_entry* entry;
+        enum crit_level level = tsk_mc_crit(t);
+        char name[TASK_COMM_LEN];
+        strcpy(name, "rtspin");
+        local_irq_save(flags);
+        TRACE("New mixed criticality task %d\n", t->pid);
+        /* Assign domain */
+        if (level < CRIT_LEVEL_C)
+                entry = &per_cpu(cpus, get_partition(t));
+        else
+                entry = &per_cpu(cpus, task_cpu(t));
+        t->rt_param._domain = entry->crit_entries[level].domain;
+        sched_trace_container_param(t->pid, name);
+        sched_trace_server_param(-t->pid, t->pid,
+                                 get_exec_cost(t), get_rt_period(t));
+        /* Setup job params */
+        release_at(t, litmus_clock());
+        tsk_mc_data(t)->mc_job.ghost_budget = 0;
+        tsk_mc_data(t)->mc_job.is_ghost = 0;
+        if (running) {
+                BUG_ON(entry->scheduled);
+                entry->scheduled = t;
+                tsk_rt(t)->scheduled_on = entry->cpu;
+        } else {
+                t->rt_param.scheduled_on = NO_CPU;
+        }
+        t->rt_param.linked_on = NO_CPU;
+        job_arrival(t);
+        local_irq_restore(flags);
+}
+/**
+ * mc_task_new() - Add task back into its domain check for preemptions.
+ */
+static void mc_task_wake_up(struct task_struct *task)
+{
+        unsigned long flags;
+        lt_t now = litmus_clock();
+        local_irq_save(flags);
+        TRACE(TS " wakes up\n", TA(task));
+        if (is_tardy(task, now)) {
+                /* Task missed its last release */
+                release_at(task, now);
+                sched_trace_task_release(task);
+        }
+        if (!is_ghost(task))
+                job_arrival(task);
+        local_irq_restore(flags);
+}
+/**
+ * mc_task_block() - Remove task from state to prevent it being run anywhere.
+ */
+static void mc_task_block(struct task_struct *task)
+{
+        unsigned long flags;
+        local_irq_save(flags);
+        TRACE(TS " blocks\n", TA(task));
+        remove_from_all(task);
+        local_irq_restore(flags);
+}
+/**
+ * mc_task_exit() - Remove task from the system.
+ */
+static void mc_task_exit(struct task_struct *task)
+{
+        unsigned long flags;
+        local_irq_save(flags);
+        BUG_ON(!is_realtime(task));
+        TRACE(TS " RIP\n", TA(task));
+        remove_from_all(task);
+        if (tsk_rt(task)->scheduled_on != NO_CPU) {
+                per_cpu(cpus, tsk_rt(task)->scheduled_on).scheduled = NULL;
+                tsk_rt(task)->scheduled_on = NO_CPU;
+        }
+        if (CRIT_LEVEL_A == tsk_mc_crit(task))
+                mc_ce_task_exit_common(task);
+        local_irq_restore(flags);
+}
+/**
+ * mc_admit_task() - Return true if the task is valid.
+ * Assumes there are no partitioned levels after level B.
+ */
+static long mc_admit_task(struct task_struct* task)
+{
+        const enum crit_level crit = tsk_mc_crit(task);
+        long ret;
+        if (!tsk_mc_data(task)) {
+                printk(KERN_WARNING "Tried to admit task with no criticality "
+                        "level\n");
+                ret = -EINVAL;
+                goto out;
+        }
+        if (crit < CRIT_LEVEL_C && get_partition(task) == NO_CPU) {
+                printk(KERN_WARNING "Tried to admit partitioned task with no "
+                       "partition\n");
+                ret = -EINVAL;
+                goto out;
+        }
+        if (crit == CRIT_LEVEL_A) {
+                ret = mc_ce_admit_task_common(task);
+                if (ret)
+                        goto out;
+        }
+        printk(KERN_INFO "Admitted task with criticality level %d\n",
+                tsk_mc_crit(task));
+        ret = 0;
+out:
+        return ret;
+}
+/**
+ * mc_schedule() - Return next task which should be scheduled.
+ */
+static struct task_struct* mc_schedule(struct task_struct* prev)
+{
+        unsigned long flags;
+        struct domain *dom;
+        struct crit_entry *ce;
+        struct cpu_entry* entry = &__get_cpu_var(cpus);
+        int i, out_of_time, sleep, preempt, exists, blocks, global, lower;
+        struct task_struct *dtask = NULL, *ready_task = NULL, *next = NULL;
+        local_irq_save(flags);
+        /* Litmus gave up because it couldn't access the stack of the CPU
+         * on which will_schedule was migrating from. Requeue it.
+         * This really only happens in VMs.
+         */
+        if (entry->will_schedule && entry->will_schedule != prev) {
+                entry->will_schedule->rt_param.scheduled_on = NO_CPU;
+                low_prio_arrival(entry->will_schedule);
+        }
+        raw_spin_lock(&entry->lock);
+        /* Sanity checking */
+        BUG_ON(entry->scheduled && entry->scheduled != prev);
+        BUG_ON(entry->scheduled && !is_realtime(prev));
+        BUG_ON(is_realtime(prev) && !entry->scheduled);
+        /* Determine state */
+        exists      = entry->scheduled != NULL;
+        blocks      = exists && !is_running(entry->scheduled);
+        out_of_time = exists && budget_enforced(entry->scheduled) &&
+                                budget_exhausted(entry->scheduled);
+        sleep       = exists && get_rt_flags(entry->scheduled) == RT_F_SLEEP;
+        global      = exists && is_global_task(entry->scheduled);
+        preempt     = entry->scheduled != entry->linked;
+        lower       = exists && preempt && entry->linked &&
+                tsk_mc_crit(entry->scheduled) > tsk_mc_crit(entry->linked);
+        TRACE(TS " blocks:%d out_of_time:%d sleep:%d preempt:%d\n",
+              TA(prev), blocks, out_of_time, sleep, preempt);
+        if (exists)
+                prev->rt_param.scheduled_on = NO_CPU;
+        raw_spin_unlock(&entry->lock);
+#ifdef CONFIG_PLUGIN_MC_REDIRECT
+        if (smp_processor_id() == interrupt_cpu)
+                fix_global_levels();
+#endif
+        /* If a task blocks we have no choice but to reschedule */
+        if (blocks)
+                remove_from_all(entry->scheduled);
+        /* Any task which exhausts its budget or sleeps waiting for its next
+         * period completes unless its execution has been forcibly stopped.
+         */
+        if ((out_of_time || sleep) && !blocks)/* && !preempt)*/
+                job_completion(entry->scheduled, !sleep);
+        /* Global scheduled tasks must wait for a deschedule before they
+         * can rejoin the global state. Rejoin them here.
+         */
+        else if (global && preempt && !blocks) {
+                if (lower)
+                        low_prio_arrival(entry->scheduled);
+                else
+                        job_arrival(entry->scheduled);
+        }
+        /* Pick next task if none is linked */
+        raw_spin_lock(&entry->lock);
+        for (i = 0; i < NUM_CRIT_LEVELS && !entry->linked; i++) {
+                ce = &entry->crit_entries[i];
+                dom = ce->domain;
+                /* Swap locks. We cannot acquire a domain lock while
+                 * holding an entry lock or deadlocks will happen.
+                 */
+                raw_spin_unlock(&entry->lock);
+                raw_spin_lock(dom->lock);
+                /* Do domain stuff before grabbing CPU locks */
+                dtask = dom->peek_ready(dom);
+                fix_crit_position(ce);
+                raw_spin_lock(&entry->lock);
+                if (!entry->linked && !ce->linked && dtask && can_use(ce)) {
+                        dom->take_ready(dom);
+                        link_task_to_crit(ce, dtask);
+                        update_crit_position(ce);
+                        ready_task = (is_ghost(dtask)) ? NULL : dtask;
+                        /* Task found! */
+                        if (ready_task) {
+                                link_task_to_cpu(entry, ready_task);
+                                raw_spin_unlock(dom->lock);
+                                update_crit_levels(entry);
+                                raw_spin_lock(&entry->lock);
+                                continue;
+                        }
+                }
+                raw_spin_unlock(dom->lock);
+        }
+        /* Schedule next task */
+        next = entry->linked;
+        if (entry->linked)
+                entry->linked->rt_param.scheduled_on = entry->cpu;
+        entry->will_schedule = entry->linked;
+        sched_state_task_picked();
+        raw_spin_unlock(&entry->lock);
+        local_irq_restore(flags);
+        if (next) {
+                TRACE_MC_TASK(next, "Picked this task\n");
+        } else if (exists && !next)
+                TRACE_ENTRY(entry, "Becomes idle at %llu\n", litmus_clock());
+        return next;
+}
+void mc_finish_switch(struct task_struct *prev)
+{
+        struct cpu_entry* entry = &__get_cpu_var(cpus);
+        entry->scheduled = is_realtime(current) ? current : NULL;
+        TRACE_TASK(prev, "Switched away from to " TS "\n",
+                   TA(entry->scheduled));
+}
+/*
+ * This is the plugin's release at function, called by the release task-set
+ * system call. Other places in the file use the generic LITMUS release_at(),
+ * which is not this.
+ */
+void mc_release_at(struct task_struct *ts, lt_t start)
+{
+        /* hack so that we can have CE timers start at the right time */
+        if (CRIT_LEVEL_A == tsk_mc_crit(ts))
+                mc_ce_release_at_common(ts, start);
+        else
+                release_at(ts, start);
+}
+long mc_deactivate_plugin(void)
+{
+        return mc_ce_deactivate_plugin_common();
+}
+/* **************************************************************************
+ * Initialization
+ * ************************************************************************** */
+/* Initialize values here so that they are allocated with the module
+ * and destroyed when the module is unloaded.
+ */
+/* LVL-A */
+DEFINE_PER_CPU(struct domain_data, _mc_crit_a);
+DEFINE_PER_CPU(raw_spinlock_t, _mc_crit_a_lock);
+DEFINE_PER_CPU(struct ce_dom_data, _mc_crit_a_ce_data);
+/* LVL-B */
+DEFINE_PER_CPU(struct domain_data, _mc_crit_b);
+DEFINE_PER_CPU(rt_domain_t, _mc_crit_b_rt);
+/* LVL-C */
+static struct domain_data _mc_crit_c;
+static rt_domain_t _mc_crit_c_rt;
+struct bheap _mc_heap_c;
+struct bheap_node _mc_nodes_c[NR_CPUS];
+static long mc_activate_plugin(void)
+{
+        struct domain_data *dom_data;
+        struct domain *dom;
+        struct domain_data *our_domains[NR_CPUS];
+        int cpu, n = 0;
+        long ret;
+#ifdef CONFIG_RELEASE_MASTER
+        interrupt_cpu = atomic_read(&release_master_cpu);
+#if defined(CONFIG_PLUGIN_MC_REDIRECT) || defined(CONFIG_PLUGIN_MC_RELEASE_MASTER)
+        if (NO_CPU == interrupt_cpu) {
+                printk(KERN_ERR "LITMUS-MC: need a release master\n");
+                ret = -EINVAL;
+                goto out;
+        }
+#endif
+#endif
+        for_each_online_cpu(cpu) {
+                BUG_ON(NR_CPUS <= n);
+                dom = per_cpu(cpus, cpu).crit_entries[CRIT_LEVEL_A].domain;
+                dom_data = domain_data(dom);
+                our_domains[cpu] = dom_data;
+#if defined(CONFIG_MERGE_TIMERS) && defined(CONFIG_PLUGIN_MC_RELEASE_MASTER)
+                per_cpu(cpus, cpu).event_group =
+                        get_event_group_for(interrupt_cpu);
+#elif defined(CONFIG_MERGE_TIMERS) && !defined(CONFIG_PLUGIN_MC_RELEASE_MASTER)
+                per_cpu(cpus, cpu).event_group = get_event_group_for(cpu);
+#endif
+                n++;
+        }
+        ret = mc_ce_set_domains(n, our_domains);
+        if (ret)
+                goto out;
+        ret = mc_ce_activate_plugin_common();
+out:
+        return ret;
+}
+static void mc_release_ts(lt_t time)
+{
+        int i, cpu, base_id = 0, cont_id = -1;
+        char name[TASK_COMM_LEN];
+        enum crit_level level;
+        struct cpu_entry *entry;
+        struct crit_entry *ce;
+        level = CRIT_LEVEL_A;
+        strcpy(name, "LVL-A");
+        for_each_online_cpu(cpu) {
+                entry = &per_cpu(cpus, cpu);
+                trace_litmus_container_param(++cont_id, &name);
+                ce = &entry->crit_entries[level];
+                sched_trace_server_param(sid(ce), cont_id, 0, 0);
+        }
+        level = CRIT_LEVEL_B;
+        strcpy(name, "LVL-B");
+        for_each_online_cpu(cpu) {
+                entry = &per_cpu(cpus, cpu);
+                trace_litmus_container_param(++cont_id, &name);
+                ce = &entry->crit_entries[level];
+                sched_trace_server_param(sid(ce), cont_id, 0, 0);
+        }
+        level = CRIT_LEVEL_C;
+        strcpy(name, "LVL-C");
+        trace_litmus_container_param(++cont_id, &name);
+        for_each_online_cpu(cpu) {
+                entry = &per_cpu(cpus, cpu);
+                ce = &entry->crit_entries[level];
+                sched_trace_server_param(sid(ce), cont_id, 0, 0);
+        }
+}
+static struct sched_plugin mc_plugin __cacheline_aligned_in_smp = {
+        .plugin_name            = "MC",
+        .task_new               = mc_task_new,
+        .complete_job           = complete_job,
+        .task_exit              = mc_task_exit,
+        .schedule               = mc_schedule,
+        .task_wake_up           = mc_task_wake_up,
+        .task_block             = mc_task_block,
+        .admit_task             = mc_admit_task,
+        .activate_plugin        = mc_activate_plugin,
+        .release_at             = mc_release_at,
+        .deactivate_plugin      = mc_deactivate_plugin,
+        .finish_switch          = mc_finish_switch,
+        .release_ts             = mc_release_ts,
+};
+static void init_crit_entry(struct crit_entry *ce, enum crit_level level,
+                            struct domain_data *dom_data,
+                            struct bheap_node *node)
+{
+        ce->level  = level;
+        ce->linked = NULL;
+        ce->node   = node;
+        ce->domain = &dom_data->domain;
+        ce->state  = CS_ACTIVE;
+#ifdef CONFIG_MERGE_TIMERS
+        init_event(&ce->event, level, mc_ghost_exhausted,
+                   event_list_alloc(GFP_ATOMIC));
+#else
+        hrtimer_init(&ce->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+        ce->timer.function = mc_ghost_exhausted;
+#endif
+}
+static void init_local_domain(struct cpu_entry *entry, struct domain_data *dom_data,
+                              enum crit_level level)
+{
+        dom_data->heap = NULL;
+        dom_data->crit_entry = &entry->crit_entries[level];
+        init_crit_entry(dom_data->crit_entry, level, dom_data, NULL);
+}
+static void init_global_domain(struct domain_data *dom_data, enum crit_level level,
+                               struct bheap *heap, struct bheap_node *nodes)
+{
+        int cpu;
+        struct cpu_entry *entry;
+        struct crit_entry *ce;
+        struct bheap_node *node;
+        dom_data->crit_entry = NULL;
+        dom_data->heap = heap;
+        bheap_init(heap);
+        for_each_online_cpu(cpu) {
+                entry = &per_cpu(cpus, cpu);
+                node = &nodes[cpu];
+                ce = &entry->crit_entries[level];
+                init_crit_entry(ce, level, dom_data, node);
+                bheap_node_init(&ce->node, ce);
+                bheap_insert(cpu_lower_prio, heap, node);
+        }
+}
+static inline void init_edf_domain(struct domain *dom, rt_domain_t *rt,
+                                   enum crit_level prio, int is_partitioned, int cpu)
+{
+        pd_domain_init(dom, rt, edf_ready_order, NULL,
+                       mc_release_jobs, mc_preempt_needed,
+                       edf_higher_prio);
+        rt->level = prio;
+#if defined(CONFIG_PLUGIN_MC_RELEASE_MASTER) && defined(CONFIG_MERGE_TIMERS)
+        /* All timers are on one CPU and release-master is using the event
+         * merging interface as well. */
+        BUG_ON(NO_CPU == interrupt_cpu);
+        rt->event_group = get_event_group_for(interrupt_cpu);
+        rt->prio = prio;
+#elif defined(CONFIG_PLUGIN_MC_RELEASE_MASTER) && !defined(CONFIG_MERGE_TIMERS)
+        /* Using release master, but not merging timers. */
+        rt->release_master = interrupt_cpu;
+#elif !defined(CONFIG_PLUGIN_MC_RELEASE_MASTER) && defined(CONFIG_MERGE_TIMERS)
+        /* Merge the timers, but don't move them to the release master. */
+        if (is_partitioned) {
+                rt->event_group = get_event_group_for(cpu);
+        } else {
+                /* Global timers will be added to the event groups that code is
+                 * executing on when add_event() is called.
+                 */
+                rt->event_group = NULL;
+        }
+        rt->prio = prio;
+#endif
+}
+struct domain_data *ce_domain_for(int);
+static int __init init_mc(void)
+{
+        int cpu;
+        struct cpu_entry *entry;
+        struct domain_data *dom_data;
+        rt_domain_t *rt;
+        raw_spinlock_t *a_dom_lock, *b_dom_lock, *c_dom_lock; /* For lock debugger */
+        struct ce_dom_data *ce_data;
+        for_each_online_cpu(cpu) {
+                entry = &per_cpu(cpus, cpu);
+                /* CPU */
+                entry->cpu = cpu;
+                entry->scheduled = NULL;
+                entry->linked = NULL;
+                raw_spin_lock_init(&entry->lock);
+#ifdef CONFIG_PLUGIN_MC_REDIRECT
+                raw_spin_lock_init(&entry->redir_lock);
+                INIT_LIST_HEAD(&entry->redir);
+#endif
+                /* CRIT_LEVEL_A */
+                dom_data = &per_cpu(_mc_crit_a, cpu);
+                ce_data = &per_cpu(_mc_crit_a_ce_data, cpu);
+                a_dom_lock = &per_cpu(_mc_crit_a_lock, cpu);
+                raw_spin_lock_init(a_dom_lock);
+                ce_domain_init(&dom_data->domain,
+                                a_dom_lock, ce_requeue, ce_peek_and_take_ready,
+                                ce_peek_and_take_ready, mc_preempt_needed,
+                                ce_higher_prio, ce_data, cpu,
+                                ce_timer_function);
+                init_local_domain(entry, dom_data, CRIT_LEVEL_A);
+                dom_data->domain.name = "LVL-A";
+                /* CRIT_LEVEL_B */
+                dom_data = &per_cpu(_mc_crit_b, cpu);
+                rt = &per_cpu(_mc_crit_b_rt, cpu);
+                init_local_domain(entry, dom_data, CRIT_LEVEL_B);
+                init_edf_domain(&dom_data->domain, rt, CRIT_LEVEL_B, 1, cpu);
+                b_dom_lock = dom_data->domain.lock;
+                raw_spin_lock_init(b_dom_lock);
+                dom_data->domain.name = "LVL-B";
+        }
+        /* CRIT_LEVEL_C */
+        init_global_domain(&_mc_crit_c, CRIT_LEVEL_C,
+                           &_mc_heap_c, _mc_nodes_c);
+        init_edf_domain(&_mc_crit_c.domain, &_mc_crit_c_rt, CRIT_LEVEL_C,
+                        0, NO_CPU);
+        c_dom_lock = _mc_crit_c.domain.lock;
+        raw_spin_lock_init(c_dom_lock);
+        _mc_crit_c.domain.name = "LVL-C";
+        return register_sched_plugin(&mc_plugin);
+}
+module_init(init_mc);
diff --git a/litmus/sched_mc_ce.c b/litmus/sched_mc_ce.c
new file mode 100644
index 000000000000..702b46da93d5
--- /dev/null
+++ b/litmus/sched_mc_ce.c
@@ -0,0 +1,1052 @@
+/**
+ * litmus/sched_mc_ce.c
+ *
+ * The Cyclic Executive (CE) scheduler used by the mixed criticality scheduling
+ * algorithm.
+ */
+#include <asm/atomic.h>
+#include <asm/uaccess.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/hrtimer.h>
+#include <linux/pid.h>
+#include <linux/sched.h>
+#include <linux/proc_fs.h>
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/rt_domain.h>
+#include <litmus/rt_param.h>
+#include <litmus/litmus_proc.h>
+#include <litmus/sched_trace.h>
+#include <litmus/jobs.h>
+#include <litmus/sched_mc.h>
+#include <litmus/ce_domain.h>
+static struct sched_plugin mc_ce_plugin __cacheline_aligned_in_smp;
+#define using_linux_plugin() (litmus == &linux_sched_plugin)
+/* get a reference to struct domain for a CPU */
+#define get_domain_for(cpu) (&per_cpu(domains, cpu)->domain)
+#define get_pid_table(cpu) (&per_cpu(ce_pid_table, cpu))
+#define get_pid_entry(cpu, idx) (&(get_pid_table(cpu)->entries[idx]))
+static atomic_t start_time_set = ATOMIC_INIT(-1);
+static atomic64_t start_time = ATOMIC64_INIT(0);
+static struct proc_dir_entry *mc_ce_dir = NULL, *ce_file = NULL;
+/*
+ * Cache the budget along with the struct PID for a task so that we don't need
+ * to fetch its task_struct every time we check to see what should be
+ * scheduled.
+ */
+struct ce_pid_entry {
+        struct pid *pid;
+        lt_t budget;
+        /* accumulated (summed) budgets, including this one */
+        lt_t acc_time;
+        unsigned int expected_job;
+};
+/*
+ * Each CPU needs a mapping of level A ID (integer) to struct pid so that we
+ * can get its task struct.
+ */
+struct ce_pid_table {
+        struct ce_pid_entry entries[CONFIG_PLUGIN_MC_LEVEL_A_MAX_TASKS];
+        int num_pid_entries;
+        lt_t cycle_time;
+};
+DEFINE_PER_CPU(struct ce_pid_table, ce_pid_table);
+/*
+ * How we get the domain for a given CPU locally. Set with the
+ * mc_ce_set_domains function. Must be done before activating plugins. Be
+ * careful when using domains as a variable elsewhere in this file.
+ */
+DEFINE_PER_CPU(struct domain_data*, domains);
+/*
+ * The domains and other data used by the MC-CE plugin when it runs alone.
+ */
+DEFINE_PER_CPU(struct domain_data, _mc_ce_doms);
+DEFINE_PER_CPU(struct ce_dom_data, _mc_ce_dom_data);
+DEFINE_PER_CPU(raw_spinlock_t, _mc_ce_dom_locks);
+#ifdef CONFIG_PLUGIN_MC_RELEASE_MASTER
+static int interrupt_cpu;
+#endif
+long mc_ce_set_domains(const int n, struct domain_data *domains_in[])
+{
+        const int max = (NR_CPUS < n) ? NR_CPUS : n;
+        struct domain_data *new_dom = NULL;
+        int i, ret;
+        if (!using_linux_plugin()) {
+                printk(KERN_WARNING "can't set MC-CE domains when not using "
+                                "Linux scheduler.\n");
+                ret = -EINVAL;
+                goto out;
+        }
+        for (i = 0; i < max; ++i) {
+                new_dom = domains_in[i];
+                per_cpu(domains, i) = new_dom;
+        }
+        ret = 0;
+out:
+        return ret;
+}
+unsigned int mc_ce_get_expected_job(const int cpu, const int idx)
+{
+        const struct ce_pid_table *pid_table = get_pid_table(cpu);
+        BUG_ON(0 > cpu);
+        BUG_ON(0 > idx);
+        BUG_ON(pid_table->num_pid_entries <= idx);
+        return pid_table->entries[idx].expected_job;
+}
+/*
+ * Get the offset into the cycle taking the start time into account.
+ */
+static inline lt_t get_cycle_offset(const lt_t when, const lt_t cycle_time)
+{
+        long long st = atomic64_read(&start_time);
+        lt_t offset = (when - st) % cycle_time;
+        TRACE("when: %llu  cycle_time: %llu start_time: %lld  offset %llu\n",
+                        when, cycle_time, st, offset);
+        return offset;
+}
+/*
+ * The user land job completion call will set the RT_F_SLEEP flag and then
+ * call schedule. This function is used when schedule sleeps a task.
+ *
+ * Do not call prepare_for_next_period on Level-A tasks!
+ */
+static void mc_ce_job_completion(struct domain *dom, struct task_struct *ts)
+{
+        const int cpu = task_cpu(ts);
+        const int idx = tsk_mc_data(ts)->mc_task.lvl_a_id;
+        const struct ce_pid_entry *pid_entry = get_pid_entry(cpu, idx);
+        unsigned int just_finished;
+        TRACE_TASK(ts, "Completed\n");
+        /* sched_trace_task_completion(ts, 0); */
+        /* post-increment is important here */
+        just_finished = (tsk_rt(ts)->job_params.job_no)++;
+        /* Job completes in expected window: everything is normal.
+         * Job completes in an earlier window: BUG(), that's wrong.
+         * Job completes in a later window: The job is behind.
+         */
+        if (just_finished < pid_entry->expected_job) {
+                /* this job is already released because it's running behind */
+                set_rt_flags(ts, RT_F_RUNNING);
+                TRACE_TASK(ts, "appears behind: the expected job is %u but "
+                                "job %u just completed\n",
+                                pid_entry->expected_job, just_finished);
+        } else if (pid_entry->expected_job < just_finished) {
+                printk(KERN_CRIT "job %u completed in expected job %u which "
+                                "seems too early\n", just_finished,
+                                pid_entry->expected_job);
+                BUG();
+        }
+}
+/*
+ * Return the index into the PID entries table of what to schedule next.
+ * Don't call if the table is empty. Assumes the caller has the domain lock.
+ * The offset parameter is the offset into the cycle.
+ *
+ * TODO Currently O(n) in the number of tasks on the CPU. Binary search?
+ */
+static int mc_ce_schedule_at(const struct domain *dom, lt_t offset)
+{
+        const struct ce_dom_data *ce_data = dom->data;
+        struct ce_pid_table *pid_table = get_pid_table(ce_data->cpu);
+        const struct ce_pid_entry *pid_entry = NULL;
+        int idx;
+        BUG_ON(pid_table->cycle_time < 1);
+        BUG_ON(pid_table->num_pid_entries < 1);
+        for (idx = 0; idx < pid_table->num_pid_entries; ++idx) {
+                pid_entry = &pid_table->entries[idx];
+                if (offset < pid_entry->acc_time) {
+                        /* found task to schedule in this window */
+                        break;
+                }
+        }
+        /* can only happen if cycle_time is not right */
+        BUG_ON(pid_entry->acc_time > pid_table->cycle_time);
+        TRACE("schedule at returning task %d for CPU %d\n", idx, ce_data->cpu);
+        return idx;
+}
+static struct task_struct *mc_ce_schedule(struct task_struct *prev)
+{
+        struct domain *dom = get_domain_for(smp_processor_id());
+        struct ce_dom_data *ce_data = dom->data;
+        struct task_struct *next = NULL;
+        int exists, sleep, should_sched_exists, should_sched_blocked,
+            should_sched_asleep;
+        raw_spin_lock(dom->lock);
+        /* sanity checking */
+        BUG_ON(ce_data->scheduled && ce_data->scheduled != prev);
+        BUG_ON(ce_data->scheduled && !is_realtime(prev));
+        BUG_ON(is_realtime(prev) && !ce_data->scheduled);
+        exists = NULL != ce_data->scheduled;
+        sleep = exists && RT_F_SLEEP == get_rt_flags(ce_data->scheduled);
+        TRACE("exists: %d, sleep: %d\n", exists, sleep);
+        if (sleep)
+                mc_ce_job_completion(dom, ce_data->scheduled);
+        /* these checks must go after the call to mc_ce_job_completion in case
+         * a late task needs to be scheduled again right away and its the only
+         * task on a core
+         */
+        should_sched_exists = NULL != ce_data->should_schedule;
+        should_sched_blocked = should_sched_exists &&
+                !is_running(ce_data->should_schedule);
+        should_sched_asleep = should_sched_exists &&
+                RT_F_SLEEP == get_rt_flags(ce_data->should_schedule);
+        TRACE("should_sched_exists: %d, should_sched_blocked: %d, "
+                        "should_sched_asleep: %d\n", should_sched_exists,
+                        should_sched_blocked, should_sched_asleep);
+        if (should_sched_exists && !should_sched_blocked &&
+                        !should_sched_asleep) {
+                /*
+                 * schedule the task that should be executing in the cyclic
+                 * schedule if it is not blocked and not sleeping
+                 */
+                next = ce_data->should_schedule;
+        }
+        sched_state_task_picked();
+        raw_spin_unlock(dom->lock);
+        return next;
+}
+static void mc_ce_finish_switch(struct task_struct *prev)
+{
+        struct domain *dom = get_domain_for(smp_processor_id());
+        struct ce_dom_data *ce_data = dom->data;
+        TRACE("finish switch\n");
+        if (is_realtime(current) && CRIT_LEVEL_A == tsk_mc_crit(current))
+                ce_data->scheduled = current;
+        else
+                ce_data->scheduled = NULL;
+}
+/*
+ * Admit task called to see if this task is permitted to enter the system.
+ * Here we look up the task's PID structure and save it in the proper slot on
+ * the CPU this task will run on.
+ */
+long mc_ce_admit_task_common(struct task_struct *ts)
+{
+        struct domain *dom = get_domain_for(get_partition(ts));
+        struct ce_dom_data *ce_data = dom->data;
+        struct mc_data *mcd = tsk_mc_data(ts);
+        struct pid *pid = NULL;
+        long retval = -EINVAL;
+        const int lvl_a_id = mcd->mc_task.lvl_a_id;
+        struct ce_pid_table *pid_table = get_pid_table(ce_data->cpu);
+        BUG_ON(get_partition(ts) != ce_data->cpu);
+        /* check the task has migrated to the right CPU (like in sched_cedf) */
+        if (task_cpu(ts) != get_partition(ts)) {
+                printk(KERN_INFO "litmus: %d admitted on CPU %d but want %d ",
+                                ts->pid, task_cpu(ts), get_partition(ts));
+                goto out;
+        }
+        /* only level A tasks can be CE */
+        if (!mcd || CRIT_LEVEL_A != tsk_mc_crit(ts)) {
+                printk(KERN_INFO "litmus: non-MC or non level A task %d\n",
+                                ts->pid);
+                goto out;
+        }
+        /* try and get the task's PID structure */
+        pid = get_task_pid(ts, PIDTYPE_PID);
+        if (IS_ERR_OR_NULL(pid)) {
+                printk(KERN_INFO "litmus: couldn't get pid struct for %d\n",
+                                ts->pid);
+                goto out;
+        }
+        if (lvl_a_id >= pid_table->num_pid_entries) {
+                printk(KERN_INFO "litmus: level A id greater than expected "
+                                "number of tasks %d for %d cpu %d\n",
+                                pid_table->num_pid_entries, ts->pid,
+                                get_partition(ts));
+                goto out_put_pid;
+        }
+        if (pid_table->entries[lvl_a_id].pid) {
+                printk(KERN_INFO "litmus: have saved pid info id: %d cpu: %d\n",
+                                lvl_a_id, get_partition(ts));
+                goto out_put_pid;
+        }
+        if (get_exec_cost(ts) >= pid_table->entries[lvl_a_id].budget) {
+                printk(KERN_INFO "litmus: execution cost %llu is larger than "
+                                "the budget %llu\n",
+                                get_exec_cost(ts),
+                                pid_table->entries[lvl_a_id].budget);
+                goto out_put_pid;
+        }
+        pid_table->entries[lvl_a_id].pid = pid;
+        retval = 0;
+        /* don't call put_pid if we are successful */
+        goto out;
+out_put_pid:
+        put_pid(pid);
+out:
+        return retval;
+}
+static long mc_ce_admit_task(struct task_struct *ts)
+{
+        struct domain *dom = get_domain_for(get_partition(ts));
+        unsigned long flags, retval;
+        raw_spin_lock_irqsave(dom->lock, flags);
+        retval = mc_ce_admit_task_common(ts);
+        raw_spin_unlock_irqrestore(dom->lock, flags);
+        return retval;
+}
+/*
+ * Called to set up a new real-time task (after the admit_task callback).
+ * At this point the task's struct PID is already hooked up on the destination
+ * CPU. The task may already be running.
+ */
+static void mc_ce_task_new(struct task_struct *ts, int on_rq, int running)
+{
+        const int cpu = task_cpu(ts);
+        struct domain *dom = get_domain_for(cpu);
+        struct ce_dom_data *ce_data = dom->data;
+        struct ce_pid_table *pid_table = get_pid_table(cpu);
+        struct pid *pid_should_be_running;
+        struct ce_pid_entry *pid_entry;
+        unsigned long flags;
+        int idx, should_be_running;
+        lt_t offset;
+        raw_spin_lock_irqsave(dom->lock, flags);
+        pid_entry = get_pid_entry(cpu, tsk_mc_data(ts)->mc_task.lvl_a_id);
+        /* initialize some task state */
+        set_rt_flags(ts, RT_F_RUNNING);
+        /* have to call mc_ce_schedule_at because the task only gets a PID
+         * entry after calling admit_task */
+        offset = get_cycle_offset(litmus_clock(), pid_table->cycle_time);
+        idx = mc_ce_schedule_at(dom, offset);
+        pid_should_be_running = get_pid_entry(cpu, idx)->pid;
+        rcu_read_lock();
+        should_be_running = (ts == pid_task(pid_should_be_running, PIDTYPE_PID));
+        rcu_read_unlock();
+        if (running) {
+                /* admit task checks that the task is not on the wrong CPU */
+                BUG_ON(task_cpu(ts) != get_partition(ts));
+                BUG_ON(ce_data->scheduled);
+                ce_data->scheduled = ts;
+                if (should_be_running)
+                        ce_data->should_schedule = ts;
+                else
+                        preempt_if_preemptable(ce_data->scheduled, ce_data->cpu);
+        } else if (!running && should_be_running) {
+                ce_data->should_schedule = ts;
+                preempt_if_preemptable(ce_data->scheduled, ce_data->cpu);
+        }
+        raw_spin_unlock_irqrestore(dom->lock, flags);
+}
+/*
+ * Called to re-introduce a task after blocking.
+ * Can potentailly be called multiple times.
+ */
+static void mc_ce_task_wake_up(struct task_struct *ts)
+{
+        struct domain *dom = get_domain_for(get_partition(ts));
+        struct ce_dom_data *ce_data = dom->data;
+        unsigned long flags;
+        TRACE_TASK(ts, "wake up\n");
+        raw_spin_lock_irqsave(dom->lock, flags);
+        if (ts == ce_data->should_schedule && ts != ce_data->scheduled)
+                preempt_if_preemptable(ts, ce_data->cpu);
+        raw_spin_unlock_irqrestore(dom->lock, flags);
+}
+/*
+ * Called to notify the plugin of a blocking real-time tasks. Only called for
+ * real-time tasks and before schedule is called.
+ */
+static void mc_ce_task_block(struct task_struct *ts)
+{
+        /* nothing to do because it will be taken care of in schedule */
+        TRACE_TASK(ts, "blocked\n");
+}
+/*
+ * Called when a task switches from RT mode back to normal mode.
+ */
+void mc_ce_task_exit_common(struct task_struct *ts)
+{
+        struct domain *dom = get_domain_for(get_partition(ts));
+        struct ce_dom_data *ce_data = dom->data;
+        unsigned long flags;
+        struct pid *pid;
+        const int lvl_a_id = tsk_mc_data(ts)->mc_task.lvl_a_id;
+        struct ce_pid_table *pid_table = get_pid_table(ce_data->cpu);
+        BUG_ON(CRIT_LEVEL_A != tsk_mc_crit(ts));
+        BUG_ON(lvl_a_id >= pid_table->num_pid_entries);
+        raw_spin_lock_irqsave(dom->lock, flags);
+        pid = pid_table->entries[lvl_a_id].pid;
+        BUG_ON(!pid);
+        put_pid(pid);
+        pid_table->entries[lvl_a_id].pid = NULL;
+        if (ce_data->scheduled == ts)
+                ce_data->scheduled = NULL;
+        if (ce_data->should_schedule == ts)
+                ce_data->should_schedule = NULL;
+        raw_spin_unlock_irqrestore(dom->lock, flags);
+}
+/***********************************************************
+ * Timer stuff
+ **********************************************************/
+/*
+ * Returns the next absolute time that the timer should fire.
+ */
+lt_t mc_ce_timer_callback_common(struct domain *dom)
+{
+        /* relative and absolute times for cycles */
+        lt_t now, offset_rel, cycle_start_abs, next_timer_abs;
+        struct task_struct *should_schedule;
+        struct ce_pid_table *pid_table;
+        struct ce_pid_entry *pid_entry;
+        struct ce_dom_data *ce_data;
+        int idx, budget_overrun;
+        ce_data = dom->data;
+        pid_table = get_pid_table(ce_data->cpu);
+        /* Based off of the current time, figure out the offset into the cycle
+         * and the cycle's start time, and determine what should be scheduled.
+         */
+        now = litmus_clock();
+        offset_rel = get_cycle_offset(now, pid_table->cycle_time);
+        cycle_start_abs = now - offset_rel;
+        idx = mc_ce_schedule_at(dom, offset_rel);
+        pid_entry = get_pid_entry(ce_data->cpu, idx);
+        next_timer_abs = cycle_start_abs + pid_entry->acc_time;
+        STRACE("timer: now: %llu  offset_rel: %llu  cycle_start_abs: %llu  "
+                        "next_timer_abs: %llu\n", now, offset_rel,
+                        cycle_start_abs, next_timer_abs);
+        /* get the task_struct (pid_task can accept a NULL) */
+        rcu_read_lock();
+        should_schedule = pid_task(pid_entry->pid, PIDTYPE_PID);
+        rcu_read_unlock();
+        ce_data->should_schedule = should_schedule;
+        if (should_schedule && 0 == atomic_read(&start_time_set)) {
+                /*
+                 * If jobs are not overrunning their budgets, then this
+                 * should not happen.
+                 */
+                pid_entry->expected_job++;
+                budget_overrun = pid_entry->expected_job !=
+                        tsk_rt(should_schedule)->job_params.job_no;
+                if (budget_overrun)
+                        TRACE_MC_TASK(should_schedule,
+                                      "timer expected job number: %u "
+                                      "but current job: %u\n",
+                                      pid_entry->expected_job,
+                                      tsk_rt(should_schedule)->job_params.job_no);
+        }
+        if (ce_data->should_schedule) {
+                tsk_rt(should_schedule)->job_params.deadline =
+                        cycle_start_abs + pid_entry->acc_time;
+                tsk_rt(should_schedule)->job_params.release =
+                        tsk_rt(should_schedule)->job_params.deadline -
+                        pid_entry->budget;
+                tsk_rt(should_schedule)->job_params.exec_time = 0;
+                /* sched_trace_task_release(should_schedule); */
+                set_rt_flags(ce_data->should_schedule, RT_F_RUNNING);
+        }
+        return next_timer_abs;
+}
+/*
+ * What to do when a timer fires. The timer should only be armed if the number
+ * of PID entries is positive.
+ */
+#ifdef CONFIG_MERGE_TIMERS
+static void mc_ce_timer_callback(struct rt_event *e)
+#else
+static enum hrtimer_restart mc_ce_timer_callback(struct hrtimer *timer)
+#endif
+{
+        struct ce_dom_data *ce_data;
+        unsigned long flags;
+        struct domain *dom;
+        lt_t next_timer_abs;
+#ifdef CONFIG_MERGE_TIMERS
+        struct event_group *event_group;
+        ce_data = container_of(e, struct ce_dom_data, event);
+        /* use the same CPU the callbacking is executing on by passing NO_CPU */
+        event_group = get_event_group_for(NO_CPU);
+#else /* CONFIG_MERGE_TIMERS */
+        ce_data = container_of(timer, struct ce_dom_data, timer);
+#endif
+        dom = get_domain_for(ce_data->cpu);
+        TRACE("timer callback on CPU %d (before lock)\n", ce_data->cpu);
+        raw_spin_lock_irqsave(dom->lock, flags);
+        next_timer_abs = mc_ce_timer_callback_common(dom);
+        /* setup an event or timer for the next release in the CE schedule */
+#ifdef CONFIG_MERGE_TIMERS
+        add_event(event_group, e, next_timer_abs);
+#else
+        hrtimer_set_expires(timer, ns_to_ktime(next_timer_abs));
+#endif
+        if (ce_data->scheduled != ce_data->should_schedule)
+                preempt_if_preemptable(ce_data->scheduled, ce_data->cpu);
+        raw_spin_unlock_irqrestore(dom->lock, flags);
+#ifndef CONFIG_MERGE_TIMERS
+        return HRTIMER_RESTART;
+#endif
+}
+/*
+ * Cancel timers on all CPUs. Returns 1 if any were active.
+ */
+static int cancel_all_timers(void)
+{
+        struct ce_dom_data *ce_data;
+        struct domain *dom;
+        int cpu, ret = 0;
+#ifndef CONFIG_MERGE_TIMERS
+        int cancel_res;
+#endif
+        TRACE("cancel all timers\n");
+        for_each_online_cpu(cpu) {
+                dom = get_domain_for(cpu);
+                ce_data = dom->data;
+                ce_data->should_schedule = NULL;
+#ifdef CONFIG_MERGE_TIMERS
+                cancel_event(&ce_data->event);
+#else
+                cancel_res = hrtimer_cancel(&ce_data->timer);
+                atomic_set(&ce_data->timer_info.state,
+                                HRTIMER_START_ON_INACTIVE);
+                ret = ret || cancel_res;
+#endif
+        }
+        return ret;
+}
+/*
+ * Arm all timers so that they start at the new value of start time.
+ * Any CPU without CE PID entries won't have a timer armed.
+ * All timers should be canceled before calling this.
+ */
+static void arm_all_timers(void)
+{
+        struct domain *dom;
+        struct ce_dom_data *ce_data;
+        struct ce_pid_table *pid_table;
+        int cpu, idx, cpu_for_timer;
+        const lt_t start = atomic64_read(&start_time);
+        TRACE("arm all timers\n");
+        for_each_online_cpu(cpu) {
+                dom = get_domain_for(cpu);
+                ce_data = dom->data;
+                pid_table = get_pid_table(cpu);
+                if (0 == pid_table->num_pid_entries)
+                        continue;
+                for (idx = 0; idx < pid_table->num_pid_entries; idx++) {
+                        pid_table->entries[idx].expected_job = 0;
+                }
+#ifdef CONFIG_PLUGIN_MC_RELEASE_MASTER
+                cpu_for_timer = interrupt_cpu;
+#else
+                cpu_for_timer = cpu;
+#endif
+#ifdef CONFIG_MERGE_TIMERS
+                add_event(get_event_group_for(cpu_for_timer),
+                                &ce_data->event, start);
+#else
+                hrtimer_start_on(cpu_for_timer, &ce_data->timer_info,
+                                &ce_data->timer, ns_to_ktime(start),
+                                HRTIMER_MODE_ABS_PINNED);
+#endif
+        }
+}
+/*
+ * There are no real releases in the CE, but the task release syscall will
+ * call this. We can re-set our notion of the CE period start to make
+ * the schedule look pretty.
+ */
+void mc_ce_release_at_common(struct task_struct *ts, lt_t start)
+{
+        TRACE_TASK(ts, "release at\n");
+        if (atomic_inc_and_test(&start_time_set)) {
+                /* in this case, we won the race */
+                cancel_all_timers();
+                atomic64_set(&start_time, start);
+                arm_all_timers();
+        } else
+                atomic_dec(&start_time_set);
+}
+long mc_ce_activate_plugin_common(void)
+{
+        struct ce_dom_data *ce_data;
+        struct domain *dom;
+        long ret;
+        int cpu;
+#ifdef CONFIG_PLUGIN_MC_RELEASE_MASTER
+        interrupt_cpu = atomic_read(&release_master_cpu);
+        if (NO_CPU == interrupt_cpu) {
+                printk(KERN_ERR "LITMUS: MC-CE needs a release master\n");
+                ret = -EINVAL;
+                goto out;
+        }
+#endif
+        for_each_online_cpu(cpu) {
+                dom = get_domain_for(cpu);
+                ce_data = dom->data;
+                ce_data->scheduled = NULL;
+                ce_data->should_schedule = NULL;
+        }
+        atomic_set(&start_time_set, -1);
+        atomic64_set(&start_time, litmus_clock());
+        /* may not want to arm timers on activation, just after release */
+        arm_all_timers();
+        ret = 0;
+out:
+        return ret;
+}
+static long mc_ce_activate_plugin(void)
+{
+        struct domain_data *our_domains[NR_CPUS];
+        int cpu, n = 0;
+        long ret;
+        for_each_online_cpu(cpu) {
+                BUG_ON(NR_CPUS <= n);
+                our_domains[cpu] = &per_cpu(_mc_ce_doms, cpu);
+                n++;
+        }
+        ret = mc_ce_set_domains(n, our_domains);
+        if (ret)
+                goto out;
+        ret = mc_ce_activate_plugin_common();
+out:
+        return ret;
+}
+static void clear_pid_entries(void)
+{
+        struct ce_pid_table *pid_table = NULL;
+        int cpu, entry;
+        for_each_online_cpu(cpu) {
+                pid_table = get_pid_table(cpu);
+                pid_table->num_pid_entries = 0;
+                pid_table->cycle_time = 0;
+                for (entry = 0; entry < CONFIG_PLUGIN_MC_LEVEL_A_MAX_TASKS;
+                                ++entry) {
+                        if (NULL != pid_table->entries[entry].pid) {
+                                put_pid(pid_table->entries[entry].pid);
+                                pid_table->entries[entry].pid = NULL;
+                        }
+                        pid_table->entries[entry].budget = 0;
+                        pid_table->entries[entry].acc_time = 0;
+                        pid_table->entries[entry].expected_job = 0;
+                }
+        }
+}
+long mc_ce_deactivate_plugin_common(void)
+{
+        int cpu;
+        cancel_all_timers();
+        for_each_online_cpu(cpu) {
+                per_cpu(domains, cpu) = NULL;
+        }
+        return 0;
+}
+/*      Plugin object   */
+static struct sched_plugin mc_ce_plugin __cacheline_aligned_in_smp = {
+        .plugin_name            = "MC-CE",
+        .admit_task             = mc_ce_admit_task,
+        .task_new               = mc_ce_task_new,
+        .complete_job           = complete_job,
+        .release_at             = mc_ce_release_at_common,
+        .task_exit              = mc_ce_task_exit_common,
+        .schedule               = mc_ce_schedule,
+        .finish_switch          = mc_ce_finish_switch,
+        .task_wake_up           = mc_ce_task_wake_up,
+        .task_block             = mc_ce_task_block,
+        .activate_plugin        = mc_ce_activate_plugin,
+        .deactivate_plugin      = mc_ce_deactivate_plugin_common,
+};
+static int setup_proc(void);
+static int __init init_sched_mc_ce(void)
+{
+        raw_spinlock_t *ce_lock;
+        struct domain_data *dom_data;
+        struct domain *dom;
+        int cpu, err;
+        for_each_online_cpu(cpu) {
+                per_cpu(domains, cpu) = NULL;
+                ce_lock = &per_cpu(_mc_ce_dom_locks, cpu);
+                raw_spin_lock_init(ce_lock);
+                dom_data = &per_cpu(_mc_ce_doms, cpu);
+                dom = &dom_data->domain;
+                ce_domain_init(dom, ce_lock, NULL, NULL, NULL, NULL, NULL,
+                                &per_cpu(_mc_ce_dom_data, cpu), cpu,
+                                mc_ce_timer_callback);
+        }
+        clear_pid_entries();
+        err = setup_proc();
+        if (!err)
+                err = register_sched_plugin(&mc_ce_plugin);
+        return err;
+}
+#define BUF_SIZE PAGE_SIZE
+static int write_into_proc(char *proc_buf, const int proc_size, char *fmt, ...)
+{
+        static char buf[BUF_SIZE];
+        int n;
+        va_list args;
+        /* When writing to procfs, we don't care about the trailing null that
+         * is not included in the count returned by vscnprintf.
+         */
+        va_start(args, fmt);
+        n = vsnprintf(buf, BUF_SIZE, fmt, args);
+        va_end(args);
+        if (BUF_SIZE <= n || proc_size <= n) {
+                /* too big for formatting buffer or proc (less null byte) */
+                n = -EINVAL;
+                goto out;
+        }
+        memcpy(proc_buf, buf, n);
+out:
+        return n;
+}
+#undef BUF_SIZE
+/*
+ * Writes a PID entry to the procfs.
+ *
+ * @page buffer to write into.
+ * @count bytes available in the buffer
+ */
+#define PID_SPACE 15
+#define TASK_INFO_BUF (PID_SPACE + TASK_COMM_LEN)
+static int write_pid_entry(char *page, const int count, const int cpu,
+                const int task, struct ce_pid_entry *pid_entry)
+{
+        static char task_info[TASK_INFO_BUF];
+        struct task_struct *ts;
+        int n = 0, err, ti_n;
+        char *ti_b;
+        if (pid_entry->pid) {
+                rcu_read_lock();
+                ts = pid_task(pid_entry->pid, PIDTYPE_PID);
+                rcu_read_unlock();
+                /* get some information about the task */
+                if (ts) {
+                        ti_b = task_info;
+                        ti_n = snprintf(ti_b, PID_SPACE, "%d", ts->pid);
+                        if (PID_SPACE <= ti_n)
+                                ti_n = PID_SPACE - 1;
+                        ti_b += ti_n;
+                        *ti_b = ' '; /* nuke the null byte */
+                        ti_b++;
+                        get_task_comm(ti_b, ts);
+                } else {
+                        strncpy(task_info, "pid_task() failed :(",
+                                        TASK_INFO_BUF);
+                }
+        } else
+                strncpy(task_info, "no", TASK_INFO_BUF);
+        task_info[TASK_INFO_BUF - 1] = '\0'; /* just to be sure */
+        err = write_into_proc(page + n, count - n, "# task: %s\n", task_info);
+        if (err < 0) {
+                n = -ENOSPC;
+                goto out;
+        }
+        n += err;
+        err = write_into_proc(page + n, count - n, "%d, %d, %llu\n",
+                        cpu, task, pid_entry->budget);
+        if (err < 0) {
+                n = -ENOSPC;
+                goto out;
+        }
+        n += err;
+out:
+        return n;
+}
+#undef PID_SPACE
+#undef TASK_INFO_BUF
+/*
+ * Called when the user-land reads from proc.
+ */
+static int proc_read_ce_file(char *page, char **start, off_t off, int count,
+                int *eof, void *data)
+{
+        int n = 0, err, cpu, t;
+        struct ce_pid_table *pid_table;
+        if (off > 0) {
+                printk(KERN_INFO "litmus: MC-CE called read with off > 0\n");
+                goto out;
+        }
+        for_each_online_cpu(cpu) {
+                pid_table = get_pid_table(cpu);
+                for (t = 0; t < pid_table->num_pid_entries; ++t) {
+                        err = write_pid_entry(page + n, count - n,
+                                        cpu, t, get_pid_entry(cpu, t));
+                        if (err < 0) {
+                                n = -ENOSPC;
+                                goto out;
+                        }
+                        n += err;
+                }
+        }
+out:
+        *eof = 1;
+        return n;
+}
+/*
+ * Skip a commented line.
+ */
+static int skip_comment(const char *buf, const unsigned long max)
+{
+        unsigned long i = 0;
+        const char *c = buf;
+        if (0 == max || !c || *c != '#')
+                return 0;
+        ++c; ++i;
+        for (; i < max; ++i) {
+                if (*c == '\n') {
+                        ++c; ++i;
+                        break;
+                }
+                ++c;
+        }
+        return i;
+}
+/* a budget of 5 milliseconds is probably reasonable */
+#define BUDGET_THRESHOLD 5000000ULL
+static int setup_pid_entry(const int cpu, const int task, const lt_t budget)
+{
+        struct ce_pid_table *pid_table = get_pid_table(cpu);
+        struct ce_pid_entry *new_entry = NULL;
+        int err = 0;
+        /* check the inputs */
+        if (cpu < 0 || NR_CPUS <= cpu || task < 0 ||
+                        CONFIG_PLUGIN_MC_LEVEL_A_MAX_TASKS <= task ||
+                        budget < 1) {
+                printk(KERN_INFO "litmus: bad cpu, task ID, or budget sent to "
+                                "MC-CE proc\n");
+                err = -EINVAL;
+                goto out;
+        }
+        /* check for small budgets */
+        if (BUDGET_THRESHOLD > budget) {
+                printk(KERN_CRIT "litmus: you gave a small budget for an "
+                                "MC-CE task; that might be an issue.\n");
+        }
+        /* check that we have space for a new entry */
+        if (CONFIG_PLUGIN_MC_LEVEL_A_MAX_TASKS <= pid_table->num_pid_entries) {
+                printk(KERN_INFO "litmus: too many MC-CE tasks for cpu "
+                                "%d\n", cpu);
+                err = -EINVAL;
+                goto out;
+        }
+        /* add the new entry */
+        new_entry = get_pid_entry(cpu, pid_table->num_pid_entries);
+        BUG_ON(NULL != new_entry->pid);
+        new_entry->budget = budget;
+        new_entry->acc_time = pid_table->cycle_time + budget;
+        /* update the domain entry */
+        pid_table->cycle_time += budget;
+        pid_table->num_pid_entries++;
+out:
+        return err;
+}
+#undef BUDGET_THRESHOLD
+/*
+ * Called when the user-land writes to proc.
+ *
+ * Error checking is quite minimal. Format is:
+ * <cpu>, <process ID>, <budget>
+ */
+#define PROCFS_MAX_SIZE PAGE_SIZE
+static int proc_write_ce_file(struct file *file, const char __user *buffer,
+                unsigned long count, void *data)
+{
+        static char kbuf[PROCFS_MAX_SIZE];
+        char *c = kbuf, *c_skipped;
+        int cpu, task, cnt = 0, chars_read, converted, err;
+        lt_t budget;
+        if (!using_linux_plugin()) {
+                printk(KERN_INFO "litmus: can only edit MC-CE proc under Linux "
+                                "plugin\n");
+                cnt = -EINVAL;
+                goto out;
+        }
+        if (count > PROCFS_MAX_SIZE) {
+                printk(KERN_INFO "litmus: MC-CE procfs got too many bytes "
+                                "from user-space.\n");
+                cnt = -EINVAL;
+                goto out;
+        }
+        if (copy_from_user(kbuf, buffer, count)) {
+                printk(KERN_INFO "litmus: couldn't copy from user %s\n",
+                                __FUNCTION__);
+                cnt = -EFAULT;
+                goto out;
+        }
+        clear_pid_entries();
+        while (cnt < count) {
+                c_skipped = skip_spaces(c);
+                if (c_skipped != c) {
+                        chars_read = c_skipped - c;
+                        cnt += chars_read;
+                        c += chars_read;
+                        continue;
+                }
+                if (*c == '#') {
+                        chars_read = skip_comment(c, count - cnt);
+                        cnt += chars_read;
+                        c += chars_read;
+                        continue;
+                }
+                converted = sscanf(c, "%d, %d, %llu%n", &cpu, &task, &budget,
+                                &chars_read);
+                if (3 != converted) {
+                        printk(KERN_INFO "litmus: MC-CE procfs expected three "
+                                        "arguments, but got %d.\n", converted);
+                        cnt = -EINVAL;
+                        goto out;
+                }
+                cnt += chars_read;
+                c += chars_read;
+                err = setup_pid_entry(cpu, task, budget);
+                if (err) {
+                        cnt = -EINVAL;
+                        goto out;
+                }
+        }
+out:
+        return cnt;
+}
+#undef PROCFS_MAX_SIZE
+#define CE_FILE_PROC_NAME "ce_file"
+static void tear_down_proc(void)
+{
+        if (ce_file)
+                remove_proc_entry(CE_FILE_PROC_NAME, mc_ce_dir);
+        if (mc_ce_dir)
+                remove_plugin_proc_dir(&mc_ce_plugin);
+}
+static int setup_proc(void)
+{
+        int err;
+        err = make_plugin_proc_dir(&mc_ce_plugin, &mc_ce_dir);
+        if (err) {
+                printk(KERN_ERR "could not create MC-CE procfs dir.\n");
+                goto out;
+        }
+        ce_file = create_proc_entry(CE_FILE_PROC_NAME, 0644, mc_ce_dir);
+        if (!ce_file) {
+                printk(KERN_ERR "could not create MC-CE procfs file.\n");
+                err = -EIO;
+                goto out_remove_proc;
+        }
+        ce_file->read_proc = proc_read_ce_file;
+        ce_file->write_proc = proc_write_ce_file;
+        goto out;
+out_remove_proc:
+        tear_down_proc();
+out:
+        return err;
+}
+#undef CE_FILE_PROC_NAME
+static void clean_sched_mc_ce(void)
+{
+        tear_down_proc();
+}
+module_init(init_sched_mc_ce);
+module_exit(clean_sched_mc_ce);
diff --git a/litmus/sched_plugin.c b/litmus/sched_plugin.c
index 00a1900d6457..123c7516fb76 100644
--- a/litmus/sched_plugin.c
+++ b/litmus/sched_plugin.c
@@ -95,6 +95,10 @@ static void litmus_dummy_task_exit(struct task_struct *task)
 {
 }
+static void litmus_dummy_release_ts(lt_t time)
+{
+}
 static long litmus_dummy_complete_job(void)
 {
        return -ENOSYS;
@@ -136,6 +140,7 @@ struct sched_plugin linux_sched_plugin = {
        .finish_switch = litmus_dummy_finish_switch,
        .activate_plugin = litmus_dummy_activate_plugin,
        .deactivate_plugin = litmus_dummy_deactivate_plugin,
+        .release_ts = litmus_dummy_release_ts,
 #ifdef CONFIG_LITMUS_LOCKING
        .allocate_lock = litmus_dummy_allocate_lock,
 #endif
@@ -174,6 +179,7 @@ int register_sched_plugin(struct sched_plugin* plugin)
        CHECK(complete_job);
        CHECK(activate_plugin);
        CHECK(deactivate_plugin);
+        CHECK(release_ts);
 #ifdef CONFIG_LITMUS_LOCKING
        CHECK(allocate_lock);
 #endif
diff --git a/litmus/sched_psn_edf.c b/litmus/sched_psn_edf.c
index 8e4a22dd8d6a..eaaec38f43da 100644
--- a/litmus/sched_psn_edf.c
+++ b/litmus/sched_psn_edf.c
@@ -284,6 +284,9 @@ static void psnedf_task_new(struct task_struct * t, int on_rq, int running)
        TRACE_TASK(t, "psn edf: task new, cpu = %d\n",
                   t->rt_param.task_params.cpu);
+        trace_litmus_server_param(0 - t->pid, -1 - get_partition(t),
+                                  get_exec_time(t), get_rt_period(t));
        /* setup job parameters */
        release_at(t, litmus_clock());
diff --git a/litmus/sync.c b/litmus/sync.c
index bf75fde5450b..f3c9262f7022 100644
--- a/litmus/sync.c
+++ b/litmus/sync.c
@@ -73,6 +73,9 @@ static long do_release_ts(lt_t start)
        complete_n(&ts_release, task_count);
+        /* TODO: remove this hack */
+        litmus->release_ts(start);
        return task_count;
 }