71 files changed, 18008 insertions, 0 deletions
diff --git a/include/litmus/affinity.h b/include/litmus/affinity.h
new file mode 100644
index 000000000000..4d7c618c8175
--- /dev/null
+++ b/include/litmus/affinity.h
@@ -0,0 +1,52 @@
+#ifndef __LITMUS_AFFINITY_H
+#define __LITMUS_AFFINITY_H
+#include <linux/cpumask.h>
+/* Works like:
+void get_nearest_available_cpu(
+        cpu_entry_t **nearest,
+        cpu_entry_t *start,
+        cpu_entry_t *entries,
+        int release_master,
+        cpumask_var_t cpus_to_test)
+Set release_master = NO_CPU for no Release Master.
+We use a macro here to exploit the fact that C-EDF and G-EDF
+have similar structures for their cpu_entry_t structs, even though
+they do not share a common base-struct.  The macro allows us to
+avoid code duplication.
+ */
+#define get_nearest_available_cpu(nearest, start, entries, release_master, cpus_to_test) \
+{ \
+        (nearest) = NULL; \
+        if (!(start)->linked && likely((start)->cpu != (release_master))) { \
+                (nearest) = (start); \
+        } else { \
+                int __cpu; \
+                \
+                /* FIXME: get rid of the iteration with a bitmask + AND */ \
+                for_each_cpu(__cpu, cpus_to_test) { \
+                        if (likely(__cpu != release_master)) { \
+                                cpu_entry_t *__entry = &per_cpu((entries), __cpu); \
+                                if (cpus_share_cache((start)->cpu, __entry->cpu) \
+                                    && !__entry->linked) { \
+                                        (nearest) = __entry; \
+                                        break; \
+                                } \
+                        } \
+                } \
+        } \
+        \
+        if ((nearest)) { \
+                TRACE("P%d is closest available CPU to P%d\n", \
+                                (nearest)->cpu, (start)->cpu); \
+        } else { \
+                TRACE("Could not find an available CPU close to P%d\n", \
+                                (start)->cpu); \
+        } \
+}
+#endif
diff --git a/include/litmus/bheap.h b/include/litmus/bheap.h
new file mode 100644
index 000000000000..cf4864a498d8
--- /dev/null
+++ b/include/litmus/bheap.h
@@ -0,0 +1,77 @@
+/* bheaps.h -- Binomial Heaps
+ *
+ * (c) 2008, 2009 Bjoern Brandenburg
+ */
+#ifndef BHEAP_H
+#define BHEAP_H
+#define NOT_IN_HEAP UINT_MAX
+struct bheap_node {
+        struct bheap_node*      parent;
+        struct bheap_node*      next;
+        struct bheap_node*      child;
+        unsigned int            degree;
+        void*                   value;
+        struct bheap_node**     ref;
+};
+struct bheap {
+        struct bheap_node*      head;
+        /* We cache the minimum of the heap.
+         * This speeds up repeated peek operations.
+         */
+        struct bheap_node*      min;
+};
+typedef int (*bheap_prio_t)(struct bheap_node* a, struct bheap_node* b);
+void bheap_init(struct bheap* heap);
+void bheap_node_init(struct bheap_node** ref_to_bheap_node_ptr, void* value);
+static inline int bheap_node_in_heap(struct bheap_node* h)
+{
+        return h->degree != NOT_IN_HEAP;
+}
+static inline int bheap_empty(struct bheap* heap)
+{
+        return heap->head == NULL && heap->min == NULL;
+}
+/* insert (and reinitialize) a node into the heap */
+void bheap_insert(bheap_prio_t higher_prio,
+                 struct bheap* heap,
+                 struct bheap_node* node);
+/* merge addition into target */
+void bheap_union(bheap_prio_t higher_prio,
+                struct bheap* target,
+                struct bheap* addition);
+struct bheap_node* bheap_peek(bheap_prio_t higher_prio,
+                            struct bheap* heap);
+struct bheap_node* bheap_take(bheap_prio_t higher_prio,
+                            struct bheap* heap);
+void bheap_uncache_min(bheap_prio_t higher_prio, struct bheap* heap);
+int  bheap_decrease(bheap_prio_t higher_prio, struct bheap_node* node);
+void bheap_delete(bheap_prio_t higher_prio,
+                 struct bheap* heap,
+                 struct bheap_node* node);
+/* allocate from memcache */
+struct bheap_node* bheap_node_alloc(int gfp_flags);
+void bheap_node_free(struct bheap_node* hn);
+/* allocate a heap node for value and insert into the heap */
+int bheap_add(bheap_prio_t higher_prio, struct bheap* heap,
+             void* value, int gfp_flags);
+void* bheap_take_del(bheap_prio_t higher_prio,
+                    struct bheap* heap);
+#endif
diff --git a/include/litmus/binheap.h b/include/litmus/binheap.h
new file mode 100644
index 000000000000..1cf364701da8
--- /dev/null
+++ b/include/litmus/binheap.h
@@ -0,0 +1,205 @@
+#ifndef LITMUS_BINARY_HEAP_H
+#define LITMUS_BINARY_HEAP_H
+#include <linux/kernel.h>
+/**
+ * Simple binary heap with add, arbitrary delete, delete_root, and top
+ * operations.
+ *
+ * Style meant to conform with list.h.
+ *
+ * Motivation: Linux's prio_heap.h is of fixed size. Litmus's binomial
+ * heap may be overkill (and perhaps not general enough) for some applications.
+ *
+ * Note: In order to make node swaps fast, a node inserted with a data pointer
+ * may not always hold said data pointer. This is similar to the binomial heap
+ * implementation. This does make node deletion tricky since we have to
+ * (1) locate the node that holds the data pointer to delete, and (2) the
+ * node that was originally inserted with said data pointer. These have to be
+ * coalesced into a single node before removal (see usage of
+ * __binheap_safe_swap()). We have to track node references to accomplish this.
+ */
+struct binheap_node {
+        void    *data;
+        struct binheap_node *parent;
+        struct binheap_node *left;
+        struct binheap_node *right;
+        /* pointer to binheap_node that holds *data for which this binheap_node
+         * was originally inserted.  (*data "owns" this node)
+         */
+        struct binheap_node *ref;
+        struct binheap_node **ref_ptr;
+};
+/**
+ * Signature of compator function.  Assumed 'less-than' (min-heap).
+ * Pass in 'greater-than' for max-heap.
+ *
+ * TODO: Consider macro-based implementation that allows comparator to be
+ * inlined (similar to Linux red/black tree) for greater efficiency.
+ */
+typedef int (*binheap_order_t)(struct binheap_node *a,
+                                struct binheap_node *b);
+struct binheap {
+        struct binheap_node *root;
+        /* pointer to node to take next inserted child */
+        struct binheap_node *next;
+        /* pointer to last node in complete binary tree */
+        struct binheap_node *last;
+        /* comparator function pointer */
+        binheap_order_t compare;
+};
+/* Initialized heap nodes not in a heap have parent
+ * set to BINHEAP_POISON.
+ */
+#define BINHEAP_POISON  ((void*)(0xdeadbeef))
+/**
+ * binheap_entry - get the struct for this heap node.
+ *  Only valid when called upon heap nodes other than the root handle.
+ * @ptr:        the heap node.
+ * @type:       the type of struct pointed to by binheap_node::data.
+ * @member:     unused.
+ */
+#define binheap_entry(ptr, type, member) \
+((type *)((ptr)->data))
+/**
+ * binheap_node_container - get the struct that contains this node.
+ *  Only valid when called upon heap nodes other than the root handle.
+ * @ptr:        the heap node.
+ * @type:       the type of struct the node is embedded in.
+ * @member:     the name of the binheap_struct within the (type) struct.
+ */
+#define binheap_node_container(ptr, type, member) \
+container_of((ptr), type, member)
+/**
+ * binheap_top_entry - get the struct for the node at the top of the heap.
+ *  Only valid when called upon the heap handle node.
+ * @ptr:    the special heap-handle node.
+ * @type:   the type of the struct the head is embedded in.
+ * @member:     the name of the binheap_struct within the (type) struct.
+ */
+#define binheap_top_entry(ptr, type, member) \
+binheap_entry((ptr)->root, type, member)
+/**
+ * binheap_delete_root - remove the root element from the heap.
+ * @handle:      handle to the heap.
+ * @type:    the type of the struct the head is embedded in.
+ * @member:      the name of the binheap_struct within the (type) struct.
+ */
+#define binheap_delete_root(handle, type, member) \
+__binheap_delete_root((handle), &((type *)((handle)->root->data))->member)
+/**
+ * binheap_delete - remove an arbitrary element from the heap.
+ * @to_delete:  pointer to node to be removed.
+ * @handle:      handle to the heap.
+ */
+#define binheap_delete(to_delete, handle) \
+__binheap_delete((to_delete), (handle))
+/**
+ * binheap_add - insert an element to the heap
+ * new_node: node to add.
+ * @handle:      handle to the heap.
+ * @type:    the type of the struct the head is embedded in.
+ * @member:      the name of the binheap_struct within the (type) struct.
+ */
+#define binheap_add(new_node, handle, type, member) \
+__binheap_add((new_node), (handle), container_of((new_node), type, member))
+/**
+ * binheap_decrease - re-eval the position of a node (based upon its
+ * original data pointer).
+ * @handle: handle to the heap.
+ * @orig_node: node that was associated with the data pointer
+ *             (whose value has changed) when said pointer was
+ *             added to the heap.
+ */
+#define binheap_decrease(orig_node, handle) \
+__binheap_decrease((orig_node), (handle))
+#define BINHEAP_NODE_INIT() { NULL, BINHEAP_POISON, NULL, NULL , NULL, NULL}
+#define BINHEAP_NODE(name) \
+        struct binheap_node name = BINHEAP_NODE_INIT()
+static inline void INIT_BINHEAP_NODE(struct binheap_node *n)
+{
+        n->data = NULL;
+        n->parent = BINHEAP_POISON;
+        n->left = NULL;
+        n->right = NULL;
+        n->ref = NULL;
+        n->ref_ptr = NULL;
+}
+static inline void INIT_BINHEAP_HANDLE(struct binheap *handle,
+                                binheap_order_t compare)
+{
+        handle->root = NULL;
+        handle->next = NULL;
+        handle->last = NULL;
+        handle->compare = compare;
+}
+/* Returns true if binheap is empty. */
+static inline int binheap_empty(struct binheap *handle)
+{
+        return(handle->root == NULL);
+}
+/* Returns true if binheap node is in a heap. */
+static inline int binheap_is_in_heap(struct binheap_node *node)
+{
+        return (node->parent != BINHEAP_POISON);
+}
+/* Returns true if binheap node is in given heap. */
+int binheap_is_in_this_heap(struct binheap_node *node, struct binheap* heap);
+/* Add a node to a heap */
+void __binheap_add(struct binheap_node *new_node,
+                                struct binheap *handle,
+                                void *data);
+/**
+ * Removes the root node from the heap. The node is removed after coalescing
+ * the binheap_node with its original data pointer at the root of the tree.
+ *
+ * The 'last' node in the tree is then swapped up to the root and bubbled
+ * down.
+ */
+void __binheap_delete_root(struct binheap *handle,
+                                struct binheap_node *container);
+/**
+ * Delete an arbitrary node.  Bubble node to delete up to the root,
+ * and then delete to root.
+ */
+void __binheap_delete(struct binheap_node *node_to_delete,
+                                struct binheap *handle);
+/**
+ * Bubble up a node whose pointer has decreased in value.
+ */
+void __binheap_decrease(struct binheap_node *orig_node,
+                                                struct binheap *handle);
+#endif
diff --git a/include/litmus/budget.h b/include/litmus/budget.h
new file mode 100644
index 000000000000..60eb814fc82b
--- /dev/null
+++ b/include/litmus/budget.h
@@ -0,0 +1,38 @@
+#ifndef _LITMUS_BUDGET_H_
+#define _LITMUS_BUDGET_H_
+/* Update the per-processor enforcement timer (arm/reproram/cancel) for
+ * the next task. */
+void update_enforcement_timer(struct task_struct* t);
+inline static int budget_exhausted(struct task_struct* t)
+{
+        return get_exec_time(t) >= get_exec_cost(t);
+}
+inline static lt_t budget_remaining(struct task_struct* t)
+{
+        if (!budget_exhausted(t))
+                return get_exec_cost(t) - get_exec_time(t);
+        else
+                /* avoid overflow */
+                return 0;
+}
+#define budget_enforced(t) (tsk_rt(t)->task_params.budget_policy != NO_ENFORCEMENT)
+#define budget_precisely_enforced(t) (tsk_rt(t)->task_params.budget_policy \
+                                      == PRECISE_ENFORCEMENT)
+static inline int requeue_preempted_job(struct task_struct* t)
+{
+        /* Add task to ready queue only if not subject to budget enforcement or
+         * if the job has budget remaining. t may be NULL.
+         */
+        return t && !is_completed(t) &&
+                (!budget_exhausted(t) || !budget_enforced(t));
+}
+void litmus_current_budget(lt_t *used_so_far, lt_t *remaining);
+#endif
diff --git a/include/litmus/ceiling.h b/include/litmus/ceiling.h
new file mode 100644
index 000000000000..f3d3889315f7
--- /dev/null
+++ b/include/litmus/ceiling.h
@@ -0,0 +1,36 @@
+#ifndef _LITMUS_CEILING_H_
+#define _LITMUS_CEILING_H_
+#ifdef CONFIG_LITMUS_LOCKING
+void __srp_ceiling_block(struct task_struct *cur);
+DECLARE_PER_CPU(int, srp_objects_in_use);
+/* assumes preemptions off */
+void srp_ceiling_block(void)
+{
+        struct task_struct *tsk = current;
+        /* Only applies to real-time tasks. */
+        if (!is_realtime(tsk))
+                return;
+        /* Bail out early if there aren't any SRP resources around. */
+        if (likely(!raw_cpu_read(srp_objects_in_use)))
+                return;
+        /* Avoid recursive ceiling blocking. */
+        if (unlikely(tsk->rt_param.srp_non_recurse))
+                return;
+        /* must take slow path */
+        __srp_ceiling_block(tsk);
+}
+#else
+#define srp_ceiling_block() /* nothing */
+#endif
+#endif
+\ No newline at end of file
diff --git a/include/litmus/clustered.h b/include/litmus/clustered.h
new file mode 100644
index 000000000000..fc7f0f87966e
--- /dev/null
+++ b/include/litmus/clustered.h
@@ -0,0 +1,46 @@
+#ifndef CLUSTERED_H
+#define CLUSTERED_H
+/* Which cache level should be used to group CPUs into clusters?
+ * GLOBAL_CLUSTER means that all CPUs form a single cluster (just like under
+ * global scheduling).
+ */
+enum cache_level {
+        GLOBAL_CLUSTER = 0,
+        L1_CLUSTER     = 1,
+        L2_CLUSTER     = 2,
+        L3_CLUSTER     = 3
+};
+int parse_cache_level(const char *str, enum cache_level *level);
+const char* cache_level_name(enum cache_level level);
+/* expose a cache level in a /proc dir */
+struct proc_dir_entry* create_cluster_file(struct proc_dir_entry* parent,
+                                           enum cache_level* level);
+struct scheduling_cluster {
+        unsigned int id;
+        /* list of CPUs that are part of this cluster */
+        struct list_head cpus;
+};
+struct cluster_cpu {
+        unsigned int id; /* which CPU is this? */
+        struct list_head cluster_list; /* List of the CPUs in this cluster. */
+        struct scheduling_cluster* cluster; /* The cluster that this CPU belongs to. */
+};
+int get_cluster_size(enum cache_level level);
+int assign_cpus_to_clusters(enum cache_level level,
+                            struct scheduling_cluster* clusters[],
+                            unsigned int num_clusters,
+                            struct cluster_cpu* cpus[],
+                            unsigned int num_cpus);
+int get_shared_cpu_map(cpumask_var_t mask, unsigned int cpu, unsigned int index);
+#endif
diff --git a/include/litmus/ctrlpage.h b/include/litmus/ctrlpage.h
new file mode 100644
index 000000000000..f7b03e1aedd6
--- /dev/null
+++ b/include/litmus/ctrlpage.h
@@ -0,0 +1,105 @@
+#ifndef _LITMUS_CTRLPAGE_H_
+#define _LITMUS_CTRLPAGE_H_
+#include <litmus/rt_param.h>
+union np_flag {
+        uint32_t raw;
+        struct {
+                /* Is the task currently in a non-preemptive section? */
+                uint32_t flag:31;
+                /* Should the task call into the scheduler? */
+                uint32_t preempt:1;
+        } np;
+};
+/* The definition of the data that is shared between the kernel and real-time
+ * tasks via a shared page (see litmus/ctrldev.c).
+ *
+ * WARNING: User space can write to this, so don't trust
+ * the correctness of the fields!
+ *
+ * This servees two purposes: to enable efficient signaling
+ * of non-preemptive sections (user->kernel) and
+ * delayed preemptions (kernel->user), and to export
+ * some real-time relevant statistics such as preemption and
+ * migration data to user space. We can't use a device to export
+ * statistics because we want to avoid system call overhead when
+ * determining preemption/migration overheads).
+ */
+struct control_page {
+        /* This flag is used by userspace to communicate non-preempive
+         * sections. */
+        volatile __attribute__ ((aligned (8))) union np_flag sched;
+        /* Incremented by the kernel each time an IRQ is handled. */
+        volatile __attribute__ ((aligned (8))) uint64_t irq_count;
+        /* Locking overhead tracing: userspace records here the time stamp
+         * and IRQ counter prior to starting the system call. */
+        uint64_t ts_syscall_start;  /* Feather-Trace cycles */
+        uint64_t irq_syscall_start; /* Snapshot of irq_count when the syscall
+                                     * started. */
+        lt_t deadline; /* Deadline for the currently executing job */
+        lt_t release;  /* Release time of current job */
+        uint64_t job_index; /* Job sequence number of current job */
+        /* to be extended */
+};
+/* Expected offsets within the control page. */
+#define LITMUS_CP_OFFSET_SCHED          0
+#define LITMUS_CP_OFFSET_IRQ_COUNT      8
+#define LITMUS_CP_OFFSET_TS_SC_START    16
+#define LITMUS_CP_OFFSET_IRQ_SC_START   24
+#define LITMUS_CP_OFFSET_DEADLINE       32
+#define LITMUS_CP_OFFSET_RELEASE        40
+#define LITMUS_CP_OFFSET_JOB_INDEX      48
+/* System call emulation via ioctl() */
+typedef enum {
+        LRT_null_call = 2006,
+        LRT_set_rt_task_param,
+        LRT_get_rt_task_param,
+        LRT_reservation_create,
+        LRT_complete_job,
+        LRT_od_open,
+        LRT_od_close,
+        LRT_litmus_lock,
+        LRT_litmus_unlock,
+        LRT_wait_for_job_release,
+        LRT_wait_for_ts_release,
+        LRT_release_ts,
+        LRT_get_current_budget,
+} litmus_syscall_id_t;
+union litmus_syscall_args {
+        struct {
+                pid_t pid;
+                struct rt_task __user *param;
+        } get_set_task_param;
+        struct {
+                uint32_t type;
+                void __user *config;
+        } reservation_create;
+        struct {
+                uint32_t fd;
+                uint32_t obj_type;
+                uint32_t obj_id;
+                void __user *config;
+        } od_open;
+        struct {
+                lt_t __user *expended;
+                lt_t __user *remaining;
+        } get_current_budget;
+};
+#endif
diff --git a/include/litmus/debug_trace.h b/include/litmus/debug_trace.h
new file mode 100644
index 000000000000..f87f25a5f40e
--- /dev/null
+++ b/include/litmus/debug_trace.h
@@ -0,0 +1,57 @@
+#ifndef LITMUS_DEBUG_TRACE_H
+#define LITMUS_DEBUG_TRACE_H
+#ifdef CONFIG_SCHED_DEBUG_TRACE
+void sched_trace_log_message(const char* fmt, ...);
+void dump_trace_buffer(int max);
+#else
+#define sched_trace_log_message(fmt, ...)
+#endif
+extern atomic_t __log_seq_no;
+#ifdef CONFIG_SCHED_DEBUG_TRACE_CALLER
+#define LITMUS_TRACE_PREFIX "%d P%d [%s@%s:%d]: "
+#define LITMUS_TRACE_ARGS  atomic_add_return(1, &__log_seq_no), \
+                raw_smp_processor_id(),                         \
+                __FUNCTION__, __FILE__, __LINE__
+#else
+#define LITMUS_TRACE_PREFIX "%d P%d: "
+#define LITMUS_TRACE_ARGS  atomic_add_return(1, &__log_seq_no), \
+                raw_smp_processor_id()
+#endif
+#define LITMUS_TRACE(fmt, args...)                                              \
+        sched_trace_log_message(LITMUS_TRACE_PREFIX fmt,                        \
+                                LITMUS_TRACE_ARGS,  ## args)
+#define LITMUS_TRACE_TASK(t, fmt, args...)                      \
+        LITMUS_TRACE("(%s/%d:%d) " fmt,                  \
+              t ? (t)->comm : "null",                    \
+              t ? (t)->pid : 0,                          \
+              t ? (t)->rt_param.job_params.job_no : 0,   \
+              ##args)
+#define LITMUS_TRACE_CUR(fmt, args...) \
+        LITMUS_TRACE_TASK(current, fmt, ## args)
+#define LITMUS_TRACE_WARN_ON(cond) \
+        if (unlikely(cond)) \
+                LITMUS_TRACE("WARNING: '%s' [%s@%s:%d]\n", \
+                        #cond, __FUNCTION__, __FILE__, __LINE__)
+#endif
+#ifndef LITMUS_DEBUG_TRACE_DONT_POLLUTE_NAMESPACE
+#ifndef LITMUS_DEBUG_TRACE_H_UNQUALIFIED_NAMES
+#define LITMUS_DEBUG_TRACE_H_UNQUALIFIED_NAMES
+#define TRACE(fmt, args...) LITMUS_TRACE(fmt, ## args)
+#define TRACE_TASK(t, fmt, args...) LITMUS_TRACE_TASK(t, fmt, ## args)
+#define TRACE_CUR(fmt, args...) LITMUS_TRACE_CUR(fmt, ## args)
+#define TRACE_WARN_ON(cond) LITMUS_TRACE_WARN_ON(cond)
+#endif
+#endif
diff --git a/include/litmus/edf_common.h b/include/litmus/edf_common.h
new file mode 100644
index 000000000000..bbaf22ea7f12
--- /dev/null
+++ b/include/litmus/edf_common.h
@@ -0,0 +1,25 @@
+/*
+ * EDF common data structures and utility functions shared by all EDF
+ * based scheduler plugins
+ */
+/* CLEANUP: Add comments and make it less messy.
+ *
+ */
+#ifndef __UNC_EDF_COMMON_H__
+#define __UNC_EDF_COMMON_H__
+#include <litmus/rt_domain.h>
+void edf_domain_init(rt_domain_t* rt, check_resched_needed_t resched,
+                     release_jobs_t release);
+int edf_higher_prio(struct task_struct* first,
+                    struct task_struct* second);
+int edf_ready_order(struct bheap_node* a, struct bheap_node* b);
+int edf_preemption_needed(rt_domain_t* rt, struct task_struct *t);
+#endif
diff --git a/include/litmus/fdso.h b/include/litmus/fdso.h
new file mode 100644
index 000000000000..fd9b30dbfb34
--- /dev/null
+++ b/include/litmus/fdso.h
@@ -0,0 +1,78 @@
+/* fdso.h - file descriptor attached shared objects
+ *
+ * (c) 2007 B. Brandenburg, LITMUS^RT project
+ */
+#ifndef _LINUX_FDSO_H_
+#define _LINUX_FDSO_H_
+#include <linux/list.h>
+#include <asm/atomic.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#define MAX_OBJECT_DESCRIPTORS 85
+typedef enum  {
+        MIN_OBJ_TYPE    = 0,
+        FMLP_SEM        = 0,
+        SRP_SEM         = 1,
+        MPCP_SEM        = 2,
+        MPCP_VS_SEM     = 3,
+        DPCP_SEM        = 4,
+        PCP_SEM         = 5,
+        DFLP_SEM        = 6,
+        MAX_OBJ_TYPE    = 6
+} obj_type_t;
+struct inode_obj_id {
+        struct list_head        list;
+        atomic_t                count;
+        struct inode*           inode;
+        obj_type_t              type;
+        void*                   obj;
+        unsigned int            id;
+};
+struct fdso_ops;
+struct od_table_entry {
+        unsigned int            used;
+        struct inode_obj_id*    obj;
+        const struct fdso_ops*  class;
+};
+struct fdso_ops {
+        int   (*create)(void** obj_ref, obj_type_t type, void* __user);
+        void  (*destroy)(obj_type_t type, void*);
+        int   (*open)   (struct od_table_entry*, void* __user);
+        int   (*close)  (struct od_table_entry*);
+};
+/* translate a userspace supplied od into the raw table entry
+ * returns NULL if od is invalid
+ */
+struct od_table_entry* get_entry_for_od(int od);
+/* translate a userspace supplied od into the associated object
+ * returns NULL if od is invalid
+ */
+static inline void* od_lookup(int od, obj_type_t type)
+{
+        struct od_table_entry* e = get_entry_for_od(od);
+        return e && e->obj->type == type ? e->obj->obj : NULL;
+}
+#define lookup_fmlp_sem(od)((struct pi_semaphore*)  od_lookup(od, FMLP_SEM))
+#define lookup_srp_sem(od) ((struct srp_semaphore*) od_lookup(od, SRP_SEM))
+#define lookup_ics(od)     ((struct ics*)           od_lookup(od, ICS_ID))
+#endif
diff --git a/include/litmus/feather_buffer.h b/include/litmus/feather_buffer.h
new file mode 100644
index 000000000000..7857cd2c1938
--- /dev/null
+++ b/include/litmus/feather_buffer.h
@@ -0,0 +1,118 @@
+#ifndef _FEATHER_BUFFER_H_
+#define _FEATHER_BUFFER_H_
+/* requires UINT_MAX and memcpy */
+#define SLOT_FREE       0
+#define SLOT_BUSY       1
+#define SLOT_READY      2
+struct ft_buffer {
+        unsigned int    slot_count;
+        unsigned int    slot_size;
+        atomic_t                free_count;
+        atomic_t                write_idx;
+        unsigned int    read_idx;
+        char*           slots;
+        void*           buffer_mem;
+        atomic_t        failed_writes;
+};
+static inline int init_ft_buffer(struct ft_buffer*      buf,
+                                 unsigned int           slot_count,
+                                 unsigned int           slot_size,
+                                 char*                  slots,
+                                 void*                  buffer_mem)
+{
+        int i = 0;
+        if (!slot_count || UINT_MAX % slot_count != slot_count - 1) {
+                /* The slot count must divide UNIT_MAX + 1 so that when it
+                 * wraps around the index correctly points to 0.
+                 */
+                return 0;
+        } else {
+                buf->slot_count    = slot_count;
+                buf->slot_size     = slot_size;
+                buf->slots         = slots;
+                buf->buffer_mem    = buffer_mem;
+                atomic_set(&buf->free_count, slot_count);
+                atomic_set(&buf->write_idx, 0);
+                buf->read_idx      = 0;
+                atomic_set(&buf->failed_writes, 0);
+                for (i = 0; i < slot_count; i++)
+                        buf->slots[i] = SLOT_FREE;
+                return 1;
+        }
+}
+static inline int ft_buffer_start_write(struct ft_buffer* buf, void **ptr)
+{
+        int free = atomic_fetch_dec(&buf->free_count);
+        unsigned int idx;
+        if (free <= 0) {
+                atomic_fetch_inc(&buf->free_count);
+                *ptr = 0;
+                atomic_fetch_inc(&buf->failed_writes);
+                return 0;
+        } else {
+                idx  = atomic_fetch_inc(&buf->write_idx) % buf->slot_count;
+                buf->slots[idx] = SLOT_BUSY;
+                *ptr = ((char*) buf->buffer_mem) + idx * buf->slot_size;
+                return 1;
+        }
+}
+/* For single writer scenarios, with fewer atomic ops. */
+static inline int ft_buffer_start_single_write(struct ft_buffer* buf, void **ptr)
+{
+        unsigned int idx;
+        if (buf->free_count.counter <= 0) {
+                *ptr = 0;
+                /* single writer: no atomicity needed */
+                buf->failed_writes.counter++;
+                return 0;
+        } else {
+                /* free_count is positive, and can only increase since we are
+                 * (by assumption) the only writer accessing the buffer.
+                 */
+                idx = buf->write_idx.counter++ % buf->slot_count;
+                buf->slots[idx] = SLOT_BUSY;
+                *ptr = ((char*) buf->buffer_mem) + idx * buf->slot_size;
+                atomic_dec(&buf->free_count);
+                return 1;
+        }
+}
+static inline void ft_buffer_finish_write(struct ft_buffer* buf, void *ptr)
+{
+        unsigned int idx = ((char*) ptr - (char*) buf->buffer_mem) / buf->slot_size;
+        buf->slots[idx]  = SLOT_READY;
+}
+/* exclusive reader access is assumed */
+static inline int ft_buffer_read(struct ft_buffer* buf, void* dest)
+{
+        unsigned int idx;
+        if (atomic_read(&buf->free_count) == buf->slot_count)
+                /* nothing available */
+                return 0;
+        idx = buf->read_idx % buf->slot_count;
+        if (buf->slots[idx] == SLOT_READY) {
+                memcpy(dest, ((char*) buf->buffer_mem) + idx * buf->slot_size,
+                       buf->slot_size);
+                buf->slots[idx] = SLOT_FREE;
+                buf->read_idx++;
+                atomic_fetch_inc(&buf->free_count);
+                return 1;
+        } else
+                return 0;
+}
+#endif
diff --git a/include/litmus/feather_trace.h b/include/litmus/feather_trace.h
new file mode 100644
index 000000000000..1a7f41ea6a79
--- /dev/null
+++ b/include/litmus/feather_trace.h
@@ -0,0 +1,50 @@
+#ifndef _FEATHER_TRACE_H_
+#define _FEATHER_TRACE_H_
+#include <linux/atomic.h>
+int ft_enable_event(unsigned long id);
+int ft_disable_event(unsigned long id);
+int ft_is_event_enabled(unsigned long id);
+int ft_disable_all_events(void);
+/* Don't use rewriting implementation if the kernel is relocatable.
+ */
+#if defined(CONFIG_ARCH_HAS_FEATHER_TRACE) && !defined(CONFIG_RELOCATABLE)
+#include <asm/feather_trace.h>
+#else /* !__ARCH_HAS_FEATHER_TRACE */
+/* provide default implementation */
+#include <linux/timex.h> /* for get_cycles() */
+static inline unsigned long long ft_timestamp(void)
+{
+        return get_cycles();
+}
+#define feather_callback
+#define MAX_EVENTS 1024
+extern int ft_events[MAX_EVENTS];
+#define ft_event(id, callback) \
+        if (ft_events[id]) callback();
+#define ft_event0(id, callback) \
+        if (ft_events[id]) callback(id);
+#define ft_event1(id, callback, param) \
+        if (ft_events[id]) callback(id, param);
+#define ft_event2(id, callback, param, param2) \
+        if (ft_events[id]) callback(id, param, param2);
+#define ft_event3(id, callback, p, p2, p3) \
+        if (ft_events[id]) callback(id, p, p2, p3);
+#endif /* __ARCH_HAS_FEATHER_TRACE */
+#endif
diff --git a/include/litmus/fp_common.h b/include/litmus/fp_common.h
new file mode 100644
index 000000000000..71c0d0142fc4
--- /dev/null
+++ b/include/litmus/fp_common.h
@@ -0,0 +1,183 @@
+/* Fixed-priority scheduler support.
+ */
+#ifndef __FP_COMMON_H__
+#define __FP_COMMON_H__
+#include <litmus/rt_domain.h>
+#include <asm/bitops.h>
+void fp_domain_init(rt_domain_t* rt, check_resched_needed_t resched,
+                    release_jobs_t release);
+int fp_higher_prio(struct task_struct* first,
+                   struct task_struct* second);
+int fp_ready_order(struct bheap_node* a, struct bheap_node* b);
+#define FP_PRIO_BIT_WORDS (LITMUS_MAX_PRIORITY / BITS_PER_LONG)
+#if (LITMUS_MAX_PRIORITY % BITS_PER_LONG)
+#error LITMUS_MAX_PRIORITY must be a multiple of BITS_PER_LONG
+#endif
+/* bitmask-inexed priority queue */
+struct fp_prio_queue {
+        unsigned long   bitmask[FP_PRIO_BIT_WORDS];
+        struct bheap    queue[LITMUS_MAX_PRIORITY];
+};
+void fp_prio_queue_init(struct fp_prio_queue* q);
+static inline void fpq_set(struct fp_prio_queue* q, unsigned int index)
+{
+        unsigned long *word = q->bitmask + (index / BITS_PER_LONG);
+        __set_bit(index % BITS_PER_LONG, word);
+}
+static inline void fpq_clear(struct fp_prio_queue* q, unsigned int index)
+{
+        unsigned long *word = q->bitmask + (index / BITS_PER_LONG);
+        __clear_bit(index % BITS_PER_LONG, word);
+}
+static inline unsigned int fpq_find(struct fp_prio_queue* q)
+{
+        int i;
+        /* loop optimizer should unroll this */
+        for (i = 0; i < FP_PRIO_BIT_WORDS; i++)
+                if (q->bitmask[i])
+                        return __ffs(q->bitmask[i]) + i * BITS_PER_LONG;
+        return LITMUS_MAX_PRIORITY; /* nothing found */
+}
+static inline void fp_prio_add(struct fp_prio_queue* q, struct task_struct* t, unsigned int index)
+{
+        BUG_ON(index >= LITMUS_MAX_PRIORITY);
+        BUG_ON(bheap_node_in_heap(tsk_rt(t)->heap_node));
+        fpq_set(q, index);
+        bheap_insert(fp_ready_order, &q->queue[index], tsk_rt(t)->heap_node);
+}
+static inline void fp_prio_remove(struct fp_prio_queue* q, struct task_struct* t, unsigned int index)
+{
+        BUG_ON(!is_queued(t));
+        bheap_delete(fp_ready_order, &q->queue[index], tsk_rt(t)->heap_node);
+        if (likely(bheap_empty(&q->queue[index])))
+                fpq_clear(q, index);
+}
+static inline struct task_struct* fp_prio_peek(struct fp_prio_queue* q)
+{
+        unsigned int idx = fpq_find(q);
+        struct bheap_node* hn;
+        if (idx < LITMUS_MAX_PRIORITY) {
+                hn = bheap_peek(fp_ready_order, &q->queue[idx]);
+                return bheap2task(hn);
+        } else
+                return NULL;
+}
+static inline struct task_struct* fp_prio_take(struct fp_prio_queue* q)
+{
+        unsigned int idx = fpq_find(q);
+        struct bheap_node* hn;
+        if (idx < LITMUS_MAX_PRIORITY) {
+                hn = bheap_take(fp_ready_order, &q->queue[idx]);
+                if (likely(bheap_empty(&q->queue[idx])))
+                        fpq_clear(q, idx);
+                return bheap2task(hn);
+        } else
+                return NULL;
+}
+int fp_preemption_needed(struct fp_prio_queue*  q, struct task_struct *t);
+/* ******* list-based version ******** */
+/* bitmask-inexed priority queue */
+struct fp_ready_list {
+        unsigned long     bitmask[FP_PRIO_BIT_WORDS];
+        struct list_head  queue[LITMUS_MAX_PRIORITY];
+};
+void fp_ready_list_init(struct fp_ready_list* q);
+static inline void fp_rl_set(struct fp_ready_list* q, unsigned int index)
+{
+        unsigned long *word = q->bitmask + (index / BITS_PER_LONG);
+        __set_bit(index % BITS_PER_LONG, word);
+}
+static inline void fp_rl_clear(struct fp_ready_list* q, unsigned int index)
+{
+        unsigned long *word = q->bitmask + (index / BITS_PER_LONG);
+        __clear_bit(index % BITS_PER_LONG, word);
+}
+static inline unsigned int fp_rl_find(struct fp_ready_list* q)
+{
+        int i;
+        /* loop optimizer should unroll this */
+        for (i = 0; i < FP_PRIO_BIT_WORDS; i++)
+                if (q->bitmask[i])
+                        return __ffs(q->bitmask[i]) + i * BITS_PER_LONG;
+        return LITMUS_MAX_PRIORITY; /* nothing found */
+}
+static inline void fp_ready_list_add(
+        struct fp_ready_list* q, struct list_head* lh, unsigned int index)
+{
+        BUG_ON(index >= LITMUS_MAX_PRIORITY);
+        BUG_ON(in_list(lh));
+        fp_rl_set(q, index);
+        list_add_tail(lh, &q->queue[index]);
+}
+static inline void fp_ready_list_remove(
+        struct fp_ready_list* q, struct list_head* lh, unsigned int index)
+{
+        BUG_ON(!in_list(lh));
+        list_del(lh);
+        if (likely(list_empty(q->queue + index)))
+                fp_rl_clear(q, index);
+}
+static inline struct list_head* fp_ready_list_peek(struct fp_ready_list* q)
+{
+        unsigned int idx = fp_rl_find(q);
+        if (idx < LITMUS_MAX_PRIORITY) {
+                return q->queue[idx].next;
+        } else
+                return NULL;
+}
+static inline struct list_head* fp_ready_list_take(struct fp_ready_list* q)
+{
+        unsigned int idx = fp_rl_find(q);
+        struct list_head* lh;
+        if (idx < LITMUS_MAX_PRIORITY) {
+                lh = q->queue[idx].next;
+                fp_ready_list_remove(q, lh, idx);
+                return lh;
+        } else
+                return NULL;
+}
+#endif
diff --git a/include/litmus/fpmath.h b/include/litmus/fpmath.h
new file mode 100644
index 000000000000..642de98542c8
--- /dev/null
+++ b/include/litmus/fpmath.h
@@ -0,0 +1,147 @@
+#ifndef __FP_MATH_H__
+#define __FP_MATH_H__
+#include <linux/math64.h>
+#ifndef __KERNEL__
+#include <stdint.h>
+#define abs(x) (((x) < 0) ? -(x) : x)
+#endif
+// Use 64-bit because we want to track things at the nanosecond scale.
+// This can lead to very large numbers.
+typedef int64_t fpbuf_t;
+typedef struct
+{
+        fpbuf_t val;
+} fp_t;
+#define FP_SHIFT 10
+#define ROUND_BIT (FP_SHIFT - 1)
+#define _fp(x) ((fp_t) {x})
+#ifdef __KERNEL__
+static const fp_t LITMUS_FP_ZERO = {.val = 0};
+static const fp_t LITMUS_FP_ONE = {.val = (1 << FP_SHIFT)};
+#endif
+static inline fp_t FP(fpbuf_t x)
+{
+        return _fp(((fpbuf_t) x) << FP_SHIFT);
+}
+/* divide two integers to obtain a fixed point value  */
+static inline fp_t _frac(fpbuf_t a, fpbuf_t b)
+{
+        return _fp(div64_s64(FP(a).val, (b)));
+}
+static inline fpbuf_t _point(fp_t x)
+{
+        return (x.val % (1 << FP_SHIFT));
+}
+#define fp2str(x) x.val
+/*(x.val >> FP_SHIFT), (x.val % (1 << FP_SHIFT)) */
+#define _FP_  "%ld/1024"
+static inline fpbuf_t _floor(fp_t x)
+{
+        return x.val >> FP_SHIFT;
+}
+/* FIXME: negative rounding */
+static inline fpbuf_t _round(fp_t x)
+{
+        return _floor(x) + ((x.val >> ROUND_BIT) & 1);
+}
+/* multiply two fixed point values */
+static inline fp_t _mul(fp_t a, fp_t b)
+{
+        return _fp((a.val * b.val) >> FP_SHIFT);
+}
+static inline fp_t _div(fp_t a, fp_t b)
+{
+#if !defined(__KERNEL__) && !defined(unlikely)
+#define unlikely(x) (x)
+#define DO_UNDEF_UNLIKELY
+#endif
+        /* try not to overflow */
+        if (unlikely(  a.val > (2l << ((sizeof(fpbuf_t)*8) - FP_SHIFT)) ))
+                return _fp((a.val / b.val) << FP_SHIFT);
+        else
+                return _fp((a.val << FP_SHIFT) / b.val);
+#ifdef DO_UNDEF_UNLIKELY
+#undef unlikely
+#undef DO_UNDEF_UNLIKELY
+#endif
+}
+static inline fp_t _add(fp_t a, fp_t b)
+{
+        return _fp(a.val + b.val);
+}
+static inline fp_t _sub(fp_t a, fp_t b)
+{
+        return _fp(a.val - b.val);
+}
+static inline fp_t _neg(fp_t x)
+{
+        return _fp(-x.val);
+}
+static inline fp_t _abs(fp_t x)
+{
+        return _fp(abs(x.val));
+}
+/* works the same as casting float/double to integer */
+static inline fpbuf_t _fp_to_integer(fp_t x)
+{
+        return _floor(_abs(x)) * ((x.val > 0) ? 1 : -1);
+}
+static inline fp_t _integer_to_fp(fpbuf_t x)
+{
+        return _frac(x,1);
+}
+static inline int _leq(fp_t a, fp_t b)
+{
+        return a.val <= b.val;
+}
+static inline int _geq(fp_t a, fp_t b)
+{
+        return a.val >= b.val;
+}
+static inline int _lt(fp_t a, fp_t b)
+{
+        return a.val < b.val;
+}
+static inline int _gt(fp_t a, fp_t b)
+{
+        return a.val > b.val;
+}
+static inline int _eq(fp_t a, fp_t b)
+{
+        return a.val == b.val;
+}
+static inline fp_t _max(fp_t a, fp_t b)
+{
+        if (a.val < b.val)
+                return b;
+        else
+                return a;
+}
+#endif
diff --git a/include/litmus/ftdev.h b/include/litmus/ftdev.h
new file mode 100644
index 000000000000..a566b0b6ae05
--- /dev/null
+++ b/include/litmus/ftdev.h
@@ -0,0 +1,58 @@
+#ifndef _LITMUS_FTDEV_H_
+#define _LITMUS_FTDEV_H_
+#include <litmus/feather_trace.h>
+#include <litmus/feather_buffer.h>
+#include <linux/mutex.h>
+#include <linux/cdev.h>
+#define FTDEV_ENABLE_CMD        0
+#define FTDEV_DISABLE_CMD       1
+#define FTDEV_CALIBRATE         0x1410
+struct ftdev;
+/* return 0 if buffer can be opened, otherwise -$REASON */
+typedef int  (*ftdev_can_open_t)(struct ftdev* dev, unsigned int buf_no);
+/* return 0 on success, otherwise -$REASON */
+typedef int  (*ftdev_alloc_t)(struct ftdev* dev, unsigned int buf_no);
+typedef void (*ftdev_free_t)(struct ftdev* dev, unsigned int buf_no);
+typedef long (*ftdev_calibrate_t)(struct ftdev* dev, unsigned int buf_no, unsigned long user_arg);
+/* Let devices handle writes from userspace. No synchronization provided. */
+typedef ssize_t (*ftdev_write_t)(struct ft_buffer* buf, size_t len, const char __user *from);
+struct ftdev_event;
+struct ftdev_minor {
+        struct ft_buffer*       buf;
+        unsigned int            readers;
+        struct mutex            lock;
+        /* FIXME: filter for authorized events */
+        struct ftdev_event*     events;
+        struct device*          device;
+        struct ftdev*           ftdev;
+};
+struct ftdev {
+        dev_t                   major;
+        struct cdev             cdev;
+        struct class*           class;
+        const char*             name;
+        struct ftdev_minor*     minor;
+        unsigned int            minor_cnt;
+        ftdev_alloc_t           alloc;
+        ftdev_free_t            free;
+        ftdev_can_open_t        can_open;
+        ftdev_write_t           write;
+        ftdev_calibrate_t       calibrate;
+};
+struct ft_buffer* alloc_ft_buffer(unsigned int count, size_t size);
+void free_ft_buffer(struct ft_buffer* buf);
+int ftdev_init( struct ftdev* ftdev, struct module* owner,
+                const int minor_cnt, const char* name);
+void ftdev_exit(struct ftdev* ftdev);
+int register_ftdev(struct ftdev* ftdev);
+#endif
diff --git a/include/litmus/jobs.h b/include/litmus/jobs.h
new file mode 100644
index 000000000000..7033393148df
--- /dev/null
+++ b/include/litmus/jobs.h
@@ -0,0 +1,13 @@
+#ifndef __LITMUS_JOBS_H__
+#define __LITMUS_JOBS_H__
+void prepare_for_next_period(struct task_struct *t);
+void release_at(struct task_struct *t, lt_t start);
+void inferred_sporadic_job_release_at(struct task_struct *t, lt_t when);
+long default_wait_for_release_at(lt_t release_time);
+long complete_job(void);
+long complete_job_oneshot(void);
+#endif
diff --git a/include/litmus/litmus.h b/include/litmus/litmus.h
new file mode 100644
index 000000000000..f550367ddd4b
--- /dev/null
+++ b/include/litmus/litmus.h
@@ -0,0 +1,224 @@
+/*
+ * Constant definitions related to
+ * scheduling policy.
+ */
+#ifndef _LINUX_LITMUS_H_
+#define _LINUX_LITMUS_H_
+#include <litmus/ctrlpage.h>
+#ifdef CONFIG_RELEASE_MASTER
+extern atomic_t release_master_cpu;
+#endif
+/* in_list - is a given list_head queued on some list?
+ */
+static inline int in_list(struct list_head* list)
+{
+        return !(  /* case 1: deleted */
+                   (list->next == LIST_POISON1 &&
+                    list->prev == LIST_POISON2)
+                 ||
+                   /* case 2: initialized */
+                   (list->next == list &&
+                    list->prev == list)
+                );
+}
+struct task_struct* __waitqueue_remove_first(wait_queue_head_t *wq);
+#define NO_CPU                  0xffffffff
+void litmus_fork(struct task_struct *tsk);
+void litmus_exec(void);
+/* clean up real-time state of a task */
+void litmus_clear_state(struct task_struct *dead_tsk);
+void exit_litmus(struct task_struct *dead_tsk);
+/* Prevent the plugin from being switched-out from underneath a code
+ * path. Might sleep, so may be called only from non-atomic context. */
+void litmus_plugin_switch_disable(void);
+void litmus_plugin_switch_enable(void);
+long litmus_admit_task(struct task_struct *tsk);
+void litmus_exit_task(struct task_struct *tsk);
+void litmus_dealloc(struct task_struct *tsk);
+void litmus_do_exit(struct task_struct *tsk);
+int litmus_be_migrate_to(int cpu);
+#define is_realtime(t)          ((t)->policy == SCHED_LITMUS)
+#define rt_transition_pending(t) \
+        ((t)->rt_param.transition_pending)
+#define tsk_rt(t)               (&(t)->rt_param)
+/*      Realtime utility macros */
+#ifdef CONFIG_LITMUS_LOCKING
+#define is_priority_boosted(t)  (tsk_rt(t)->priority_boosted)
+#define get_boost_start(t)  (tsk_rt(t)->boost_start_time)
+#else
+#define is_priority_boosted(t)  0
+#define get_boost_start(t)      0
+#endif
+/* task_params macros */
+#define get_exec_cost(t)        (tsk_rt(t)->task_params.exec_cost)
+#define get_rt_period(t)        (tsk_rt(t)->task_params.period)
+#define get_rt_relative_deadline(t)     (tsk_rt(t)->task_params.relative_deadline)
+#define get_rt_phase(t)         (tsk_rt(t)->task_params.phase)
+#define get_partition(t)        (tsk_rt(t)->task_params.cpu)
+#define get_priority(t)         (tsk_rt(t)->task_params.priority)
+#define get_class(t)        (tsk_rt(t)->task_params.cls)
+#define get_release_policy(t) (tsk_rt(t)->task_params.release_policy)
+/* job_param macros */
+#define get_exec_time(t)    (tsk_rt(t)->job_params.exec_time)
+#define get_deadline(t)         (tsk_rt(t)->job_params.deadline)
+#define get_release(t)          (tsk_rt(t)->job_params.release)
+#define get_lateness(t)         (tsk_rt(t)->job_params.lateness)
+/* release policy macros */
+#define is_periodic(t)          (get_release_policy(t) == TASK_PERIODIC)
+#define is_sporadic(t)          (get_release_policy(t) == TASK_SPORADIC)
+#ifdef CONFIG_ALLOW_EARLY_RELEASE
+#define is_early_releasing(t)   (get_release_policy(t) == TASK_EARLY)
+#else
+#define is_early_releasing(t)   (0)
+#endif
+#define is_hrt(t)               \
+        (tsk_rt(t)->task_params.cls == RT_CLASS_HARD)
+#define is_srt(t)               \
+        (tsk_rt(t)->task_params.cls == RT_CLASS_SOFT)
+#define is_be(t)                \
+        (tsk_rt(t)->task_params.cls == RT_CLASS_BEST_EFFORT)
+/* Our notion of time within LITMUS: kernel monotonic time. */
+static inline lt_t litmus_clock(void)
+{
+        return ktime_to_ns(ktime_get());
+}
+/* A macro to convert from nanoseconds to ktime_t. */
+#define ns_to_ktime(t)          ktime_add_ns(ktime_set(0, 0), t)
+#define is_released(t, now)     \
+        (lt_before_eq(get_release(t), now))
+#define is_tardy(t, now)    \
+        (lt_before_eq(tsk_rt(t)->job_params.deadline, now))
+/* real-time comparison macros */
+#define earlier_deadline(a, b) (lt_before(\
+        (a)->rt_param.job_params.deadline,\
+        (b)->rt_param.job_params.deadline))
+#define earlier_release(a, b)  (lt_before(\
+        (a)->rt_param.job_params.release,\
+        (b)->rt_param.job_params.release))
+void preempt_if_preemptable(struct task_struct* t, int on_cpu);
+#define bheap2task(hn) ((struct task_struct*) hn->value)
+static inline int is_present(struct task_struct* t)
+{
+        return t && tsk_rt(t)->present;
+}
+static inline int is_completed(struct task_struct* t)
+{
+        return t && tsk_rt(t)->completed;
+}
+/* Used to convert ns-specified execution costs and periods into
+ * integral quanta equivalents.
+ */
+#define LITMUS_QUANTUM_LENGTH_NS (CONFIG_LITMUS_QUANTUM_LENGTH_US * 1000ULL)
+/* make the unit explicit */
+typedef unsigned long quanta_t;
+enum round {
+        FLOOR,
+        CEIL
+};
+static inline quanta_t time2quanta(lt_t time, enum round round)
+{
+        s64  quantum_length = LITMUS_QUANTUM_LENGTH_NS;
+        if (do_div(time, quantum_length) && round == CEIL)
+                time++;
+        return (quanta_t) time;
+}
+static inline lt_t quanta2time(quanta_t quanta)
+{
+        return quanta * LITMUS_QUANTUM_LENGTH_NS;
+}
+/* By how much is cpu staggered behind CPU 0? */
+u64 cpu_stagger_offset(int cpu);
+static inline struct control_page* get_control_page(struct task_struct *t)
+{
+        return tsk_rt(t)->ctrl_page;
+}
+static inline int has_control_page(struct task_struct* t)
+{
+        return tsk_rt(t)->ctrl_page != NULL;
+}
+#ifdef CONFIG_SCHED_OVERHEAD_TRACE
+#define TS_SYSCALL_IN_START                                             \
+        if (has_control_page(current)) {                                \
+                __TS_SYSCALL_IN_START(&get_control_page(current)->ts_syscall_start); \
+        }
+#define TS_SYSCALL_IN_END                                               \
+        if (has_control_page(current)) {                                \
+                unsigned long flags;                                    \
+                uint64_t irqs;                                          \
+                local_irq_save(flags);                                  \
+                irqs = get_control_page(current)->irq_count -           \
+                        get_control_page(current)->irq_syscall_start;   \
+                __TS_SYSCALL_IN_END(&irqs);                             \
+                local_irq_restore(flags);                               \
+        }
+#else
+#define TS_SYSCALL_IN_START
+#define TS_SYSCALL_IN_END
+#endif
+#ifdef CONFIG_SMP
+/*
+ * struct hrtimer_start_on_info - timer info on remote cpu
+ * @timer:      timer to be triggered on remote cpu
+ * @time:       time event
+ * @mode:       timer mode
+ * @csd:        smp_call_function parameter to call hrtimer_pull on remote cpu
+ */
+struct hrtimer_start_on_info {
+        struct hrtimer          *timer;
+        ktime_t                 time;
+        enum hrtimer_mode       mode;
+        struct call_single_data csd;
+};
+void hrtimer_pull(void *csd_info);
+extern void hrtimer_start_on(int cpu, struct hrtimer_start_on_info *info,
+                        struct hrtimer *timer, ktime_t time,
+                        const enum hrtimer_mode mode);
+#endif
+#endif
diff --git a/include/litmus/litmus_proc.h b/include/litmus/litmus_proc.h
new file mode 100644
index 000000000000..a5db24c03ec0
--- /dev/null
+++ b/include/litmus/litmus_proc.h
@@ -0,0 +1,63 @@
+#include <litmus/sched_plugin.h>
+#include <linux/proc_fs.h>
+int __init init_litmus_proc(void);
+void exit_litmus_proc(void);
+struct cd_mapping
+{
+        int id;
+        cpumask_var_t mask;
+        struct proc_dir_entry *proc_file;
+};
+struct domain_proc_info
+{
+        int num_cpus;
+        int num_domains;
+        struct cd_mapping *cpu_to_domains;
+        struct cd_mapping *domain_to_cpus;
+};
+/*
+ * On success, returns 0 and sets the pointer to the location of the new
+ * proc dir entry, otherwise returns an error code and sets pde to NULL.
+ */
+long make_plugin_proc_dir(struct sched_plugin* plugin,
+                struct proc_dir_entry** pde);
+/*
+ * Plugins should deallocate all child proc directory entries before
+ * calling this, to avoid memory leaks.
+ */
+void remove_plugin_proc_dir(struct sched_plugin* plugin);
+/*
+ * Setup the CPU <-> sched domain mappings in proc
+ */
+long activate_domain_proc(struct domain_proc_info* map);
+/*
+ * Remove the CPU <-> sched domain mappings from proc
+ */
+long deactivate_domain_proc(void);
+/*
+ * Alloc memory for the mapping
+ * Note: Does not set up proc files. Use make_sched_domain_maps for that.
+ */
+long init_domain_proc_info(struct domain_proc_info* map,
+        int num_cpus, int num_domains);
+/*
+ * Free memory of the mapping
+ * Note: Does not clean up proc files. Use deactivate_domain_proc for that.
+ */
+void destroy_domain_proc_info(struct domain_proc_info* map);
+/* Copy at most size-1 bytes from ubuf into kbuf, null-terminate buf, and
+ * remove a '\n' if present. Returns the number of bytes that were read or
+ * -EFAULT. */
+int copy_and_chomp(char *kbuf, unsigned long ksize,
+                   __user const char* ubuf, unsigned long ulength);
diff --git a/include/litmus/locking.h b/include/litmus/locking.h
new file mode 100644
index 000000000000..4d7b870cb443
--- /dev/null
+++ b/include/litmus/locking.h
@@ -0,0 +1,28 @@
+#ifndef LITMUS_LOCKING_H
+#define LITMUS_LOCKING_H
+struct litmus_lock_ops;
+/* Generic base struct for LITMUS^RT userspace semaphores.
+ * This structure should be embedded in protocol-specific semaphores.
+ */
+struct litmus_lock {
+        struct litmus_lock_ops *ops;
+        int type;
+};
+struct litmus_lock_ops {
+        /* Current task tries to obtain / drop a reference to a lock.
+         * Optional methods, allowed by default. */
+        int (*open)(struct litmus_lock*, void* __user);
+        int (*close)(struct litmus_lock*);
+        /* Current tries to lock/unlock this lock (mandatory methods). */
+        int (*lock)(struct litmus_lock*);
+        int (*unlock)(struct litmus_lock*);
+        /* The lock is no longer being referenced (mandatory method). */
+        void (*deallocate)(struct litmus_lock*);
+};
+#endif
diff --git a/include/litmus/np.h b/include/litmus/np.h
new file mode 100644
index 000000000000..dbe2b695f74a
--- /dev/null
+++ b/include/litmus/np.h
@@ -0,0 +1,121 @@
+#ifndef _LITMUS_NP_H_
+#define _LITMUS_NP_H_
+/* Definitions related to non-preemptive sections signaled via the control
+ * page
+ */
+#ifdef CONFIG_NP_SECTION
+static inline int is_kernel_np(struct task_struct *t)
+{
+        return tsk_rt(t)->kernel_np;
+}
+static inline int is_user_np(struct task_struct *t)
+{
+        return tsk_rt(t)->ctrl_page ? tsk_rt(t)->ctrl_page->sched.np.flag : 0;
+}
+static inline void request_exit_np(struct task_struct *t)
+{
+        if (is_user_np(t)) {
+                /* Set the flag that tells user space to call
+                 * into the kernel at the end of a critical section. */
+                if (likely(tsk_rt(t)->ctrl_page)) {
+                        TRACE_TASK(t, "setting delayed_preemption flag\n");
+                        tsk_rt(t)->ctrl_page->sched.np.preempt = 1;
+                }
+        }
+}
+static inline void make_np(struct task_struct *t)
+{
+        tsk_rt(t)->kernel_np++;
+}
+/* Caller should check if preemption is necessary when
+ * the function return 0.
+ */
+static inline int take_np(struct task_struct *t)
+{
+        return --tsk_rt(t)->kernel_np;
+}
+/* returns 0 if remote CPU needs an IPI to preempt, 1 if no IPI is required */
+static inline int request_exit_np_atomic(struct task_struct *t)
+{
+        union np_flag old, new;
+        if (tsk_rt(t)->ctrl_page) {
+                old.raw = tsk_rt(t)->ctrl_page->sched.raw;
+                if (old.np.flag == 0) {
+                        /* no longer non-preemptive */
+                        return 0;
+                } else if (old.np.preempt) {
+                        /* already set, nothing for us to do */
+                        return 1;
+                } else {
+                        /* non preemptive and flag not set */
+                        new.raw = old.raw;
+                        new.np.preempt = 1;
+                        /* if we get old back, then we atomically set the flag */
+                        return cmpxchg(&tsk_rt(t)->ctrl_page->sched.raw, old.raw, new.raw) == old.raw;
+                        /* If we raced with a concurrent change, then so be
+                         * it. Deliver it by IPI.  We don't want an unbounded
+                         * retry loop here since tasks might exploit that to
+                         * keep the kernel busy indefinitely. */
+                }
+        } else
+                return 0;
+}
+#else
+static inline int is_kernel_np(struct task_struct* t)
+{
+        return 0;
+}
+static inline int is_user_np(struct task_struct* t)
+{
+        return 0;
+}
+static inline void request_exit_np(struct task_struct *t)
+{
+        /* request_exit_np() shouldn't be called if !CONFIG_NP_SECTION */
+        BUG();
+}
+static inline int request_exit_np_atomic(struct task_struct *t)
+{
+        return 0;
+}
+#endif
+static inline void clear_exit_np(struct task_struct *t)
+{
+        if (likely(tsk_rt(t)->ctrl_page))
+                tsk_rt(t)->ctrl_page->sched.np.preempt = 0;
+}
+static inline int is_np(struct task_struct *t)
+{
+#ifdef CONFIG_SCHED_DEBUG_TRACE
+        int kernel, user;
+        kernel = is_kernel_np(t);
+        user   = is_user_np(t);
+        if (kernel || user)
+                TRACE_TASK(t, " is non-preemptive: kernel=%d user=%d\n",
+                           kernel, user);
+        return kernel || user;
+#else
+        return unlikely(is_kernel_np(t) || is_user_np(t));
+#endif
+}
+#endif
diff --git a/include/litmus/preempt.h b/include/litmus/preempt.h
new file mode 100644
index 000000000000..ffb602772896
--- /dev/null
+++ b/include/litmus/preempt.h
@@ -0,0 +1,191 @@
+#ifndef LITMUS_PREEMPT_H
+#define LITMUS_PREEMPT_H
+#include <linux/types.h>
+#include <linux/cache.h>
+#include <linux/percpu.h>
+#include <asm/atomic.h>
+DECLARE_PER_CPU(bool, litmus_preemption_in_progress);
+/* is_current_running() is legacy macro (and a hack) that is used to make
+ * the plugin logic, which still stems from the 2.6.20 era, work with current
+ * kernels.
+ *
+ * It used to honor the flag in the preempt_count variable that was
+ * set when scheduling is in progress. This doesn't exist anymore in recent
+ * Linux versions. Instead, Linux has moved to passing a 'preempt' flag to
+ * __schedule(). In particular, Linux ignores prev->state != TASK_RUNNING and
+ * does *not* process self-suspensions if an interrupt (i.e., a preemption)
+ * races with a task that is about to call schedule() anyway.
+ *
+ * The value of the 'preempt' flag in __schedule() is crucial
+ * information for some of the LITMUS^RT plugins, which must re-add
+ * soon-to-block tasks to the ready queue if the rest of the system doesn't
+ * process the preemption yet. Unfortunately, the flag is not passed to
+ * pick_next_task(). Hence, as a hack, we communicate it out of band via the
+ * global, per-core variable litmus_preemption_in_progress, which is set by
+ * the scheduler in __schedule() and read by the plugins via the
+ * is_current_running() macro.
+ */
+#define is_current_running() \
+        ((current)->state == TASK_RUNNING || \
+        this_cpu_read(litmus_preemption_in_progress))
+DECLARE_PER_CPU_SHARED_ALIGNED(atomic_t, resched_state);
+#ifdef CONFIG_PREEMPT_STATE_TRACE
+/* this file is included widely --- be careful not to pollute the namespace
+ * with the TRACE() symbol */
+#define LITMUS_DEBUG_TRACE_DONT_POLLUTE_NAMESPACE
+#include <litmus/debug_trace.h>
+#undef LITMUS_DEBUG_TRACE_DONT_POLLUTE_NAMESPACE
+const char* sched_state_name(int s);
+#define TRACE_STATE(fmt, args...) LITMUS_TRACE("SCHED_STATE " fmt, args)
+#else
+#define TRACE_STATE(fmt, args...) /* ignore */
+#endif
+#define VERIFY_SCHED_STATE(x)                                           \
+        do { int __s = get_sched_state();                               \
+                if ((__s & (x)) == 0)                                   \
+                        TRACE_STATE("INVALID s=0x%x (%s) not "          \
+                                    "in 0x%x (%s) [%s]\n",              \
+                                    __s, sched_state_name(__s),         \
+                                    (x), #x, __FUNCTION__);             \
+        } while (0);
+#define TRACE_SCHED_STATE_CHANGE(x, y, cpu)                             \
+        TRACE_STATE("[P%d] 0x%x (%s) -> 0x%x (%s)\n",                   \
+                    cpu,  (x), sched_state_name(x),                     \
+                    (y), sched_state_name(y))
+typedef enum scheduling_state {
+        TASK_SCHEDULED    = (1 << 0),  /* The currently scheduled task is the one that
+                                        * should be scheduled, and the processor does not
+                                        * plan to invoke schedule(). */
+        SHOULD_SCHEDULE   = (1 << 1),  /* A remote processor has determined that the
+                                        * processor should reschedule, but this has not
+                                        * been communicated yet (IPI still pending). */
+        WILL_SCHEDULE     = (1 << 2),  /* The processor has noticed that it has to
+                                        * reschedule and will do so shortly. */
+        TASK_PICKED       = (1 << 3),  /* The processor is currently executing schedule(),
+                                        * has selected a new task to schedule, but has not
+                                        * yet performed the actual context switch. */
+        PICKED_WRONG_TASK = (1 << 4),  /* The processor has not yet performed the context
+                                        * switch, but a remote processor has already
+                                        * determined that a higher-priority task became
+                                        * eligible after the task was picked. */
+} sched_state_t;
+static inline sched_state_t get_sched_state_on(int cpu)
+{
+        return atomic_read(&per_cpu(resched_state, cpu));
+}
+static inline sched_state_t get_sched_state(void)
+{
+        return atomic_read(this_cpu_ptr(&resched_state));
+}
+static inline int is_in_sched_state(int possible_states)
+{
+        return get_sched_state() & possible_states;
+}
+static inline int cpu_is_in_sched_state(int cpu, int possible_states)
+{
+        return get_sched_state_on(cpu) & possible_states;
+}
+static inline void set_sched_state(sched_state_t s)
+{
+        TRACE_SCHED_STATE_CHANGE(get_sched_state(), s, smp_processor_id());
+        atomic_set(this_cpu_ptr(&resched_state), s);
+}
+static inline int sched_state_transition(sched_state_t from, sched_state_t to)
+{
+        sched_state_t old_state;
+        old_state = atomic_cmpxchg(this_cpu_ptr(&resched_state), from, to);
+        if (old_state == from) {
+                TRACE_SCHED_STATE_CHANGE(from, to, smp_processor_id());
+                return 1;
+        } else
+                return 0;
+}
+static inline int sched_state_transition_on(int cpu,
+                                            sched_state_t from,
+                                            sched_state_t to)
+{
+        sched_state_t old_state;
+        old_state = atomic_cmpxchg(&per_cpu(resched_state, cpu), from, to);
+        if (old_state == from) {
+                TRACE_SCHED_STATE_CHANGE(from, to, cpu);
+                return 1;
+        } else
+                return 0;
+}
+/* Plugins must call this function after they have decided which job to
+ * schedule next.  IMPORTANT: this function must be called while still holding
+ * the lock that is used to serialize scheduling decisions.
+ *
+ * (Ideally, we would like to use runqueue locks for this purpose, but that
+ * would lead to deadlocks with the migration code.)
+ */
+static inline void sched_state_task_picked(void)
+{
+        VERIFY_SCHED_STATE(WILL_SCHEDULE);
+        /* WILL_SCHEDULE has only a local tansition => simple store is ok */
+        set_sched_state(TASK_PICKED);
+}
+static inline void sched_state_entered_schedule(void)
+{
+        /* Update state for the case that we entered schedule() not due to
+         * set_tsk_need_resched() */
+        set_sched_state(WILL_SCHEDULE);
+}
+/* Called by schedule() to check if the scheduling decision is still valid
+ * after a context switch. Returns 1 if the CPU needs to reschdule. */
+static inline int sched_state_validate_switch(void)
+{
+        int decision_ok = 0;
+        VERIFY_SCHED_STATE(PICKED_WRONG_TASK | TASK_PICKED | WILL_SCHEDULE);
+        if (is_in_sched_state(TASK_PICKED)) {
+                /* Might be good; let's try to transition out of this
+                 * state. This must be done atomically since remote processors
+                 * may try to change the state, too. */
+                decision_ok = sched_state_transition(TASK_PICKED, TASK_SCHEDULED);
+        }
+        if (!decision_ok)
+                TRACE_STATE("validation failed (%s)\n",
+                        sched_state_name(get_sched_state()));
+        return !decision_ok;
+}
+/* State transition events. See litmus/preempt.c for details. */
+void sched_state_will_schedule(struct task_struct* tsk);
+void sched_state_ipi(void);
+/* Cause a CPU (remote or local) to reschedule. */
+void litmus_reschedule(int cpu);
+void litmus_reschedule_local(void);
+#ifdef CONFIG_DEBUG_KERNEL
+void sched_state_plugin_check(void);
+#else
+#define sched_state_plugin_check() /* no check */
+#endif
+#endif
diff --git a/include/litmus/reservations/alloc.h b/include/litmus/reservations/alloc.h
new file mode 100644
index 000000000000..b3471288c9f1
--- /dev/null
+++ b/include/litmus/reservations/alloc.h
@@ -0,0 +1,15 @@
+#ifndef LITMUS_RESERVATIONS_ALLOC_H
+#define LITMUS_RESERVATIONS_ALLOC_H
+#include <litmus/reservations/reservation.h>
+long alloc_polling_reservation(
+        int res_type,
+        struct reservation_config *config,
+        struct reservation **_res);
+long alloc_table_driven_reservation(
+        struct reservation_config *config,
+        struct reservation **_res);
+#endif
+\ No newline at end of file
diff --git a/include/litmus/reservations/budget-notifier.h b/include/litmus/reservations/budget-notifier.h
new file mode 100644
index 000000000000..d831fa9d5153
--- /dev/null
+++ b/include/litmus/reservations/budget-notifier.h
@@ -0,0 +1,50 @@
+#ifndef LITMUS_BUDGET_NOTIFIER_H
+#define LITMUS_BUDGET_NOTIFIER_H
+#include <linux/list.h>
+#include <linux/spinlock.h>
+struct budget_notifier;
+typedef void (*budget_callback_t)  (
+        struct budget_notifier *bn
+);
+struct budget_notifier {
+        struct list_head list;
+        budget_callback_t budget_exhausted;
+        budget_callback_t budget_replenished;
+};
+struct budget_notifier_list {
+        struct list_head list;
+        raw_spinlock_t lock;
+};
+void budget_notifier_list_init(struct budget_notifier_list* bnl);
+static inline void budget_notifier_add(
+        struct budget_notifier_list *bnl,
+        struct budget_notifier *bn)
+{
+        unsigned long flags;
+        raw_spin_lock_irqsave(&bnl->lock, flags);
+        list_add(&bn->list, &bnl->list);
+        raw_spin_unlock_irqrestore(&bnl->lock, flags);
+}
+static inline void budget_notifier_remove(
+        struct budget_notifier_list *bnl,
+        struct budget_notifier *bn)
+{
+        unsigned long flags;
+        raw_spin_lock_irqsave(&bnl->lock, flags);
+        list_del(&bn->list);
+        raw_spin_unlock_irqrestore(&bnl->lock, flags);
+}
+void budget_notifiers_fire(struct budget_notifier_list *bnl, bool replenished);
+#endif
diff --git a/include/litmus/reservations/polling.h b/include/litmus/reservations/polling.h
new file mode 100644
index 000000000000..230e12b1088a
--- /dev/null
+++ b/include/litmus/reservations/polling.h
@@ -0,0 +1,19 @@
+#ifndef LITMUS_POLLING_RESERVATIONS_H
+#define LITMUS_POLLING_RESERVATIONS_H
+#include <litmus/reservations/reservation.h>
+struct polling_reservation {
+        /* extend basic reservation */
+        struct reservation res;
+        lt_t max_budget;
+        lt_t period;
+        lt_t deadline;
+        lt_t offset;
+};
+void polling_reservation_init(struct polling_reservation *pres, int use_edf_prio,
+        int use_periodic_polling, lt_t budget, lt_t period, lt_t deadline, lt_t offset);
+#endif
diff --git a/include/litmus/reservations/reservation.h b/include/litmus/reservations/reservation.h
new file mode 100644
index 000000000000..1752dac4e698
--- /dev/null
+++ b/include/litmus/reservations/reservation.h
@@ -0,0 +1,224 @@
+#ifndef LITMUS_RESERVATION_H
+#define LITMUS_RESERVATION_H
+#include <linux/list.h>
+#include <linux/hrtimer.h>
+#include <litmus/debug_trace.h>
+#include <litmus/reservations/budget-notifier.h>
+struct reservation_client;
+struct reservation_environment;
+struct reservation;
+typedef enum {
+        /* reservation has no clients, is not consuming budget */
+        RESERVATION_INACTIVE = 0,
+        /* reservation has clients, consumes budget when scheduled */
+        RESERVATION_ACTIVE,
+        /* reservation has no clients, but may be consuming budget */
+        RESERVATION_ACTIVE_IDLE,
+        /* Reservation has no budget and waits for
+         * replenishment. May or may not have clients. */
+        RESERVATION_DEPLETED,
+} reservation_state_t;
+/* ************************************************************************** */
+/* Select which task to dispatch. If NULL is returned, it means there is nothing
+ * to schedule right now and background work can be scheduled. */
+typedef struct task_struct * (*dispatch_t)  (
+        struct reservation_client *client
+);
+/* Something that can be managed in a reservation and that can yield
+ * a process for dispatching. Contains a pointer to the reservation
+ * to which it "belongs". */
+struct reservation_client {
+        struct list_head list;
+        struct reservation* reservation;
+        dispatch_t dispatch;
+};
+/* ************************************************************************** */
+/* Called by reservations to request state change. */
+typedef void (*reservation_change_state_t)  (
+        struct reservation_environment* env,
+        struct reservation *res,
+        reservation_state_t new_state
+);
+/* Called by reservations to request replenishment while not DEPLETED.
+ * Useful for soft reservations that remain ACTIVE with lower priority. */
+typedef void (*request_replenishment_t)(
+        struct reservation_environment* env,
+        struct reservation *res
+);
+/* The framework within wich reservations operate. */
+struct reservation_environment {
+        lt_t time_zero;
+        lt_t current_time;
+        /* services invoked by reservations */
+        reservation_change_state_t change_state;
+        request_replenishment_t request_replenishment;
+};
+/* ************************************************************************** */
+/* A new client is added or an existing client resumes. */
+typedef void (*client_arrives_t)  (
+        struct reservation *reservation,
+        struct reservation_client *client
+);
+/* A client suspends or terminates. */
+typedef void (*client_departs_t)  (
+        struct reservation *reservation,
+        struct reservation_client *client,
+        int did_signal_job_completion
+);
+/* A previously requested replenishment has occurred. */
+typedef void (*on_replenishment_timer_t)  (
+        struct reservation *reservation
+);
+/* Update the reservation's budget to reflect execution or idling. */
+typedef void (*drain_budget_t) (
+        struct reservation *reservation,
+        lt_t how_much
+);
+/* Select a ready task from one of the clients for scheduling. */
+typedef struct task_struct* (*dispatch_client_t)  (
+        struct reservation *reservation,
+        lt_t *time_slice /* May be used to force rescheduling after
+                            some amount of time. 0 => no limit */
+);
+/* Destructor: called before scheduler is deactivated. */
+typedef void (*shutdown_t)(struct reservation *reservation);
+struct reservation_ops {
+        dispatch_client_t dispatch_client;
+        client_arrives_t client_arrives;
+        client_departs_t client_departs;
+        on_replenishment_timer_t replenish;
+        drain_budget_t drain_budget;
+        shutdown_t shutdown;
+};
+#define RESERVATION_BACKGROUND_PRIORITY ULLONG_MAX
+struct reservation {
+        /* used to queue in environment */
+        struct list_head list;
+        struct list_head replenish_list;
+        reservation_state_t state;
+        unsigned int id;
+        unsigned int kind;
+        /* exact meaning defined by impl. */
+        lt_t priority;
+        lt_t cur_budget;
+        lt_t next_replenishment;
+        /* budget stats */
+        lt_t budget_consumed; /* how much budget consumed in this allocation cycle? */
+        lt_t budget_consumed_total;
+        /* list of registered budget callbacks */
+        struct budget_notifier_list budget_notifiers;
+        /* for memory reclamation purposes */
+        struct list_head all_list;
+        /* interaction with framework */
+        struct reservation_environment *env;
+        struct reservation_ops *ops;
+        struct list_head clients;
+};
+void reservation_init(struct reservation *res);
+/* Default implementations */
+/* simply select the first client in the list, set *for_at_most to zero */
+struct task_struct* default_dispatch_client(
+        struct reservation *res,
+        lt_t *for_at_most
+);
+/* drain budget at linear rate, enter DEPLETED state when budget used up */
+void common_drain_budget(struct reservation *res, lt_t how_much);
+/* "connector" reservation client to hook up tasks with reservations */
+struct task_client {
+        struct reservation_client client;
+        struct task_struct *task;
+};
+void task_client_init(struct task_client *tc, struct task_struct *task,
+        struct reservation *reservation);
+#define SUP_RESCHEDULE_NOW (0)
+#define SUP_NO_SCHEDULER_UPDATE (ULLONG_MAX)
+/* A simple uniprocessor (SUP) flat (i.e., non-hierarchical) reservation
+ * environment.
+ */
+struct sup_reservation_environment {
+        struct reservation_environment env;
+        /* ordered by priority */
+        struct list_head active_reservations;
+        /* ordered by next_replenishment */
+        struct list_head depleted_reservations;
+        /* unordered */
+        struct list_head inactive_reservations;
+        /* list of all reservations */
+        struct list_head all_reservations;
+        /* - SUP_RESCHEDULE_NOW means call sup_dispatch() now
+         * - SUP_NO_SCHEDULER_UPDATE means nothing to do
+         * any other value means program a timer for the given time
+         */
+        lt_t next_scheduler_update;
+        /* set to true if a call to sup_dispatch() is imminent */
+        bool will_schedule;
+};
+/* Contract:
+ *  - before calling into sup_ code, or any reservation methods,
+ *    update the time with sup_update_time(); and
+ *  - after calling into sup_ code, or any reservation methods,
+ *    check next_scheduler_update and program timer or trigger
+ *    scheduler invocation accordingly.
+ */
+void sup_init(struct sup_reservation_environment* sup_env);
+void sup_add_new_reservation(struct sup_reservation_environment* sup_env,
+        struct reservation* new_res);
+void sup_update_time(struct sup_reservation_environment* sup_env, lt_t now);
+struct task_struct* sup_dispatch(struct sup_reservation_environment* sup_env);
+struct reservation* sup_find_by_id(struct sup_reservation_environment* sup_env,
+        unsigned int id);
+#endif
diff --git a/include/litmus/reservations/table-driven.h b/include/litmus/reservations/table-driven.h
new file mode 100644
index 000000000000..b6302a2f200d
--- /dev/null
+++ b/include/litmus/reservations/table-driven.h
@@ -0,0 +1,23 @@
+#ifndef LITMUS_RESERVATIONS_TABLE_DRIVEN_H
+#define LITMUS_RESERVATIONS_TABLE_DRIVEN_H
+#include <litmus/reservations/reservation.h>
+struct table_driven_reservation {
+        /* extend basic reservation */
+        struct reservation res;
+        lt_t major_cycle;
+        unsigned int next_interval;
+        unsigned int num_intervals;
+        struct lt_interval *intervals;
+        /* info about current scheduling slot */
+        struct lt_interval cur_interval;
+        lt_t major_cycle_start;
+};
+void table_driven_reservation_init(struct table_driven_reservation *tdres,
+        lt_t major_cycle, struct lt_interval *intervals, unsigned int num_intervals);
+#endif
diff --git a/include/litmus/rt_domain.h b/include/litmus/rt_domain.h
new file mode 100644
index 000000000000..ac249292e866
--- /dev/null
+++ b/include/litmus/rt_domain.h
@@ -0,0 +1,182 @@
+/* CLEANUP: Add comments and make it less messy.
+ *
+ */
+#ifndef __UNC_RT_DOMAIN_H__
+#define __UNC_RT_DOMAIN_H__
+#include <litmus/bheap.h>
+#define RELEASE_QUEUE_SLOTS 127 /* prime */
+struct _rt_domain;
+typedef int (*check_resched_needed_t)(struct _rt_domain *rt);
+typedef void (*release_jobs_t)(struct _rt_domain *rt, struct bheap* tasks);
+struct release_queue {
+        /* each slot maintains a list of release heaps sorted
+         * by release time */
+        struct list_head                slot[RELEASE_QUEUE_SLOTS];
+};
+typedef struct _rt_domain {
+        /* runnable rt tasks are in here */
+        raw_spinlock_t                  ready_lock;
+        struct bheap                    ready_queue;
+        /* real-time tasks waiting for release are in here */
+        raw_spinlock_t                  release_lock;
+        struct release_queue            release_queue;
+#ifdef CONFIG_RELEASE_MASTER
+        int                             release_master;
+#endif
+        /* for moving tasks to the release queue */
+        raw_spinlock_t                  tobe_lock;
+        struct list_head                tobe_released;
+        /* how do we check if we need to kick another CPU? */
+        check_resched_needed_t          check_resched;
+        /* how do we release jobs? */
+        release_jobs_t                  release_jobs;
+        /* how are tasks ordered in the ready queue? */
+        bheap_prio_t                    order;
+} rt_domain_t;
+struct release_heap {
+        /* list_head for per-time-slot list */
+        struct list_head                list;
+        lt_t                            release_time;
+        /* all tasks to be released at release_time */
+        struct bheap                    heap;
+        /* used to trigger the release */
+        struct hrtimer                  timer;
+#ifdef CONFIG_RELEASE_MASTER
+        /* used to delegate releases */
+        struct hrtimer_start_on_info    info;
+#endif
+        /* required for the timer callback */
+        rt_domain_t*                    dom;
+};
+static inline struct task_struct* __next_ready(rt_domain_t* rt)
+{
+        struct bheap_node *hn = bheap_peek(rt->order, &rt->ready_queue);
+        if (hn)
+                return bheap2task(hn);
+        else
+                return NULL;
+}
+void rt_domain_init(rt_domain_t *rt, bheap_prio_t order,
+                    check_resched_needed_t check,
+                    release_jobs_t relase);
+void __add_ready(rt_domain_t* rt, struct task_struct *new);
+void __merge_ready(rt_domain_t* rt, struct bheap *tasks);
+void __add_release(rt_domain_t* rt, struct task_struct *task);
+static inline struct task_struct* __take_ready(rt_domain_t* rt)
+{
+        struct bheap_node* hn = bheap_take(rt->order, &rt->ready_queue);
+        if (hn)
+                return bheap2task(hn);
+        else
+                return NULL;
+}
+static inline struct task_struct* __peek_ready(rt_domain_t* rt)
+{
+        struct bheap_node* hn = bheap_peek(rt->order, &rt->ready_queue);
+        if (hn)
+                return bheap2task(hn);
+        else
+                return NULL;
+}
+static inline int  is_queued(struct task_struct *t)
+{
+        BUG_ON(!tsk_rt(t)->heap_node);
+        return bheap_node_in_heap(tsk_rt(t)->heap_node);
+}
+static inline void remove(rt_domain_t* rt, struct task_struct *t)
+{
+        bheap_delete(rt->order, &rt->ready_queue, tsk_rt(t)->heap_node);
+}
+static inline void add_ready(rt_domain_t* rt, struct task_struct *new)
+{
+        unsigned long flags;
+        /* first we need the write lock for rt_ready_queue */
+        raw_spin_lock_irqsave(&rt->ready_lock, flags);
+        __add_ready(rt, new);
+        raw_spin_unlock_irqrestore(&rt->ready_lock, flags);
+}
+static inline void merge_ready(rt_domain_t* rt, struct bheap* tasks)
+{
+        unsigned long flags;
+        raw_spin_lock_irqsave(&rt->ready_lock, flags);
+        __merge_ready(rt, tasks);
+        raw_spin_unlock_irqrestore(&rt->ready_lock, flags);
+}
+static inline struct task_struct* take_ready(rt_domain_t* rt)
+{
+        unsigned long flags;
+        struct task_struct* ret;
+        /* first we need the write lock for rt_ready_queue */
+        raw_spin_lock_irqsave(&rt->ready_lock, flags);
+        ret = __take_ready(rt);
+        raw_spin_unlock_irqrestore(&rt->ready_lock, flags);
+        return ret;
+}
+static inline void add_release(rt_domain_t* rt, struct task_struct *task)
+{
+        unsigned long flags;
+        raw_spin_lock_irqsave(&rt->tobe_lock, flags);
+        __add_release(rt, task);
+        raw_spin_unlock_irqrestore(&rt->tobe_lock, flags);
+}
+#ifdef CONFIG_RELEASE_MASTER
+void __add_release_on(rt_domain_t* rt, struct task_struct *task,
+                      int target_cpu);
+static inline void add_release_on(rt_domain_t* rt,
+                                  struct task_struct *task,
+                                  int target_cpu)
+{
+        unsigned long flags;
+        raw_spin_lock_irqsave(&rt->tobe_lock, flags);
+        __add_release_on(rt, task, target_cpu);
+        raw_spin_unlock_irqrestore(&rt->tobe_lock, flags);
+}
+#endif
+static inline int __jobs_pending(rt_domain_t* rt)
+{
+        return !bheap_empty(&rt->ready_queue);
+}
+static inline int jobs_pending(rt_domain_t* rt)
+{
+        unsigned long flags;
+        int ret;
+        /* first we need the write lock for rt_ready_queue */
+        raw_spin_lock_irqsave(&rt->ready_lock, flags);
+        ret = !bheap_empty(&rt->ready_queue);
+        raw_spin_unlock_irqrestore(&rt->ready_lock, flags);
+        return ret;
+}
+#endif
diff --git a/include/litmus/rt_param.h b/include/litmus/rt_param.h
new file mode 100644
index 000000000000..9b291343714f
--- /dev/null
+++ b/include/litmus/rt_param.h
@@ -0,0 +1,290 @@
+/*
+ * Definition of the scheduler plugin interface.
+ *
+ */
+#ifndef _LINUX_RT_PARAM_H_
+#define _LINUX_RT_PARAM_H_
+/* Litmus time type. */
+typedef unsigned long long lt_t;
+static inline int lt_after(lt_t a, lt_t b)
+{
+        return ((long long) b) - ((long long) a) < 0;
+}
+#define lt_before(a, b) lt_after(b, a)
+static inline int lt_after_eq(lt_t a, lt_t b)
+{
+        return ((long long) a) - ((long long) b) >= 0;
+}
+#define lt_before_eq(a, b) lt_after_eq(b, a)
+/* different types of clients */
+typedef enum {
+        RT_CLASS_HARD,
+        RT_CLASS_SOFT,
+        RT_CLASS_BEST_EFFORT
+} task_class_t;
+typedef enum {
+        NO_ENFORCEMENT,      /* job may overrun unhindered */
+        QUANTUM_ENFORCEMENT, /* budgets are only checked on quantum boundaries */
+        PRECISE_ENFORCEMENT  /* budgets are enforced with hrtimers */
+} budget_policy_t;
+/* Release behaviors for jobs. PERIODIC and EARLY jobs
+   must end by calling sys_complete_job() (or equivalent)
+   to set up their next release and deadline. */
+typedef enum {
+        /* Jobs are released sporadically (provided job precedence
+       constraints are met). */
+        TASK_SPORADIC,
+        /* Jobs are released periodically (provided job precedence
+       constraints are met). */
+        TASK_PERIODIC,
+    /* Jobs are released immediately after meeting precedence
+       constraints. Beware this can peg your CPUs if used in
+       the wrong applications. Only supported by EDF schedulers. */
+        TASK_EARLY
+} release_policy_t;
+/* We use the common priority interpretation "lower index == higher priority",
+ * which is commonly used in fixed-priority schedulability analysis papers.
+ * So, a numerically lower priority value implies higher scheduling priority,
+ * with priority 1 being the highest priority. Priority 0 is reserved for
+ * priority boosting. LITMUS_MAX_PRIORITY denotes the maximum priority value
+ * range.
+ */
+#define LITMUS_MAX_PRIORITY     512
+#define LITMUS_HIGHEST_PRIORITY   1
+#define LITMUS_LOWEST_PRIORITY    (LITMUS_MAX_PRIORITY - 1)
+#define LITMUS_NO_PRIORITY      UINT_MAX
+/* Provide generic comparison macros for userspace,
+ * in case that we change this later. */
+#define litmus_higher_fixed_prio(a, b)  (a < b)
+#define litmus_lower_fixed_prio(a, b)   (a > b)
+#define litmus_is_valid_fixed_prio(p)           \
+        ((p) >= LITMUS_HIGHEST_PRIORITY &&      \
+         (p) <= LITMUS_LOWEST_PRIORITY)
+/* reservation support */
+typedef enum {
+        PERIODIC_POLLING = 10,
+        SPORADIC_POLLING,
+        TABLE_DRIVEN,
+} reservation_type_t;
+struct lt_interval {
+        lt_t start;
+        lt_t end;
+};
+#ifndef __KERNEL__
+#define __user
+#endif
+struct reservation_config {
+        unsigned int id;
+        lt_t priority;
+        int  cpu;
+        union {
+                struct {
+                        lt_t period;
+                        lt_t budget;
+                        lt_t relative_deadline;
+                        lt_t offset;
+                } polling_params;
+                struct {
+                        lt_t major_cycle_length;
+                        unsigned int num_intervals;
+                        struct lt_interval __user *intervals;
+                } table_driven_params;
+        };
+};
+/* regular sporadic task support */
+struct rt_task {
+        lt_t            exec_cost;
+        lt_t            period;
+        lt_t            relative_deadline;
+        lt_t            phase;
+        unsigned int    cpu;
+        unsigned int    priority;
+        task_class_t    cls;
+        budget_policy_t  budget_policy;  /* ignored by pfair */
+        release_policy_t release_policy;
+};
+/* don't export internal data structures to user space (liblitmus) */
+#ifdef __KERNEL__
+struct _rt_domain;
+struct bheap_node;
+struct release_heap;
+struct rt_job {
+        /* Time instant the the job was or will be released.  */
+        lt_t    release;
+        /* What is the current deadline? */
+        lt_t    deadline;
+        /* How much service has this job received so far? */
+        lt_t    exec_time;
+        /* By how much did the prior job miss its deadline by?
+         * Value differs from tardiness in that lateness may
+         * be negative (when job finishes before its deadline).
+         */
+        long long       lateness;
+        /* Which job is this. This is used to let user space
+         * specify which job to wait for, which is important if jobs
+         * overrun. If we just call sys_sleep_next_period() then we
+         * will unintentionally miss jobs after an overrun.
+         *
+         * Increase this sequence number when a job is released.
+         */
+        unsigned int    job_no;
+#ifdef CONFIG_SCHED_TASK_TRACE
+        /* Keep track of the last time the job suspended.
+         * -> used for tracing sporadic tasks. */
+        lt_t    last_suspension;
+#endif
+};
+struct pfair_param;
+/*      RT task parameters for scheduling extensions
+ *      These parameters are inherited during clone and therefore must
+ *      be explicitly set up before the task set is launched.
+ */
+struct rt_param {
+        /* do we need to check for srp blocking? */
+        unsigned int            srp_non_recurse:1;
+        /* is the task present? (true if it can be scheduled) */
+        unsigned int            present:1;
+        /* has the task completed? */
+        unsigned int            completed:1;
+#ifdef CONFIG_LITMUS_LOCKING
+        /* Is the task being priority-boosted by a locking protocol? */
+        unsigned int            priority_boosted:1;
+        /* If so, when did this start? */
+        lt_t                    boost_start_time;
+        /* How many LITMUS^RT locks does the task currently hold/wait for? */
+        unsigned int            num_locks_held;
+        /* How many PCP/SRP locks does the task currently hold/wait for? */
+        unsigned int            num_local_locks_held;
+#endif
+        /* user controlled parameters */
+        struct rt_task          task_params;
+        /* timing parameters */
+        struct rt_job           job_params;
+        /* Special handling for periodic tasks executing
+         * clock_nanosleep(CLOCK_MONOTONIC, ...).
+         */
+        lt_t                    nanosleep_wakeup;
+        unsigned int    doing_abs_nanosleep:1;
+        /* Should the next job be released at some time other than
+         * just period time units after the last release?
+         */
+        unsigned int            sporadic_release:1;
+        lt_t                    sporadic_release_time;
+        /* task representing the current "inherited" task
+         * priority, assigned by inherit_priority and
+         * return priority in the scheduler plugins.
+         * could point to self if PI does not result in
+         * an increased task priority.
+         */
+         struct task_struct*    inh_task;
+#ifdef CONFIG_NP_SECTION
+        /* For the FMLP under PSN-EDF, it is required to make the task
+         * non-preemptive from kernel space. In order not to interfere with
+         * user space, this counter indicates the kernel space np setting.
+         * kernel_np > 0 => task is non-preemptive
+         */
+        unsigned int    kernel_np;
+#endif
+        /* This field can be used by plugins to store where the task
+         * is currently scheduled. It is the responsibility of the
+         * plugin to avoid race conditions.
+         *
+         * This used by GSN-EDF and PFAIR.
+         */
+        volatile int            scheduled_on;
+        /* Is the stack of the task currently in use? This is updated by
+         * the LITMUS core.
+         *
+         * Be careful to avoid deadlocks!
+         */
+        volatile int            stack_in_use;
+        /* This field can be used by plugins to store where the task
+         * is currently linked. It is the responsibility of the plugin
+         * to avoid race conditions.
+         *
+         * Used by GSN-EDF.
+         */
+        volatile int            linked_on;
+        /* PFAIR/PD^2 state. Allocated on demand. */
+        union {
+                void *plugin_state;
+                struct pfair_param *pfair;
+        };
+        /* Fields saved before BE->RT transition.
+         */
+        int old_policy;
+        int old_prio;
+        /* ready queue for this task */
+        struct _rt_domain* domain;
+        /* heap element for this task
+         *
+         * Warning: Don't statically allocate this node. The heap
+         *          implementation swaps these between tasks, thus after
+         *          dequeuing from a heap you may end up with a different node
+         *          then the one you had when enqueuing the task.  For the same
+         *          reason, don't obtain and store references to this node
+         *          other than this pointer (which is updated by the heap
+         *          implementation).
+         */
+        struct bheap_node*      heap_node;
+        struct release_heap*    rel_heap;
+        /* Used by rt_domain to queue task in release list.
+         */
+        struct list_head list;
+        /* Pointer to the page shared between userspace and kernel. */
+        struct control_page * ctrl_page;
+};
+#endif
+#endif
diff --git a/include/litmus/sched_plugin.h b/include/litmus/sched_plugin.h
new file mode 100644
index 000000000000..0923f26b745a
--- /dev/null
+++ b/include/litmus/sched_plugin.h
@@ -0,0 +1,180 @@
+/*
+ * Definition of the scheduler plugin interface.
+ *
+ */
+#ifndef _LINUX_SCHED_PLUGIN_H_
+#define _LINUX_SCHED_PLUGIN_H_
+#include <linux/sched.h>
+#ifdef CONFIG_LITMUS_LOCKING
+#include <litmus/locking.h>
+#endif
+/************************ setup/tear down ********************/
+typedef long (*activate_plugin_t) (void);
+typedef long (*deactivate_plugin_t) (void);
+struct domain_proc_info;
+typedef long (*get_domain_proc_info_t) (struct domain_proc_info **info);
+/********************* scheduler invocation ******************/
+/* The main scheduling function, called to select the next task to dispatch. */
+typedef struct task_struct* (*schedule_t)(struct task_struct * prev);
+/* Clean up after the task switch has occured.
+ * This function is called after every (even non-rt) task switch.
+ */
+typedef void (*finish_switch_t)(struct task_struct *prev);
+/* When waiting for the stack of the task selected by the plugin
+ * to become available, this callback is invoked to give the
+ * plugin a chance to cancel the wait. If the plugin returns false,
+ * the scheduler is invoked again. */
+typedef bool (*should_wait_for_stack_t)(struct task_struct *next);
+/* After a pull migration (which involves dropping scheduler locks),
+ * the plugin is given the chance to validate that the task is still
+ * the right one. If the plugin returns false, the scheduler
+ * will be invoked again. */
+typedef bool (*post_migration_validate_t)(struct task_struct *next);
+/* After dropping the lock to facilitate a pull migration, the task
+ * state may have changed. In this case, the core notifies the plugin
+ * with this callback and then invokes the scheduler again. */
+typedef void (*next_became_invalid_t)(struct task_struct *next);
+/********************* task state changes ********************/
+/* Called to setup a new real-time task.
+ * Release the first job, enqueue, etc.
+ * Task may already be running.
+ */
+typedef void (*task_new_t) (struct task_struct *task,
+                            int on_rq,
+                            int running);
+/* Called when userspace seeks to set new task parameters for a task
+ * that is already in real-time mode (i.e., is_realtime(task)).
+ */
+typedef long (*task_change_params_t) (struct task_struct *task,
+                            struct rt_task *new_params);
+/* Called to re-introduce a task after blocking.
+ * Can potentially be called multiple times.
+ */
+typedef void (*task_wake_up_t) (struct task_struct *task);
+/* called to notify the plugin of a blocking real-time task
+ * it will only be called for real-time tasks and before schedule is called */
+typedef void (*task_block_t)  (struct task_struct *task);
+/* Called when a real-time task exits or changes to a different scheduling
+ * class.
+ * Free any allocated resources
+ */
+typedef void (*task_exit_t)    (struct task_struct *);
+/* task_exit() is called with interrupts disabled and runqueue locks held, and
+ * thus and cannot block or spin.  task_cleanup() is called sometime later
+ * without any locks being held.
+ */
+typedef void (*task_cleanup_t)  (struct task_struct *);
+#ifdef CONFIG_LITMUS_LOCKING
+/* Called when the current task attempts to create a new lock of a given
+ * protocol type. */
+typedef long (*allocate_lock_t) (struct litmus_lock **lock, int type,
+                                 void* __user config);
+#endif
+/********************* sys call backends  ********************/
+/* This function causes the caller to sleep until the next release */
+typedef long (*complete_job_t) (void);
+typedef long (*admit_task_t)(struct task_struct* tsk);
+/* return false to indicate that the plugin does not support forking */
+typedef bool (*fork_task_t)(struct task_struct* tsk);
+typedef long (*wait_for_release_at_t)(lt_t release_time);
+/* Informs the plugin when a synchronous release takes place. */
+typedef void (*synchronous_release_at_t)(lt_t time_zero);
+/* How much budget has the current task consumed so far, and how much
+ * has it left? The default implementation ties into the per-task
+ * budget enforcement code. Plugins can override this to report
+ * reservation-specific values. */
+typedef void (*current_budget_t)(lt_t *used_so_far, lt_t *remaining);
+/* Reservation creation/removal backends. Meaning of reservation_type and
+ * reservation_id are entirely plugin-specific. */
+typedef long (*reservation_create_t)(int reservation_type, void* __user config);
+typedef long (*reservation_destroy_t)(unsigned int reservation_id, int cpu);
+/************************ misc routines ***********************/
+struct sched_plugin {
+        struct list_head        list;
+        /*      basic info              */
+        char                    *plugin_name;
+        /*      setup                   */
+        activate_plugin_t       activate_plugin;
+        deactivate_plugin_t     deactivate_plugin;
+        get_domain_proc_info_t  get_domain_proc_info;
+        /*      scheduler invocation    */
+        schedule_t              schedule;
+        finish_switch_t         finish_switch;
+        /* control over pull migrations */
+        should_wait_for_stack_t should_wait_for_stack;
+        next_became_invalid_t next_became_invalid;
+        post_migration_validate_t post_migration_validate;
+        /*      syscall backend         */
+        complete_job_t          complete_job;
+        wait_for_release_at_t   wait_for_release_at;
+        synchronous_release_at_t synchronous_release_at;
+        /*      task state changes      */
+        admit_task_t            admit_task;
+        fork_task_t                     fork_task;
+    task_new_t                  task_new;
+        task_wake_up_t          task_wake_up;
+        task_block_t            task_block;
+        /* optional: support task parameter changes at runtime */
+    task_change_params_t task_change_params;
+        task_exit_t             task_exit;
+        task_cleanup_t          task_cleanup;
+        current_budget_t        current_budget;
+        /* Reservation support */
+        reservation_create_t    reservation_create;
+        reservation_destroy_t   reservation_destroy;
+#ifdef CONFIG_LITMUS_LOCKING
+        /*      locking protocols       */
+        allocate_lock_t         allocate_lock;
+#endif
+} __attribute__ ((__aligned__(SMP_CACHE_BYTES)));
+extern struct sched_plugin *litmus;
+int register_sched_plugin(struct sched_plugin* plugin);
+struct sched_plugin* find_sched_plugin(const char* name);
+void print_sched_plugins(struct seq_file *m);
+extern struct sched_plugin linux_sched_plugin;
+#endif
diff --git a/include/litmus/sched_trace.h b/include/litmus/sched_trace.h
new file mode 100644
index 000000000000..0532424fbee6
--- /dev/null
+++ b/include/litmus/sched_trace.h
@@ -0,0 +1,267 @@
+/*
+ * sched_trace.h -- record scheduler events to a byte stream for offline analysis.
+ */
+#ifndef _LINUX_SCHED_TRACE_H_
+#define _LINUX_SCHED_TRACE_H_
+/* all times in nanoseconds */
+struct st_trace_header {
+        u8      type;           /* Of what type is this record?  */
+        u8      cpu;            /* On which CPU was it recorded? */
+        u16     pid;            /* PID of the task.              */
+        u32     job;            /* The job sequence number.      */
+};
+#define ST_NAME_LEN 16
+struct st_name_data {
+        char    cmd[ST_NAME_LEN];/* The name of the executable of this process. */
+};
+struct st_param_data {          /* regular params */
+        u32     wcet;
+        u32     period;
+        u32     phase;
+        u8      partition;
+        u8      class;
+        u8      __unused[2];
+};
+struct st_release_data {        /* A job is was/is going to be released. */
+        u64     release;        /* What's the release time?              */
+        u64     deadline;       /* By when must it finish?               */
+};
+struct st_assigned_data {       /* A job was asigned to a CPU.           */
+        u64     when;
+        u8      target;         /* Where should it execute?              */
+        u8      __unused[7];
+};
+struct st_switch_to_data {      /* A process was switched to on a given CPU.   */
+        u64     when;           /* When did this occur?                        */
+        u32     exec_time;      /* Time the current job has executed.          */
+        u8      __unused[4];
+};
+struct st_switch_away_data {    /* A process was switched away from on a given CPU. */
+        u64     when;
+        u64     exec_time;
+};
+struct st_completion_data {     /* A job completed. */
+        u64     when;
+        u64     forced:1;       /* Set to 1 if job overran and kernel advanced to the
+                                 * next task automatically; set to 0 otherwise.
+                                 */
+        u64     exec_time:63; /* Actual execution time of job. */
+};
+struct st_block_data {          /* A task blocks. */
+        u64     when;
+        u64     __unused;
+};
+struct st_resume_data {         /* A task resumes. */
+        u64     when;
+        u64     __unused;
+};
+struct st_action_data {
+        u64     when;
+        u8      action;
+        u8      __unused[7];
+};
+struct st_sys_release_data {
+        u64     when;
+        u64     release;
+};
+#define DATA(x) struct st_ ## x ## _data x;
+typedef enum {
+        ST_NAME = 1,            /* Start at one, so that we can spot
+                                 * uninitialized records. */
+        ST_PARAM,
+        ST_RELEASE,
+        ST_ASSIGNED,
+        ST_SWITCH_TO,
+        ST_SWITCH_AWAY,
+        ST_COMPLETION,
+        ST_BLOCK,
+        ST_RESUME,
+        ST_ACTION,
+        ST_SYS_RELEASE
+} st_event_record_type_t;
+struct st_event_record {
+        struct st_trace_header hdr;
+        union {
+                u64 raw[2];
+                DATA(name);
+                DATA(param);
+                DATA(release);
+                DATA(assigned);
+                DATA(switch_to);
+                DATA(switch_away);
+                DATA(completion);
+                DATA(block);
+                DATA(resume);
+                DATA(action);
+                DATA(sys_release);
+        } data;
+};
+#undef DATA
+#ifdef __KERNEL__
+#include <linux/sched.h>
+#include <litmus/feather_trace.h>
+#ifdef CONFIG_SCHED_TASK_TRACE
+#define SCHED_TRACE(id, callback, task) \
+        ft_event1(id, callback, task)
+#define SCHED_TRACE2(id, callback, task, xtra) \
+        ft_event2(id, callback, task, xtra)
+/* provide prototypes; needed on sparc64 */
+#ifndef NO_TASK_TRACE_DECLS
+feather_callback void do_sched_trace_task_name(unsigned long id,
+                                               struct task_struct* task);
+feather_callback void do_sched_trace_task_param(unsigned long id,
+                                                struct task_struct* task);
+feather_callback void do_sched_trace_task_release(unsigned long id,
+                                                  struct task_struct* task);
+feather_callback void do_sched_trace_task_switch_to(unsigned long id,
+                                                    struct task_struct* task);
+feather_callback void do_sched_trace_task_switch_away(unsigned long id,
+                                                      struct task_struct* task);
+feather_callback void do_sched_trace_task_completion(unsigned long id,
+                                                     struct task_struct* task,
+                                                     unsigned long forced);
+feather_callback void do_sched_trace_last_suspension_as_completion(
+                                                unsigned long id,
+                                                struct task_struct* task);
+feather_callback void do_sched_trace_task_block(unsigned long id,
+                                                struct task_struct* task);
+feather_callback void do_sched_trace_task_resume(unsigned long id,
+                                                 struct task_struct* task);
+feather_callback void do_sched_trace_action(unsigned long id,
+                                            struct task_struct* task,
+                                            unsigned long action);
+feather_callback void do_sched_trace_sys_release(unsigned long id,
+                                                 lt_t* start);
+#endif
+#else
+#define SCHED_TRACE(id, callback, task)        /* no tracing */
+#define SCHED_TRACE2(id, callback, task, xtra) /* no tracing */
+#endif
+#ifdef CONFIG_SCHED_LITMUS_TRACEPOINT
+#include <trace/events/litmus.h>
+#else
+/* Override trace macros to actually do nothing */
+#define trace_litmus_task_param(t)
+#define trace_litmus_task_release(t)
+#define trace_litmus_switch_to(t)
+#define trace_litmus_switch_away(prev)
+#define trace_litmus_task_completion(t, forced)
+#define trace_litmus_task_block(t)
+#define trace_litmus_task_resume(t)
+#define trace_litmus_sys_release(start)
+#endif
+#define SCHED_TRACE_BASE_ID 500
+#define sched_trace_task_name(t)                                        \
+        SCHED_TRACE(SCHED_TRACE_BASE_ID + 1,                            \
+                        do_sched_trace_task_name, t)
+#define sched_trace_task_param(t)                                       \
+        do {                                                            \
+                SCHED_TRACE(SCHED_TRACE_BASE_ID + 2,                    \
+                                do_sched_trace_task_param, t);          \
+                trace_litmus_task_param(t);                             \
+        } while (0)
+#define sched_trace_task_release(t)                                     \
+        do {                                                            \
+                SCHED_TRACE(SCHED_TRACE_BASE_ID + 3,                    \
+                                do_sched_trace_task_release, t);        \
+                trace_litmus_task_release(t);                           \
+        } while (0)
+#define sched_trace_task_switch_to(t)                                   \
+        do {                                                            \
+                SCHED_TRACE(SCHED_TRACE_BASE_ID + 4,                    \
+                        do_sched_trace_task_switch_to, t);              \
+                trace_litmus_switch_to(t);                              \
+        } while (0)
+#define sched_trace_task_switch_away(t)                                 \
+        do {                                                            \
+                SCHED_TRACE(SCHED_TRACE_BASE_ID + 5,                    \
+                        do_sched_trace_task_switch_away, t);            \
+                trace_litmus_switch_away(t);                            \
+        } while (0)
+#define sched_trace_task_completion(t, forced)                          \
+        do {                                                            \
+                SCHED_TRACE2(SCHED_TRACE_BASE_ID + 6,                   \
+                                do_sched_trace_task_completion, t,      \
+                                (unsigned long) forced);                \
+                trace_litmus_task_completion(t, forced);                \
+        } while (0)
+#define sched_trace_task_block(t)                                       \
+        do {                                                            \
+                SCHED_TRACE(SCHED_TRACE_BASE_ID + 7,                    \
+                        do_sched_trace_task_block, t);                  \
+                trace_litmus_task_block(t);                             \
+        } while (0)
+#define sched_trace_task_resume(t)                                      \
+        do {                                                            \
+                SCHED_TRACE(SCHED_TRACE_BASE_ID + 8,                    \
+                                do_sched_trace_task_resume, t);         \
+                trace_litmus_task_resume(t);                            \
+        } while (0)
+#define sched_trace_action(t, action)                                   \
+        SCHED_TRACE2(SCHED_TRACE_BASE_ID + 9,                           \
+                do_sched_trace_action, t, (unsigned long) action);
+/* when is a pointer, it does not need an explicit cast to unsigned long */
+#define sched_trace_sys_release(when)                                   \
+        do {                                                            \
+                SCHED_TRACE(SCHED_TRACE_BASE_ID + 10,                   \
+                        do_sched_trace_sys_release, when);              \
+                trace_litmus_sys_release(when);                         \
+        } while (0)
+#define sched_trace_last_suspension_as_completion(t)    \
+        do {                                                            \
+                SCHED_TRACE(SCHED_TRACE_BASE_ID + 11,                   \
+                                do_sched_trace_last_suspension_as_completion, t); \
+        } while (0)
+#define sched_trace_quantum_boundary() /* NOT IMPLEMENTED */
+#endif /* __KERNEL__ */
+#endif
diff --git a/include/litmus/srp.h b/include/litmus/srp.h
new file mode 100644
index 000000000000..c9a4552b2bf3
--- /dev/null
+++ b/include/litmus/srp.h
@@ -0,0 +1,28 @@
+#ifndef LITMUS_SRP_H
+#define LITMUS_SRP_H
+struct srp_semaphore;
+struct srp_priority {
+        struct list_head        list;
+        unsigned int            priority;
+        pid_t                   pid;
+};
+#define list2prio(l) list_entry(l, struct srp_priority, list)
+/* struct for uniprocessor SRP "semaphore" */
+struct srp_semaphore {
+        struct litmus_lock litmus_lock;
+        struct srp_priority ceiling;
+        struct task_struct* owner;
+        int cpu; /* cpu associated with this "semaphore" and resource */
+};
+/* map a task to its SRP preemption level priority */
+typedef unsigned int (*srp_prioritization_t)(struct task_struct* t);
+/* Must be updated by each plugin that uses SRP.*/
+extern srp_prioritization_t get_srp_prio;
+struct srp_semaphore* allocate_srp_semaphore(void);
+#endif
diff --git a/include/litmus/trace.h b/include/litmus/trace.h
new file mode 100644
index 000000000000..2646136e3881
--- /dev/null
+++ b/include/litmus/trace.h
@@ -0,0 +1,161 @@
+#ifndef _SYS_TRACE_H_
+#define _SYS_TRACE_H_
+#ifdef CONFIG_SCHED_OVERHEAD_TRACE
+#include <litmus/feather_trace.h>
+#include <litmus/feather_buffer.h>
+/*********************** TIMESTAMPS ************************/
+enum task_type_marker {
+        TSK_BE,
+        TSK_RT,
+        TSK_UNKNOWN
+};
+struct timestamp {
+        uint64_t                timestamp:48;
+        uint64_t                pid:16;
+        uint32_t                seq_no;
+        uint8_t                 cpu;
+        uint8_t                 event;
+        uint8_t                 task_type:2;
+        uint8_t                 irq_flag:1;
+        uint8_t                 irq_count:5;
+};
+/* tracing callbacks */
+feather_callback void msg_sent_to(unsigned long event, unsigned long to);
+feather_callback void msg_received_local(unsigned long event);
+feather_callback void msg_sent_local(unsigned long event);
+feather_callback void msg_received_from(unsigned long event, unsigned long from);
+#define MSG_TIMESTAMP_SENT(id, to) \
+        ft_event1(id, msg_sent_to, (unsigned long) (to));
+#define MSG_TIMESTAMP_RECEIVED(id) \
+        ft_event0(id, msg_received_local);
+#define MSG_TIMESTAMP_SENT_LOCAL(id) \
+        ft_event0(id, msg_sent_local);
+#define MSG_TIMESTAMP_RECEIVED_FROM(id, from) \
+        ft_event1(id, msg_received_from, (unsigned long) (from))
+feather_callback void save_cpu_timestamp(unsigned long event);
+feather_callback void save_cpu_timestamp_time(unsigned long event, unsigned long time_ptr);
+feather_callback void save_cpu_timestamp_irq(unsigned long event, unsigned long irq_count_ptr);
+feather_callback void save_cpu_timestamp_task(unsigned long event, unsigned long t_ptr);
+feather_callback void save_cpu_timestamp_def(unsigned long event, unsigned long type);
+feather_callback void save_cpu_task_latency(unsigned long event, unsigned long when_ptr);
+#define CPU_TIMESTAMP_TIME(id, time_ptr) \
+        ft_event1(id, save_cpu_timestamp_time, (unsigned long) time_ptr)
+#define CPU_TIMESTAMP_IRQ(id, irq_count_ptr) \
+        ft_event1(id, save_cpu_timestamp_irq, (unsigned long) irq_count_ptr)
+#define CPU_TIMESTAMP(id) ft_event0(id, save_cpu_timestamp)
+#define CPU_DTIMESTAMP(id, def)  ft_event1(id, save_cpu_timestamp_def, (unsigned long) def)
+#define CPU_TIMESTAMP_CUR(id) CPU_DTIMESTAMP(id, is_realtime(current) ? TSK_RT : TSK_BE)
+#define CPU_TTIMESTAMP(id, task) \
+        ft_event1(id, save_cpu_timestamp_task, (unsigned long) task)
+#define CPU_LTIMESTAMP(id, task) \
+        ft_event1(id, save_cpu_task_latency, (unsigned long) task)
+#else /* !CONFIG_SCHED_OVERHEAD_TRACE */
+#define MSG_TIMESTAMP_SENT(id, to)
+#define MSG_TIMESTAMP_RECEIVED(id)
+#define CPU_TIMESTAMP_TIME(id, time_ptr)
+#define CPU_TIMESTAMP_IRQ(id, irq_count_ptr)
+#define CPU_TIMESTAMP(id)
+#define CPU_DTIMESTAMP(id, def)
+#define CPU_TIMESTAMP_CUR(id)
+#define CPU_TTIMESTAMP(id, task)
+#define CPU_LTIMESTAMP(id, task)
+#endif
+/* Convention for timestamps
+ * =========================
+ *
+ * In order to process the trace files with a common tool, we use the following
+ * convention to measure execution times: The end time id of a code segment is
+ * always the next number after the start time event id.
+ */
+#define __TS_SYSCALL_IN_START(p)        CPU_TIMESTAMP_TIME(10, p)
+#define __TS_SYSCALL_IN_END(p)          CPU_TIMESTAMP_IRQ(11, p)
+#define TS_SYSCALL_OUT_START            CPU_TIMESTAMP_CUR(20)
+#define TS_SYSCALL_OUT_END              CPU_TIMESTAMP_CUR(21)
+#define TS_LOCK_START                   CPU_TIMESTAMP_CUR(30)
+#define TS_LOCK_END                     CPU_TIMESTAMP_CUR(31)
+#define TS_LOCK_SUSPEND                 CPU_TIMESTAMP_CUR(38)
+#define TS_LOCK_RESUME                  CPU_TIMESTAMP_CUR(39)
+#define TS_UNLOCK_START                 CPU_TIMESTAMP_CUR(40)
+#define TS_UNLOCK_END                   CPU_TIMESTAMP_CUR(41)
+#define TS_SCHED_START                  CPU_DTIMESTAMP(100, TSK_UNKNOWN) /* we only
+                                                                      * care
+                                                                      * about
+                                                                      * next */
+#define TS_SCHED_END(t)                 CPU_TTIMESTAMP(101, t)
+#define TS_SCHED2_START(t)              CPU_TTIMESTAMP(102, t)
+#define TS_SCHED2_END(t)                CPU_TTIMESTAMP(103, t)
+#define TS_CXS_START(t)                 CPU_TTIMESTAMP(104, t)
+#define TS_CXS_END(t)                   CPU_TTIMESTAMP(105, t)
+#define TS_RELEASE_START                CPU_DTIMESTAMP(106, TSK_RT)
+#define TS_RELEASE_END                  CPU_DTIMESTAMP(107, TSK_RT)
+#define TS_XCALL_START                  CPU_DTIMESTAMP(108, TSK_RT)
+#define TS_XCALL_END                    CPU_DTIMESTAMP(109, TSK_RT)
+#define TS_TICK_START(t)                CPU_TTIMESTAMP(110, t)
+#define TS_TICK_END(t)                  CPU_TTIMESTAMP(111, t)
+#define TS_QUANTUM_BOUNDARY_START       CPU_TIMESTAMP_CUR(112)
+#define TS_QUANTUM_BOUNDARY_END         CPU_TIMESTAMP_CUR(113)
+#define TS_SCHED_TIMER_START    CPU_TIMESTAMP_CUR(114)
+#define TS_SCHED_TIMER_END              CPU_TIMESTAMP_CUR(115)
+#define TS_PLUGIN_SCHED_START           /* TIMESTAMP(120) */  /* currently unused */
+#define TS_PLUGIN_SCHED_END             /* TIMESTAMP(121) */
+#define TS_PLUGIN_TICK_START            /* TIMESTAMP(130) */
+#define TS_PLUGIN_TICK_END              /* TIMESTAMP(131) */
+#define TS_ENTER_NP_START               CPU_TIMESTAMP(140)
+#define TS_ENTER_NP_END                 CPU_TIMESTAMP(141)
+#define TS_EXIT_NP_START                CPU_TIMESTAMP(150)
+#define TS_EXIT_NP_END                  CPU_TIMESTAMP(151)
+#define TS_SEND_RESCHED_START(c)        MSG_TIMESTAMP_SENT(190, c)
+#define TS_SEND_RESCHED_END             MSG_TIMESTAMP_RECEIVED(191)
+#define TS_SEND_XCALL_START(c)  MSG_TIMESTAMP_SENT(192, c)
+#define TS_SEND_XCALL_END               MSG_TIMESTAMP_RECEIVED(193)
+#define TS_RELEASE_LATENCY(when)        CPU_LTIMESTAMP(208, &(when))
+#define TS_TIMER_LATENCY(when)          CPU_LTIMESTAMP(209, &(when))
+#endif /* !_SYS_TRACE_H_ */
diff --git a/include/litmus/trace_irq.h b/include/litmus/trace_irq.h
new file mode 100644
index 000000000000..0d0c042ba9c3
--- /dev/null
+++ b/include/litmus/trace_irq.h
@@ -0,0 +1,14 @@
+#ifndef _LITMUS_TRACE_IRQ_H_
+#define _LITMUS_TRACE_IRQ_H_
+#ifdef CONFIG_SCHED_OVERHEAD_TRACE
+void ft_irq_fired(void);
+#else
+#define ft_irq_fired() /* nothing to do */
+#endif
+#endif
diff --git a/include/litmus/wait.h b/include/litmus/wait.h
new file mode 100644
index 000000000000..ce1347c355f8
--- /dev/null
+++ b/include/litmus/wait.h
@@ -0,0 +1,57 @@
+#ifndef _LITMUS_WAIT_H_
+#define _LITMUS_WAIT_H_
+struct task_struct* __waitqueue_remove_first(wait_queue_head_t *wq);
+/* wrap regular wait_queue_t head */
+struct __prio_wait_queue {
+        wait_queue_t wq;
+        /* some priority point */
+        lt_t priority;
+        /* break ties in priority by lower tie_breaker */
+        unsigned int tie_breaker;
+};
+typedef struct __prio_wait_queue prio_wait_queue_t;
+static inline void init_prio_waitqueue_entry(prio_wait_queue_t *pwq,
+                                             struct task_struct* t,
+                                             lt_t priority)
+{
+        init_waitqueue_entry(&pwq->wq, t);
+        pwq->priority    = priority;
+        pwq->tie_breaker = 0;
+}
+static inline void init_prio_waitqueue_entry_tie(prio_wait_queue_t *pwq,
+                                                 struct task_struct* t,
+                                                 lt_t priority,
+                                                 unsigned int tie_breaker)
+{
+        init_waitqueue_entry(&pwq->wq, t);
+        pwq->priority    = priority;
+        pwq->tie_breaker = tie_breaker;
+}
+unsigned int __add_wait_queue_prio_exclusive(
+        wait_queue_head_t* head,
+        prio_wait_queue_t *new);
+static inline unsigned int add_wait_queue_prio_exclusive(
+        wait_queue_head_t* head,
+        prio_wait_queue_t *new)
+{
+        unsigned long flags;
+        unsigned int passed;
+        spin_lock_irqsave(&head->lock, flags);
+        passed = __add_wait_queue_prio_exclusive(head, new);
+        spin_unlock_irqrestore(&head->lock, flags);
+        return passed;
+}
+#endif
diff --git a/litmus/Kconfig b/litmus/Kconfig
new file mode 100644
index 000000000000..be25d7c53971
--- /dev/null
+++ b/litmus/Kconfig
@@ -0,0 +1,384 @@
+menu "LITMUS^RT"
+menu "Scheduling"
+config PLUGIN_CEDF
+        bool "Clustered-EDF"
+        depends on X86 && SYSFS
+        default y
+        help
+          Include the Clustered EDF (C-EDF) plugin in the kernel.
+          This is appropriate for large platforms with shared caches.
+          On smaller platforms (e.g., ARM PB11MPCore), using C-EDF
+          makes little sense since there aren't any shared caches.
+config PLUGIN_PFAIR
+        bool "PFAIR"
+        default y
+        help
+          Include the PFAIR plugin (i.e., the PD^2 scheduler) in the kernel.
+          The PFAIR plugin requires high resolution timers (for staggered
+          quanta) and also requires HZ_PERIODIC (i.e., periodic timer ticks
+          even if a processor is idle, as quanta could be missed otherwise).
+          Further, the PFAIR plugin uses the system tick and thus requires
+          HZ=1000 to achive reasonable granularity.
+          If unsure, say Yes.
+config RELEASE_MASTER
+        bool "Release-master Support"
+        depends on SMP
+        default n
+        help
+           Allow one processor to act as a dedicated interrupt processor
+           that services all timer interrupts, but that does not schedule
+           real-time tasks. See RTSS'09 paper for details
+           (http://www.cs.unc.edu/~anderson/papers.html).
+config PREFER_LOCAL_LINKING
+       bool "Link newly arrived tasks locally if possible"
+       depends on SMP
+       default y
+       help
+          In linking-based schedulers such as GSN-EDF, if an idle CPU processes
+          a job arrival (i.e., when a job resumed or was released), it can
+          either link the task to itself and schedule it immediately (to avoid
+          unnecessary scheduling latency) or it can try to link it to the CPU
+          where it executed previously (to maximize cache affinity, at the
+          expense of increased latency due to the need to send an IPI).
+          In lightly loaded systems, this option can significantly reduce
+          scheduling latencies. In heavily loaded systems (where CPUs are
+          rarely idle), it will likely make hardly a difference.
+          If unsure, say yes.
+config LITMUS_QUANTUM_LENGTH_US
+    int "quantum length (in us)"
+    default 1000
+    range 500 10000
+    help
+      Determine the desired quantum length, in microseconds, which
+      is used to determine the granularity of scheduling in
+      quantum-driven plugins (primarily PFAIR). This parameter does not
+      affect event-driven plugins (such as the EDF-based plugins and P-FP).
+      Default: 1000us = 1ms.
+config BUG_ON_MIGRATION_DEADLOCK
+       bool "Panic on suspected migration deadlock"
+       default y
+       help
+          This is a debugging option. The LITMUS^RT migration support code for
+          global scheduling contains a simple heuristic to detect when the
+          system deadlocks due to circular stack dependencies.
+          For example, such a deadlock exists if CPU 0 waits for task A's stack
+          to become available while using task B's stack, and CPU 1 waits for
+          task B's stack to become available while using task A's stack. Such
+          a situation can arise in (buggy) global scheduling plugins.
+          With this option enabled, such a scenario with result in a BUG().
+          You can turn off this option when debugging on real hardware (e.g.,
+          to rescue traces, etc. that would be hard to get after a panic).
+          Only turn this off if you really know what you are doing. If this
+          BUG() triggers, the scheduler is broken and turning off this option
+          won't fix it.
+endmenu
+menu "Real-Time Synchronization"
+config NP_SECTION
+        bool "Non-preemptive section support"
+        default y
+        help
+          Allow tasks to become non-preemptable.
+          Note that plugins still need to explicitly support non-preemptivity.
+          Currently, only the GSN-EDF, PSN-EDF, and P-FP plugins have such support.
+          This is required to support locking protocols such as the FMLP.
+          If disabled, all tasks will be considered preemptable at all times.
+config LITMUS_LOCKING
+        bool "Support for real-time locking protocols"
+        depends on NP_SECTION
+        default y
+        help
+          Enable LITMUS^RT's multiprocessor real-time locking protocols with
+          predicable maximum blocking times.
+          Say Yes if you want to include locking protocols such as the FMLP and
+          Baker's SRP.
+endmenu
+menu "Performance Enhancements"
+config SCHED_CPU_AFFINITY
+        bool "Local Migration Affinity"
+        depends on X86 && SYSFS
+        default y
+        help
+          Rescheduled tasks prefer CPUs near to their previously used CPU.
+          This may improve cache performance through possible preservation of
+          cache affinity, at the expense of (slightly) more involved scheduling
+          logic.
+          Warning: May make bugs harder to find since tasks may migrate less often.
+          NOTES:
+                * Feature is not utilized by PFair/PD^2.
+          Say Yes if unsure.
+config ALLOW_EARLY_RELEASE
+        bool "Allow Early Releasing"
+        default y
+        help
+          Allow tasks to release jobs early (while still maintaining job
+          precedence constraints). Only supported by EDF schedulers. Early
+          releasing must be explicitly requested by real-time tasks via
+          the task_params passed to sys_set_task_rt_param().
+          Early releasing can improve job response times while maintaining
+          real-time correctness. However, it can easily peg your CPUs
+          since tasks never suspend to wait for their next job. As such, early
+          releasing is really only useful in the context of implementing
+          bandwidth servers, interrupt handling threads, or short-lived
+          computations.
+          Beware that early releasing may affect real-time analysis
+          if using locking protocols or I/O.
+          Say Yes if unsure.
+choice
+        prompt "EDF Tie-Break Behavior"
+        default EDF_TIE_BREAK_LATENESS_NORM
+        help
+          Allows the configuration of tie-breaking behavior when the deadlines
+          of two EDF-scheduled tasks are equal.
+        config EDF_TIE_BREAK_LATENESS
+        bool "Lateness-based Tie Break"
+        help
+          Break ties between two jobs, A and B, based upon the lateness of their
+          prior jobs. The job with the greatest lateness has priority. Note that
+          lateness has a negative value if the prior job finished before its
+          deadline.
+        config EDF_TIE_BREAK_LATENESS_NORM
+        bool "Normalized Lateness-based Tie Break"
+        help
+          Break ties between two jobs, A and B, based upon the lateness, normalized
+          by relative deadline, of their prior jobs. The job with the greatest
+          normalized lateness has priority. Note that lateness has a negative value
+          if the prior job finished before its deadline.
+          Normalized lateness tie-breaks are likely desireable over non-normalized
+          tie-breaks if the execution times and/or relative deadlines of tasks in a
+          task set vary greatly.
+        config EDF_TIE_BREAK_HASH
+        bool "Hash-based Tie Breaks"
+        help
+          Break ties between two jobs, A and B, with equal deadlines by using a
+          uniform hash; i.e.: hash(A.pid, A.job_num) < hash(B.pid, B.job_num). Job
+          A has ~50% of winning a given tie-break.
+        config EDF_PID_TIE_BREAK
+        bool "PID-based Tie Breaks"
+        help
+          Break ties based upon OS-assigned thread IDs. Use this option if
+          required by algorithm's real-time analysis or per-task response-time
+          jitter must be minimized.
+          NOTES:
+            * This tie-breaking method was default in Litmus 2012.2 and before.
+endchoice
+endmenu
+menu "Tracing"
+config FEATHER_TRACE
+        bool "Feather-Trace Infrastructure"
+        default y
+        help
+          Feather-Trace basic tracing infrastructure. Includes device file
+          driver and instrumentation point support.
+          Note that this option only enables the basic Feather-Trace infrastructure;
+          you still need to enable SCHED_TASK_TRACE and/or SCHED_OVERHEAD_TRACE to
+          actually enable any events.
+config SCHED_TASK_TRACE
+        bool "Trace real-time tasks"
+        depends on FEATHER_TRACE
+        default y
+        help
+          Include support for the sched_trace_XXX() tracing functions. This
+          allows the collection of real-time task events such as job
+          completions, job releases, early completions, etc. This results in  a
+          small overhead in the scheduling code. Disable if the overhead is not
+          acceptable (e.g., benchmarking).
+          Say Yes for debugging.
+          Say No for overhead tracing.
+config SCHED_TASK_TRACE_SHIFT
+       int "Buffer size for sched_trace_xxx() events"
+       depends on SCHED_TASK_TRACE
+       range 8 13
+       default 9
+       help
+         Select the buffer size of sched_trace_xxx() events as a power of two.
+         These buffers are statically allocated as per-CPU data. Each event
+         requires 24 bytes storage plus one additional flag byte. Too large
+         buffers can cause issues with the per-cpu allocator (and waste
+         memory). Too small buffers can cause scheduling events to be lost. The
+         "right" size is workload dependent and depends on the number of tasks,
+         each task's period, each task's number of suspensions, and how often
+         the buffer is flushed.
+         Examples: 12 =>   4k events
+                   10 =>   1k events
+                    8 =>  512 events
+config SCHED_LITMUS_TRACEPOINT
+        bool "Enable Event/Tracepoint Tracing for real-time task tracing"
+        depends on TRACEPOINTS
+        default n
+        help
+          Enable kernel-style events (tracepoint) for Litmus. Litmus events
+          trace the same functions as the above sched_trace_XXX(), but can
+          be enabled independently.
+          Litmus tracepoints can be recorded and analyzed together (single
+          time reference) with all other kernel tracing events (e.g.,
+          sched:sched_switch, etc.).
+          This also enables a quick way to visualize schedule traces using
+          trace-cmd utility and kernelshark visualizer.
+          Say Yes for debugging and visualization purposes.
+          Say No for overhead tracing.
+config SCHED_OVERHEAD_TRACE
+        bool "Record timestamps for overhead measurements"
+        depends on FEATHER_TRACE
+        default y
+        help
+          Export event stream for overhead tracing.
+          Say Yes for overhead tracing.
+config SCHED_OVERHEAD_TRACE_SHIFT
+       int "Buffer size for Feather-Trace overhead data"
+       depends on SCHED_OVERHEAD_TRACE
+       range 15 32
+       default 22
+       help
+         Select the buffer size for the Feather-Trace overhead tracing
+         infrastructure (/dev/litmus/ft_trace0 & ftcat) as a power of two.  The
+         larger the buffer, the less likely the chance of buffer overflows if
+         the ftcat process is starved by real-time activity. In machines with
+         large memories, large buffer sizes are recommended.
+         Examples: 16 =>   2 MB
+                   24 => 512 MB
+                   26 =>  2G MB
+config SCHED_DEBUG_TRACE
+        bool "TRACE() debugging"
+        default n
+        help
+          Include support for sched_trace_log_messageg(), which is used to
+          implement TRACE(). If disabled, no TRACE() messages will be included
+          in the kernel, and no overheads due to debugging statements will be
+          incurred by the scheduler. Disable if the overhead is not acceptable
+          (e.g. benchmarking).
+          Say Yes for debugging.
+          Say No for overhead tracing.
+config SCHED_DEBUG_TRACE_SHIFT
+       int "Buffer size for TRACE() buffer"
+       depends on SCHED_DEBUG_TRACE
+       range 14 22
+       default 18
+       help
+        Select the amount of memory needed per for the TRACE() buffer, as a
+        power of two. The TRACE() buffer is global and statically allocated. If
+        the buffer is too small, there will be holes in the TRACE() log if the
+        buffer-flushing task is starved.
+        The default should be sufficient for most systems. Increase the buffer
+        size if the log contains holes. Reduce the buffer size when running on
+        a memory-constrained system.
+        Examples: 14 =>  16KB
+                  18 => 256KB
+                  20 =>   1MB
+        This buffer is exported to usespace using a misc device as
+        'litmus/log'. On a system with default udev rules, a corresponding
+        character device node should be created at /dev/litmus/log. The buffer
+        can be flushed using cat, e.g., 'cat /dev/litmus/log > my_log_file.txt'.
+config SCHED_DEBUG_TRACE_CALLER
+       bool "Include [function@file:line] tag in TRACE() log"
+       depends on SCHED_DEBUG_TRACE
+       default n
+       help
+         With this option enabled, TRACE() prepends
+              "[<function name>@<filename>:<line number>]"
+         to each message in the debug log. Enable this to aid in figuring out
+         what was called in which order. The downside is that it adds a lot of
+         clutter.
+         If unsure, say No.
+config PREEMPT_STATE_TRACE
+       bool "Trace preemption state machine transitions"
+       depends on SCHED_DEBUG_TRACE && DEBUG_KERNEL
+       default n
+       help
+         With this option enabled, each CPU will log when it transitions
+         states in the preemption state machine. This state machine is
+         used to determine how to react to IPIs (avoid races with in-flight IPIs).
+         Warning: this creates a lot of information in the debug trace. Only
+         recommended when you are debugging preemption-related races.
+         If unsure, say No.
+config REPORT_TIMER_LATENCY
+       bool "Warn when hrtimers incur large latency"
+       default n
+       help
+         With this option enabled, the hrtimer code will printk()
+         a warning when a timer fires much after its intended
+         time. This can useful when debugging latency issues.
+         If unsure, say No.
+config REPORT_TIMER_LATENCY_THRESHOLD
+       int "Maximum acceptable timer latency (in nanoseconds)"
+       depends on REPORT_TIMER_LATENCY
+       range 10000 100000000
+       default 1000000
+       help
+             If a timer fires more than the given threshold after its intended
+             expiration time, a warning message is printed to the kernel log.
+             By default, the threshold is one millisecond (= one million nanoseconds).
+endmenu
+endmenu
diff --git a/litmus/Makefile b/litmus/Makefile
new file mode 100644
index 000000000000..ecaa28dc68ad
--- /dev/null
+++ b/litmus/Makefile
@@ -0,0 +1,36 @@
+#
+# Makefile for LITMUS^RT
+#
+obj-y     = sched_plugin.o litmus.o \
+            preempt.o \
+            litmus_proc.o \
+            budget.o \
+            clustered.o \
+            jobs.o \
+            sync.o \
+            rt_domain.o \
+            edf_common.o \
+            fp_common.o \
+            fdso.o \
+            locking.o \
+            srp.o \
+            bheap.o \
+            binheap.o \
+            ctrldev.o \
+            uncachedev.o \
+            sched_gsn_edf.o \
+            sched_psn_edf.o \
+            sched_pfp.o
+obj-$(CONFIG_PLUGIN_CEDF) += sched_cedf.o
+obj-$(CONFIG_PLUGIN_PFAIR) += sched_pfair.o
+obj-$(CONFIG_FEATHER_TRACE) += ft_event.o ftdev.o
+obj-$(CONFIG_SCHED_TASK_TRACE) += sched_task_trace.o
+obj-$(CONFIG_SCHED_DEBUG_TRACE) += sched_trace.o
+obj-$(CONFIG_SCHED_OVERHEAD_TRACE) += trace.o
+obj-y += sched_pres.o
+obj-y += reservations/
diff --git a/litmus/bheap.c b/litmus/bheap.c
new file mode 100644
index 000000000000..2707e0122b6d
--- /dev/null
+++ b/litmus/bheap.c
@@ -0,0 +1,316 @@
+#include <linux/bug.h>
+#include <linux/kernel.h>
+#include <litmus/bheap.h>
+void bheap_init(struct bheap* heap)
+{
+        heap->head = NULL;
+        heap->min  = NULL;
+}
+void bheap_node_init(struct bheap_node** _h, void* value)
+{
+        struct bheap_node* h = *_h;
+        h->parent = NULL;
+        h->next   = NULL;
+        h->child  = NULL;
+        h->degree = NOT_IN_HEAP;
+        h->value  = value;
+        h->ref    = _h;
+}
+/* make child a subtree of root */
+static void __bheap_link(struct bheap_node* root,
+                        struct bheap_node* child)
+{
+        child->parent = root;
+        child->next   = root->child;
+        root->child   = child;
+        root->degree++;
+}
+/* merge root lists */
+static  struct bheap_node* __bheap_merge(struct bheap_node* a,
+                                             struct bheap_node* b)
+{
+        struct bheap_node* head = NULL;
+        struct bheap_node** pos = &head;
+        while (a && b) {
+                if (a->degree < b->degree) {
+                        *pos = a;
+                        a = a->next;
+                } else {
+                        *pos = b;
+                        b = b->next;
+                }
+                pos = &(*pos)->next;
+        }
+        if (a)
+                *pos = a;
+        else
+                *pos = b;
+        return head;
+}
+/* reverse a linked list of nodes. also clears parent pointer */
+static  struct bheap_node* __bheap_reverse(struct bheap_node* h)
+{
+        struct bheap_node* tail = NULL;
+        struct bheap_node* next;
+        if (!h)
+                return h;
+        h->parent = NULL;
+        while (h->next) {
+                next    = h->next;
+                h->next = tail;
+                tail    = h;
+                h       = next;
+                h->parent = NULL;
+        }
+        h->next = tail;
+        return h;
+}
+static  void __bheap_min(bheap_prio_t higher_prio, struct bheap* heap,
+                              struct bheap_node** prev, struct bheap_node** node)
+{
+        struct bheap_node *_prev, *cur;
+        *prev = NULL;
+        if (!heap->head) {
+                *node = NULL;
+                return;
+        }
+        *node = heap->head;
+        _prev = heap->head;
+        cur   = heap->head->next;
+        while (cur) {
+                if (higher_prio(cur, *node)) {
+                        *node = cur;
+                        *prev = _prev;
+                }
+                _prev = cur;
+                cur   = cur->next;
+        }
+}
+static  void __bheap_union(bheap_prio_t higher_prio, struct bheap* heap,
+                                struct bheap_node* h2)
+{
+        struct bheap_node* h1;
+        struct bheap_node *prev, *x, *next;
+        if (!h2)
+                return;
+        h1 = heap->head;
+        if (!h1) {
+                heap->head = h2;
+                return;
+        }
+        h1 = __bheap_merge(h1, h2);
+        prev = NULL;
+        x    = h1;
+        next = x->next;
+        while (next) {
+                if (x->degree != next->degree ||
+                    (next->next && next->next->degree == x->degree)) {
+                        /* nothing to do, advance */
+                        prev = x;
+                        x    = next;
+                } else if (higher_prio(x, next)) {
+                        /* x becomes the root of next */
+                        x->next = next->next;
+                        __bheap_link(x, next);
+                } else {
+                        /* next becomes the root of x */
+                        if (prev)
+                                prev->next = next;
+                        else
+                                h1 = next;
+                        __bheap_link(next, x);
+                        x = next;
+                }
+                next = x->next;
+        }
+        heap->head = h1;
+}
+static struct bheap_node* __bheap_extract_min(bheap_prio_t higher_prio,
+                                            struct bheap* heap)
+{
+        struct bheap_node *prev, *node;
+        __bheap_min(higher_prio, heap, &prev, &node);
+        if (!node)
+                return NULL;
+        if (prev)
+                prev->next = node->next;
+        else
+                heap->head = node->next;
+        __bheap_union(higher_prio, heap, __bheap_reverse(node->child));
+        return node;
+}
+/* insert (and reinitialize) a node into the heap */
+void bheap_insert(bheap_prio_t higher_prio, struct bheap* heap,
+                 struct bheap_node* node)
+{
+        struct bheap_node *min;
+        node->child  = NULL;
+        node->parent = NULL;
+        node->next   = NULL;
+        node->degree = 0;
+        if (heap->min && higher_prio(node, heap->min)) {
+                /* swap min cache */
+                min = heap->min;
+                min->child  = NULL;
+                min->parent = NULL;
+                min->next   = NULL;
+                min->degree = 0;
+                __bheap_union(higher_prio, heap, min);
+                heap->min   = node;
+        } else
+                __bheap_union(higher_prio, heap, node);
+}
+void bheap_uncache_min(bheap_prio_t higher_prio, struct bheap* heap)
+{
+        struct bheap_node* min;
+        if (heap->min) {
+                min = heap->min;
+                heap->min = NULL;
+                bheap_insert(higher_prio, heap, min);
+        }
+}
+/* merge addition into target */
+void bheap_union(bheap_prio_t higher_prio,
+                struct bheap* target, struct bheap* addition)
+{
+        /* first insert any cached minima, if necessary */
+        bheap_uncache_min(higher_prio, target);
+        bheap_uncache_min(higher_prio, addition);
+        __bheap_union(higher_prio, target, addition->head);
+        /* this is a destructive merge */
+        addition->head = NULL;
+}
+struct bheap_node* bheap_peek(bheap_prio_t higher_prio,
+                            struct bheap* heap)
+{
+        if (!heap->min)
+                heap->min = __bheap_extract_min(higher_prio, heap);
+        return heap->min;
+}
+struct bheap_node* bheap_take(bheap_prio_t higher_prio,
+                            struct bheap* heap)
+{
+        struct bheap_node *node;
+        if (!heap->min)
+                heap->min = __bheap_extract_min(higher_prio, heap);
+        node = heap->min;
+        heap->min = NULL;
+        if (node)
+                node->degree = NOT_IN_HEAP;
+        return node;
+}
+int bheap_decrease(bheap_prio_t higher_prio, struct bheap_node* node)
+{
+        struct bheap_node  *parent;
+        struct bheap_node** tmp_ref;
+        void* tmp;
+        /* bubble up */
+        parent = node->parent;
+        while (parent && higher_prio(node, parent)) {
+                /* swap parent and node */
+                tmp           = parent->value;
+                parent->value = node->value;
+                node->value   = tmp;
+                /* swap references */
+                *(parent->ref) = node;
+                *(node->ref)   = parent;
+                tmp_ref        = parent->ref;
+                parent->ref    = node->ref;
+                node->ref      = tmp_ref;
+                /* step up */
+                node   = parent;
+                parent = node->parent;
+        }
+        return parent != NULL;
+}
+void bheap_delete(bheap_prio_t higher_prio, struct bheap* heap,
+                 struct bheap_node* node)
+{
+        struct bheap_node *parent, *prev, *pos;
+        struct bheap_node** tmp_ref;
+        void* tmp;
+        if (heap->min != node) {
+                /* bubble up */
+                parent = node->parent;
+                while (parent) {
+                        /* swap parent and node */
+                        tmp           = parent->value;
+                        parent->value = node->value;
+                        node->value   = tmp;
+                        /* swap references */
+                        *(parent->ref) = node;
+                        *(node->ref)   = parent;
+                        tmp_ref        = parent->ref;
+                        parent->ref    = node->ref;
+                        node->ref      = tmp_ref;
+                        /* step up */
+                        node   = parent;
+                        parent = node->parent;
+                }
+                /* now delete:
+                 * first find prev */
+                prev = NULL;
+                pos  = heap->head;
+                while (pos != node) {
+                        BUG_ON(!pos); /* fell off the list -> deleted from wrong heap */
+                        prev = pos;
+                        pos  = pos->next;
+                }
+                /* we have prev, now remove node */
+                if (prev)
+                        prev->next = node->next;
+                else
+                        heap->head = node->next;
+                __bheap_union(higher_prio, heap, __bheap_reverse(node->child));
+        } else
+                heap->min = NULL;
+        node->degree = NOT_IN_HEAP;
+}
+/* allocate a heap node for value and insert into the heap */
+int bheap_add(bheap_prio_t higher_prio, struct bheap* heap,
+             void* value, int gfp_flags)
+{
+        struct bheap_node* hn = bheap_node_alloc(gfp_flags);
+        if (likely(hn)) {
+                bheap_node_init(&hn, value);
+                bheap_insert(higher_prio, heap, hn);
+        }
+        return hn != NULL;
+}
+void* bheap_take_del(bheap_prio_t higher_prio,
+                    struct bheap* heap)
+{
+        struct bheap_node* hn = bheap_take(higher_prio, heap);
+        void* ret = NULL;
+        if (hn) {
+                ret = hn->value;
+                bheap_node_free(hn);
+        }
+        return ret;
+}
diff --git a/litmus/binheap.c b/litmus/binheap.c
new file mode 100644
index 000000000000..d3ab34b92096
--- /dev/null
+++ b/litmus/binheap.c
@@ -0,0 +1,387 @@
+#include <litmus/binheap.h>
+/* Returns true of the root ancestor of node is the root of the given heap. */
+int binheap_is_in_this_heap(struct binheap_node *node,
+        struct binheap* heap)
+{
+        if(!binheap_is_in_heap(node)) {
+                return 0;
+        }
+        while(node->parent != NULL) {
+                node = node->parent;
+        }
+        return (node == heap->root);
+}
+/* Update the node reference pointers.  Same logic as Litmus binomial heap. */
+static void __update_ref(struct binheap_node *parent,
+                                struct binheap_node *child)
+{
+        *(parent->ref_ptr) = child;
+        *(child->ref_ptr) = parent;
+        swap(parent->ref_ptr, child->ref_ptr);
+}
+/* Swaps data between two nodes. */
+static void __binheap_swap(struct binheap_node *parent,
+                                struct binheap_node *child)
+{
+        swap(parent->data, child->data);
+        __update_ref(parent, child);
+}
+/* Swaps memory and data between two nodes. Actual nodes swap instead of
+ * just data.  Needed when we delete nodes from the heap.
+ */
+static void __binheap_swap_safe(struct binheap *handle,
+                                struct binheap_node *a,
+                                struct binheap_node *b)
+{
+        swap(a->data, b->data);
+        __update_ref(a, b);
+        if((a->parent != NULL) && (a->parent == b->parent)) {
+                /* special case: shared parent */
+                swap(a->parent->left, a->parent->right);
+        }
+        else {
+                /* Update pointers to swap parents. */
+                if(a->parent) {
+                        if(a == a->parent->left) {
+                                a->parent->left = b;
+                        }
+                        else {
+                                a->parent->right = b;
+                        }
+                }
+                if(b->parent) {
+                        if(b == b->parent->left) {
+                                b->parent->left = a;
+                        }
+                        else {
+                                b->parent->right = a;
+                        }
+                }
+                swap(a->parent, b->parent);
+        }
+        /* swap children */
+        if(a->left) {
+                a->left->parent = b;
+                if(a->right) {
+                        a->right->parent = b;
+                }
+        }
+        if(b->left) {
+                b->left->parent = a;
+                if(b->right) {
+                        b->right->parent = a;
+                }
+        }
+        swap(a->left, b->left);
+        swap(a->right, b->right);
+        /* update next/last/root pointers */
+        if(a == handle->next) {
+                handle->next = b;
+        }
+        else if(b == handle->next) {
+                handle->next = a;
+        }
+        if(a == handle->last) {
+                handle->last = b;
+        }
+        else if(b == handle->last) {
+                handle->last = a;
+        }
+        if(a == handle->root) {
+                handle->root = b;
+        }
+        else if(b == handle->root) {
+                handle->root = a;
+        }
+}
+/**
+ * Update the pointer to the last node in the complete binary tree.
+ * Called internally after the root node has been deleted.
+ */
+static void __binheap_update_last(struct binheap *handle)
+{
+        struct binheap_node *temp = handle->last;
+        /* find a "bend" in the tree. */
+        while(temp->parent && (temp == temp->parent->left)) {
+                temp = temp->parent;
+        }
+        /* step over to sibling if we're not at root */
+        if(temp->parent != NULL) {
+                temp = temp->parent->left;
+        }
+        /* now travel right as far as possible. */
+        while(temp->right != NULL) {
+                temp = temp->right;
+        }
+        /* take one step to the left if we're not at the bottom-most level. */
+        if(temp->left != NULL) {
+                temp = temp->left;
+        }
+        handle->last = temp;
+}
+/**
+ * Update the pointer to the node that will take the next inserted node.
+ * Called internally after a node has been inserted.
+ */
+static void __binheap_update_next(struct binheap *handle)
+{
+        struct binheap_node *temp = handle->next;
+        /* find a "bend" in the tree. */
+        while(temp->parent && (temp == temp->parent->right)) {
+                temp = temp->parent;
+        }
+        /* step over to sibling if we're not at root */
+        if(temp->parent != NULL) {
+                temp = temp->parent->right;
+        }
+        /* now travel left as far as possible. */
+        while(temp->left != NULL) {
+                temp = temp->left;
+        }
+        handle->next = temp;
+}
+/* bubble node up towards root */
+static void __binheap_bubble_up(struct binheap *handle,
+                                struct binheap_node *node)
+{
+        /* let BINHEAP_POISON data bubble to the top */
+        while((node->parent != NULL) &&
+                  ((node->data == BINHEAP_POISON) ||
+                   handle->compare(node, node->parent))) {
+                          __binheap_swap(node->parent, node);
+                          node = node->parent;
+        }
+}
+/* bubble node down, swapping with min-child */
+static void __binheap_bubble_down(struct binheap *handle)
+{
+        struct binheap_node *node = handle->root;
+        while(node->left != NULL) {
+                if(node->right && handle->compare(node->right, node->left)) {
+                        if(handle->compare(node->right, node)) {
+                                __binheap_swap(node, node->right);
+                                node = node->right;
+                        }
+                        else {
+                                break;
+                        }
+                }
+                else {
+                        if(handle->compare(node->left, node)) {
+                                __binheap_swap(node, node->left);
+                                node = node->left;
+                        }
+                        else {
+                                break;
+                        }
+                }
+        }
+}
+void __binheap_add(struct binheap_node *new_node,
+                                struct binheap *handle,
+                                void *data)
+{
+        new_node->data = data;
+        new_node->ref = new_node;
+        new_node->ref_ptr = &(new_node->ref);
+        if(!binheap_empty(handle)) {
+                /* insert left side first */
+                if(handle->next->left == NULL) {
+                        handle->next->left = new_node;
+                        new_node->parent = handle->next;
+                        new_node->left = NULL;
+                        new_node->right = NULL;
+                        handle->last = new_node;
+                        __binheap_bubble_up(handle, new_node);
+                }
+                else {
+                        /* left occupied. insert right. */
+                        handle->next->right = new_node;
+                        new_node->parent = handle->next;
+                        new_node->left = NULL;
+                        new_node->right = NULL;
+                        handle->last = new_node;
+                        __binheap_update_next(handle);
+                        __binheap_bubble_up(handle, new_node);
+                }
+        }
+        else {
+                /* first node in heap */
+                new_node->parent = NULL;
+                new_node->left = NULL;
+                new_node->right = NULL;
+                handle->root = new_node;
+                handle->next = new_node;
+                handle->last = new_node;
+        }
+}
+/**
+ * Removes the root node from the heap. The node is removed after coalescing
+ * the binheap_node with its original data pointer at the root of the tree.
+ *
+ * The 'last' node in the tree is then swapped up to the root and bubbled
+ * down.
+ */
+void __binheap_delete_root(struct binheap *handle,
+                                struct binheap_node *container)
+{
+        struct binheap_node *root = handle->root;
+        if(root != container) {
+                /* coalesce */
+                __binheap_swap_safe(handle, root, container);
+                root = container;
+        }
+        if(handle->last != root) {
+                /* swap 'last' node up to root and bubble it down. */
+                struct binheap_node *to_move = handle->last;
+                if(to_move->parent != root) {
+                        handle->next = to_move->parent;
+                        if(handle->next->right == to_move) {
+                                /* disconnect from parent */
+                                to_move->parent->right = NULL;
+                                handle->last = handle->next->left;
+                        }
+                        else {
+                                /* find new 'last' before we disconnect */
+                                __binheap_update_last(handle);
+                                /* disconnect from parent */
+                                to_move->parent->left = NULL;
+                        }
+                }
+                else {
+                        /* 'last' is direct child of root */
+                        handle->next = to_move;
+                        if(to_move == to_move->parent->right) {
+                                to_move->parent->right = NULL;
+                                handle->last = to_move->parent->left;
+                        }
+                        else {
+                                to_move->parent->left = NULL;
+                                handle->last = to_move;
+                        }
+                }
+                to_move->parent = NULL;
+                /* reconnect as root.  We can't just swap data ptrs since root node
+                 * may be freed after this function returns.
+                 */
+                to_move->left = root->left;
+                to_move->right = root->right;
+                if(to_move->left != NULL) {
+                        to_move->left->parent = to_move;
+                }
+                if(to_move->right != NULL) {
+                        to_move->right->parent = to_move;
+                }
+                handle->root = to_move;
+                /* bubble down */
+                __binheap_bubble_down(handle);
+        }
+        else {
+                /* removing last node in tree */
+                handle->root = NULL;
+                handle->next = NULL;
+                handle->last = NULL;
+        }
+        /* mark as removed */
+        container->parent = BINHEAP_POISON;
+}
+/**
+ * Delete an arbitrary node.  Bubble node to delete up to the root,
+ * and then delete to root.
+ */
+void __binheap_delete(struct binheap_node *node_to_delete,
+                                struct binheap *handle)
+{
+        struct binheap_node *target = node_to_delete->ref;
+        void *temp_data = target->data;
+        /* temporarily set data to null to allow node to bubble up to the top. */
+        target->data = BINHEAP_POISON;
+        __binheap_bubble_up(handle, target);
+        __binheap_delete_root(handle, node_to_delete);
+        node_to_delete->data = temp_data;  /* restore node data pointer */
+}
+/**
+ * Bubble up a node whose pointer has decreased in value.
+ */
+void __binheap_decrease(struct binheap_node *orig_node,
+                                struct binheap *handle)
+{
+        struct binheap_node *target = orig_node->ref;
+        __binheap_bubble_up(handle, target);
+}
diff --git a/litmus/budget.c b/litmus/budget.c
new file mode 100644
index 000000000000..18dac24e5632
--- /dev/null
+++ b/litmus/budget.c
@@ -0,0 +1,168 @@
+#include <linux/sched.h>
+#include <linux/percpu.h>
+#include <linux/hrtimer.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <litmus/debug_trace.h>
+#include <litmus/litmus.h>
+#include <litmus/preempt.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/np.h>
+#include <litmus/budget.h>
+struct enforcement_timer {
+        /* The enforcement timer is used to accurately police
+         * slice budgets. */
+        struct hrtimer          timer;
+        int                     armed;
+};
+DEFINE_PER_CPU(struct enforcement_timer, budget_timer);
+static enum hrtimer_restart on_enforcement_timeout(struct hrtimer *timer)
+{
+        struct enforcement_timer* et = container_of(timer,
+                                                    struct enforcement_timer,
+                                                    timer);
+        unsigned long flags;
+        local_irq_save(flags);
+        TRACE("enforcement timer fired.\n");
+        et->armed = 0;
+        /* activate scheduler */
+        litmus_reschedule_local();
+        local_irq_restore(flags);
+        return  HRTIMER_NORESTART;
+}
+/* assumes called with IRQs off */
+static void cancel_enforcement_timer(struct enforcement_timer* et)
+{
+        int ret;
+        TRACE("cancelling enforcement timer.\n");
+        /* Since interrupts are disabled and et->armed is only
+         * modified locally, we do not need any locks.
+         */
+        if (et->armed) {
+                ret = hrtimer_try_to_cancel(&et->timer);
+                /* Should never be inactive. */
+                BUG_ON(ret == 0);
+                /* Should never be running concurrently. */
+                BUG_ON(ret == -1);
+                et->armed = 0;
+        }
+}
+/* assumes called with IRQs off */
+static void arm_enforcement_timer(struct enforcement_timer* et,
+                                  struct task_struct* t)
+{
+        lt_t when_to_fire;
+        TRACE_TASK(t, "arming enforcement timer.\n");
+        WARN_ONCE(!hrtimer_is_hres_active(&et->timer),
+                KERN_ERR "WARNING: no high resolution timers available!?\n");
+        /* Calling this when there is no budget left for the task
+         * makes no sense, unless the task is non-preemptive. */
+        BUG_ON(budget_exhausted(t) && (!is_np(t)));
+        /* hrtimer_start_range_ns() cancels the timer
+         * anyway, so we don't have to check whether it is still armed */
+        if (likely(!is_np(t))) {
+                when_to_fire = litmus_clock() + budget_remaining(t);
+                hrtimer_start(&et->timer, ns_to_ktime(when_to_fire),
+                        HRTIMER_MODE_ABS_PINNED);
+                et->armed = 1;
+        }
+}
+/* expects to be called with IRQs off */
+void update_enforcement_timer(struct task_struct* t)
+{
+        struct enforcement_timer* et = this_cpu_ptr(&budget_timer);
+        if (t && budget_precisely_enforced(t)) {
+                /* Make sure we call into the scheduler when this budget
+                 * expires. */
+                arm_enforcement_timer(et, t);
+        } else if (et->armed) {
+                /* Make sure we don't cause unnecessary interrupts. */
+                cancel_enforcement_timer(et);
+        }
+}
+static int __init init_budget_enforcement(void)
+{
+        int cpu;
+        struct enforcement_timer* et;
+        for (cpu = 0; cpu < NR_CPUS; cpu++)  {
+                et = &per_cpu(budget_timer, cpu);
+                hrtimer_init(&et->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+                et->timer.function = on_enforcement_timeout;
+        }
+        return 0;
+}
+void litmus_current_budget(lt_t *used_so_far, lt_t *remaining)
+{
+        struct task_struct *t = current;
+        unsigned long flags;
+        s64 delta;
+        local_irq_save(flags);
+        delta = sched_clock_cpu(smp_processor_id()) - t->se.exec_start;
+        if (delta < 0)
+                delta = 0;
+        TRACE_CUR("current_budget: sc:%llu start:%llu lt_t:%llu delta:%lld exec-time:%llu rem:%llu\n",
+                sched_clock_cpu(smp_processor_id()), t->se.exec_start,
+                litmus_clock(), delta,
+                tsk_rt(t)->job_params.exec_time,
+                budget_remaining(t));
+        if (used_so_far)
+                *used_so_far = tsk_rt(t)->job_params.exec_time + delta;
+        if (remaining) {
+                *remaining = budget_remaining(t);
+                if (*remaining > delta)
+                        *remaining -= delta;
+                else
+                        *remaining = 0;
+        }
+        local_irq_restore(flags);
+}
+asmlinkage long sys_get_current_budget(
+        lt_t __user * _expended,
+        lt_t __user *_remaining)
+{
+        lt_t expended = 0, remaining = 0;
+        if (is_realtime(current))
+                litmus->current_budget(&expended, &remaining);
+        if (_expended && put_user(expended, _expended))
+                return -EFAULT;
+        if (_remaining && put_user(remaining, _remaining))
+                return -EFAULT;
+        return 0;
+}
+module_init(init_budget_enforcement);
diff --git a/litmus/clustered.c b/litmus/clustered.c
new file mode 100644
index 000000000000..de2aca2a271c
--- /dev/null
+++ b/litmus/clustered.c
@@ -0,0 +1,119 @@
+#include <linux/gfp.h>
+#include <linux/cpumask.h>
+#include <linux/list.h>
+#include <linux/cacheinfo.h>
+#include <litmus/debug_trace.h>
+#include <litmus/clustered.h>
+int get_shared_cpu_map(cpumask_var_t mask, unsigned int cpu, unsigned int index)
+{
+        struct cpu_cacheinfo* info = get_cpu_cacheinfo(cpu);
+        struct cacheinfo *ci;
+        if (!info || index >= info->num_leaves) {
+                TRACE("no shared-cache CPUs: info=%d index=%u\n",
+                        info != NULL, index);
+                return 1;
+        }
+        if (!info->info_list) {
+                TRACE("no shared-cache CPUs: no info_list (cpu\n");
+        }
+        ci = info->info_list + index;
+        cpumask_copy(mask, &ci->shared_cpu_map);
+        TRACE("get_shared: P%u@L%u -> %d siblings\n ", cpu, index, cpumask_weight(mask));
+        return 0;
+}
+int get_cluster_size(enum cache_level level)
+{
+        cpumask_var_t mask;
+        int ok;
+        int num_cpus;
+        if (level == GLOBAL_CLUSTER)
+                return num_online_cpus();
+        else {
+                if (!zalloc_cpumask_var(&mask, GFP_ATOMIC))
+                        return -ENOMEM;
+                /* assumes CPU 0 is representative of all CPUs */
+                ok = get_shared_cpu_map(mask, 0, level);
+                /* ok == 0 means we got the map; otherwise it's an invalid cache level */
+                if (ok == 0)
+                        num_cpus = cpumask_weight(mask);
+                free_cpumask_var(mask);
+                if (ok == 0)
+                        return num_cpus;
+                else
+                        return -EINVAL;
+        }
+}
+int assign_cpus_to_clusters(enum cache_level level,
+                            struct scheduling_cluster* clusters[],
+                            unsigned int num_clusters,
+                            struct cluster_cpu* cpus[],
+                            unsigned int num_cpus)
+{
+        cpumask_var_t mask;
+        unsigned int i, free_cluster = 0, low_cpu;
+        int err = 0;
+        if (!zalloc_cpumask_var(&mask, GFP_ATOMIC))
+                return -ENOMEM;
+        /* clear cluster pointers */
+        for (i = 0; i < num_cpus; i++) {
+                cpus[i]->id      = i;
+                cpus[i]->cluster = NULL;
+        }
+        /* initialize clusters */
+        for (i = 0; i < num_clusters; i++) {
+                clusters[i]->id = i;
+                INIT_LIST_HEAD(&clusters[i]->cpus);
+        }
+        /* Assign each CPU. Two assumtions are made:
+         * 1) The index of a cpu in cpus corresponds to its processor id (i.e., the index in a cpu mask).
+         * 2) All cpus that belong to some cluster are online.
+         */
+        for_each_online_cpu(i) {
+                /* get lowest-id CPU in cluster */
+                if (level != GLOBAL_CLUSTER) {
+                        err = get_shared_cpu_map(mask, cpus[i]->id, level);
+                        if (err != 0) {
+                                /* ugh... wrong cache level? Either caller screwed up
+                                 * or the CPU topology is weird. */
+                                printk(KERN_ERR "Could not set up clusters for L%d sharing (max: L%d).\n",
+                                       level, err);
+                                err = -EINVAL;
+                                goto out;
+                        }
+                        low_cpu = cpumask_first(mask);
+                } else
+                        low_cpu = 0;
+                if (low_cpu == i) {
+                        /* caller must provide an appropriate number of clusters */
+                        BUG_ON(free_cluster >= num_clusters);
+                        /* create new cluster */
+                        cpus[i]->cluster = clusters[free_cluster++];
+                } else {
+                        /* low_cpu points to the right cluster
+                         * Assumption: low_cpu is actually online and was processed earlier. */
+                        cpus[i]->cluster = cpus[low_cpu]->cluster;
+                }
+                /* enqueue in cpus list */
+                list_add_tail(&cpus[i]->cluster_list, &cpus[i]->cluster->cpus);
+                printk(KERN_INFO "Assigning CPU%u to cluster %u\n.", i, cpus[i]->cluster->id);
+        }
+out:
+        free_cpumask_var(mask);
+        return err;
+}
diff --git a/litmus/ctrldev.c b/litmus/ctrldev.c
new file mode 100644
index 000000000000..cc74c5afa5c6
--- /dev/null
+++ b/litmus/ctrldev.c
@@ -0,0 +1,264 @@
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <linux/uaccess.h>
+#include <litmus/litmus.h>
+#include <litmus/debug_trace.h>
+/* only one page for now, but we might want to add a RO version at some point */
+#define CTRL_NAME        "litmus/ctrl"
+/* allocate t->rt_param.ctrl_page*/
+static int alloc_ctrl_page(struct task_struct *t)
+{
+        int err = 0;
+        /* only allocate if the task doesn't have one yet */
+        if (!tsk_rt(t)->ctrl_page) {
+                tsk_rt(t)->ctrl_page = (void*) get_zeroed_page(GFP_KERNEL);
+                if (!tsk_rt(t)->ctrl_page)
+                        err = -ENOMEM;
+                /* will get de-allocated in task teardown */
+                TRACE_TASK(t, "%s ctrl_page = %p\n", __FUNCTION__,
+                           tsk_rt(t)->ctrl_page);
+        }
+        return err;
+}
+static int map_ctrl_page(struct task_struct *t, struct vm_area_struct* vma)
+{
+        int err;
+        struct page* ctrl = virt_to_page(tsk_rt(t)->ctrl_page);
+        TRACE_CUR(CTRL_NAME
+                  ": mapping %p (pfn:%lx) to 0x%lx (prot:%lx)\n",
+                  tsk_rt(t)->ctrl_page,page_to_pfn(ctrl), vma->vm_start,
+                  vma->vm_page_prot);
+        /* Map it into the vma. */
+        err = vm_insert_page(vma, vma->vm_start, ctrl);
+        if (err)
+                TRACE_CUR(CTRL_NAME ": vm_insert_page() failed (%d)\n", err);
+        return err;
+}
+static void litmus_ctrl_vm_close(struct vm_area_struct* vma)
+{
+        TRACE_CUR("%s flags=0x%x prot=0x%x\n", __FUNCTION__,
+                  vma->vm_flags, vma->vm_page_prot);
+        TRACE_CUR(CTRL_NAME
+                  ": %p:%p vma:%p vma->vm_private_data:%p closed.\n",
+                  (void*) vma->vm_start, (void*) vma->vm_end, vma,
+                  vma->vm_private_data);
+}
+static int litmus_ctrl_vm_fault(struct vm_area_struct* vma,
+                                      struct vm_fault* vmf)
+{
+        TRACE_CUR("%s flags=0x%x (off:%ld)\n", __FUNCTION__,
+                  vma->vm_flags, vmf->pgoff);
+        /* This function should never be called, since all pages should have
+         * been mapped by mmap() already. */
+        WARN_ONCE(1, "Page faults should be impossible in the control page\n");
+        return VM_FAULT_SIGBUS;
+}
+static struct vm_operations_struct litmus_ctrl_vm_ops = {
+        .close = litmus_ctrl_vm_close,
+        .fault = litmus_ctrl_vm_fault,
+};
+static int litmus_ctrl_mmap(struct file* filp, struct vm_area_struct* vma)
+{
+        int err = 0;
+        /* first make sure mapper knows what he's doing */
+        /* you can only get one page */
+        if (vma->vm_end - vma->vm_start != PAGE_SIZE)
+                return -EINVAL;
+        /* you can only map the "first" page */
+        if (vma->vm_pgoff != 0)
+                return -EINVAL;
+        /* you can't share it with anyone */
+        if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED))
+                return -EINVAL;
+        vma->vm_ops = &litmus_ctrl_vm_ops;
+        /* This mapping should not be kept across forks,
+         * cannot be expanded, and is not a "normal" page. */
+        vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_READ | VM_WRITE;
+        /* We don't want the first write access to trigger a "minor" page fault
+         * to mark the page as dirty.  This is transient, private memory, we
+         * don't care if it was touched or not. PAGE_SHARED means RW access, but
+         * not execute, and avoids copy-on-write behavior.
+         * See protection_map in mmap.c.  */
+        vma->vm_page_prot = PAGE_SHARED;
+        err = alloc_ctrl_page(current);
+        if (!err)
+                err = map_ctrl_page(current, vma);
+        TRACE_CUR("%s flags=0x%x prot=0x%lx\n",
+                  __FUNCTION__, vma->vm_flags, vma->vm_page_prot);
+        return err;
+}
+/* LITMUS^RT system calls */
+asmlinkage long sys_set_rt_task_param(pid_t pid, struct rt_task __user * param);
+asmlinkage long sys_get_rt_task_param(pid_t pid, struct rt_task __user * param);
+asmlinkage long sys_reservation_create(int type, void __user *config);
+asmlinkage long sys_get_current_budget(lt_t __user * _expended, lt_t __user *_remaining);
+asmlinkage long sys_null_call(cycles_t __user *ts);
+asmlinkage long sys_od_open(int fd, int type, int obj_id, void* __user config);
+asmlinkage long sys_od_close(int od);
+asmlinkage long sys_complete_job(void);
+asmlinkage long sys_litmus_lock(int lock_od);
+asmlinkage long sys_litmus_unlock(int lock_od);
+asmlinkage long sys_wait_for_job_release(unsigned int job);
+asmlinkage long sys_wait_for_ts_release(void);
+asmlinkage long sys_release_ts(lt_t __user *__when);
+static long litmus_ctrl_ioctl(struct file *filp,
+        unsigned int cmd, unsigned long arg)
+{
+        long err = -ENOIOCTLCMD;
+        /* LITMUS^RT syscall emulation: we expose LITMUS^RT-specific operations
+         * via ioctl() to avoid merge conflicts with the syscall tables when
+         * rebasing LITMUS^RT. Whi this is not the most elegant way to expose
+         * syscall-like functionality, it helps with reducing the effort
+         * required to maintain LITMUS^RT out of tree.
+         */
+        union litmus_syscall_args syscall_args;
+        switch (cmd) {
+        case LRT_set_rt_task_param:
+        case LRT_get_rt_task_param:
+        case LRT_reservation_create:
+        case LRT_get_current_budget:
+        case LRT_od_open:
+                /* multiple arguments => need to get args via pointer */
+                /* get syscall parameters */
+                if (copy_from_user(&syscall_args, (void*) arg,
+                                   sizeof(syscall_args))) {
+                        return -EFAULT;
+                }
+                switch (cmd) {
+                case LRT_set_rt_task_param:
+                        return sys_set_rt_task_param(
+                                syscall_args.get_set_task_param.pid,
+                                syscall_args.get_set_task_param.param);
+                case LRT_get_rt_task_param:
+                        return sys_get_rt_task_param(
+                                syscall_args.get_set_task_param.pid,
+                                syscall_args.get_set_task_param.param);
+                case LRT_reservation_create:
+                        return sys_reservation_create(
+                                syscall_args.reservation_create.type,
+                                syscall_args.reservation_create.config);
+                case LRT_get_current_budget:
+                        return sys_get_current_budget(
+                                syscall_args.get_current_budget.expended,
+                                syscall_args.get_current_budget.remaining);
+                case LRT_od_open:
+                        return sys_od_open(
+                                syscall_args.od_open.fd,
+                                syscall_args.od_open.obj_type,
+                                syscall_args.od_open.obj_id,
+                                syscall_args.od_open.config);
+                }
+        case LRT_null_call:
+                return sys_null_call((cycles_t __user *) arg);
+        case LRT_od_close:
+                return sys_od_close(arg);
+        case LRT_complete_job:
+                return sys_complete_job();
+        case LRT_litmus_lock:
+                return sys_litmus_lock(arg);
+        case LRT_litmus_unlock:
+                return sys_litmus_unlock(arg);
+        case LRT_wait_for_job_release:
+                return sys_wait_for_job_release(arg);
+        case LRT_wait_for_ts_release:
+                return sys_wait_for_ts_release();
+        case LRT_release_ts:
+                return sys_release_ts((lt_t __user *) arg);
+        default:
+                printk(KERN_DEBUG "ctrldev: strange ioctl (%u, %lu)\n", cmd, arg);
+        };
+        return err;
+}
+static struct file_operations litmus_ctrl_fops = {
+        .owner = THIS_MODULE,
+        .mmap  = litmus_ctrl_mmap,
+        .unlocked_ioctl = litmus_ctrl_ioctl,
+};
+static struct miscdevice litmus_ctrl_dev = {
+        .name  = CTRL_NAME,
+        .minor = MISC_DYNAMIC_MINOR,
+        .fops  = &litmus_ctrl_fops,
+};
+static int __init init_litmus_ctrl_dev(void)
+{
+        int err;
+        BUILD_BUG_ON(sizeof(struct control_page) > PAGE_SIZE);
+        BUILD_BUG_ON(sizeof(union np_flag) != sizeof(uint32_t));
+        BUILD_BUG_ON(offsetof(struct control_page, sched.raw)
+                     != LITMUS_CP_OFFSET_SCHED);
+        BUILD_BUG_ON(offsetof(struct control_page, irq_count)
+                     != LITMUS_CP_OFFSET_IRQ_COUNT);
+        BUILD_BUG_ON(offsetof(struct control_page, ts_syscall_start)
+                     != LITMUS_CP_OFFSET_TS_SC_START);
+        BUILD_BUG_ON(offsetof(struct control_page, irq_syscall_start)
+                     != LITMUS_CP_OFFSET_IRQ_SC_START);
+        printk("Initializing LITMUS^RT control device.\n");
+        err = misc_register(&litmus_ctrl_dev);
+        if (err)
+                printk("Could not allocate %s device (%d).\n", CTRL_NAME, err);
+        return err;
+}
+static void __exit exit_litmus_ctrl_dev(void)
+{
+        misc_deregister(&litmus_ctrl_dev);
+}
+module_init(init_litmus_ctrl_dev);
+module_exit(exit_litmus_ctrl_dev);
diff --git a/litmus/edf_common.c b/litmus/edf_common.c
new file mode 100644
index 000000000000..1cd5ec711d28
--- /dev/null
+++ b/litmus/edf_common.c
@@ -0,0 +1,201 @@
+/*
+ * kernel/edf_common.c
+ *
+ * Common functions for EDF based scheduler.
+ */
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/sched_trace.h>
+#include <litmus/debug_trace.h>
+#include <litmus/edf_common.h>
+#ifdef CONFIG_EDF_TIE_BREAK_LATENESS_NORM
+#include <litmus/fpmath.h>
+#endif
+#ifdef CONFIG_EDF_TIE_BREAK_HASH
+#include <linux/hash.h>
+static inline long edf_hash(struct task_struct *t)
+{
+        /* pid is 32 bits, so normally we would shove that into the
+         * upper 32-bits and and put the job number in the bottom
+         * and hash the 64-bit number with hash_64(). Sadly,
+         * in testing, hash_64() doesn't distribute keys were the
+         * upper bits are close together (as would be the case with
+         * pids) and job numbers are equal (as would be the case with
+         * synchronous task sets with all relative deadlines equal).
+         *
+         * A 2006 Linux patch proposed the following solution
+         * (but for some reason it wasn't accepted...).
+         *
+         * At least this workaround works for 32-bit systems as well.
+         */
+        return hash_32(hash_32((u32)tsk_rt(t)->job_params.job_no, 32) ^ t->pid, 32);
+}
+#endif
+/* edf_higher_prio -  returns true if first has a higher EDF priority
+ *                    than second. Deadline ties are broken by PID.
+ *
+ * both first and second may be NULL
+ */
+int edf_higher_prio(struct task_struct* first,
+                    struct task_struct* second)
+{
+        struct task_struct *first_task = first;
+        struct task_struct *second_task = second;
+        /* There is no point in comparing a task to itself. */
+        if (first && first == second) {
+                TRACE_TASK(first,
+                           "WARNING: pointless edf priority comparison.\n");
+                return 0;
+        }
+        /* check for NULL tasks */
+        if (!first || !second)
+                return first && !second;
+#ifdef CONFIG_LITMUS_LOCKING
+        /* Check for inherited priorities. Change task
+         * used for comparison in such a case.
+         */
+        if (unlikely(first->rt_param.inh_task))
+                first_task = first->rt_param.inh_task;
+        if (unlikely(second->rt_param.inh_task))
+                second_task = second->rt_param.inh_task;
+        /* Check for priority boosting. Tie-break by start of boosting.
+         */
+        if (unlikely(is_priority_boosted(first_task))) {
+                /* first_task is boosted, how about second_task? */
+                if (!is_priority_boosted(second_task) ||
+                    lt_before(get_boost_start(first_task),
+                              get_boost_start(second_task)))
+                        return 1;
+                else
+                        return 0;
+        } else if (unlikely(is_priority_boosted(second_task)))
+                /* second_task is boosted, first is not*/
+                return 0;
+#endif
+        if (earlier_deadline(first_task, second_task)) {
+                return 1;
+        }
+        else if (get_deadline(first_task) == get_deadline(second_task)) {
+                /* Need to tie break. All methods must set pid_break to 0/1 if
+                 * first_task does not have priority over second_task.
+                 */
+                int pid_break;
+#if defined(CONFIG_EDF_TIE_BREAK_LATENESS)
+                /* Tie break by lateness. Jobs with greater lateness get
+                 * priority. This should spread tardiness across all tasks,
+                 * especially in task sets where all tasks have the same
+                 * period and relative deadlines.
+                 */
+                if (get_lateness(first_task) > get_lateness(second_task)) {
+                        return 1;
+                }
+                pid_break = (get_lateness(first_task) == get_lateness(second_task));
+#elif defined(CONFIG_EDF_TIE_BREAK_LATENESS_NORM)
+                /* Tie break by lateness, normalized by relative deadline. Jobs with
+                 * greater normalized lateness get priority.
+                 *
+                 * Note: Considered using the algebraically equivalent
+                 *      lateness(first)*relative_deadline(second) >
+                                        lateness(second)*relative_deadline(first)
+                 * to avoid fixed-point math, but values are prone to overflow if inputs
+                 * are on the order of several seconds, even in 64-bit.
+                 */
+                fp_t fnorm = _frac(get_lateness(first_task),
+                                                   get_rt_relative_deadline(first_task));
+                fp_t snorm = _frac(get_lateness(second_task),
+                                                   get_rt_relative_deadline(second_task));
+                if (_gt(fnorm, snorm)) {
+                        return 1;
+                }
+                pid_break = _eq(fnorm, snorm);
+#elif defined(CONFIG_EDF_TIE_BREAK_HASH)
+                /* Tie break by comparing hashs of (pid, job#) tuple.  There should be
+                 * a 50% chance that first_task has a higher priority than second_task.
+                 */
+                long fhash = edf_hash(first_task);
+                long shash = edf_hash(second_task);
+                if (fhash < shash) {
+                        return 1;
+                }
+                pid_break = (fhash == shash);
+#else
+                /* CONFIG_EDF_PID_TIE_BREAK */
+                pid_break = 1; // fall through to tie-break by pid;
+#endif
+                /* Tie break by pid */
+                if(pid_break) {
+                        if (first_task->pid < second_task->pid) {
+                                return 1;
+                        }
+                        else if (first_task->pid == second_task->pid) {
+                                /* If the PIDs are the same then the task with the
+                                 * inherited priority wins.
+                                 */
+                                if (!second->rt_param.inh_task) {
+                                        return 1;
+                                }
+                        }
+                }
+        }
+        return 0; /* fall-through. prio(second_task) > prio(first_task) */
+}
+int edf_ready_order(struct bheap_node* a, struct bheap_node* b)
+{
+        return edf_higher_prio(bheap2task(a), bheap2task(b));
+}
+void edf_domain_init(rt_domain_t* rt, check_resched_needed_t resched,
+                      release_jobs_t release)
+{
+        rt_domain_init(rt,  edf_ready_order, resched, release);
+}
+/* need_to_preempt - check whether the task t needs to be preempted
+ *                   call only with irqs disabled and with  ready_lock acquired
+ *                   THIS DOES NOT TAKE NON-PREEMPTIVE SECTIONS INTO ACCOUNT!
+ */
+int edf_preemption_needed(rt_domain_t* rt, struct task_struct *t)
+{
+        /* we need the read lock for edf_ready_queue */
+        /* no need to preempt if there is nothing pending */
+        if (!__jobs_pending(rt))
+                return 0;
+        /* we need to reschedule if t doesn't exist */
+        if (!t)
+                return 1;
+        /* NOTE: We cannot check for non-preemptibility since we
+         *       don't know what address space we're currently in.
+         */
+        /* make sure to get non-rt stuff out of the way */
+        return !is_realtime(t) || edf_higher_prio(__next_ready(rt), t);
+}
diff --git a/litmus/fdso.c b/litmus/fdso.c
new file mode 100644
index 000000000000..0ff54e41839c
--- /dev/null
+++ b/litmus/fdso.c
@@ -0,0 +1,308 @@
+/* fdso.c - file descriptor attached shared objects
+ *
+ * (c) 2007 B. Brandenburg, LITMUS^RT project
+ *
+ * Notes:
+ *   - objects descriptor (OD) tables are not cloned during a fork.
+ *   - objects are created on-demand, and freed after the last reference
+ *     is dropped.
+ *   - for now, object types are hard coded.
+ *   - As long as we have live objects, we keep a reference to the inode.
+ */
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/mutex.h>
+#include <linux/file.h>
+#include <asm/uaccess.h>
+#include <litmus/fdso.h>
+extern struct fdso_ops generic_lock_ops;
+static const struct fdso_ops* fdso_ops[] = {
+        &generic_lock_ops, /* FMLP_SEM */
+        &generic_lock_ops, /* SRP_SEM */
+        &generic_lock_ops, /* MPCP_SEM */
+        &generic_lock_ops, /* MPCP_VS_SEM */
+        &generic_lock_ops, /* DPCP_SEM */
+        &generic_lock_ops, /* PCP_SEM */
+        &generic_lock_ops, /* DFLP_SEM */
+};
+static int fdso_create(void** obj_ref, obj_type_t type, void* __user config)
+{
+        BUILD_BUG_ON(ARRAY_SIZE(fdso_ops) != MAX_OBJ_TYPE + 1);
+        if (fdso_ops[type]->create)
+                return fdso_ops[type]->create(obj_ref, type, config);
+        else
+                return -EINVAL;
+}
+static void fdso_destroy(obj_type_t type, void* obj)
+{
+        fdso_ops[type]->destroy(type, obj);
+}
+static int fdso_open(struct od_table_entry* entry, void* __user config)
+{
+        if (fdso_ops[entry->obj->type]->open)
+                return fdso_ops[entry->obj->type]->open(entry, config);
+        else
+                return 0;
+}
+static int fdso_close(struct od_table_entry* entry)
+{
+        if (fdso_ops[entry->obj->type]->close)
+                return fdso_ops[entry->obj->type]->close(entry);
+        else
+                return 0;
+}
+/* inode must be locked already */
+static int alloc_inode_obj(struct inode_obj_id** obj_ref,
+                           struct inode* inode,
+                           obj_type_t type,
+                           unsigned int id,
+                           void* __user config)
+{
+        struct inode_obj_id* obj;
+        void* raw_obj;
+        int err;
+        obj = kmalloc(sizeof(*obj), GFP_KERNEL);
+        if (!obj) {
+                return -ENOMEM;
+        }
+        err = fdso_create(&raw_obj, type, config);
+        if (err != 0) {
+                kfree(obj);
+                return err;
+        }
+        INIT_LIST_HEAD(&obj->list);
+        atomic_set(&obj->count, 1);
+        obj->type  = type;
+        obj->id    = id;
+        obj->obj   = raw_obj;
+        obj->inode = inode;
+        list_add(&obj->list, &inode->i_obj_list);
+        atomic_inc(&inode->i_count);
+        printk(KERN_DEBUG "alloc_inode_obj(%p, %d, %d): object created\n", inode, type, id);
+        *obj_ref = obj;
+        return 0;
+}
+/* inode must be locked already */
+static struct inode_obj_id* get_inode_obj(struct inode* inode,
+                                          obj_type_t type,
+                                          unsigned int id)
+{
+        struct list_head* pos;
+        struct inode_obj_id* obj = NULL;
+        list_for_each(pos, &inode->i_obj_list) {
+                obj = list_entry(pos, struct inode_obj_id, list);
+                if (obj->id == id && obj->type == type) {
+                        atomic_inc(&obj->count);
+                        return obj;
+                }
+        }
+        printk(KERN_DEBUG "get_inode_obj(%p, %d, %d): couldn't find object\n", inode, type, id);
+        return NULL;
+}
+static void put_inode_obj(struct inode_obj_id* obj)
+{
+        struct inode* inode;
+        int let_go = 0;
+        inode = obj->inode;
+        if (atomic_dec_and_test(&obj->count)) {
+                mutex_lock(&inode->i_obj_mutex);
+                /* no new references can be obtained */
+                if (!atomic_read(&obj->count)) {
+                        list_del(&obj->list);
+                        fdso_destroy(obj->type, obj->obj);
+                        kfree(obj);
+                        let_go = 1;
+                }
+                mutex_unlock(&inode->i_obj_mutex);
+                if (let_go)
+                        iput(inode);
+        }
+}
+static struct od_table_entry*  get_od_entry(struct task_struct* t)
+{
+        struct od_table_entry* table;
+        int i;
+        table = t->od_table;
+        if (!table) {
+                table = kzalloc(sizeof(*table) * MAX_OBJECT_DESCRIPTORS,
+                                GFP_KERNEL);
+                t->od_table = table;
+        }
+        for (i = 0; table &&  i < MAX_OBJECT_DESCRIPTORS; i++)
+                if (!table[i].used) {
+                        table[i].used = 1;
+                        return table + i;
+                }
+        return NULL;
+}
+static int put_od_entry(struct od_table_entry* od)
+{
+        put_inode_obj(od->obj);
+        od->used = 0;
+        return 0;
+}
+static long close_od_entry(struct od_table_entry *od)
+{
+        long ret;
+        /* Give the class a chance to reject the close. */
+        ret = fdso_close(od);
+        if (ret == 0)
+                ret = put_od_entry(od);
+        return ret;
+}
+void exit_od_table(struct task_struct* t)
+{
+        int i;
+        if (t->od_table) {
+                for (i = 0; i < MAX_OBJECT_DESCRIPTORS; i++)
+                        if (t->od_table[i].used)
+                                close_od_entry(t->od_table + i);
+                kfree(t->od_table);
+                t->od_table = NULL;
+        }
+}
+static int do_sys_od_open(struct file* file, obj_type_t type, int id,
+                          void* __user config)
+{
+        int idx = 0, err = 0;
+        struct inode* inode;
+        struct inode_obj_id* obj = NULL;
+        struct od_table_entry* entry;
+        inode = file_inode(file);
+        entry = get_od_entry(current);
+        if (!entry)
+                return -ENOMEM;
+        mutex_lock(&inode->i_obj_mutex);
+        obj = get_inode_obj(inode, type, id);
+        if (!obj)
+                err = alloc_inode_obj(&obj, inode, type, id, config);
+        if (err != 0) {
+                obj = NULL;
+                idx = err;
+                entry->used = 0;
+        } else {
+                entry->obj   = obj;
+                entry->class = fdso_ops[type];
+                idx = entry - current->od_table;
+        }
+        mutex_unlock(&inode->i_obj_mutex);
+        /* open only if creation succeeded */
+        if (!err)
+                err = fdso_open(entry, config);
+        if (err < 0) {
+                /* The class rejected the open call.
+                 * We need to clean up and tell user space.
+                 */
+                if (obj)
+                        put_od_entry(entry);
+                idx = err;
+        }
+        return idx;
+}
+struct od_table_entry* get_entry_for_od(int od)
+{
+        struct task_struct *t = current;
+        if (!t->od_table)
+                return NULL;
+        if (od < 0 || od >= MAX_OBJECT_DESCRIPTORS)
+                return NULL;
+        if (!t->od_table[od].used)
+                return NULL;
+        return t->od_table + od;
+}
+asmlinkage long sys_od_open(int fd, int type, int obj_id, void* __user config)
+{
+        int ret = 0;
+        struct file*  file;
+        /*
+           1) get file from fd, get inode from file
+           2) lock inode
+           3) try to lookup object
+           4) if not present create and enqueue object, inc inode refcnt
+           5) increment refcnt of object
+           6) alloc od_table_entry, setup ptrs
+           7) unlock inode
+           8) return offset in od_table as OD
+         */
+        if (type < MIN_OBJ_TYPE || type > MAX_OBJ_TYPE) {
+                ret = -EINVAL;
+                goto out;
+        }
+        file = fget(fd);
+        if (!file) {
+                ret = -EBADF;
+                goto out;
+        }
+        ret = do_sys_od_open(file, type, obj_id, config);
+        fput(file);
+out:
+        return ret;
+}
+asmlinkage long sys_od_close(int od)
+{
+        int ret = -EINVAL;
+        struct task_struct *t = current;
+        if (od < 0 || od >= MAX_OBJECT_DESCRIPTORS)
+                return ret;
+        if (!t->od_table || !t->od_table[od].used)
+                return ret;
+        ret = close_od_entry(t->od_table + od);
+        return ret;
+}
diff --git a/litmus/fp_common.c b/litmus/fp_common.c
new file mode 100644
index 000000000000..595c7b8e561d
--- /dev/null
+++ b/litmus/fp_common.c
@@ -0,0 +1,137 @@
+/*
+ * litmus/fp_common.c
+ *
+ * Common functions for fixed-priority scheduler.
+ */
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/sched_trace.h>
+#include <litmus/debug_trace.h>
+#include <litmus/fp_common.h>
+/* fp_higher_prio -  returns true if first has a higher static priority
+ *                   than second. Ties are broken by PID.
+ *
+ * both first and second may be NULL
+ */
+int fp_higher_prio(struct task_struct* first,
+                   struct task_struct* second)
+{
+        struct task_struct *first_task = first;
+        struct task_struct *second_task = second;
+        /* There is no point in comparing a task to itself. */
+        if (unlikely(first && first == second)) {
+                TRACE_TASK(first,
+                           "WARNING: pointless FP priority comparison.\n");
+                return 0;
+        }
+        /* check for NULL tasks */
+        if (!first || !second)
+                return first && !second;
+        if (!is_realtime(second_task))
+                return 1;
+#ifdef CONFIG_LITMUS_LOCKING
+        /* Check for inherited priorities. Change task
+         * used for comparison in such a case.
+         */
+        if (unlikely(first->rt_param.inh_task))
+                first_task = first->rt_param.inh_task;
+        if (unlikely(second->rt_param.inh_task))
+                second_task = second->rt_param.inh_task;
+        /* Comparisons to itself are only possible with
+         * priority inheritance when svc_preempt interrupt just
+         * before scheduling (and everything that could follow in the
+         * ready queue). Always favour the original job, as that one will just
+         * suspend itself to resolve this.
+         */
+        if(first_task == second_task)
+                return first_task == first;
+        /* Check for priority boosting. Tie-break by start of boosting.
+         */
+        if (unlikely(is_priority_boosted(first_task))) {
+                /* first_task is boosted, how about second_task? */
+                if (is_priority_boosted(second_task))
+                        /* break by priority point */
+                        return lt_before(get_boost_start(first_task),
+                                         get_boost_start(second_task));
+                else
+                        /* priority boosting wins. */
+                        return 1;
+        } else if (unlikely(is_priority_boosted(second_task)))
+                /* second_task is boosted, first is not*/
+                return 0;
+#else
+        /* No locks, no priority inheritance, no comparisons to itself */
+        BUG_ON(first_task == second_task);
+#endif
+        if (get_priority(first_task) < get_priority(second_task))
+                return 1;
+        else if (get_priority(first_task) == get_priority(second_task))
+                /* Break by PID. */
+                return first_task->pid < second_task->pid;
+        else
+                return 0;
+}
+int fp_ready_order(struct bheap_node* a, struct bheap_node* b)
+{
+        return fp_higher_prio(bheap2task(a), bheap2task(b));
+}
+void fp_domain_init(rt_domain_t* rt, check_resched_needed_t resched,
+                    release_jobs_t release)
+{
+        rt_domain_init(rt,  fp_ready_order, resched, release);
+}
+/* need_to_preempt - check whether the task t needs to be preempted
+ */
+int fp_preemption_needed(struct fp_prio_queue *q, struct task_struct *t)
+{
+        struct task_struct *pending;
+        pending = fp_prio_peek(q);
+        if (!pending)
+                return 0;
+        if (!t)
+                return 1;
+        /* make sure to get non-rt stuff out of the way */
+        return !is_realtime(t) || fp_higher_prio(pending, t);
+}
+void fp_prio_queue_init(struct fp_prio_queue* q)
+{
+        int i;
+        for (i = 0; i < FP_PRIO_BIT_WORDS; i++)
+                q->bitmask[i] = 0;
+        for (i = 0; i < LITMUS_MAX_PRIORITY; i++)
+                bheap_init(&q->queue[i]);
+}
+void fp_ready_list_init(struct fp_ready_list* q)
+{
+        int i;
+        for (i = 0; i < FP_PRIO_BIT_WORDS; i++)
+                q->bitmask[i] = 0;
+        for (i = 0; i < LITMUS_MAX_PRIORITY; i++)
+                INIT_LIST_HEAD(q->queue + i);
+}
diff --git a/litmus/ft_event.c b/litmus/ft_event.c
new file mode 100644
index 000000000000..dbf61f6c389a
--- /dev/null
+++ b/litmus/ft_event.c
@@ -0,0 +1,43 @@
+#include <linux/types.h>
+#include <litmus/feather_trace.h>
+#if !defined(CONFIG_ARCH_HAS_FEATHER_TRACE) || defined(CONFIG_RELOCATABLE)
+/* provide dummy implementation */
+int ft_events[MAX_EVENTS];
+int ft_enable_event(unsigned long id)
+{
+        if (id < MAX_EVENTS) {
+                ft_events[id]++;
+                return 1;
+        } else
+                return 0;
+}
+int ft_disable_event(unsigned long id)
+{
+        if (id < MAX_EVENTS && ft_events[id]) {
+                ft_events[id]--;
+                return 1;
+        } else
+                return 0;
+}
+int ft_disable_all_events(void)
+{
+        int i;
+        for (i = 0; i < MAX_EVENTS; i++)
+                ft_events[i] = 0;
+        return MAX_EVENTS;
+}
+int ft_is_event_enabled(unsigned long id)
+{
+        return  id < MAX_EVENTS && ft_events[id];
+}
+#endif
diff --git a/litmus/ftdev.c b/litmus/ftdev.c
new file mode 100644
index 000000000000..646e8c9fe230
--- /dev/null
+++ b/litmus/ftdev.c
@@ -0,0 +1,457 @@
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/cdev.h>
+#include <asm/uaccess.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/vmalloc.h>
+#include <linux/mutex.h>
+#include <litmus/feather_trace.h>
+#include <litmus/ftdev.h>
+struct ft_buffer* alloc_ft_buffer(unsigned int count, size_t size)
+{
+        struct ft_buffer* buf;
+        size_t total = (size + 1) * count;
+        char* mem;
+        buf = kmalloc(sizeof(*buf), GFP_KERNEL);
+        if (!buf)
+                return NULL;
+        mem = vmalloc(total);
+        if (!mem) {
+                kfree(buf);
+                return NULL;
+        }
+        if (!init_ft_buffer(buf, count, size,
+                            mem + (count * size),  /* markers at the end */
+                            mem)) {                /* buffer objects     */
+                vfree(mem);
+                kfree(buf);
+                return NULL;
+        }
+        return buf;
+}
+void free_ft_buffer(struct ft_buffer* buf)
+{
+        if (buf) {
+                vfree(buf->buffer_mem);
+                kfree(buf);
+        }
+}
+struct ftdev_event {
+        int id;
+        struct ftdev_event* next;
+};
+static DEFINE_MUTEX(ft_event_activation_mutex);
+static int activate(struct ftdev_event** chain, int id)
+{
+        struct ftdev_event* ev = kmalloc(sizeof(*ev), GFP_KERNEL);
+        if (ev) {
+                mutex_lock(&ft_event_activation_mutex);
+                printk(KERN_INFO
+                       "Enabling feather-trace event %d.\n", (int) id);
+                ft_enable_event(id);
+                mutex_unlock(&ft_event_activation_mutex);
+                ev->id = id;
+                ev->next = *chain;
+                *chain    = ev;
+        }
+        return ev ? 0 : -ENOMEM;
+}
+static void deactivate(struct ftdev_event** chain, int id)
+{
+        struct ftdev_event **cur = chain;
+        struct ftdev_event *nxt;
+        while (*cur) {
+                if ((*cur)->id == id) {
+                        nxt   = (*cur)->next;
+                        kfree(*cur);
+                        *cur  = nxt;
+                        printk(KERN_INFO
+                               "Disabling feather-trace event %d.\n", (int) id);
+                        mutex_lock(&ft_event_activation_mutex);
+                        ft_disable_event(id);
+                        mutex_unlock(&ft_event_activation_mutex);
+                        break;
+                }
+                cur = &(*cur)->next;
+        }
+}
+static int ftdev_open(struct inode *in, struct file *filp)
+{
+        struct ftdev* ftdev;
+        struct ftdev_minor* ftdm;
+        unsigned int buf_idx = iminor(in);
+        int err = 0;
+        ftdev = container_of(in->i_cdev, struct ftdev, cdev);
+        if (buf_idx >= ftdev->minor_cnt) {
+                err = -ENODEV;
+                goto out;
+        }
+        if (ftdev->can_open && (err = ftdev->can_open(ftdev, buf_idx)))
+                goto out;
+        ftdm = ftdev->minor + buf_idx;
+        ftdm->ftdev = ftdev;
+        filp->private_data = ftdm;
+        if (mutex_lock_interruptible(&ftdm->lock)) {
+                err = -ERESTARTSYS;
+                goto out;
+        }
+        if (!ftdm->readers && ftdev->alloc)
+                err = ftdev->alloc(ftdev, buf_idx);
+        if (0 == err)
+                ftdm->readers++;
+        mutex_unlock(&ftdm->lock);
+out:
+        return err;
+}
+static int ftdev_release(struct inode *in, struct file *filp)
+{
+        struct ftdev* ftdev;
+        struct ftdev_minor* ftdm;
+        unsigned int buf_idx = iminor(in);
+        int err = 0;
+        ftdev = container_of(in->i_cdev, struct ftdev, cdev);
+        if (buf_idx >= ftdev->minor_cnt) {
+                err = -ENODEV;
+                goto out;
+        }
+        ftdm = ftdev->minor + buf_idx;
+        if (mutex_lock_interruptible(&ftdm->lock)) {
+                err = -ERESTARTSYS;
+                goto out;
+        }
+        if (ftdm->readers == 1) {
+                while (ftdm->events)
+                        deactivate(&ftdm->events, ftdm->events->id);
+                /* wait for any pending events to complete */
+                set_current_state(TASK_UNINTERRUPTIBLE);
+                schedule_timeout(HZ);
+                printk(KERN_ALERT "Failed trace writes: %u\n",
+                       atomic_read(&ftdm->buf->failed_writes));
+                if (ftdev->free)
+                        ftdev->free(ftdev, buf_idx);
+        }
+        ftdm->readers--;
+        mutex_unlock(&ftdm->lock);
+out:
+        return err;
+}
+/* based on ft_buffer_read
+ * @returns < 0 : page fault
+ *          = 0 : no data available
+ *          = 1 : one slot copied
+ */
+static int ft_buffer_copy_to_user(struct ft_buffer* buf, char __user *dest)
+{
+        unsigned int idx;
+        int err = 0;
+        if (atomic_read(&buf->free_count) != buf->slot_count) {
+                /* data available */
+                idx = buf->read_idx % buf->slot_count;
+                if (buf->slots[idx] == SLOT_READY) {
+                        err = copy_to_user(dest, ((char*) buf->buffer_mem) +
+                                           idx * buf->slot_size,
+                                           buf->slot_size);
+                        if (err == 0) {
+                                /* copy ok */
+                                buf->slots[idx] = SLOT_FREE;
+                                buf->read_idx++;
+                                atomic_fetch_inc(&buf->free_count);
+                                err = 1;
+                        }
+                }
+        }
+        return err;
+}
+static ssize_t ftdev_read(struct file *filp,
+                          char __user *to, size_t len, loff_t *f_pos)
+{
+        /*      we ignore f_pos, this is strictly sequential */
+        ssize_t err = 0;
+        size_t chunk;
+        int copied;
+        struct ftdev_minor* ftdm = filp->private_data;
+        if (mutex_lock_interruptible(&ftdm->lock)) {
+                err = -ERESTARTSYS;
+                goto out;
+        }
+        chunk = ftdm->buf->slot_size;
+        while (len >= chunk) {
+                copied = ft_buffer_copy_to_user(ftdm->buf, to);
+                if (copied == 1) {
+                        len    -= chunk;
+                        to     += chunk;
+                        err    += chunk;
+                } else if (err == 0 && copied == 0 && ftdm->events) {
+                        /* Only wait if there are any events enabled and only
+                         * if we haven't copied some data yet. We cannot wait
+                         * here with copied data because that data would get
+                         * lost if the task is interrupted (e.g., killed).
+                         */
+                        /* Before sleeping, check wether a non-blocking
+                         * read was requested.
+                         */
+                        if (filp->f_flags & O_NONBLOCK)
+                        {
+                                /* bug out, userspace doesn't want us to sleep */
+                                err = -EWOULDBLOCK;
+                                break;
+                        }
+                        mutex_unlock(&ftdm->lock);
+                        set_current_state(TASK_INTERRUPTIBLE);
+                        schedule_timeout(50);
+                        if (signal_pending(current)) {
+                                if (err == 0)
+                                        /* nothing read yet, signal problem */
+                                        err = -ERESTARTSYS;
+                                goto out;
+                        }
+                        if (mutex_lock_interruptible(&ftdm->lock)) {
+                                err = -ERESTARTSYS;
+                                goto out;
+                        }
+                } else if (copied < 0) {
+                        /* page fault */
+                        err = copied;
+                        break;
+                } else
+                        /* nothing left to get, return to user space */
+                        break;
+        }
+        mutex_unlock(&ftdm->lock);
+out:
+        return err;
+}
+static long ftdev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+        long err = -ENOIOCTLCMD;
+        struct ftdev_minor* ftdm = filp->private_data;
+        if (mutex_lock_interruptible(&ftdm->lock)) {
+                err = -ERESTARTSYS;
+                goto out;
+        }
+        /* FIXME: check id against list of acceptable events */
+        switch (cmd) {
+        case  FTDEV_ENABLE_CMD:
+                if (activate(&ftdm->events, arg))
+                        err = -ENOMEM;
+                else
+                        err = 0;
+                break;
+        case FTDEV_DISABLE_CMD:
+                deactivate(&ftdm->events, arg);
+                err = 0;
+                break;
+        case FTDEV_CALIBRATE:
+                if (ftdm->ftdev->calibrate) {
+                        err = ftdm->ftdev->calibrate(ftdm->ftdev, iminor(file_inode(filp)), arg);
+                }
+                break;
+        default:
+                printk(KERN_DEBUG "ftdev: strange ioctl (%u, %lu)\n", cmd, arg);
+        };
+        mutex_unlock(&ftdm->lock);
+out:
+        return err;
+}
+static ssize_t ftdev_write(struct file *filp, const char __user *from,
+                           size_t len, loff_t *f_pos)
+{
+        struct ftdev_minor* ftdm = filp->private_data;
+        ssize_t err = -EINVAL;
+        struct ftdev* ftdev = ftdm->ftdev;
+        /* dispatch write to buffer-specific code, if available */
+        if (ftdev->write)
+                err = ftdev->write(ftdm->buf, len, from);
+        return err;
+}
+struct file_operations ftdev_fops = {
+        .owner   = THIS_MODULE,
+        .open    = ftdev_open,
+        .release = ftdev_release,
+        .write   = ftdev_write,
+        .read    = ftdev_read,
+        .unlocked_ioctl = ftdev_ioctl,
+};
+int ftdev_init( struct ftdev* ftdev, struct module* owner,
+                const int minor_cnt, const char* name)
+{
+        int i, err;
+        BUG_ON(minor_cnt < 1);
+        cdev_init(&ftdev->cdev, &ftdev_fops);
+        ftdev->name = name;
+        ftdev->minor_cnt = minor_cnt;
+        ftdev->cdev.owner = owner;
+        ftdev->cdev.ops = &ftdev_fops;
+        ftdev->alloc    = NULL;
+        ftdev->free     = NULL;
+        ftdev->can_open = NULL;
+        ftdev->write    = NULL;
+        ftdev->calibrate = NULL;
+        ftdev->minor = kcalloc(ftdev->minor_cnt, sizeof(*ftdev->minor),
+                        GFP_KERNEL);
+        if (!ftdev->minor) {
+                printk(KERN_WARNING "ftdev(%s): Could not allocate memory\n",
+                        ftdev->name);
+                err = -ENOMEM;
+                goto err_out;
+        }
+        for (i = 0; i < ftdev->minor_cnt; i++) {
+                mutex_init(&ftdev->minor[i].lock);
+                ftdev->minor[i].readers = 0;
+                ftdev->minor[i].buf     = NULL;
+                ftdev->minor[i].events  = NULL;
+        }
+        ftdev->class = class_create(owner, ftdev->name);
+        if (IS_ERR(ftdev->class)) {
+                err = PTR_ERR(ftdev->class);
+                printk(KERN_WARNING "ftdev(%s): "
+                        "Could not create device class.\n", ftdev->name);
+                goto err_dealloc;
+        }
+        return 0;
+err_dealloc:
+        kfree(ftdev->minor);
+err_out:
+        return err;
+}
+/*
+ * Destroy minor devices up to, but not including, up_to.
+ */
+static void ftdev_device_destroy(struct ftdev* ftdev, unsigned int up_to)
+{
+        dev_t minor_cntr;
+        if (up_to < 1)
+                up_to = (ftdev->minor_cnt < 1) ? 0 : ftdev->minor_cnt;
+        for (minor_cntr = 0; minor_cntr < up_to; ++minor_cntr)
+                device_destroy(ftdev->class, MKDEV(ftdev->major, minor_cntr));
+}
+void ftdev_exit(struct ftdev* ftdev)
+{
+        printk("ftdev(%s): Exiting\n", ftdev->name);
+        ftdev_device_destroy(ftdev, -1);
+        cdev_del(&ftdev->cdev);
+        unregister_chrdev_region(MKDEV(ftdev->major, 0), ftdev->minor_cnt);
+        class_destroy(ftdev->class);
+        kfree(ftdev->minor);
+}
+int register_ftdev(struct ftdev* ftdev)
+{
+        struct device **device;
+        dev_t trace_dev_tmp, minor_cntr;
+        int err;
+        err = alloc_chrdev_region(&trace_dev_tmp, 0, ftdev->minor_cnt,
+                        ftdev->name);
+        if (err) {
+                printk(KERN_WARNING "ftdev(%s): "
+                       "Could not allocate char. device region (%d minors)\n",
+                       ftdev->name, ftdev->minor_cnt);
+                goto err_out;
+        }
+        ftdev->major = MAJOR(trace_dev_tmp);
+        err = cdev_add(&ftdev->cdev, trace_dev_tmp, ftdev->minor_cnt);
+        if (err) {
+                printk(KERN_WARNING "ftdev(%s): "
+                       "Could not add cdev for major %u with %u minor(s).\n",
+                       ftdev->name, ftdev->major, ftdev->minor_cnt);
+                goto err_unregister;
+        }
+        /* create the minor device(s) */
+        for (minor_cntr = 0; minor_cntr < ftdev->minor_cnt; ++minor_cntr)
+        {
+                trace_dev_tmp = MKDEV(ftdev->major, minor_cntr);
+                device = &ftdev->minor[minor_cntr].device;
+                *device = device_create(ftdev->class, NULL, trace_dev_tmp, NULL,
+                                "litmus/%s%d", ftdev->name, minor_cntr);
+                if (IS_ERR(*device)) {
+                        err = PTR_ERR(*device);
+                        printk(KERN_WARNING "ftdev(%s): "
+                                "Could not create device major/minor number "
+                                "%u/%u\n", ftdev->name, ftdev->major,
+                                minor_cntr);
+                        printk(KERN_WARNING "ftdev(%s): "
+                                "will attempt deletion of allocated devices.\n",
+                                ftdev->name);
+                        goto err_minors;
+                }
+        }
+        return 0;
+err_minors:
+        ftdev_device_destroy(ftdev, minor_cntr);
+        cdev_del(&ftdev->cdev);
+err_unregister:
+        unregister_chrdev_region(MKDEV(ftdev->major, 0), ftdev->minor_cnt);
+err_out:
+        return err;
+}
diff --git a/litmus/jobs.c b/litmus/jobs.c
new file mode 100644
index 000000000000..43f1f94e0b6e
--- /dev/null
+++ b/litmus/jobs.c
@@ -0,0 +1,164 @@
+/* litmus/jobs.c - common job control code
+ */
+#include <linux/sched.h>
+#include <litmus/debug_trace.h>
+#include <litmus/preempt.h>
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/sched_trace.h>
+#include <litmus/jobs.h>
+static inline void setup_release(struct task_struct *t, lt_t release)
+{
+        /* prepare next release */
+        t->rt_param.job_params.release = release;
+        t->rt_param.job_params.deadline = release + get_rt_relative_deadline(t);
+        t->rt_param.job_params.exec_time = 0;
+        /* update job sequence number */
+        t->rt_param.job_params.job_no++;
+        /* expose to user space */
+        if (has_control_page(t)) {
+                struct control_page* cp = get_control_page(t);
+                cp->deadline = t->rt_param.job_params.deadline;
+                cp->release = get_release(t);
+                cp->job_index = t->rt_param.job_params.job_no;
+        }
+}
+void prepare_for_next_period(struct task_struct *t)
+{
+        BUG_ON(!t);
+        /* Record lateness before we set up the next job's
+         * release and deadline. Lateness may be negative.
+         */
+        t->rt_param.job_params.lateness =
+                (long long)litmus_clock() -
+                (long long)t->rt_param.job_params.deadline;
+        if (tsk_rt(t)->sporadic_release) {
+                TRACE_TASK(t, "sporadic release at %llu\n",
+                           tsk_rt(t)->sporadic_release_time);
+                /* sporadic release */
+                setup_release(t, tsk_rt(t)->sporadic_release_time);
+                tsk_rt(t)->sporadic_release = 0;
+        } else {
+                /* periodic release => add period */
+                setup_release(t, get_release(t) + get_rt_period(t));
+        }
+}
+void release_at(struct task_struct *t, lt_t start)
+{
+        BUG_ON(!t);
+        setup_release(t, start);
+        tsk_rt(t)->completed = 0;
+}
+void inferred_sporadic_job_release_at(struct task_struct *t, lt_t when)
+{
+        /* new sporadic release */
+        sched_trace_last_suspension_as_completion(t);
+        /* check if this task is resuming from a clock_nanosleep() call */
+        if (tsk_rt(t)->doing_abs_nanosleep &&
+            lt_after_eq(tsk_rt(t)->nanosleep_wakeup,
+                        get_release(t) + get_rt_period(t))) {
+                /* clock_nanosleep() is supposed to wake up the task
+                 * at a time that is a valid release time. Use that time
+                 * rather than guessing the intended release time from the
+                 * current time. */
+                TRACE_TASK(t, "nanosleep: backdating release "
+                        "to %llu instead of %llu\n",
+                        tsk_rt(t)->nanosleep_wakeup, when);
+                when = tsk_rt(t)->nanosleep_wakeup;
+        }
+        release_at(t, when);
+        sched_trace_task_release(t);
+}
+long default_wait_for_release_at(lt_t release_time)
+{
+        struct task_struct *t = current;
+        unsigned long flags;
+        local_irq_save(flags);
+        tsk_rt(t)->sporadic_release_time = release_time;
+        smp_wmb();
+        tsk_rt(t)->sporadic_release = 1;
+        local_irq_restore(flags);
+        return litmus->complete_job();
+}
+/*
+ *      Deactivate current task until the beginning of the next period.
+ */
+long complete_job(void)
+{
+        preempt_disable();
+        TRACE_CUR("job completion indicated at %llu\n", litmus_clock());
+        /* Mark that we do not excute anymore */
+        tsk_rt(current)->completed = 1;
+        /* call schedule, this will return when a new job arrives
+         * it also takes care of preparing for the next release
+         */
+        litmus_reschedule_local();
+        preempt_enable();
+        return 0;
+}
+static long sleep_until_next_release(void);
+/* alternative job completion implementation that suspends the task */
+long complete_job_oneshot(void)
+{
+        struct task_struct *t = current;
+        preempt_disable();
+        TRACE_CUR("job completes at %llu (deadline: %llu)\n", litmus_clock(),
+                get_deadline(t));
+        sched_trace_task_completion(t, 0);
+        prepare_for_next_period(t);
+        sched_trace_task_release(t);
+        return sleep_until_next_release();
+}
+/* assumes caller has disabled preemptions;
+ * re-enables preemptions before returning */
+static long sleep_until_next_release(void)
+{
+        struct task_struct *t = current;
+        ktime_t next_release;
+        long err;
+        next_release = ns_to_ktime(get_release(t));
+        TRACE_CUR("next_release=%llu\n", get_release(t));
+        if (lt_after(get_release(t), litmus_clock())) {
+                set_current_state(TASK_INTERRUPTIBLE);
+                tsk_rt(t)->completed = 1;
+                preempt_enable_no_resched();
+                err = schedule_hrtimeout(&next_release, HRTIMER_MODE_ABS);
+                /* If we get woken by a signal, we return early.
+                 * This is intentional; we want to be able to kill tasks
+                 * that are waiting for the next job release.
+                 */
+                tsk_rt(t)->completed = 0;
+        } else {
+                err = 0;
+                TRACE_CUR("TARDY: release=%llu now=%llu\n", get_release(t), litmus_clock());
+                preempt_enable();
+        }
+        TRACE_CUR("return to next job at %llu\n", litmus_clock());
+        return err;
+}
diff --git a/litmus/litmus.c b/litmus/litmus.c
new file mode 100644
index 000000000000..bd192180fef7
--- /dev/null
+++ b/litmus/litmus.c
@@ -0,0 +1,773 @@
+/*
+ * litmus.c -- Implementation of the LITMUS syscalls,
+ *             the LITMUS intialization code,
+ *             and the procfs interface..
+ */
+#include <asm/uaccess.h>
+#include <linux/uaccess.h>
+#include <linux/sysrq.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/reboot.h>
+#include <linux/stop_machine.h>
+#include <linux/sched/rt.h>
+#include <linux/rwsem.h>
+#include <linux/interrupt.h>
+#include <litmus/debug_trace.h>
+#include <litmus/litmus.h>
+#include <litmus/bheap.h>
+#include <litmus/trace.h>
+#include <litmus/rt_domain.h>
+#include <litmus/litmus_proc.h>
+#include <litmus/sched_trace.h>
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+#include <litmus/affinity.h>
+#endif
+#ifdef CONFIG_SCHED_LITMUS_TRACEPOINT
+#define CREATE_TRACE_POINTS
+#include <trace/events/litmus.h>
+#endif
+/* Number of RT tasks that exist in the system */
+atomic_t rt_task_count          = ATOMIC_INIT(0);
+#ifdef CONFIG_RELEASE_MASTER
+/* current master CPU for handling timer IRQs */
+atomic_t release_master_cpu = ATOMIC_INIT(NO_CPU);
+#endif
+static struct kmem_cache * bheap_node_cache;
+extern struct kmem_cache * release_heap_cache;
+struct bheap_node* bheap_node_alloc(int gfp_flags)
+{
+        return kmem_cache_alloc(bheap_node_cache, gfp_flags);
+}
+void bheap_node_free(struct bheap_node* hn)
+{
+        kmem_cache_free(bheap_node_cache, hn);
+}
+struct release_heap* release_heap_alloc(int gfp_flags);
+void release_heap_free(struct release_heap* rh);
+/**
+ * Get the quantum alignment as a cmdline option.
+ * Default is aligned quanta..
+ */
+static bool aligned_quanta = 1;
+module_param(aligned_quanta, bool, 0644);
+u64 cpu_stagger_offset(int cpu)
+{
+        u64 offset = 0;
+        if (!aligned_quanta) {
+                offset = LITMUS_QUANTUM_LENGTH_NS;
+                do_div(offset, num_possible_cpus());
+                offset *= cpu;
+        }
+        return offset;
+}
+/*
+ * sys_set_task_rt_param
+ * @pid: Pid of the task which scheduling parameters must be changed
+ * @param: New real-time extension parameters such as the execution cost and
+ *         period
+ * Syscall for manipulating with task rt extension params
+ * Returns EFAULT  if param is NULL.
+ *         ESRCH   if pid is not corrsponding
+ *                 to a valid task.
+ *         EINVAL  if either period or execution cost is <=0
+ *         EPERM   if pid is a real-time task
+ *         0       if success
+ *
+ * Only non-real-time tasks may be configured with this system call
+ * to avoid races with the scheduler. In practice, this means that a
+ * task's parameters must be set _before_ calling sys_prepare_rt_task()
+ *
+ * find_task_by_vpid() assumes that we are in the same namespace of the
+ * target.
+ */
+asmlinkage long sys_set_rt_task_param(pid_t pid, struct rt_task __user * param)
+{
+        struct rt_task tp;
+        struct task_struct *target;
+        int retval = -EINVAL;
+        printk("Setting up rt task parameters for process %d.\n", pid);
+        if (pid < 0 || param == 0) {
+                goto out;
+        }
+        if (copy_from_user(&tp, param, sizeof(tp))) {
+                retval = -EFAULT;
+                goto out;
+        }
+        /* Task search and manipulation must be protected */
+        read_lock_irq(&tasklist_lock);
+        rcu_read_lock();
+        if (!(target = find_task_by_vpid(pid))) {
+                retval = -ESRCH;
+                rcu_read_unlock();
+                goto out_unlock;
+        }
+        rcu_read_unlock();
+        /* set relative deadline to be implicit if left unspecified */
+        if (tp.relative_deadline == 0)
+                tp.relative_deadline = tp.period;
+        if (tp.exec_cost <= 0)
+                goto out_unlock;
+        if (tp.period <= 0)
+                goto out_unlock;
+        if (min(tp.relative_deadline, tp.period) < tp.exec_cost) /*density check*/
+        {
+                printk(KERN_INFO "litmus: real-time task %d rejected "
+                       "because task density > 1.0\n", pid);
+                goto out_unlock;
+        }
+        if (tp.cls != RT_CLASS_HARD &&
+            tp.cls != RT_CLASS_SOFT &&
+            tp.cls != RT_CLASS_BEST_EFFORT)
+        {
+                printk(KERN_INFO "litmus: real-time task %d rejected "
+                                 "because its class is invalid\n", pid);
+                goto out_unlock;
+        }
+        if (tp.budget_policy != NO_ENFORCEMENT &&
+            tp.budget_policy != QUANTUM_ENFORCEMENT &&
+            tp.budget_policy != PRECISE_ENFORCEMENT)
+        {
+                printk(KERN_INFO "litmus: real-time task %d rejected "
+                       "because unsupported budget enforcement policy "
+                       "specified (%d)\n",
+                       pid, tp.budget_policy);
+                goto out_unlock;
+        }
+        if (is_realtime(target)) {
+                /* The task is already a real-time task.
+                 * Let plugin decide whether it wants to support
+                 * parameter changes at runtime.
+                 */
+                retval = litmus->task_change_params(target, &tp);
+        } else {
+                target->rt_param.task_params = tp;
+                retval = 0;
+        }
+      out_unlock:
+        read_unlock_irq(&tasklist_lock);
+      out:
+        return retval;
+}
+/*
+ * Getter of task's RT params
+ *   returns EINVAL if param or pid is NULL
+ *   returns ESRCH  if pid does not correspond to a valid task
+ *   returns EFAULT if copying of parameters has failed.
+ *
+ *   find_task_by_vpid() assumes that we are in the same namespace of the
+ *   target.
+ */
+asmlinkage long sys_get_rt_task_param(pid_t pid, struct rt_task __user * param)
+{
+        int retval = -EINVAL;
+        struct task_struct *source;
+        struct rt_task lp;
+        if (param == 0 || pid < 0)
+                goto out;
+        read_lock_irq(&tasklist_lock);
+        rcu_read_lock();
+        source = find_task_by_vpid(pid);
+        rcu_read_unlock();
+        if (!source) {
+                retval = -ESRCH;
+                read_unlock_irq(&tasklist_lock);
+                goto out;
+        }
+        lp = source->rt_param.task_params;
+        read_unlock_irq(&tasklist_lock);
+        /* Do copying outside the lock */
+        retval =
+            copy_to_user(param, &lp, sizeof(lp)) ? -EFAULT : 0;
+      out:
+        return retval;
+}
+/*
+ *      This is the crucial function for periodic task implementation,
+ *      It checks if a task is periodic, checks if such kind of sleep
+ *      is permitted and calls plugin-specific sleep, which puts the
+ *      task into a wait array.
+ *      returns 0 on successful wakeup
+ *      returns EPERM if current conditions do not permit such sleep
+ *      returns EINVAL if current task is not able to go to sleep
+ */
+asmlinkage long sys_complete_job(void)
+{
+        int retval = -EPERM;
+        if (!is_realtime(current)) {
+                retval = -EINVAL;
+                goto out;
+        }
+        /* Task with negative or zero period cannot sleep */
+        if (get_rt_period(current) <= 0) {
+                retval = -EINVAL;
+                goto out;
+        }
+        /* The plugin has to put the task into an
+         * appropriate queue and call schedule
+         */
+        retval = litmus->complete_job();
+      out:
+        return retval;
+}
+/*      This is an "improved" version of sys_complete_job that
+ *      addresses the problem of unintentionally missing a job after
+ *      an overrun.
+ *
+ *      returns 0 on successful wakeup
+ *      returns EPERM if current conditions do not permit such sleep
+ *      returns EINVAL if current task is not able to go to sleep
+ */
+asmlinkage long sys_wait_for_job_release(unsigned int job)
+{
+        int retval = -EPERM;
+        if (!is_realtime(current)) {
+                retval = -EINVAL;
+                goto out;
+        }
+        /* Task with negative or zero period cannot sleep */
+        if (get_rt_period(current) <= 0) {
+                retval = -EINVAL;
+                goto out;
+        }
+        retval = 0;
+        /* first wait until we have "reached" the desired job
+         *
+         * This implementation has at least two problems:
+         *
+         * 1) It doesn't gracefully handle the wrap around of
+         *    job_no. Since LITMUS is a prototype, this is not much
+         *    of a problem right now.
+         *
+         * 2) It is theoretically racy if a job release occurs
+         *    between checking job_no and calling sleep_next_period().
+         *    A proper solution would requiring adding another callback
+         *    in the plugin structure and testing the condition with
+         *    interrupts disabled.
+         *
+         * FIXME: At least problem 2 should be taken care of eventually.
+         */
+        while (!retval && job > current->rt_param.job_params.job_no)
+                /* If the last job overran then job <= job_no and we
+                 * don't send the task to sleep.
+                 */
+                retval = litmus->complete_job();
+      out:
+        return retval;
+}
+/*      This is a helper syscall to query the current job sequence number.
+ *
+ *      returns 0 on successful query
+ *      returns EPERM if task is not a real-time task.
+ *      returns EFAULT if &job is not a valid pointer.
+ */
+asmlinkage long sys_query_job_no(unsigned int __user *job)
+{
+        int retval = -EPERM;
+        if (is_realtime(current))
+                retval = put_user(current->rt_param.job_params.job_no, job);
+        return retval;
+}
+/* sys_null_call() is only used for determining raw system call
+ * overheads (kernel entry, kernel exit). It has no useful side effects.
+ * If ts is non-NULL, then the current Feather-Trace time is recorded.
+ */
+asmlinkage long sys_null_call(cycles_t __user *ts)
+{
+        long ret = 0;
+        cycles_t now;
+        if (ts) {
+                now = get_cycles();
+                ret = put_user(now, ts);
+        }
+        return ret;
+}
+asmlinkage long sys_reservation_create(int type, void __user *config)
+{
+        return litmus->reservation_create(type, config);
+}
+asmlinkage long sys_reservation_destroy(unsigned int reservation_id, int cpu)
+{
+        return litmus->reservation_destroy(reservation_id, cpu);
+}
+/* p is a real-time task. Re-init its state as a best-effort task. */
+static void reinit_litmus_state(struct task_struct* p, int restore)
+{
+        struct rt_task  user_config = {};
+        void*  ctrl_page     = NULL;
+        if (restore) {
+                /* Safe user-space provided configuration data.
+                 * and allocated page. */
+                user_config = p->rt_param.task_params;
+                ctrl_page   = p->rt_param.ctrl_page;
+        }
+        /* We probably should not be inheriting any task's priority
+         * at this point in time.
+         */
+        WARN_ON(p->rt_param.inh_task);
+        /* Cleanup everything else. */
+        memset(&p->rt_param, 0, sizeof(p->rt_param));
+        /* Restore preserved fields. */
+        if (restore) {
+                p->rt_param.task_params = user_config;
+                p->rt_param.ctrl_page   = ctrl_page;
+        }
+}
+static long __litmus_admit_task(struct task_struct* tsk)
+{
+        long err;
+        INIT_LIST_HEAD(&tsk_rt(tsk)->list);
+        /* allocate heap node for this task */
+        tsk_rt(tsk)->heap_node = bheap_node_alloc(GFP_ATOMIC);
+        tsk_rt(tsk)->rel_heap = release_heap_alloc(GFP_ATOMIC);
+        if (!tsk_rt(tsk)->heap_node || !tsk_rt(tsk)->rel_heap) {
+                printk(KERN_WARNING "litmus: no more heap node memory!?\n");
+                return -ENOMEM;
+        } else {
+                bheap_node_init(&tsk_rt(tsk)->heap_node, tsk);
+        }
+        preempt_disable();
+        err = litmus->admit_task(tsk);
+        if (!err) {
+                sched_trace_task_name(tsk);
+                sched_trace_task_param(tsk);
+                atomic_inc(&rt_task_count);
+        }
+        preempt_enable();
+        return err;
+}
+long litmus_admit_task(struct task_struct* tsk)
+{
+        long retval = 0;
+        BUG_ON(is_realtime(tsk));
+        tsk_rt(tsk)->heap_node = NULL;
+        tsk_rt(tsk)->rel_heap = NULL;
+        if (get_rt_relative_deadline(tsk) == 0 ||
+            get_exec_cost(tsk) >
+                        min(get_rt_relative_deadline(tsk), get_rt_period(tsk)) ) {
+                TRACE_TASK(tsk,
+                        "litmus admit: invalid task parameters "
+                        "(e = %lu, p = %lu, d = %lu)\n",
+                        get_exec_cost(tsk), get_rt_period(tsk),
+                        get_rt_relative_deadline(tsk));
+                retval = -EINVAL;
+                goto out;
+        }
+        retval = __litmus_admit_task(tsk);
+out:
+        if (retval) {
+                if (tsk_rt(tsk)->heap_node)
+                        bheap_node_free(tsk_rt(tsk)->heap_node);
+                if (tsk_rt(tsk)->rel_heap)
+                        release_heap_free(tsk_rt(tsk)->rel_heap);
+        }
+        return retval;
+}
+void litmus_clear_state(struct task_struct* tsk)
+{
+    BUG_ON(bheap_node_in_heap(tsk_rt(tsk)->heap_node));
+    bheap_node_free(tsk_rt(tsk)->heap_node);
+    release_heap_free(tsk_rt(tsk)->rel_heap);
+    atomic_dec(&rt_task_count);
+    reinit_litmus_state(tsk, 1);
+}
+/* called from sched_setscheduler() */
+void litmus_exit_task(struct task_struct* tsk)
+{
+        if (is_realtime(tsk)) {
+                sched_trace_task_completion(tsk, 1);
+                litmus->task_exit(tsk);
+        }
+}
+static DECLARE_RWSEM(plugin_switch_mutex);
+void litmus_plugin_switch_disable(void)
+{
+        down_read(&plugin_switch_mutex);
+}
+void litmus_plugin_switch_enable(void)
+{
+        up_read(&plugin_switch_mutex);
+}
+static int __do_plugin_switch(struct sched_plugin* plugin)
+{
+        int ret;
+        /* don't switch if there are active real-time tasks */
+        if (atomic_read(&rt_task_count) == 0) {
+                TRACE("deactivating plugin %s\n", litmus->plugin_name);
+                ret = litmus->deactivate_plugin();
+                if (0 != ret)
+                        goto out;
+                TRACE("activating plugin %s\n", plugin->plugin_name);
+                ret = plugin->activate_plugin();
+                if (0 != ret) {
+                        printk(KERN_INFO "Can't activate %s (%d).\n",
+                               plugin->plugin_name, ret);
+                        plugin = &linux_sched_plugin;
+                }
+                printk(KERN_INFO "Switching to LITMUS^RT plugin %s.\n", plugin->plugin_name);
+                litmus = plugin;
+        } else
+                ret = -EBUSY;
+out:
+        TRACE("do_plugin_switch() => %d\n", ret);
+        return ret;
+}
+static atomic_t ready_to_switch;
+static int do_plugin_switch(void *_plugin)
+{
+        unsigned long flags;
+        int ret = 0;
+        local_save_flags(flags);
+        local_irq_disable();
+        hard_irq_disable();
+        if (atomic_dec_and_test(&ready_to_switch))
+        {
+                ret = __do_plugin_switch((struct sched_plugin*) _plugin);
+                atomic_set(&ready_to_switch, INT_MAX);
+        }
+        do {
+                cpu_relax();
+        } while (atomic_read(&ready_to_switch) != INT_MAX);
+        local_irq_restore(flags);
+        return ret;
+}
+/* Switching a plugin in use is tricky.
+ * We must watch out that no real-time tasks exists
+ * (and that none is created in parallel) and that the plugin is not
+ * currently in use on any processor (in theory).
+ */
+int switch_sched_plugin(struct sched_plugin* plugin)
+{
+        int err;
+        struct domain_proc_info* domain_info;
+        BUG_ON(!plugin);
+        if (atomic_read(&rt_task_count) == 0) {
+                down_write(&plugin_switch_mutex);
+                deactivate_domain_proc();
+                get_online_cpus();
+                atomic_set(&ready_to_switch, num_online_cpus());
+                err = stop_cpus(cpu_online_mask, do_plugin_switch, plugin);
+                put_online_cpus();
+                if (!litmus->get_domain_proc_info(&domain_info))
+                        activate_domain_proc(domain_info);
+                up_write(&plugin_switch_mutex);
+                return err;
+        } else
+                return -EBUSY;
+}
+/* Called upon fork.
+ * p is the newly forked task.
+ */
+void litmus_fork(struct task_struct* p)
+{
+        /* non-rt tasks might have ctrl_page set */
+        tsk_rt(p)->ctrl_page = NULL;
+        if (is_realtime(p)) {
+                reinit_litmus_state(p, 1);
+                if (litmus->fork_task(p)) {
+                        if (__litmus_admit_task(p))
+                                /* something went wrong, give up */
+                                p->sched_reset_on_fork = 1;
+                } else {
+                        /* clean out any litmus related state */
+                        reinit_litmus_state(p, 0);
+                        TRACE_TASK(p, "fork: real-time status denied\n");
+                        /* Don't let the child be a real-time task. */
+                        p->sched_reset_on_fork = 1;
+                }
+        }
+        /* od tables are never inherited across a fork */
+        p->od_table = NULL;
+}
+/* Called upon execve().
+ * current is doing the exec.
+ * Don't let address space specific stuff leak.
+ */
+void litmus_exec(void)
+{
+        struct task_struct* p = current;
+        if (is_realtime(p)) {
+                WARN_ON(p->rt_param.inh_task);
+                if (tsk_rt(p)->ctrl_page) {
+                        free_page((unsigned long) tsk_rt(p)->ctrl_page);
+                        tsk_rt(p)->ctrl_page = NULL;
+                }
+        }
+}
+/* Called when dead_tsk is being deallocated
+ */
+void exit_litmus(struct task_struct *dead_tsk)
+{
+        /* We also allow non-RT tasks to
+         * allocate control pages to allow
+         * measurements with non-RT tasks.
+         * So check if we need to free the page
+         * in any case.
+         */
+        if (tsk_rt(dead_tsk)->ctrl_page) {
+                TRACE_TASK(dead_tsk,
+                           "freeing ctrl_page %p\n",
+                           tsk_rt(dead_tsk)->ctrl_page);
+                free_page((unsigned long) tsk_rt(dead_tsk)->ctrl_page);
+        }
+        /* Tasks should not be real-time tasks any longer at this point. */
+        BUG_ON(is_realtime(dead_tsk));
+}
+void litmus_do_exit(struct task_struct *exiting_tsk)
+{
+        /* This task called do_exit(), but is still a real-time task. To avoid
+         * complications later, we force it to be a non-real-time task now. */
+        struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
+        TRACE_TASK(exiting_tsk, "exiting, demoted to SCHED_FIFO\n");
+        sched_setscheduler_nocheck(exiting_tsk, SCHED_FIFO, &param);
+}
+void litmus_dealloc(struct task_struct *tsk)
+{
+        /* tsk is no longer a real-time task */
+        TRACE_TASK(tsk, "Deallocating real-time task data\n");
+        litmus->task_cleanup(tsk);
+        litmus_clear_state(tsk);
+}
+/* move current non-RT task to a specific CPU */
+int litmus_be_migrate_to(int cpu)
+{
+        struct cpumask single_cpu_aff;
+        cpumask_clear(&single_cpu_aff);
+        cpumask_set_cpu(cpu, &single_cpu_aff);
+        return sched_setaffinity(current->pid, &single_cpu_aff);
+}
+#ifdef CONFIG_MAGIC_SYSRQ
+int sys_kill(int pid, int sig);
+static void sysrq_handle_kill_rt_tasks(int key)
+{
+        struct task_struct *t;
+        read_lock(&tasklist_lock);
+        for_each_process(t) {
+                if (is_realtime(t)) {
+                        sys_kill(t->pid, SIGKILL);
+                }
+        }
+        read_unlock(&tasklist_lock);
+}
+static struct sysrq_key_op sysrq_kill_rt_tasks_op = {
+        .handler        = sysrq_handle_kill_rt_tasks,
+        .help_msg       = "quit-rt-tasks(X)",
+        .action_msg     = "sent SIGKILL to all LITMUS^RT real-time tasks",
+};
+#endif
+extern struct sched_plugin linux_sched_plugin;
+static int litmus_shutdown_nb(struct notifier_block *unused1,
+                                unsigned long unused2, void *unused3)
+{
+        /* Attempt to switch back to regular Linux scheduling.
+         * Forces the active plugin to clean up.
+         */
+        if (litmus != &linux_sched_plugin) {
+                int ret = switch_sched_plugin(&linux_sched_plugin);
+                if (ret) {
+                        printk("Auto-shutdown of active Litmus plugin failed.\n");
+                }
+        }
+        return NOTIFY_DONE;
+}
+static struct notifier_block shutdown_notifier = {
+        .notifier_call = litmus_shutdown_nb,
+};
+/**
+ * Triggering hrtimers on specific cpus as required by arm_release_timer(_on)
+ */
+#ifdef CONFIG_SMP
+/**
+ *  hrtimer_pull - smp_call_function_single_async callback on remote cpu
+ */
+void hrtimer_pull(void *csd_info)
+{
+        struct hrtimer_start_on_info *info = csd_info;
+        TRACE("pulled timer 0x%x\n", info->timer);
+        hrtimer_start_range_ns(info->timer, info->time, 0, info->mode);
+}
+/**
+ *  hrtimer_start_on - trigger timer arming on remote cpu
+ *  @cpu:       remote cpu
+ *  @info:      save timer information for enqueuing on remote cpu
+ *  @timer:     timer to be pulled
+ *  @time:      expire time
+ *  @mode:      timer mode
+ */
+void hrtimer_start_on(int cpu, struct hrtimer_start_on_info *info,
+                struct hrtimer *timer, ktime_t time,
+                const enum hrtimer_mode mode)
+{
+        info->timer = timer;
+        info->time  = time;
+        info->mode  = mode;
+        /* initialize call_single_data struct */
+        info->csd.func  = &hrtimer_pull;
+        info->csd.info  = info;
+        info->csd.flags = 0;
+        /* initiate pull  */
+        preempt_disable();
+        if (cpu == smp_processor_id()) {
+                /* start timer locally; we may get called
+                * with rq->lock held, do not wake up anything
+                */
+                TRACE("hrtimer_start_on: starting on local CPU\n");
+                hrtimer_start(info->timer, info->time, info->mode);
+        } else {
+                /* call hrtimer_pull() on remote cpu
+                * to start remote timer asynchronously
+                */
+                TRACE("hrtimer_start_on: pulling to remote CPU\n");
+                smp_call_function_single_async(cpu, &info->csd);
+        }
+        preempt_enable();
+}
+#endif /* CONFIG_SMP */
+static int __init _init_litmus(void)
+{
+        /*      Common initializers,
+         *      mode change lock is used to enforce single mode change
+         *      operation.
+         */
+        printk("Starting LITMUS^RT kernel\n");
+        register_sched_plugin(&linux_sched_plugin);
+        bheap_node_cache    = KMEM_CACHE(bheap_node, SLAB_PANIC);
+        release_heap_cache = KMEM_CACHE(release_heap, SLAB_PANIC);
+#ifdef CONFIG_MAGIC_SYSRQ
+        /* offer some debugging help */
+        if (!register_sysrq_key('x', &sysrq_kill_rt_tasks_op))
+                printk("Registered kill rt tasks magic sysrq.\n");
+        else
+                printk("Could not register kill rt tasks magic sysrq.\n");
+#endif
+        init_litmus_proc();
+        register_reboot_notifier(&shutdown_notifier);
+        return 0;
+}
+static void _exit_litmus(void)
+{
+        unregister_reboot_notifier(&shutdown_notifier);
+        exit_litmus_proc();
+        kmem_cache_destroy(bheap_node_cache);
+        kmem_cache_destroy(release_heap_cache);
+}
+module_init(_init_litmus);
+module_exit(_exit_litmus);
diff --git a/litmus/litmus_proc.c b/litmus/litmus_proc.c
new file mode 100644
index 000000000000..de5e3f37fe88
--- /dev/null
+++ b/litmus/litmus_proc.c
@@ -0,0 +1,574 @@
+/*
+ * litmus_proc.c -- Implementation of the /proc/litmus directory tree.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/seq_file.h>
+#include <litmus/debug_trace.h>
+#include <litmus/litmus.h>
+#include <litmus/litmus_proc.h>
+#include <litmus/clustered.h>
+/* in litmus/litmus.c */
+extern atomic_t rt_task_count;
+static struct proc_dir_entry *litmus_dir = NULL,
+        *curr_file = NULL,
+        *stat_file = NULL,
+        *plugs_dir = NULL,
+#ifdef CONFIG_RELEASE_MASTER
+        *release_master_file = NULL,
+#endif
+        *plugs_file = NULL,
+        *domains_dir = NULL,
+        *cpus_dir = NULL;
+/* in litmus/sync.c */
+int count_tasks_waiting_for_release(void);
+static int litmus_stats_proc_show(struct seq_file *m, void *v)
+{
+        seq_printf(m,
+                   "real-time tasks   = %d\n"
+                   "ready for release = %d\n",
+                   atomic_read(&rt_task_count),
+                   count_tasks_waiting_for_release());
+        return 0;
+}
+static int litmus_stats_proc_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, litmus_stats_proc_show, PDE_DATA(inode));
+}
+static const struct file_operations litmus_stats_proc_fops = {
+        .open           = litmus_stats_proc_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
+static int litmus_loaded_proc_show(struct seq_file *m, void *v)
+{
+        print_sched_plugins(m);
+        return 0;
+}
+static int litmus_loaded_proc_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, litmus_loaded_proc_show, PDE_DATA(inode));
+}
+static const struct file_operations litmus_loaded_proc_fops = {
+        .open           = litmus_loaded_proc_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
+/* in litmus/litmus.c */
+int switch_sched_plugin(struct sched_plugin*);
+static ssize_t litmus_active_proc_write(struct file *file,
+                                        const char __user *buffer, size_t count,
+                                        loff_t *ppos)
+{
+        char name[65];
+        struct sched_plugin* found;
+        ssize_t ret = -EINVAL;
+        int err;
+        ret = copy_and_chomp(name, sizeof(name), buffer, count);
+        if (ret < 0)
+                return ret;
+        found = find_sched_plugin(name);
+        if (found) {
+                err = switch_sched_plugin(found);
+                if (err) {
+                        printk(KERN_INFO "Could not switch plugin: %d\n", err);
+                        ret = err;
+                }
+        } else {
+                printk(KERN_INFO "Plugin '%s' is unknown.\n", name);
+                ret = -ESRCH;
+        }
+        return ret;
+}
+static int litmus_active_proc_show(struct seq_file *m, void *v)
+{
+        seq_printf(m, "%s\n", litmus->plugin_name);
+        return 0;
+}
+static int litmus_active_proc_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, litmus_active_proc_show, PDE_DATA(inode));
+}
+static const struct file_operations litmus_active_proc_fops = {
+        .open           = litmus_active_proc_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+        .write          = litmus_active_proc_write,
+};
+#ifdef CONFIG_RELEASE_MASTER
+static ssize_t litmus_release_master_proc_write(
+        struct file *file,
+        const char __user *buffer, size_t count,
+        loff_t *ppos)
+{
+        int cpu, err, online = 0;
+        char msg[64];
+        ssize_t len;
+        len = copy_and_chomp(msg, sizeof(msg), buffer, count);
+        if (len < 0)
+                return len;
+        if (strcmp(msg, "NO_CPU") == 0)
+                atomic_set(&release_master_cpu, NO_CPU);
+        else {
+                err = sscanf(msg, "%d", &cpu);
+                if (err == 1 && cpu >= 0 && (online = cpu_online(cpu))) {
+                        atomic_set(&release_master_cpu, cpu);
+                } else {
+                        TRACE("invalid release master: '%s' "
+                              "(err:%d cpu:%d online:%d)\n",
+                              msg, err, cpu, online);
+                        len = -EINVAL;
+                }
+        }
+        return len;
+}
+static int litmus_release_master_proc_show(struct seq_file *m, void *v)
+{
+        int master;
+        master = atomic_read(&release_master_cpu);
+        if (master == NO_CPU)
+                seq_printf(m, "NO_CPU\n");
+        else
+                seq_printf(m, "%d\n", master);
+        return 0;
+}
+static int litmus_release_master_proc_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, litmus_release_master_proc_show, PDE_DATA(inode));
+}
+static const struct file_operations litmus_release_master_proc_fops = {
+        .open           = litmus_release_master_proc_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+        .write          = litmus_release_master_proc_write,
+};
+#endif
+int __init init_litmus_proc(void)
+{
+        litmus_dir = proc_mkdir("litmus", NULL);
+        if (!litmus_dir) {
+                printk(KERN_ERR "Could not allocate LITMUS^RT procfs entry.\n");
+                return -ENOMEM;
+        }
+        curr_file = proc_create("active_plugin", 0644, litmus_dir,
+                                &litmus_active_proc_fops);
+        if (!curr_file) {
+                printk(KERN_ERR "Could not allocate active_plugin "
+                       "procfs entry.\n");
+                return -ENOMEM;
+        }
+#ifdef CONFIG_RELEASE_MASTER
+        release_master_file = proc_create("release_master", 0644, litmus_dir,
+                                          &litmus_release_master_proc_fops);
+        if (!release_master_file) {
+                printk(KERN_ERR "Could not allocate release_master "
+                       "procfs entry.\n");
+                return -ENOMEM;
+        }
+#endif
+        stat_file = proc_create("stats", 0444, litmus_dir, &litmus_stats_proc_fops);
+        plugs_dir = proc_mkdir("plugins", litmus_dir);
+        if (!plugs_dir){
+                printk(KERN_ERR "Could not allocate plugins directory "
+                                "procfs entry.\n");
+                return -ENOMEM;
+        }
+        plugs_file = proc_create("loaded", 0444, plugs_dir,
+                                 &litmus_loaded_proc_fops);
+        domains_dir = proc_mkdir("domains", litmus_dir);
+        if (!domains_dir) {
+                printk(KERN_ERR "Could not allocate domains directory "
+                                "procfs entry.\n");
+                return -ENOMEM;
+        }
+        cpus_dir = proc_mkdir("cpus", litmus_dir);
+        if (!cpus_dir) {
+                printk(KERN_ERR "Could not allocate cpus directory "
+                                "procfs entry.\n");
+                return -ENOMEM;
+        }
+        return 0;
+}
+void exit_litmus_proc(void)
+{
+        if (cpus_dir || domains_dir) {
+                deactivate_domain_proc();
+                if (cpus_dir)
+                        remove_proc_entry("cpus", litmus_dir);
+                if (domains_dir)
+                        remove_proc_entry("domains", litmus_dir);
+        }
+        if (plugs_file)
+                remove_proc_entry("loaded", plugs_dir);
+        if (plugs_dir)
+                remove_proc_entry("plugins", litmus_dir);
+        if (stat_file)
+                remove_proc_entry("stats", litmus_dir);
+        if (curr_file)
+                remove_proc_entry("active_plugin", litmus_dir);
+#ifdef CONFIG_RELEASE_MASTER
+        if (release_master_file)
+                remove_proc_entry("release_master", litmus_dir);
+#endif
+        if (litmus_dir)
+                remove_proc_entry("litmus", NULL);
+}
+long make_plugin_proc_dir(struct sched_plugin* plugin,
+                struct proc_dir_entry** pde_in)
+{
+        struct proc_dir_entry *pde_new = NULL;
+        long rv;
+        if (!plugin || !plugin->plugin_name){
+                printk(KERN_ERR "Invalid plugin struct passed to %s.\n",
+                                __func__);
+                rv = -EINVAL;
+                goto out_no_pde;
+        }
+        if (!plugs_dir){
+                printk(KERN_ERR "Could not make plugin sub-directory, because "
+                                "/proc/litmus/plugins does not exist.\n");
+                rv = -ENOENT;
+                goto out_no_pde;
+        }
+        pde_new = proc_mkdir(plugin->plugin_name, plugs_dir);
+        if (!pde_new){
+                printk(KERN_ERR "Could not make plugin sub-directory: "
+                                "out of memory?.\n");
+                rv = -ENOMEM;
+                goto out_no_pde;
+        }
+        rv = 0;
+        *pde_in = pde_new;
+        goto out_ok;
+out_no_pde:
+        *pde_in = NULL;
+out_ok:
+        return rv;
+}
+void remove_plugin_proc_dir(struct sched_plugin* plugin)
+{
+        if (!plugin || !plugin->plugin_name){
+                printk(KERN_ERR "Invalid plugin struct passed to %s.\n",
+                                __func__);
+                return;
+        }
+        remove_proc_entry(plugin->plugin_name, plugs_dir);
+}
+/* misc. I/O helper functions */
+int copy_and_chomp(char *kbuf, unsigned long ksize,
+                   __user const char* ubuf, unsigned long ulength)
+{
+        /* caller must provide buffer space */
+        BUG_ON(!ksize);
+        ksize--; /* leave space for null byte */
+        if (ksize > ulength)
+                ksize = ulength;
+        if(copy_from_user(kbuf, ubuf, ksize))
+                return -EFAULT;
+        kbuf[ksize] = '\0';
+        /* chomp kbuf */
+        if (ksize > 0 && kbuf[ksize - 1] == '\n')
+                kbuf[ksize - 1] = '\0';
+        return ksize;
+}
+/* helper functions for clustered plugins */
+static const char* cache_level_names[] = {
+        "ALL",
+        "L1",
+        "L2",
+        "L3",
+};
+int parse_cache_level(const char *cache_name, enum cache_level *level)
+{
+        int err = -EINVAL;
+        int i;
+        /* do a quick and dirty comparison to find the cluster size */
+        for (i = GLOBAL_CLUSTER; i <= L3_CLUSTER; i++)
+                if (!strcmp(cache_name, cache_level_names[i])) {
+                        *level = (enum cache_level) i;
+                        err = 0;
+                        break;
+                }
+        return err;
+}
+const char* cache_level_name(enum cache_level level)
+{
+        int idx = level;
+        if (idx >= GLOBAL_CLUSTER && idx <= L3_CLUSTER)
+                return cache_level_names[idx];
+        else
+                return "INVALID";
+}
+/* proc file interface to configure the cluster size */
+static ssize_t litmus_cluster_proc_write(struct file *file,
+                                        const char __user *buffer, size_t count,
+                                        loff_t *ppos)
+{
+        enum cache_level *level = (enum cache_level *) PDE_DATA(file_inode(file));
+        ssize_t len;
+        char cache_name[8];
+        len = copy_and_chomp(cache_name, sizeof(cache_name), buffer, count);
+        if (len > 0 && parse_cache_level(cache_name, level)) {
+                printk(KERN_INFO "Cluster '%s' is unknown.\n", cache_name);
+                len = -EINVAL;
+        }
+        return len;
+}
+static int litmus_cluster_proc_show(struct seq_file *m, void *v)
+{
+        enum cache_level *level = (enum cache_level *)  m->private;
+        seq_printf(m, "%s\n", cache_level_name(*level));
+        return 0;
+}
+static int litmus_cluster_proc_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, litmus_cluster_proc_show, PDE_DATA(inode));
+}
+static const struct file_operations litmus_cluster_proc_fops = {
+        .open           = litmus_cluster_proc_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+        .write          = litmus_cluster_proc_write,
+};
+struct proc_dir_entry* create_cluster_file(struct proc_dir_entry* parent,
+                                           enum cache_level* level)
+{
+        struct proc_dir_entry* cluster_file;
+        cluster_file = proc_create_data("cluster", 0644, parent,
+                                        &litmus_cluster_proc_fops,
+                                        (void *) level);
+        if (!cluster_file) {
+                printk(KERN_ERR
+                       "Could not cluster procfs entry.\n");
+        }
+        return cluster_file;
+}
+static struct domain_proc_info* active_mapping = NULL;
+static int litmus_mapping_proc_show(struct seq_file *m, void *v)
+{
+        struct cd_mapping *mapping = (struct cd_mapping*) m->private;
+        if(!mapping)
+                return 0;
+        seq_printf(m, "%*pb\n", cpumask_pr_args(mapping->mask));
+        return 0;
+}
+static int litmus_mapping_proc_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, litmus_mapping_proc_show, PDE_DATA(inode));
+}
+static const struct file_operations litmus_domain_proc_fops = {
+        .open           = litmus_mapping_proc_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
+long activate_domain_proc(struct domain_proc_info* map)
+{
+        int i;
+        char name[8];
+        if (!map)
+                return -EINVAL;
+        if (cpus_dir == NULL || domains_dir == NULL)
+                return -EINVAL;
+        if (active_mapping)
+                deactivate_domain_proc();
+        active_mapping = map;
+        for (i = 0; i < map->num_cpus; ++i) {
+                struct cd_mapping* m = &map->cpu_to_domains[i];
+                snprintf(name, sizeof(name), "%d", m->id);
+                m->proc_file = proc_create_data(name, 0444, cpus_dir,
+                        &litmus_domain_proc_fops, (void*)m);
+        }
+        for (i = 0; i < map->num_domains; ++i) {
+                struct cd_mapping* m = &map->domain_to_cpus[i];
+                snprintf(name, sizeof(name), "%d", m->id);
+                m->proc_file = proc_create_data(name, 0444, domains_dir,
+                        &litmus_domain_proc_fops, (void*)m);
+        }
+        return 0;
+}
+long deactivate_domain_proc()
+{
+        int i;
+        char name[65];
+        struct domain_proc_info* map = active_mapping;
+        if (!map)
+                return -EINVAL;
+        for (i = 0; i < map->num_cpus; ++i) {
+                struct cd_mapping* m = &map->cpu_to_domains[i];
+                snprintf(name, sizeof(name), "%d", m->id);
+                remove_proc_entry(name, cpus_dir);
+                m->proc_file = NULL;
+        }
+        for (i = 0; i < map->num_domains; ++i) {
+                struct cd_mapping* m = &map->domain_to_cpus[i];
+                snprintf(name, sizeof(name), "%d", m->id);
+                remove_proc_entry(name, domains_dir);
+                m->proc_file = NULL;
+        }
+        active_mapping = NULL;
+        return 0;
+}
+long init_domain_proc_info(struct domain_proc_info* m,
+                                int num_cpus, int num_domains)
+{
+        int i;
+        int num_alloced_cpu_masks = 0;
+        int num_alloced_domain_masks = 0;
+        m->cpu_to_domains =
+                kmalloc(sizeof(*(m->cpu_to_domains))*num_cpus,
+                        GFP_ATOMIC);
+        if(!m->cpu_to_domains)
+                goto failure;
+        m->domain_to_cpus =
+                kmalloc(sizeof(*(m->domain_to_cpus))*num_domains,
+                        GFP_ATOMIC);
+        if(!m->domain_to_cpus)
+                goto failure;
+        for(i = 0; i < num_cpus; ++i) {
+                if(!zalloc_cpumask_var(&m->cpu_to_domains[i].mask, GFP_ATOMIC))
+                        goto failure;
+                ++num_alloced_cpu_masks;
+        }
+        for(i = 0; i < num_domains; ++i) {
+                if(!zalloc_cpumask_var(&m->domain_to_cpus[i].mask, GFP_ATOMIC))
+                        goto failure;
+                ++num_alloced_domain_masks;
+        }
+        return 0;
+failure:
+        for(i = 0; i < num_alloced_cpu_masks; ++i)
+                free_cpumask_var(m->cpu_to_domains[i].mask);
+        for(i = 0; i < num_alloced_domain_masks; ++i)
+                free_cpumask_var(m->domain_to_cpus[i].mask);
+        if(m->cpu_to_domains)
+                kfree(m->cpu_to_domains);
+        if(m->domain_to_cpus)
+                kfree(m->domain_to_cpus);
+        return -ENOMEM;
+}
+void destroy_domain_proc_info(struct domain_proc_info* m)
+{
+        int i;
+        for(i = 0; i < m->num_cpus; ++i)
+                free_cpumask_var(m->cpu_to_domains[i].mask);
+        for(i = 0; i < m->num_domains; ++i)
+                free_cpumask_var(m->domain_to_cpus[i].mask);
+        kfree(m->cpu_to_domains);
+        kfree(m->domain_to_cpus);
+        memset(m, 0, sizeof(*m));
+}
diff --git a/litmus/locking.c b/litmus/locking.c
new file mode 100644
index 000000000000..a1d0515c5613
--- /dev/null
+++ b/litmus/locking.c
@@ -0,0 +1,189 @@
+#include <linux/sched.h>
+#include <litmus/litmus.h>
+#include <litmus/fdso.h>
+#include <litmus/debug_trace.h>
+#ifdef CONFIG_LITMUS_LOCKING
+#include <linux/sched.h>
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/trace.h>
+#include <litmus/wait.h>
+static int create_generic_lock(void** obj_ref, obj_type_t type, void* __user arg);
+static int open_generic_lock(struct od_table_entry* entry, void* __user arg);
+static int close_generic_lock(struct od_table_entry* entry);
+static void destroy_generic_lock(obj_type_t type, void* sem);
+struct fdso_ops generic_lock_ops = {
+        .create  = create_generic_lock,
+        .open    = open_generic_lock,
+        .close   = close_generic_lock,
+        .destroy = destroy_generic_lock
+};
+static inline bool is_lock(struct od_table_entry* entry)
+{
+        return entry->class == &generic_lock_ops;
+}
+static inline struct litmus_lock* get_lock(struct od_table_entry* entry)
+{
+        BUG_ON(!is_lock(entry));
+        return (struct litmus_lock*) entry->obj->obj;
+}
+static  int create_generic_lock(void** obj_ref, obj_type_t type, void* __user arg)
+{
+        struct litmus_lock* lock;
+        int err;
+        err = litmus->allocate_lock(&lock, type, arg);
+        if (err == 0)
+                *obj_ref = lock;
+        return err;
+}
+static int open_generic_lock(struct od_table_entry* entry, void* __user arg)
+{
+        struct litmus_lock* lock = get_lock(entry);
+        if (lock->ops->open)
+                return lock->ops->open(lock, arg);
+        else
+                return 0; /* default: any task can open it */
+}
+static int close_generic_lock(struct od_table_entry* entry)
+{
+        struct litmus_lock* lock = get_lock(entry);
+        if (lock->ops->close)
+                return lock->ops->close(lock);
+        else
+                return 0; /* default: closing succeeds */
+}
+static void destroy_generic_lock(obj_type_t type, void* obj)
+{
+        struct litmus_lock* lock = (struct litmus_lock*) obj;
+        lock->ops->deallocate(lock);
+}
+asmlinkage long sys_litmus_lock(int lock_od)
+{
+        long err = -EINVAL;
+        struct od_table_entry* entry;
+        struct litmus_lock* l;
+        TS_SYSCALL_IN_START;
+        TS_SYSCALL_IN_END;
+        TS_LOCK_START;
+        entry = get_entry_for_od(lock_od);
+        if (entry && is_lock(entry)) {
+                l = get_lock(entry);
+                TRACE_CUR("attempts to lock 0x%p\n", l);
+                err = l->ops->lock(l);
+        }
+        /* Note: task my have been suspended or preempted in between!  Take
+         * this into account when computing overheads. */
+        TS_LOCK_END;
+        TS_SYSCALL_OUT_START;
+        return err;
+}
+asmlinkage long sys_litmus_unlock(int lock_od)
+{
+        long err = -EINVAL;
+        struct od_table_entry* entry;
+        struct litmus_lock* l;
+        TS_SYSCALL_IN_START;
+        TS_SYSCALL_IN_END;
+        TS_UNLOCK_START;
+        entry = get_entry_for_od(lock_od);
+        if (entry && is_lock(entry)) {
+                l = get_lock(entry);
+                TRACE_CUR("attempts to unlock 0x%p\n", l);
+                err = l->ops->unlock(l);
+        }
+        /* Note: task my have been preempted in between!  Take this into
+         * account when computing overheads. */
+        TS_UNLOCK_END;
+        TS_SYSCALL_OUT_START;
+        return err;
+}
+struct task_struct* __waitqueue_remove_first(wait_queue_head_t *wq)
+{
+        wait_queue_t* q;
+        struct task_struct* t = NULL;
+        if (waitqueue_active(wq)) {
+                q = list_entry(wq->task_list.next,
+                               wait_queue_t, task_list);
+                t = (struct task_struct*) q->private;
+                __remove_wait_queue(wq, q);
+        }
+        return(t);
+}
+unsigned int __add_wait_queue_prio_exclusive(
+        wait_queue_head_t* head,
+        prio_wait_queue_t *new)
+{
+        struct list_head *pos;
+        unsigned int passed = 0;
+        new->wq.flags |= WQ_FLAG_EXCLUSIVE;
+        /* find a spot where the new entry is less than the next */
+        list_for_each(pos, &head->task_list) {
+                prio_wait_queue_t* queued = list_entry(pos, prio_wait_queue_t,
+                                                       wq.task_list);
+                if (unlikely(lt_before(new->priority, queued->priority) ||
+                             (new->priority == queued->priority &&
+                              new->tie_breaker < queued->tie_breaker))) {
+                        /* pos is not less than new, thus insert here */
+                        __list_add(&new->wq.task_list, pos->prev, pos);
+                        goto out;
+                }
+                passed++;
+        }
+        /* if we get to this point either the list is empty or every entry
+         * queued element is less than new.
+         * Let's add new to the end. */
+        list_add_tail(&new->wq.task_list, &head->task_list);
+out:
+        return passed;
+}
+#else
+struct fdso_ops generic_lock_ops = {};
+asmlinkage long sys_litmus_lock(int sem_od)
+{
+        return -ENOSYS;
+}
+asmlinkage long sys_litmus_unlock(int sem_od)
+{
+        return -ENOSYS;
+}
+#endif
diff --git a/litmus/preempt.c b/litmus/preempt.c
new file mode 100644
index 000000000000..5f678536b7fa
--- /dev/null
+++ b/litmus/preempt.c
@@ -0,0 +1,144 @@
+#include <linux/sched.h>
+#include <litmus/debug_trace.h>
+#include <litmus/litmus.h>
+#include <litmus/preempt.h>
+#include <litmus/trace.h>
+DEFINE_PER_CPU(bool, litmus_preemption_in_progress);
+/* The rescheduling state of each processor.
+ */
+DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, resched_state);
+void sched_state_will_schedule(struct task_struct* tsk)
+{
+        /* Litmus hack: we only care about processor-local invocations of
+         * set_tsk_need_resched(). We can't reliably set the flag remotely
+         * since it might race with other updates to the scheduling state.  We
+         * can't rely on the runqueue lock protecting updates to the sched
+         * state since processors do not acquire the runqueue locks for all
+         * updates to the sched state (to avoid acquiring two runqueue locks at
+         * the same time). Further, if tsk is residing on a remote processor,
+         * then that processor doesn't actually know yet that it is going to
+         * reschedule; it still must receive an IPI (unless a local invocation
+         * races).
+         */
+        if (likely(task_cpu(tsk) == smp_processor_id())) {
+                VERIFY_SCHED_STATE(TASK_SCHEDULED | SHOULD_SCHEDULE | TASK_PICKED | WILL_SCHEDULE);
+                if (is_in_sched_state(TASK_PICKED | PICKED_WRONG_TASK))
+                        set_sched_state(PICKED_WRONG_TASK);
+                else
+                        set_sched_state(WILL_SCHEDULE);
+        } else
+                /* Litmus tasks should never be subject to a remote
+                 * set_tsk_need_resched(). */
+                BUG_ON(is_realtime(tsk));
+#ifdef CONFIG_PREEMPT_STATE_TRACE
+        TRACE_TASK(tsk, "set_tsk_need_resched() ret:%p\n",
+                   __builtin_return_address(0));
+#endif
+}
+/* Called by the IPI handler after another CPU called smp_send_resched(). */
+void sched_state_ipi(void)
+{
+        /* If the IPI was slow, we might be in any state right now. The IPI is
+         * only meaningful if we are in SHOULD_SCHEDULE. */
+        if (is_in_sched_state(SHOULD_SCHEDULE)) {
+                /* Cause scheduler to be invoked.
+                 * This will cause a transition to WILL_SCHEDULE. */
+                set_tsk_need_resched(current);
+                TRACE_STATE("IPI -> set_tsk_need_resched(%s/%d)\n",
+                            current->comm, current->pid);
+                TS_SEND_RESCHED_END;
+        } else {
+                /* ignore */
+                TRACE_STATE("ignoring IPI in state %x (%s)\n",
+                            get_sched_state(),
+                            sched_state_name(get_sched_state()));
+        }
+}
+/* Called by plugins to cause a CPU to reschedule. IMPORTANT: the caller must
+ * hold the lock that is used to serialize scheduling decisions. */
+void litmus_reschedule(int cpu)
+{
+        int picked_transition_ok = 0;
+        int scheduled_transition_ok = 0;
+        /* The (remote) CPU could be in any state. */
+        /* The critical states are TASK_PICKED and TASK_SCHEDULED, as the CPU
+         * is not aware of the need to reschedule at this point. */
+        /* is a context switch in progress? */
+        if (cpu_is_in_sched_state(cpu, TASK_PICKED))
+                picked_transition_ok = sched_state_transition_on(
+                        cpu, TASK_PICKED, PICKED_WRONG_TASK);
+        if (!picked_transition_ok &&
+            cpu_is_in_sched_state(cpu, TASK_SCHEDULED)) {
+                /* We either raced with the end of the context switch, or the
+                 * CPU was in TASK_SCHEDULED anyway. */
+                scheduled_transition_ok = sched_state_transition_on(
+                        cpu, TASK_SCHEDULED, SHOULD_SCHEDULE);
+        }
+        /* If the CPU was in state TASK_SCHEDULED, then we need to cause the
+         * scheduler to be invoked. */
+        if (scheduled_transition_ok) {
+                if (smp_processor_id() == cpu) {
+                        set_tsk_need_resched(current);
+                        preempt_set_need_resched();
+                } else {
+                        TS_SEND_RESCHED_START(cpu);
+                        smp_send_reschedule(cpu);
+                }
+        }
+        TRACE_STATE("%s picked-ok:%d sched-ok:%d\n",
+                    __FUNCTION__,
+                    picked_transition_ok,
+                    scheduled_transition_ok);
+}
+void litmus_reschedule_local(void)
+{
+        if (is_in_sched_state(TASK_PICKED))
+                set_sched_state(PICKED_WRONG_TASK);
+        else if (is_in_sched_state(TASK_SCHEDULED
+                                   | SHOULD_SCHEDULE
+                                   | PICKED_WRONG_TASK)) {
+                set_sched_state(WILL_SCHEDULE);
+                set_tsk_need_resched(current);
+                preempt_set_need_resched();
+        }
+}
+#ifdef CONFIG_DEBUG_KERNEL
+void sched_state_plugin_check(void)
+{
+        if (!is_in_sched_state(TASK_PICKED | PICKED_WRONG_TASK)) {
+                TRACE("!!!! plugin did not call sched_state_task_picked()!"
+                      "Calling sched_state_task_picked() is mandatory---fix this.\n");
+                set_sched_state(TASK_PICKED);
+        }
+}
+#define NAME_CHECK(x) case x:  return #x
+const char* sched_state_name(int s)
+{
+        switch (s) {
+                NAME_CHECK(TASK_SCHEDULED);
+                NAME_CHECK(SHOULD_SCHEDULE);
+                NAME_CHECK(WILL_SCHEDULE);
+                NAME_CHECK(TASK_PICKED);
+                NAME_CHECK(PICKED_WRONG_TASK);
+        default:
+                return "UNKNOWN";
+        };
+}
+#endif
diff --git a/litmus/reservations/Makefile b/litmus/reservations/Makefile
new file mode 100644
index 000000000000..517fc2ff8a76
--- /dev/null
+++ b/litmus/reservations/Makefile
@@ -0,0 +1,3 @@
+obj-y += core.o budget-notifier.o alloc.o
+obj-y += polling.o
+obj-y += table-driven.o
diff --git a/litmus/reservations/alloc.c b/litmus/reservations/alloc.c
new file mode 100644
index 000000000000..1f93f223f504
--- /dev/null
+++ b/litmus/reservations/alloc.c
@@ -0,0 +1,143 @@
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include <litmus/rt_param.h>
+#include <litmus/reservations/alloc.h>
+#include <litmus/reservations/polling.h>
+#include <litmus/reservations/table-driven.h>
+long alloc_polling_reservation(
+        int res_type,
+        struct reservation_config *config,
+        struct reservation **_res)
+{
+        struct polling_reservation *pres;
+        int use_edf  = config->priority == LITMUS_NO_PRIORITY;
+        int periodic =  res_type == PERIODIC_POLLING;
+        if (config->polling_params.budget >
+            config->polling_params.period) {
+                printk(KERN_ERR "invalid polling reservation (%u): "
+                       "budget > period\n", config->id);
+                return -EINVAL;
+        }
+        if (config->polling_params.budget >
+            config->polling_params.relative_deadline
+            && config->polling_params.relative_deadline) {
+                printk(KERN_ERR "invalid polling reservation (%u): "
+                       "budget > deadline\n", config->id);
+                return -EINVAL;
+        }
+        if (config->polling_params.offset >
+            config->polling_params.period) {
+                printk(KERN_ERR "invalid polling reservation (%u): "
+                       "offset > period\n", config->id);
+                return -EINVAL;
+        }
+        /* XXX: would be nice to use a core-local allocation. */
+        pres = kzalloc(sizeof(*pres), GFP_KERNEL);
+        if (!pres)
+                return -ENOMEM;
+        polling_reservation_init(pres, use_edf, periodic,
+                config->polling_params.budget,
+                config->polling_params.period,
+                config->polling_params.relative_deadline,
+                config->polling_params.offset);
+        pres->res.id = config->id;
+        if (!use_edf)
+                pres->res.priority = config->priority;
+        *_res = &pres->res;
+        return 0;
+}
+#define MAX_INTERVALS 1024
+long alloc_table_driven_reservation(
+        struct reservation_config *config,
+        struct reservation **_res)
+{
+        struct table_driven_reservation *td_res = NULL;
+        struct lt_interval *slots = NULL;
+        size_t slots_size;
+        unsigned int i, num_slots;
+        long err = -EINVAL;
+        void *mem;
+        if (!config->table_driven_params.num_intervals) {
+                printk(KERN_ERR "invalid table-driven reservation (%u): "
+                       "no intervals\n", config->id);
+                return -EINVAL;
+        }
+        if (config->table_driven_params.num_intervals > MAX_INTERVALS) {
+                printk(KERN_ERR "invalid table-driven reservation (%u): "
+                       "too many intervals (max: %d)\n", config->id, MAX_INTERVALS);
+                return -EINVAL;
+        }
+        num_slots = config->table_driven_params.num_intervals;
+        slots_size = sizeof(slots[0]) * num_slots;
+        mem = kzalloc(sizeof(*td_res) + slots_size, GFP_KERNEL);
+        if (!mem) {
+                return -ENOMEM;
+        } else {
+                slots  = mem + sizeof(*td_res);
+                td_res = mem;
+                err = copy_from_user(slots,
+                        config->table_driven_params.intervals, slots_size);
+        }
+        if (!err) {
+                /* sanity checks */
+                for (i = 0; !err && i < num_slots; i++)
+                        if (slots[i].end <= slots[i].start) {
+                                printk(KERN_ERR
+                                       "invalid table-driven reservation (%u): "
+                                       "invalid interval %u => [%llu, %llu]\n",
+                                       config->id, i,
+                                       slots[i].start, slots[i].end);
+                                err = -EINVAL;
+                        }
+                for (i = 0; !err && i + 1 < num_slots; i++)
+                        if (slots[i + 1].start <= slots[i].end) {
+                                printk(KERN_ERR
+                                       "invalid table-driven reservation (%u): "
+                                       "overlapping intervals %u, %u\n",
+                                       config->id, i, i + 1);
+                                err = -EINVAL;
+                        }
+                if (slots[num_slots - 1].end >
+                        config->table_driven_params.major_cycle_length) {
+                        printk(KERN_ERR
+                                "invalid table-driven reservation (%u): last "
+                                "interval ends past major cycle %llu > %llu\n",
+                                config->id,
+                                slots[num_slots - 1].end,
+                                config->table_driven_params.major_cycle_length);
+                        err = -EINVAL;
+                }
+        }
+        if (err) {
+                kfree(td_res);
+        } else {
+                table_driven_reservation_init(td_res,
+                        config->table_driven_params.major_cycle_length,
+                        slots, num_slots);
+                td_res->res.id = config->id;
+                td_res->res.priority = config->priority;
+                *_res = &td_res->res;
+        }
+        return err;
+}
diff --git a/litmus/reservations/budget-notifier.c b/litmus/reservations/budget-notifier.c
new file mode 100644
index 000000000000..0b0f42687882
--- /dev/null
+++ b/litmus/reservations/budget-notifier.c
@@ -0,0 +1,26 @@
+#include <litmus/reservations/budget-notifier.h>
+void budget_notifier_list_init(struct budget_notifier_list* bnl)
+{
+        INIT_LIST_HEAD(&bnl->list);
+        raw_spin_lock_init(&bnl->lock);
+}
+void budget_notifiers_fire(struct budget_notifier_list *bnl, bool replenished)
+{
+        struct budget_notifier *bn, *next;
+        unsigned long flags;
+        raw_spin_lock_irqsave(&bnl->lock, flags);
+        list_for_each_entry_safe(bn, next, &bnl->list, list) {
+                if (replenished)
+                        bn->budget_replenished(bn);
+                else
+                        bn->budget_exhausted(bn);
+        }
+        raw_spin_unlock_irqrestore(&bnl->lock, flags);
+}
diff --git a/litmus/reservations/core.c b/litmus/reservations/core.c
new file mode 100644
index 000000000000..5137eda0f643
--- /dev/null
+++ b/litmus/reservations/core.c
@@ -0,0 +1,393 @@
+#include <linux/sched.h>
+#include <litmus/litmus.h>
+#include <litmus/debug_trace.h>
+#include <litmus/reservations/reservation.h>
+void reservation_init(struct reservation *res)
+{
+        memset(res, 0, sizeof(*res));
+        res->state = RESERVATION_INACTIVE;
+        INIT_LIST_HEAD(&res->clients);
+        INIT_LIST_HEAD(&res->replenish_list);
+        budget_notifier_list_init(&res->budget_notifiers);
+}
+struct task_struct* default_dispatch_client(
+        struct reservation *res,
+        lt_t *for_at_most)
+{
+        struct reservation_client *client, *next;
+        struct task_struct* tsk;
+        BUG_ON(res->state != RESERVATION_ACTIVE);
+        *for_at_most = 0;
+        list_for_each_entry_safe(client, next, &res->clients, list) {
+                tsk = client->dispatch(client);
+                if (likely(tsk)) {
+                        /* Primitive form of round-robin scheduling:
+                         * make sure we alternate between multiple clients
+                         * with at least the granularity of the replenishment
+                         * period. Reservations that need more fine-grained
+                         * or more predictable alternation between threads
+                         * within a reservation should provide a custom
+                         * dispatch function. */
+                        list_del(&client->list);
+                        /* move to back of list */
+                        list_add_tail(&client->list, &res->clients);
+                        return tsk;
+                }
+        }
+        return NULL;
+}
+void common_drain_budget(
+        struct reservation *res,
+        lt_t how_much)
+{
+        if (how_much >= res->cur_budget)
+                res->cur_budget = 0;
+        else
+                res->cur_budget -= how_much;
+        res->budget_consumed += how_much;
+        res->budget_consumed_total += how_much;
+        switch (res->state) {
+                case RESERVATION_DEPLETED:
+                case RESERVATION_INACTIVE:
+                        BUG();
+                        break;
+                case RESERVATION_ACTIVE_IDLE:
+                case RESERVATION_ACTIVE:
+                        if (!res->cur_budget) {
+                                res->env->change_state(res->env, res,
+                                        RESERVATION_DEPLETED);
+                        } /* else: stay in current state */
+                        break;
+        }
+}
+static struct task_struct * task_client_dispatch(struct reservation_client *client)
+{
+        struct task_client *tc = container_of(client, struct task_client, client);
+        return tc->task;
+}
+void task_client_init(struct task_client *tc, struct task_struct *tsk,
+        struct reservation *res)
+{
+        memset(&tc->client, 0, sizeof(tc->client));
+        tc->client.dispatch = task_client_dispatch;
+        tc->client.reservation = res;
+        tc->task = tsk;
+}
+static void sup_scheduler_update_at(
+        struct sup_reservation_environment* sup_env,
+        lt_t when)
+{
+        if (sup_env->next_scheduler_update > when)
+                sup_env->next_scheduler_update = when;
+}
+static void sup_scheduler_update_after(
+        struct sup_reservation_environment* sup_env,
+        lt_t timeout)
+{
+        sup_scheduler_update_at(sup_env, sup_env->env.current_time + timeout);
+}
+static int _sup_queue_depleted(
+        struct sup_reservation_environment* sup_env,
+        struct reservation *res)
+{
+        struct list_head *pos;
+        struct reservation *queued;
+        int passed_earlier = 0;
+        BUG_ON(in_list(&res->replenish_list));
+        list_for_each(pos, &sup_env->depleted_reservations) {
+                queued = list_entry(pos, struct reservation, replenish_list);
+                if (queued->next_replenishment > res->next_replenishment) {
+                        list_add(&res->replenish_list, pos->prev);
+                        return passed_earlier;
+                } else
+                        passed_earlier = 1;
+        }
+        list_add_tail(&res->replenish_list, &sup_env->depleted_reservations);
+        return passed_earlier;
+}
+static void sup_queue_depleted(
+        struct sup_reservation_environment* sup_env,
+        struct reservation *res)
+{
+        int passed_earlier = _sup_queue_depleted(sup_env, res);
+        /* check for updated replenishment time */
+        if (!passed_earlier)
+                sup_scheduler_update_at(sup_env, res->next_replenishment);
+}
+static int _sup_queue_active(
+        struct sup_reservation_environment* sup_env,
+        struct reservation *res)
+{
+        struct list_head *pos;
+        struct reservation *queued;
+        int passed_active = 0;
+        if (likely(res->priority != RESERVATION_BACKGROUND_PRIORITY)) {
+                /* enqueue in order of priority */
+                list_for_each(pos, &sup_env->active_reservations) {
+                        queued = list_entry(pos, struct reservation, list);
+                        if (queued->priority > res->priority) {
+                                list_add(&res->list, pos->prev);
+                                return passed_active;
+                        } else if (queued->state == RESERVATION_ACTIVE)
+                                passed_active = 1;
+                }
+        } else {
+                /* don't preempt unless the list happens to be empty */
+                passed_active = !list_empty(&sup_env->active_reservations);
+        }
+        /* Either a background reservation, or we fell off the end of the list.
+         * In both cases, just add the reservation to the end of the list of
+         * active reservations. */
+        list_add_tail(&res->list, &sup_env->active_reservations);
+        return passed_active;
+}
+static void sup_queue_active(
+        struct sup_reservation_environment* sup_env,
+        struct reservation *res)
+{
+        int passed_active = _sup_queue_active(sup_env, res);
+        /* check for possible preemption */
+        if (res->state == RESERVATION_ACTIVE && !passed_active)
+                sup_env->next_scheduler_update = SUP_RESCHEDULE_NOW;
+        else if (res == list_first_entry(&sup_env->active_reservations,
+                                         struct reservation, list)) {
+                /* First reservation is draining budget => make sure
+                 * the scheduler is called to notice when the reservation
+                 * budget has been drained completely. */
+                sup_scheduler_update_after(sup_env, res->cur_budget);
+        }
+}
+static void sup_queue_reservation(
+        struct sup_reservation_environment* sup_env,
+        struct reservation *res)
+{
+        switch (res->state) {
+                case RESERVATION_INACTIVE:
+                        list_add(&res->list, &sup_env->inactive_reservations);
+                        break;
+                case RESERVATION_DEPLETED:
+                        sup_queue_depleted(sup_env, res);
+                        break;
+                case RESERVATION_ACTIVE_IDLE:
+                case RESERVATION_ACTIVE:
+                        sup_queue_active(sup_env, res);
+                        break;
+        }
+}
+void sup_add_new_reservation(
+        struct sup_reservation_environment* sup_env,
+        struct reservation* new_res)
+{
+        new_res->env = &sup_env->env;
+        list_add(&new_res->all_list, &sup_env->all_reservations);
+        sup_queue_reservation(sup_env, new_res);
+}
+struct reservation* sup_find_by_id(struct sup_reservation_environment* sup_env,
+        unsigned int id)
+{
+        struct reservation *res;
+        list_for_each_entry(res, &sup_env->all_reservations, all_list) {
+                if (res->id == id)
+                        return res;
+        }
+        return NULL;
+}
+static void sup_charge_budget(
+        struct sup_reservation_environment* sup_env,
+        lt_t delta)
+{
+        struct reservation *res;
+        /* charge the highest-priority ACTIVE or ACTIVE_IDLE reservation */
+        res = list_first_entry_or_null(
+                &sup_env->active_reservations, struct reservation, list);
+        if (res) {
+                TRACE("R%d: charging at %llu for %llu execution, budget before: %llu\n",
+                        res->id, res->env->current_time, delta, res->cur_budget);
+                res->ops->drain_budget(res, delta);
+                TRACE("R%d: budget now: %llu, priority: %llu\n",
+                        res->id, res->cur_budget, res->priority);
+        }
+        /* check when the next budget expires */
+        res = list_first_entry_or_null(
+                &sup_env->active_reservations, struct reservation, list);
+        if (res) {
+                /* make sure scheduler is invoked when this reservation expires
+                 * its remaining budget */
+                TRACE("requesting scheduler update for reservation %u "
+                        "in %llu nanoseconds\n",
+                        res->id, res->cur_budget);
+                sup_scheduler_update_after(sup_env, res->cur_budget);
+        }
+}
+static void sup_replenish_budgets(struct sup_reservation_environment* sup_env)
+{
+        struct list_head *pos, *next;
+        struct reservation *res;
+        list_for_each_safe(pos, next, &sup_env->depleted_reservations) {
+                res = list_entry(pos, struct reservation, replenish_list);
+                if (res->next_replenishment <= sup_env->env.current_time) {
+                        TRACE("R%d: replenishing budget at %llu, "
+                              "priority: %llu\n",
+                                res->id, res->env->current_time, res->priority);
+                        res->ops->replenish(res);
+                } else {
+                        /* list is ordered by increasing depletion times */
+                        break;
+                }
+        }
+        /* request a scheduler update at the next replenishment instant */
+        res = list_first_entry_or_null(&sup_env->depleted_reservations,
+                struct reservation, replenish_list);
+        if (res)
+                sup_scheduler_update_at(sup_env, res->next_replenishment);
+}
+void sup_update_time(
+        struct sup_reservation_environment* sup_env,
+        lt_t now)
+{
+        lt_t delta;
+        /* If the time didn't advance, there is nothing to do.
+         * This check makes it safe to call sup_advance_time() potentially
+         * multiple times (e.g., via different code paths. */
+        if (!list_empty(&sup_env->active_reservations))
+                TRACE("(sup_update_time) now: %llu, current_time: %llu\n", now,
+                        sup_env->env.current_time);
+        if (unlikely(now <= sup_env->env.current_time))
+                return;
+        delta = now - sup_env->env.current_time;
+        sup_env->env.current_time = now;
+        /* check if future updates are required */
+        if (sup_env->next_scheduler_update <= sup_env->env.current_time)
+                sup_env->next_scheduler_update = SUP_NO_SCHEDULER_UPDATE;
+        /* deplete budgets by passage of time */
+        sup_charge_budget(sup_env, delta);
+        /* check if any budgets were replenished */
+        sup_replenish_budgets(sup_env);
+}
+struct task_struct* sup_dispatch(struct sup_reservation_environment* sup_env)
+{
+        struct reservation *res, *next;
+        struct task_struct *tsk = NULL;
+        lt_t time_slice;
+        list_for_each_entry_safe(res, next, &sup_env->active_reservations, list) {
+                if (res->state == RESERVATION_ACTIVE) {
+                        tsk = res->ops->dispatch_client(res, &time_slice);
+                        if (likely(tsk)) {
+                                if (time_slice)
+                                    sup_scheduler_update_after(sup_env, time_slice);
+                                sup_scheduler_update_after(sup_env, res->cur_budget);
+                                return tsk;
+                        }
+                }
+        }
+        return NULL;
+}
+static void sup_res_change_state(
+        struct reservation_environment* env,
+        struct reservation *res,
+        reservation_state_t new_state)
+{
+        struct sup_reservation_environment* sup_env;
+        sup_env = container_of(env, struct sup_reservation_environment, env);
+        TRACE("reservation R%d state %d->%d at %llu\n",
+                res->id, res->state, new_state, env->current_time);
+        if (new_state == RESERVATION_DEPLETED
+            && (res->state == RESERVATION_ACTIVE ||
+                res->state == RESERVATION_ACTIVE_IDLE)) {
+                budget_notifiers_fire(&res->budget_notifiers, false);
+        } else if (res->state == RESERVATION_DEPLETED
+                   && new_state == RESERVATION_ACTIVE) {
+                budget_notifiers_fire(&res->budget_notifiers, true);
+        }
+        /* dequeue prior to re-queuing */
+        if (res->state == RESERVATION_DEPLETED)
+                list_del(&res->replenish_list);
+        else
+                list_del(&res->list);
+        /* check if we need to reschedule because we lost an active reservation */
+        if (res->state == RESERVATION_ACTIVE && !sup_env->will_schedule)
+                sup_env->next_scheduler_update = SUP_RESCHEDULE_NOW;
+        res->state = new_state;
+        sup_queue_reservation(sup_env, res);
+}
+static void sup_request_replenishment(
+        struct reservation_environment* env,
+        struct reservation *res)
+{
+        struct sup_reservation_environment* sup_env;
+        sup_env = container_of(env, struct sup_reservation_environment, env);
+        sup_queue_depleted(sup_env, res);
+}
+void sup_init(struct sup_reservation_environment* sup_env)
+{
+        memset(sup_env, 0, sizeof(*sup_env));
+        INIT_LIST_HEAD(&sup_env->all_reservations);
+        INIT_LIST_HEAD(&sup_env->active_reservations);
+        INIT_LIST_HEAD(&sup_env->depleted_reservations);
+        INIT_LIST_HEAD(&sup_env->inactive_reservations);
+        sup_env->env.change_state = sup_res_change_state;
+        sup_env->env.request_replenishment = sup_request_replenishment;
+        sup_env->next_scheduler_update = SUP_NO_SCHEDULER_UPDATE;
+}
diff --git a/litmus/reservations/polling.c b/litmus/reservations/polling.c
new file mode 100644
index 000000000000..63e0bed566e8
--- /dev/null
+++ b/litmus/reservations/polling.c
@@ -0,0 +1,256 @@
+#include <linux/sched.h>
+#include <litmus/litmus.h>
+#include <litmus/reservations/reservation.h>
+#include <litmus/reservations/polling.h>
+static void periodic_polling_client_arrives(
+        struct reservation* res,
+        struct reservation_client *client
+)
+{
+        struct polling_reservation *pres =
+                container_of(res, struct polling_reservation, res);
+        lt_t instances, tmp;
+        list_add_tail(&client->list, &res->clients);
+        switch (res->state) {
+                case RESERVATION_INACTIVE:
+                        /* Figure out next replenishment time. */
+                        tmp = res->env->current_time - res->env->time_zero;
+                        instances =  div64_u64(tmp, pres->period);
+                        res->next_replenishment =
+                                (instances + 1) * pres->period + pres->offset;
+                        TRACE("pol-res: activate tmp=%llu instances=%llu period=%llu nextrp=%llu cur=%llu\n",
+                                tmp, instances, pres->period, res->next_replenishment,
+                                res->env->current_time);
+                        res->env->change_state(res->env, res,
+                                RESERVATION_DEPLETED);
+                        break;
+                case RESERVATION_ACTIVE:
+                case RESERVATION_DEPLETED:
+                        /* do nothing */
+                        break;
+                case RESERVATION_ACTIVE_IDLE:
+                        res->env->change_state(res->env, res,
+                                RESERVATION_ACTIVE);
+                        break;
+        }
+}
+static void periodic_polling_client_departs(
+        struct reservation *res,
+        struct reservation_client *client,
+        int did_signal_job_completion
+)
+{
+        list_del(&client->list);
+        switch (res->state) {
+                case RESERVATION_INACTIVE:
+                case RESERVATION_ACTIVE_IDLE:
+                        BUG(); /* INACTIVE or IDLE <=> no client */
+                        break;
+                case RESERVATION_ACTIVE:
+                        if (list_empty(&res->clients)) {
+                                res->env->change_state(res->env, res,
+                                                RESERVATION_ACTIVE_IDLE);
+                        } /* else: nothing to do, more clients ready */
+                        break;
+                case RESERVATION_DEPLETED:
+                        /* do nothing */
+                        break;
+        }
+}
+static void periodic_polling_on_replenishment(
+        struct reservation *res
+)
+{
+        struct polling_reservation *pres =
+                container_of(res, struct polling_reservation, res);
+        /* replenish budget */
+        res->cur_budget = pres->max_budget;
+        res->next_replenishment += pres->period;
+        res->budget_consumed = 0;
+        switch (res->state) {
+                case RESERVATION_DEPLETED:
+                case RESERVATION_INACTIVE:
+                case RESERVATION_ACTIVE_IDLE:
+                        if (list_empty(&res->clients))
+                                /* no clients => poll again later */
+                                res->env->change_state(res->env, res,
+                                        RESERVATION_INACTIVE);
+                        else
+                                /* we have clients & budget => ACTIVE */
+                                res->env->change_state(res->env, res,
+                                        RESERVATION_ACTIVE);
+                        break;
+                case RESERVATION_ACTIVE:
+                        /* Replenished while active => tardy? In any case,
+                         * go ahead and stay active. */
+                        break;
+        }
+}
+static void periodic_polling_on_replenishment_edf(
+        struct reservation *res
+)
+{
+        struct polling_reservation *pres =
+                container_of(res, struct polling_reservation, res);
+        /* update current priority */
+        res->priority = res->next_replenishment + pres->deadline;
+        /* do common updates */
+        periodic_polling_on_replenishment(res);
+}
+static struct reservation_ops periodic_polling_ops_fp = {
+        .dispatch_client = default_dispatch_client,
+        .client_arrives = periodic_polling_client_arrives,
+        .client_departs = periodic_polling_client_departs,
+        .replenish = periodic_polling_on_replenishment,
+        .drain_budget = common_drain_budget,
+};
+static struct reservation_ops periodic_polling_ops_edf = {
+        .dispatch_client = default_dispatch_client,
+        .client_arrives = periodic_polling_client_arrives,
+        .client_departs = periodic_polling_client_departs,
+        .replenish = periodic_polling_on_replenishment_edf,
+        .drain_budget = common_drain_budget,
+};
+static void sporadic_polling_client_arrives_fp(
+        struct reservation* res,
+        struct reservation_client *client
+)
+{
+        struct polling_reservation *pres =
+                container_of(res, struct polling_reservation, res);
+        list_add_tail(&client->list, &res->clients);
+        switch (res->state) {
+                case RESERVATION_INACTIVE:
+                        /* Replenish now. */
+                        res->cur_budget = pres->max_budget;
+                        res->next_replenishment =
+                                res->env->current_time + pres->period;
+                        res->env->change_state(res->env, res,
+                                RESERVATION_ACTIVE);
+                        break;
+                case RESERVATION_ACTIVE:
+                case RESERVATION_DEPLETED:
+                        /* do nothing */
+                        break;
+                case RESERVATION_ACTIVE_IDLE:
+                        res->env->change_state(res->env, res,
+                                RESERVATION_ACTIVE);
+                        break;
+        }
+}
+static void sporadic_polling_client_arrives_edf(
+        struct reservation* res,
+        struct reservation_client *client
+)
+{
+        struct polling_reservation *pres =
+                container_of(res, struct polling_reservation, res);
+        list_add_tail(&client->list, &res->clients);
+        switch (res->state) {
+                case RESERVATION_INACTIVE:
+                        /* Replenish now. */
+                        res->cur_budget = pres->max_budget;
+                        res->next_replenishment =
+                                res->env->current_time + pres->period;
+                        res->priority =
+                                res->env->current_time + pres->deadline;
+                        res->env->change_state(res->env, res,
+                                RESERVATION_ACTIVE);
+                        break;
+                case RESERVATION_ACTIVE:
+                case RESERVATION_DEPLETED:
+                        /* do nothing */
+                        break;
+                case RESERVATION_ACTIVE_IDLE:
+                        res->env->change_state(res->env, res,
+                                RESERVATION_ACTIVE);
+                        break;
+        }
+}
+static struct reservation_ops sporadic_polling_ops_fp = {
+        .dispatch_client = default_dispatch_client,
+        .client_arrives = sporadic_polling_client_arrives_fp,
+        .client_departs = periodic_polling_client_departs,
+        .replenish = periodic_polling_on_replenishment,
+        .drain_budget = common_drain_budget,
+};
+static struct reservation_ops sporadic_polling_ops_edf = {
+        .dispatch_client = default_dispatch_client,
+        .client_arrives = sporadic_polling_client_arrives_edf,
+        .client_departs = periodic_polling_client_departs,
+        .replenish = periodic_polling_on_replenishment_edf,
+        .drain_budget = common_drain_budget,
+};
+void polling_reservation_init(
+        struct polling_reservation *pres,
+        int use_edf_prio,
+        int use_periodic_polling,
+        lt_t budget, lt_t period, lt_t deadline, lt_t offset
+)
+{
+        if (!deadline)
+                deadline = period;
+        BUG_ON(budget > period);
+        BUG_ON(budget > deadline);
+        BUG_ON(offset >= period);
+        reservation_init(&pres->res);
+        pres->max_budget = budget;
+        pres->period = period;
+        pres->deadline = deadline;
+        pres->offset = offset;
+        if (use_periodic_polling) {
+                pres->res.kind = PERIODIC_POLLING;
+                if (use_edf_prio)
+                        pres->res.ops = &periodic_polling_ops_edf;
+                else
+                        pres->res.ops = &periodic_polling_ops_fp;
+        } else {
+                pres->res.kind = SPORADIC_POLLING;
+                if (use_edf_prio)
+                        pres->res.ops = &sporadic_polling_ops_edf;
+                else
+                        pres->res.ops = &sporadic_polling_ops_fp;
+        }
+}
diff --git a/litmus/reservations/table-driven.c b/litmus/reservations/table-driven.c
new file mode 100644
index 000000000000..e4debcb5d4d2
--- /dev/null
+++ b/litmus/reservations/table-driven.c
@@ -0,0 +1,269 @@
+#include <linux/sched.h>
+#include <litmus/litmus.h>
+#include <litmus/reservations/reservation.h>
+#include <litmus/reservations/table-driven.h>
+static lt_t td_cur_major_cycle_start(struct table_driven_reservation *tdres)
+{
+        lt_t x, tmp;
+        tmp = tdres->res.env->current_time - tdres->res.env->time_zero;
+        x = div64_u64(tmp, tdres->major_cycle);
+        x *= tdres->major_cycle;
+        return x;
+}
+static lt_t td_next_major_cycle_start(struct table_driven_reservation *tdres)
+{
+        lt_t x, tmp;
+        tmp = tdres->res.env->current_time - tdres->res.env->time_zero;
+        x = div64_u64(tmp, tdres->major_cycle) + 1;
+        x *= tdres->major_cycle;
+        return x;
+}
+static void td_client_arrives(
+        struct reservation* res,
+        struct reservation_client *client
+)
+{
+        struct table_driven_reservation *tdres =
+                container_of(res, struct table_driven_reservation, res);
+        list_add_tail(&client->list, &res->clients);
+        switch (res->state) {
+                case RESERVATION_INACTIVE:
+                        /* Figure out first replenishment time. */
+                        tdres->major_cycle_start = td_next_major_cycle_start(tdres);
+                        res->next_replenishment  = tdres->major_cycle_start;
+                        res->next_replenishment += tdres->intervals[0].start;
+                        tdres->next_interval = 0;
+                        res->env->change_state(res->env, res,
+                                RESERVATION_DEPLETED);
+                        break;
+                case RESERVATION_ACTIVE:
+                case RESERVATION_DEPLETED:
+                        /* do nothing */
+                        break;
+                case RESERVATION_ACTIVE_IDLE:
+                        res->env->change_state(res->env, res,
+                                RESERVATION_ACTIVE);
+                        break;
+        }
+}
+static void td_client_departs(
+        struct reservation *res,
+        struct reservation_client *client,
+        int did_signal_job_completion
+)
+{
+        list_del(&client->list);
+        switch (res->state) {
+                case RESERVATION_INACTIVE:
+                case RESERVATION_ACTIVE_IDLE:
+                        BUG(); /* INACTIVE or IDLE <=> no client */
+                        break;
+                case RESERVATION_ACTIVE:
+                        if (list_empty(&res->clients)) {
+                                res->env->change_state(res->env, res,
+                                                RESERVATION_ACTIVE_IDLE);
+                        } /* else: nothing to do, more clients ready */
+                        break;
+                case RESERVATION_DEPLETED:
+                        /* do nothing */
+                        break;
+        }
+}
+static lt_t td_time_remaining_until_end(struct table_driven_reservation *tdres)
+{
+        lt_t now = tdres->res.env->current_time;
+        lt_t end = tdres->cur_interval.end;
+        TRACE("td_remaining(%u): start=%llu now=%llu end=%llu state=%d\n",
+                tdres->res.id,
+                tdres->cur_interval.start,
+                now, end,
+                tdres->res.state);
+        if (now >=  end)
+                return 0;
+        else
+                return end - now;
+}
+static void td_replenish(
+        struct reservation *res)
+{
+        struct table_driven_reservation *tdres =
+                container_of(res, struct table_driven_reservation, res);
+        TRACE("td_replenish(%u): expected_replenishment=%llu\n", res->id,
+                res->next_replenishment);
+        /* figure out current interval */
+        tdres->cur_interval.start = tdres->major_cycle_start +
+                tdres->intervals[tdres->next_interval].start;
+        tdres->cur_interval.end =  tdres->major_cycle_start +
+                tdres->intervals[tdres->next_interval].end;
+        TRACE("major_cycle_start=%llu => [%llu, %llu]\n",
+                tdres->major_cycle_start,
+                tdres->cur_interval.start,
+                tdres->cur_interval.end);
+        /* reset budget */
+        res->cur_budget = td_time_remaining_until_end(tdres);
+        res->budget_consumed = 0;
+        TRACE("td_replenish(%u): %s budget=%llu\n", res->id,
+                res->cur_budget ? "" : "WARNING", res->cur_budget);
+        /* prepare next slot */
+        tdres->next_interval = (tdres->next_interval + 1) % tdres->num_intervals;
+        if (!tdres->next_interval)
+                /* wrap to next major cycle */
+                tdres->major_cycle_start += tdres->major_cycle;
+        /* determine next time this reservation becomes eligible to execute */
+        res->next_replenishment  = tdres->major_cycle_start;
+        res->next_replenishment += tdres->intervals[tdres->next_interval].start;
+        TRACE("td_replenish(%u): next_replenishment=%llu\n", res->id,
+                res->next_replenishment);
+        switch (res->state) {
+                case RESERVATION_DEPLETED:
+                case RESERVATION_ACTIVE:
+                case RESERVATION_ACTIVE_IDLE:
+                        if (list_empty(&res->clients))
+                                res->env->change_state(res->env, res,
+                                        RESERVATION_ACTIVE_IDLE);
+                        else
+                                /* we have clients & budget => ACTIVE */
+                                res->env->change_state(res->env, res,
+                                        RESERVATION_ACTIVE);
+                        break;
+                case RESERVATION_INACTIVE:
+                        BUG();
+                        break;
+        }
+}
+static void td_drain_budget(
+                struct reservation *res,
+                lt_t how_much)
+{
+        struct table_driven_reservation *tdres =
+                container_of(res, struct table_driven_reservation, res);
+        res->budget_consumed += how_much;
+        res->budget_consumed_total += how_much;
+        /* Table-driven scheduling: instead of tracking the budget, we compute
+         * how much time is left in this allocation interval. */
+        /* sanity check: we should never try to drain from future slots */
+        BUG_ON(tdres->cur_interval.start > res->env->current_time);
+        switch (res->state) {
+                case RESERVATION_DEPLETED:
+                case RESERVATION_INACTIVE:
+                        BUG();
+                        break;
+                case RESERVATION_ACTIVE_IDLE:
+                case RESERVATION_ACTIVE:
+                        res->cur_budget = td_time_remaining_until_end(tdres);
+                        TRACE("td_drain_budget(%u): drained to budget=%llu\n",
+                                res->id, res->cur_budget);
+                        if (!res->cur_budget) {
+                                res->env->change_state(res->env, res,
+                                        RESERVATION_DEPLETED);
+                        } else {
+                                /* sanity check budget calculation */
+                                BUG_ON(res->env->current_time >= tdres->cur_interval.end);
+                                BUG_ON(res->env->current_time < tdres->cur_interval.start);
+                        }
+                        break;
+        }
+}
+static struct task_struct* td_dispatch_client(
+        struct reservation *res,
+        lt_t *for_at_most)
+{
+        struct task_struct *t;
+        struct table_driven_reservation *tdres =
+                container_of(res, struct table_driven_reservation, res);
+        /* usual logic for selecting a client */
+        t = default_dispatch_client(res, for_at_most);
+        TRACE_TASK(t, "td_dispatch_client(%u): selected, budget=%llu\n",
+                res->id, res->cur_budget);
+        /* check how much budget we have left in this time slot */
+        res->cur_budget = td_time_remaining_until_end(tdres);
+        TRACE_TASK(t, "td_dispatch_client(%u): updated to budget=%llu next=%d\n",
+                res->id, res->cur_budget, tdres->next_interval);
+        if (unlikely(!res->cur_budget)) {
+                /* Unlikely case: if we ran out of budget, the user configured
+                 * a broken scheduling table (overlapping table slots).
+                 * Not much we can do about this, but we can't dispatch a job
+                 * now without causing overload. So let's register this reservation
+                 * as depleted and wait for the next allocation. */
+                TRACE("td_dispatch_client(%u): budget unexpectedly depleted "
+                        "(check scheduling table for unintended overlap)\n",
+                        res->id);
+                res->env->change_state(res->env, res,
+                        RESERVATION_DEPLETED);
+                return NULL;
+        } else
+                return t;
+}
+static struct reservation_ops td_ops = {
+        .dispatch_client = td_dispatch_client,
+        .client_arrives = td_client_arrives,
+        .client_departs = td_client_departs,
+        .replenish = td_replenish,
+        .drain_budget = td_drain_budget,
+};
+void table_driven_reservation_init(
+        struct table_driven_reservation *tdres,
+        lt_t major_cycle,
+        struct lt_interval *intervals,
+        unsigned int num_intervals)
+{
+        unsigned int i;
+        /* sanity checking */
+        BUG_ON(!num_intervals);
+        for (i = 0; i < num_intervals; i++)
+                BUG_ON(intervals[i].end <= intervals[i].start);
+        for (i = 0; i + 1 < num_intervals; i++)
+                BUG_ON(intervals[i + 1].start <= intervals[i].end);
+        BUG_ON(intervals[num_intervals - 1].end > major_cycle);
+        reservation_init(&tdres->res);
+        tdres->res.kind = TABLE_DRIVEN;
+        tdres->major_cycle = major_cycle;
+        tdres->intervals = intervals;
+        tdres->cur_interval.start = 0;
+        tdres->cur_interval.end   = 0;
+        tdres->num_intervals = num_intervals;
+        tdres->res.ops = &td_ops;
+}
diff --git a/litmus/rt_domain.c b/litmus/rt_domain.c
new file mode 100644
index 000000000000..733a483e3084
--- /dev/null
+++ b/litmus/rt_domain.c
@@ -0,0 +1,351 @@
+/*
+ * litmus/rt_domain.c
+ *
+ * LITMUS real-time infrastructure. This file contains the
+ * functions that manipulate RT domains. RT domains are an abstraction
+ * of a ready queue and a release queue.
+ */
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/sched_trace.h>
+#include <litmus/debug_trace.h>
+#include <litmus/rt_domain.h>
+#include <litmus/trace.h>
+#include <litmus/bheap.h>
+/* Uncomment when debugging timer races... */
+#if 0
+#define VTRACE_TASK TRACE_TASK
+#define VTRACE TRACE
+#else
+#define VTRACE_TASK(t, fmt, args...) /* shut up */
+#define VTRACE(fmt, args...) /* be quiet already */
+#endif
+static int dummy_resched(rt_domain_t *rt)
+{
+        return 0;
+}
+static int dummy_order(struct bheap_node* a, struct bheap_node* b)
+{
+        return 0;
+}
+/* default implementation: use default lock */
+static void default_release_jobs(rt_domain_t* rt, struct bheap* tasks)
+{
+        merge_ready(rt, tasks);
+}
+static unsigned int time2slot(lt_t time)
+{
+        return (unsigned int) time2quanta(time, FLOOR) % RELEASE_QUEUE_SLOTS;
+}
+static enum hrtimer_restart on_release_timer(struct hrtimer *timer)
+{
+        unsigned long flags;
+        struct release_heap* rh;
+        rh = container_of(timer, struct release_heap, timer);
+        TS_RELEASE_LATENCY(rh->release_time);
+        VTRACE("on_release_timer(0x%p) starts.\n", timer);
+        TS_RELEASE_START;
+        raw_spin_lock_irqsave(&rh->dom->release_lock, flags);
+        VTRACE("CB has the release_lock 0x%p\n", &rh->dom->release_lock);
+        /* remove from release queue */
+        list_del(&rh->list);
+        raw_spin_unlock_irqrestore(&rh->dom->release_lock, flags);
+        VTRACE("CB returned release_lock 0x%p\n", &rh->dom->release_lock);
+        /* call release callback */
+        rh->dom->release_jobs(rh->dom, &rh->heap);
+        /* WARNING: rh can be referenced from other CPUs from now on. */
+        TS_RELEASE_END;
+        VTRACE("on_release_timer(0x%p) ends.\n", timer);
+        return  HRTIMER_NORESTART;
+}
+/* allocated in litmus.c */
+struct kmem_cache * release_heap_cache;
+struct release_heap* release_heap_alloc(int gfp_flags)
+{
+        struct release_heap* rh;
+        rh= kmem_cache_alloc(release_heap_cache, gfp_flags);
+        if (rh) {
+                /* initialize timer */
+                hrtimer_init(&rh->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+                rh->timer.function = on_release_timer;
+        }
+        return rh;
+}
+void release_heap_free(struct release_heap* rh)
+{
+        /* make sure timer is no longer in use */
+        hrtimer_cancel(&rh->timer);
+        kmem_cache_free(release_heap_cache, rh);
+}
+/* Caller must hold release lock.
+ * Will return heap for given time. If no such heap exists prior to
+ * the invocation it will be created.
+ */
+static struct release_heap* get_release_heap(rt_domain_t *rt,
+                                             struct task_struct* t,
+                                             int use_task_heap)
+{
+        struct list_head* pos;
+        struct release_heap* heap = NULL;
+        struct release_heap* rh;
+        lt_t release_time = get_release(t);
+        unsigned int slot = time2slot(release_time);
+        /* initialize pos for the case that the list is empty */
+        pos = rt->release_queue.slot[slot].next;
+        list_for_each(pos, &rt->release_queue.slot[slot]) {
+                rh = list_entry(pos, struct release_heap, list);
+                if (release_time == rh->release_time) {
+                        /* perfect match -- this happens on hyperperiod
+                         * boundaries
+                         */
+                        heap = rh;
+                        break;
+                } else if (lt_before(release_time, rh->release_time)) {
+                        /* we need to insert a new node since rh is
+                         * already in the future
+                         */
+                        break;
+                }
+        }
+        if (!heap && use_task_heap) {
+                /* use pre-allocated release heap */
+                rh = tsk_rt(t)->rel_heap;
+                rh->dom = rt;
+                rh->release_time = release_time;
+                /* add to release queue */
+                list_add(&rh->list, pos->prev);
+                heap = rh;
+        }
+        return heap;
+}
+static void reinit_release_heap(struct task_struct* t)
+{
+        struct release_heap* rh;
+        /* use pre-allocated release heap */
+        rh = tsk_rt(t)->rel_heap;
+        /* Make sure it is safe to use.  The timer callback could still
+         * be executing on another CPU; hrtimer_cancel() will wait
+         * until the timer callback has completed.  However, under no
+         * circumstances should the timer be active (= yet to be
+         * triggered).
+         *
+         * WARNING: If the CPU still holds the release_lock at this point,
+         *          deadlock may occur!
+         */
+        BUG_ON(hrtimer_cancel(&rh->timer));
+        /* initialize */
+        bheap_init(&rh->heap);
+}
+/* arm_release_timer() - start local release timer or trigger
+ *     remote timer (pull timer)
+ *
+ * Called by add_release() with:
+ * - tobe_lock taken
+ * - IRQ disabled
+ */
+#ifdef CONFIG_RELEASE_MASTER
+#define arm_release_timer(t) arm_release_timer_on((t), NO_CPU)
+static void arm_release_timer_on(rt_domain_t *_rt , int target_cpu)
+#else
+static void arm_release_timer(rt_domain_t *_rt)
+#endif
+{
+        rt_domain_t *rt = _rt;
+        struct list_head list;
+        struct list_head *pos, *safe;
+        struct task_struct* t;
+        struct release_heap* rh;
+        VTRACE("arm_release_timer() at %llu\n", litmus_clock());
+        list_replace_init(&rt->tobe_released, &list);
+        list_for_each_safe(pos, safe, &list) {
+                /* pick task of work list */
+                t = list_entry(pos, struct task_struct, rt_param.list);
+                sched_trace_task_release(t);
+                list_del(pos);
+                /* put into release heap while holding release_lock */
+                raw_spin_lock(&rt->release_lock);
+                VTRACE_TASK(t, "I have the release_lock 0x%p\n", &rt->release_lock);
+                rh = get_release_heap(rt, t, 0);
+                if (!rh) {
+                        /* need to use our own, but drop lock first */
+                        raw_spin_unlock(&rt->release_lock);
+                        VTRACE_TASK(t, "Dropped release_lock 0x%p\n",
+                                    &rt->release_lock);
+                        reinit_release_heap(t);
+                        VTRACE_TASK(t, "release_heap ready\n");
+                        raw_spin_lock(&rt->release_lock);
+                        VTRACE_TASK(t, "Re-acquired release_lock 0x%p\n",
+                                    &rt->release_lock);
+                        rh = get_release_heap(rt, t, 1);
+                }
+                bheap_insert(rt->order, &rh->heap, tsk_rt(t)->heap_node);
+                VTRACE_TASK(t, "arm_release_timer(): added to release heap\n");
+                raw_spin_unlock(&rt->release_lock);
+                VTRACE_TASK(t, "Returned the release_lock 0x%p\n", &rt->release_lock);
+                /* To avoid arming the timer multiple times, we only let the
+                 * owner do the arming (which is the "first" task to reference
+                 * this release_heap anyway).
+                 */
+                if (rh == tsk_rt(t)->rel_heap) {
+                        VTRACE_TASK(t, "arming timer 0x%p\n", &rh->timer);
+                        if (!hrtimer_is_hres_active(&rh->timer)) {
+                                TRACE_TASK(t, "WARNING: no hires timer!!!\n");
+                        }
+                        /* we cannot arm the timer using hrtimer_start()
+                         * as it may deadlock on rq->lock
+                         *
+                         * PINNED mode is ok on both local and remote CPU
+                         */
+#ifdef CONFIG_RELEASE_MASTER
+                        if (rt->release_master == NO_CPU &&
+                            target_cpu == NO_CPU)
+#endif
+                                hrtimer_start(&rh->timer,
+                                        ns_to_ktime(rh->release_time),
+                                        HRTIMER_MODE_ABS_PINNED);
+#ifdef CONFIG_RELEASE_MASTER
+                        else
+                                hrtimer_start_on(
+                                        /* target_cpu overrides release master */
+                                        (target_cpu != NO_CPU ?
+                                         target_cpu : rt->release_master),
+                                        &rh->info, &rh->timer,
+                                        ns_to_ktime(rh->release_time),
+                                        HRTIMER_MODE_ABS_PINNED);
+#endif
+                } else
+                        VTRACE_TASK(t, "0x%p is not my timer\n", &rh->timer);
+        }
+}
+void rt_domain_init(rt_domain_t *rt,
+                    bheap_prio_t order,
+                    check_resched_needed_t check,
+                    release_jobs_t release
+                   )
+{
+        int i;
+        BUG_ON(!rt);
+        if (!check)
+                check = dummy_resched;
+        if (!release)
+                release = default_release_jobs;
+        if (!order)
+                order = dummy_order;
+#ifdef CONFIG_RELEASE_MASTER
+        rt->release_master = NO_CPU;
+#endif
+        bheap_init(&rt->ready_queue);
+        INIT_LIST_HEAD(&rt->tobe_released);
+        for (i = 0; i < RELEASE_QUEUE_SLOTS; i++)
+                INIT_LIST_HEAD(&rt->release_queue.slot[i]);
+        raw_spin_lock_init(&rt->ready_lock);
+        raw_spin_lock_init(&rt->release_lock);
+        raw_spin_lock_init(&rt->tobe_lock);
+        rt->check_resched       = check;
+        rt->release_jobs        = release;
+        rt->order               = order;
+}
+/* add_ready - add a real-time task to the rt ready queue. It must be runnable.
+ * @new:       the newly released task
+ */
+void __add_ready(rt_domain_t* rt, struct task_struct *new)
+{
+        TRACE("rt: adding %s/%d (%llu, %llu, %llu) rel=%llu "
+                "to ready queue at %llu\n",
+                new->comm, new->pid,
+                get_exec_cost(new), get_rt_period(new), get_rt_relative_deadline(new),
+                get_release(new), litmus_clock());
+        BUG_ON(bheap_node_in_heap(tsk_rt(new)->heap_node));
+        bheap_insert(rt->order, &rt->ready_queue, tsk_rt(new)->heap_node);
+        rt->check_resched(rt);
+}
+/* merge_ready - Add a sorted set of tasks to the rt ready queue. They must be runnable.
+ * @tasks      - the newly released tasks
+ */
+void __merge_ready(rt_domain_t* rt, struct bheap* tasks)
+{
+        bheap_union(rt->order, &rt->ready_queue, tasks);
+        rt->check_resched(rt);
+}
+#ifdef CONFIG_RELEASE_MASTER
+void __add_release_on(rt_domain_t* rt, struct task_struct *task,
+                      int target_cpu)
+{
+        TRACE_TASK(task, "add_release_on(), rel=%llu, target=%d\n",
+                   get_release(task), target_cpu);
+        list_add(&tsk_rt(task)->list, &rt->tobe_released);
+        task->rt_param.domain = rt;
+        arm_release_timer_on(rt, target_cpu);
+}
+#endif
+/* add_release - add a real-time task to the rt release queue.
+ * @task:        the sleeping task
+ */
+void __add_release(rt_domain_t* rt, struct task_struct *task)
+{
+        TRACE_TASK(task, "add_release(), rel=%llu\n", get_release(task));
+        list_add(&tsk_rt(task)->list, &rt->tobe_released);
+        task->rt_param.domain = rt;
+        arm_release_timer(rt);
+}
diff --git a/litmus/sched_cedf.c b/litmus/sched_cedf.c
new file mode 100644
index 000000000000..d12a5958f5dc
--- /dev/null
+++ b/litmus/sched_cedf.c
@@ -0,0 +1,890 @@
+/*
+ * litmus/sched_cedf.c
+ *
+ * Implementation of the C-EDF scheduling algorithm.
+ *
+ * This implementation is based on G-EDF:
+ * - CPUs are clustered around L2 or L3 caches.
+ * - Clusters topology is automatically detected (this is arch dependent
+ *   and is working only on x86 at the moment --- and only with modern
+ *   cpus that exports cpuid4 information)
+ * - The plugins _does not_ attempt to put tasks in the right cluster i.e.
+ *   the programmer needs to be aware of the topology to place tasks
+ *   in the desired cluster
+ * - default clustering is around L2 cache (cache index = 2)
+ *   supported clusters are: L1 (private cache: pedf), L2, L3, ALL (all
+ *   online_cpus are placed in a single cluster).
+ *
+ *   For details on functions, take a look at sched_gsn_edf.c
+ *
+ * Currently, we do not support changes in the number of online cpus.
+ * If the num_online_cpus() dynamically changes, the plugin is broken.
+ *
+ * This version uses the simple approach and serializes all scheduling
+ * decisions by the use of a queue lock. This is probably not the
+ * best way to do it, but it should suffice for now.
+ */
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <litmus/debug_trace.h>
+#include <litmus/litmus.h>
+#include <litmus/jobs.h>
+#include <litmus/preempt.h>
+#include <litmus/budget.h>
+#include <litmus/np.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/edf_common.h>
+#include <litmus/sched_trace.h>
+#include <litmus/clustered.h>
+#include <litmus/bheap.h>
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+#include <litmus/affinity.h>
+#endif
+/* to configure the cluster size */
+#include <litmus/litmus_proc.h>
+#include <linux/uaccess.h>
+/* Reference configuration variable. Determines which cache level is used to
+ * group CPUs into clusters.  GLOBAL_CLUSTER, which is the default, means that
+ * all CPUs form a single cluster (just like GSN-EDF).
+ */
+static enum cache_level cluster_config = GLOBAL_CLUSTER;
+struct clusterdomain;
+/* cpu_entry_t - maintain the linked and scheduled state
+ *
+ * A cpu also contains a pointer to the cedf_domain_t cluster
+ * that owns it (struct clusterdomain*)
+ */
+typedef struct  {
+        int                     cpu;
+        struct clusterdomain*   cluster;        /* owning cluster */
+        struct task_struct*     linked;         /* only RT tasks */
+        struct task_struct*     scheduled;      /* only RT tasks */
+        atomic_t                will_schedule;  /* prevent unneeded IPIs */
+        struct bheap_node*      hn;
+} cpu_entry_t;
+/* one cpu_entry_t per CPU */
+DEFINE_PER_CPU(cpu_entry_t, cedf_cpu_entries);
+/*
+ * In C-EDF there is a cedf domain _per_ cluster
+ * The number of clusters is dynamically determined accordingly to the
+ * total cpu number and the cluster size
+ */
+typedef struct clusterdomain {
+        /* rt_domain for this cluster */
+        rt_domain_t     domain;
+        /* cpus in this cluster */
+        cpu_entry_t*    *cpus;
+        /* map of this cluster cpus */
+        cpumask_var_t   cpu_map;
+        /* the cpus queue themselves according to priority in here */
+        struct bheap_node *heap_node;
+        struct bheap      cpu_heap;
+        /* lock for this cluster */
+#define cluster_lock domain.ready_lock
+} cedf_domain_t;
+/* a cedf_domain per cluster; allocation is done at init/activation time */
+cedf_domain_t *cedf;
+#define remote_cluster(cpu)     ((cedf_domain_t *) per_cpu(cedf_cpu_entries, cpu).cluster)
+#define task_cpu_cluster(task)  remote_cluster(get_partition(task))
+/* Uncomment WANT_ALL_SCHED_EVENTS if you want to see all scheduling
+ * decisions in the TRACE() log; uncomment VERBOSE_INIT for verbose
+ * information during the initialization of the plugin (e.g., topology)
+#define WANT_ALL_SCHED_EVENTS
+ */
+#define VERBOSE_INIT
+static int cpu_lower_prio(struct bheap_node *_a, struct bheap_node *_b)
+{
+        cpu_entry_t *a, *b;
+        a = _a->value;
+        b = _b->value;
+        /* Note that a and b are inverted: we want the lowest-priority CPU at
+         * the top of the heap.
+         */
+        return edf_higher_prio(b->linked, a->linked);
+}
+/* update_cpu_position - Move the cpu entry to the correct place to maintain
+ *                       order in the cpu queue. Caller must hold cedf lock.
+ */
+static void update_cpu_position(cpu_entry_t *entry)
+{
+        cedf_domain_t *cluster = entry->cluster;
+        if (likely(bheap_node_in_heap(entry->hn)))
+                bheap_delete(cpu_lower_prio,
+                                &cluster->cpu_heap,
+                                entry->hn);
+        bheap_insert(cpu_lower_prio, &cluster->cpu_heap, entry->hn);
+}
+/* caller must hold cedf lock */
+static cpu_entry_t* lowest_prio_cpu(cedf_domain_t *cluster)
+{
+        struct bheap_node* hn;
+        hn = bheap_peek(cpu_lower_prio, &cluster->cpu_heap);
+        return hn->value;
+}
+/* link_task_to_cpu - Update the link of a CPU.
+ *                    Handles the case where the to-be-linked task is already
+ *                    scheduled on a different CPU.
+ */
+static noinline void link_task_to_cpu(struct task_struct* linked,
+                                      cpu_entry_t *entry)
+{
+        cpu_entry_t *sched;
+        struct task_struct* tmp;
+        int on_cpu;
+        BUG_ON(linked && !is_realtime(linked));
+        /* Currently linked task is set to be unlinked. */
+        if (entry->linked) {
+                entry->linked->rt_param.linked_on = NO_CPU;
+        }
+        /* Link new task to CPU. */
+        if (linked) {
+                /* handle task is already scheduled somewhere! */
+                on_cpu = linked->rt_param.scheduled_on;
+                if (on_cpu != NO_CPU) {
+                        sched = &per_cpu(cedf_cpu_entries, on_cpu);
+                        /* this should only happen if not linked already */
+                        BUG_ON(sched->linked == linked);
+                        /* If we are already scheduled on the CPU to which we
+                         * wanted to link, we don't need to do the swap --
+                         * we just link ourselves to the CPU and depend on
+                         * the caller to get things right.
+                         */
+                        if (entry != sched) {
+                                TRACE_TASK(linked,
+                                           "already scheduled on %d, updating link.\n",
+                                           sched->cpu);
+                                tmp = sched->linked;
+                                linked->rt_param.linked_on = sched->cpu;
+                                sched->linked = linked;
+                                update_cpu_position(sched);
+                                linked = tmp;
+                        }
+                }
+                if (linked) /* might be NULL due to swap */
+                        linked->rt_param.linked_on = entry->cpu;
+        }
+        entry->linked = linked;
+#ifdef WANT_ALL_SCHED_EVENTS
+        if (linked)
+                TRACE_TASK(linked, "linked to %d.\n", entry->cpu);
+        else
+                TRACE("NULL linked to %d.\n", entry->cpu);
+#endif
+        update_cpu_position(entry);
+}
+/* unlink - Make sure a task is not linked any longer to an entry
+ *          where it was linked before. Must hold cedf_lock.
+ */
+static noinline void unlink(struct task_struct* t)
+{
+        cpu_entry_t *entry;
+        if (t->rt_param.linked_on != NO_CPU) {
+                /* unlink */
+                entry = &per_cpu(cedf_cpu_entries, t->rt_param.linked_on);
+                t->rt_param.linked_on = NO_CPU;
+                link_task_to_cpu(NULL, entry);
+        } else if (is_queued(t)) {
+                /* This is an interesting situation: t is scheduled,
+                 * but was just recently unlinked.  It cannot be
+                 * linked anywhere else (because then it would have
+                 * been relinked to this CPU), thus it must be in some
+                 * queue. We must remove it from the list in this
+                 * case.
+                 *
+                 * in C-EDF case is should be somewhere in the queue for
+                 * its domain, therefore and we can get the domain using
+                 * task_cpu_cluster
+                 */
+                remove(&(task_cpu_cluster(t))->domain, t);
+        }
+}
+/* preempt - force a CPU to reschedule
+ */
+static void preempt(cpu_entry_t *entry)
+{
+        preempt_if_preemptable(entry->scheduled, entry->cpu);
+}
+/* requeue - Put an unlinked task into gsn-edf domain.
+ *           Caller must hold cedf_lock.
+ */
+static noinline void requeue(struct task_struct* task)
+{
+        cedf_domain_t *cluster = task_cpu_cluster(task);
+        BUG_ON(!task);
+        /* sanity check before insertion */
+        BUG_ON(is_queued(task));
+        if (is_early_releasing(task) || is_released(task, litmus_clock()))
+                __add_ready(&cluster->domain, task);
+        else {
+                /* it has got to wait */
+                add_release(&cluster->domain, task);
+        }
+}
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+static cpu_entry_t* cedf_get_nearest_available_cpu(
+                                cedf_domain_t *cluster, cpu_entry_t *start)
+{
+        cpu_entry_t *affinity;
+        get_nearest_available_cpu(affinity, start, cedf_cpu_entries,
+#ifdef CONFIG_RELEASE_MASTER
+                cluster->domain.release_master,
+#else
+                NO_CPU,
+#endif
+                cluster->cpu_map);
+        /* make sure CPU is in our cluster */
+        if (affinity && cpumask_test_cpu(affinity->cpu, cluster->cpu_map))
+                return(affinity);
+        else
+                return(NULL);
+}
+#endif
+/* check for any necessary preemptions */
+static void check_for_preemptions(cedf_domain_t *cluster)
+{
+        struct task_struct *task;
+        cpu_entry_t *last;
+#ifdef CONFIG_PREFER_LOCAL_LINKING
+        cpu_entry_t *local;
+        /* Before linking to other CPUs, check first whether the local CPU is
+         * idle. */
+        local = this_cpu_ptr(&cedf_cpu_entries);
+        task  = __peek_ready(&cluster->domain);
+        if (task && !local->linked
+#ifdef CONFIG_RELEASE_MASTER
+            && likely(local->cpu != cluster->domain.release_master)
+#endif
+                ) {
+                task = __take_ready(&cluster->domain);
+                TRACE_TASK(task, "linking to local CPU %d to avoid IPI\n", local->cpu);
+                link_task_to_cpu(task, local);
+                preempt(local);
+        }
+#endif
+        for(last = lowest_prio_cpu(cluster);
+            edf_preemption_needed(&cluster->domain, last->linked);
+            last = lowest_prio_cpu(cluster)) {
+                /* preemption necessary */
+                task = __take_ready(&cluster->domain);
+                TRACE("check_for_preemptions: attempting to link task %d to %d\n",
+                      task->pid, last->cpu);
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+                {
+                        cpu_entry_t *affinity =
+                                        cedf_get_nearest_available_cpu(cluster,
+                                                &per_cpu(cedf_cpu_entries, task_cpu(task)));
+                        if(affinity)
+                                last = affinity;
+                        else if(requeue_preempted_job(last->linked))
+                                requeue(last->linked);
+                }
+#else
+                if (requeue_preempted_job(last->linked))
+                        requeue(last->linked);
+#endif
+                link_task_to_cpu(task, last);
+                preempt(last);
+        }
+}
+/* cedf_job_arrival: task is either resumed or released */
+static noinline void cedf_job_arrival(struct task_struct* task)
+{
+        cedf_domain_t *cluster = task_cpu_cluster(task);
+        BUG_ON(!task);
+        requeue(task);
+        check_for_preemptions(cluster);
+}
+static void cedf_release_jobs(rt_domain_t* rt, struct bheap* tasks)
+{
+        cedf_domain_t* cluster = container_of(rt, cedf_domain_t, domain);
+        unsigned long flags;
+        raw_spin_lock_irqsave(&cluster->cluster_lock, flags);
+        __merge_ready(&cluster->domain, tasks);
+        check_for_preemptions(cluster);
+        raw_spin_unlock_irqrestore(&cluster->cluster_lock, flags);
+}
+/* caller holds cedf_lock */
+static noinline void current_job_completion(int forced)
+{
+        struct task_struct *t = current;
+        sched_trace_task_completion(t, forced);
+        TRACE_TASK(t, "job_completion(forced=%d).\n", forced);
+        /* set flags */
+        tsk_rt(t)->completed = 0;
+        /* prepare for next period */
+        prepare_for_next_period(t);
+        if (is_early_releasing(t) || is_released(t, litmus_clock()))
+                sched_trace_task_release(t);
+        /* unlink */
+        unlink(t);
+        /* requeue
+         * But don't requeue a blocking task. */
+        if (is_current_running())
+                cedf_job_arrival(t);
+}
+/* Getting schedule() right is a bit tricky. schedule() may not make any
+ * assumptions on the state of the current task since it may be called for a
+ * number of reasons. The reasons include a scheduler_tick() determined that it
+ * was necessary, because sys_exit_np() was called, because some Linux
+ * subsystem determined so, or even (in the worst case) because there is a bug
+ * hidden somewhere. Thus, we must take extreme care to determine what the
+ * current state is.
+ *
+ * The CPU could currently be scheduling a task (or not), be linked (or not).
+ *
+ * The following assertions for the scheduled task could hold:
+ *
+ *      - !is_running(scheduled)        // the job blocks
+ *      - scheduled->timeslice == 0     // the job completed (forcefully)
+ *      - is_completed()                // the job completed (by syscall)
+ *      - linked != scheduled           // we need to reschedule (for any reason)
+ *      - is_np(scheduled)              // rescheduling must be delayed,
+ *                                         sys_exit_np must be requested
+ *
+ * Any of these can occur together.
+ */
+static struct task_struct* cedf_schedule(struct task_struct * prev)
+{
+        cpu_entry_t* entry = this_cpu_ptr(&cedf_cpu_entries);
+        cedf_domain_t *cluster = entry->cluster;
+        int out_of_time, sleep, preempt, np, exists, blocks;
+        struct task_struct* next = NULL;
+#ifdef CONFIG_RELEASE_MASTER
+        /* Bail out early if we are the release master.
+         * The release master never schedules any real-time tasks.
+         */
+        if (unlikely(cluster->domain.release_master == entry->cpu)) {
+                sched_state_task_picked();
+                return NULL;
+        }
+#endif
+        raw_spin_lock(&cluster->cluster_lock);
+        /* sanity checking */
+        BUG_ON(entry->scheduled && entry->scheduled != prev);
+        BUG_ON(entry->scheduled && !is_realtime(prev));
+        BUG_ON(is_realtime(prev) && !entry->scheduled);
+        /* (0) Determine state */
+        exists      = entry->scheduled != NULL;
+        blocks      = exists && !is_current_running();
+        out_of_time = exists && budget_enforced(entry->scheduled)
+                             && budget_exhausted(entry->scheduled);
+        np          = exists && is_np(entry->scheduled);
+        sleep       = exists && is_completed(entry->scheduled);
+        preempt     = entry->scheduled != entry->linked;
+#ifdef WANT_ALL_SCHED_EVENTS
+        TRACE_TASK(prev, "invoked cedf_schedule.\n");
+#endif
+        if (exists)
+                TRACE_TASK(prev,
+                           "blocks:%d out_of_time:%d np:%d sleep:%d preempt:%d "
+                           "state:%d sig:%d\n",
+                           blocks, out_of_time, np, sleep, preempt,
+                           prev->state, signal_pending(prev));
+        if (entry->linked && preempt)
+                TRACE_TASK(prev, "will be preempted by %s/%d\n",
+                           entry->linked->comm, entry->linked->pid);
+        /* If a task blocks we have no choice but to reschedule.
+         */
+        if (blocks)
+                unlink(entry->scheduled);
+        /* Request a sys_exit_np() call if we would like to preempt but cannot.
+         * We need to make sure to update the link structure anyway in case
+         * that we are still linked. Multiple calls to request_exit_np() don't
+         * hurt.
+         */
+        if (np && (out_of_time || preempt || sleep)) {
+                unlink(entry->scheduled);
+                request_exit_np(entry->scheduled);
+        }
+        /* Any task that is preemptable and either exhausts its execution
+         * budget or wants to sleep completes. We may have to reschedule after
+         * this. Don't do a job completion if we block (can't have timers running
+         * for blocked jobs).
+         */
+        if (!np && (out_of_time || sleep))
+                current_job_completion(!sleep);
+        /* Link pending task if we became unlinked.
+         */
+        if (!entry->linked)
+                link_task_to_cpu(__take_ready(&cluster->domain), entry);
+        /* The final scheduling decision. Do we need to switch for some reason?
+         * If linked is different from scheduled, then select linked as next.
+         */
+        if ((!np || blocks) &&
+            entry->linked != entry->scheduled) {
+                /* Schedule a linked job? */
+                if (entry->linked) {
+                        entry->linked->rt_param.scheduled_on = entry->cpu;
+                        next = entry->linked;
+                }
+                if (entry->scheduled) {
+                        /* not gonna be scheduled soon */
+                        entry->scheduled->rt_param.scheduled_on = NO_CPU;
+                        TRACE_TASK(entry->scheduled, "scheduled_on = NO_CPU\n");
+                }
+        } else
+                /* Only override Linux scheduler if we have a real-time task
+                 * scheduled that needs to continue.
+                 */
+                if (exists)
+                        next = prev;
+        sched_state_task_picked();
+        raw_spin_unlock(&cluster->cluster_lock);
+#ifdef WANT_ALL_SCHED_EVENTS
+        TRACE("cedf_lock released, next=0x%p\n", next);
+        if (next)
+                TRACE_TASK(next, "scheduled at %llu\n", litmus_clock());
+        else if (exists && !next)
+                TRACE("becomes idle at %llu.\n", litmus_clock());
+#endif
+        return next;
+}
+/* _finish_switch - we just finished the switch away from prev
+ */
+static void cedf_finish_switch(struct task_struct *prev)
+{
+        cpu_entry_t*    entry = this_cpu_ptr(&cedf_cpu_entries);
+        entry->scheduled = is_realtime(current) ? current : NULL;
+#ifdef WANT_ALL_SCHED_EVENTS
+        TRACE_TASK(prev, "switched away from\n");
+#endif
+}
+/*      Prepare a task for running in RT mode
+ */
+static void cedf_task_new(struct task_struct * t, int on_rq, int is_scheduled)
+{
+        unsigned long           flags;
+        cpu_entry_t*            entry;
+        cedf_domain_t*          cluster;
+        TRACE("gsn edf: task new %d\n", t->pid);
+        /* the cluster doesn't change even if t is scheduled */
+        cluster = task_cpu_cluster(t);
+        raw_spin_lock_irqsave(&cluster->cluster_lock, flags);
+        /* setup job params */
+        release_at(t, litmus_clock());
+        if (is_scheduled) {
+                entry = &per_cpu(cedf_cpu_entries, task_cpu(t));
+                BUG_ON(entry->scheduled);
+#ifdef CONFIG_RELEASE_MASTER
+                if (entry->cpu != cluster->domain.release_master) {
+#endif
+                        entry->scheduled = t;
+                        tsk_rt(t)->scheduled_on = task_cpu(t);
+#ifdef CONFIG_RELEASE_MASTER
+                } else {
+                        /* do not schedule on release master */
+                        preempt(entry); /* force resched */
+                        tsk_rt(t)->scheduled_on = NO_CPU;
+                }
+#endif
+        } else {
+                t->rt_param.scheduled_on = NO_CPU;
+        }
+        t->rt_param.linked_on          = NO_CPU;
+        if (on_rq || is_scheduled)
+                cedf_job_arrival(t);
+        raw_spin_unlock_irqrestore(&(cluster->cluster_lock), flags);
+}
+static void cedf_task_wake_up(struct task_struct *task)
+{
+        unsigned long flags;
+        lt_t now;
+        cedf_domain_t *cluster;
+        TRACE_TASK(task, "wake_up at %llu\n", litmus_clock());
+        cluster = task_cpu_cluster(task);
+        raw_spin_lock_irqsave(&cluster->cluster_lock, flags);
+        now = litmus_clock();
+        if (is_sporadic(task) && is_tardy(task, now)) {
+                inferred_sporadic_job_release_at(task, now);
+        }
+        cedf_job_arrival(task);
+        raw_spin_unlock_irqrestore(&cluster->cluster_lock, flags);
+}
+static void cedf_task_block(struct task_struct *t)
+{
+        unsigned long flags;
+        cedf_domain_t *cluster;
+        TRACE_TASK(t, "block at %llu\n", litmus_clock());
+        cluster = task_cpu_cluster(t);
+        /* unlink if necessary */
+        raw_spin_lock_irqsave(&cluster->cluster_lock, flags);
+        unlink(t);
+        raw_spin_unlock_irqrestore(&cluster->cluster_lock, flags);
+        BUG_ON(!is_realtime(t));
+}
+static void cedf_task_exit(struct task_struct * t)
+{
+        unsigned long flags;
+        cedf_domain_t *cluster = task_cpu_cluster(t);
+        /* unlink if necessary */
+        raw_spin_lock_irqsave(&cluster->cluster_lock, flags);
+        unlink(t);
+        if (tsk_rt(t)->scheduled_on != NO_CPU) {
+                cpu_entry_t *cpu;
+                cpu = &per_cpu(cedf_cpu_entries, tsk_rt(t)->scheduled_on);
+                cpu->scheduled = NULL;
+                tsk_rt(t)->scheduled_on = NO_CPU;
+        }
+        raw_spin_unlock_irqrestore(&cluster->cluster_lock, flags);
+        BUG_ON(!is_realtime(t));
+        TRACE_TASK(t, "RIP\n");
+}
+static long cedf_admit_task(struct task_struct* tsk)
+{
+        return (remote_cluster(task_cpu(tsk)) == task_cpu_cluster(tsk)) ?
+                        0 : -EINVAL;
+}
+/* total number of cluster */
+static int num_clusters;
+/* we do not support cluster of different sizes */
+static unsigned int cluster_size;
+#ifdef VERBOSE_INIT
+static void print_cluster_topology(cpumask_var_t mask, int cpu)
+{
+        printk(KERN_INFO "CPU = %d, shared cpu(s) = %*pbl\n", cpu,
+               cpumask_pr_args(mask));
+}
+#endif
+static int clusters_allocated = 0;
+static void cleanup_cedf(void)
+{
+        int i;
+        if (clusters_allocated) {
+                for (i = 0; i < num_clusters; i++) {
+                        kfree(cedf[i].cpus);
+                        kfree(cedf[i].heap_node);
+                        free_cpumask_var(cedf[i].cpu_map);
+                }
+                kfree(cedf);
+        }
+}
+static struct domain_proc_info cedf_domain_proc_info;
+static long cedf_get_domain_proc_info(struct domain_proc_info **ret)
+{
+        *ret = &cedf_domain_proc_info;
+        return 0;
+}
+static void cedf_setup_domain_proc(void)
+{
+        int i, cpu, domain;
+#ifdef CONFIG_RELEASE_MASTER
+        int release_master = atomic_read(&release_master_cpu);
+        /* skip over the domain with the release master if cluster size is 1 */
+        int skip_domain = (1 == cluster_size && release_master != NO_CPU) ?
+                        release_master : NO_CPU;
+#else
+        int release_master = NO_CPU;
+        int skip_domain = NO_CPU;
+#endif
+        int num_rt_cpus = num_online_cpus() - (release_master != NO_CPU);
+        int num_rt_domains = num_clusters - (skip_domain != NO_CPU);
+        struct cd_mapping *map;
+        memset(&cedf_domain_proc_info, 0, sizeof(cedf_domain_proc_info));
+        init_domain_proc_info(&cedf_domain_proc_info, num_rt_cpus, num_rt_domains);
+        cedf_domain_proc_info.num_cpus = num_rt_cpus;
+        cedf_domain_proc_info.num_domains = num_rt_domains;
+        for (cpu = 0, i = 0; cpu < num_online_cpus(); ++cpu) {
+                if (cpu == release_master)
+                        continue;
+                map = &cedf_domain_proc_info.cpu_to_domains[i];
+                /* pointer math to figure out the domain index */
+                domain = remote_cluster(cpu) - cedf;
+                map->id = cpu;
+                cpumask_set_cpu(domain, map->mask);
+                ++i;
+        }
+        for (domain = 0, i = 0; domain < num_clusters; ++domain) {
+                if (domain == skip_domain)
+                        continue;
+                map = &cedf_domain_proc_info.domain_to_cpus[i];
+                map->id = i;
+                cpumask_copy(map->mask, cedf[domain].cpu_map);
+                ++i;
+        }
+}
+static long cedf_activate_plugin(void)
+{
+        int i, j, cpu, ccpu, cpu_count;
+        cpu_entry_t *entry;
+        cpumask_var_t mask;
+        int chk = 0;
+        /* de-allocate old clusters, if any */
+        cleanup_cedf();
+        printk(KERN_INFO "C-EDF: Activate Plugin, cluster configuration = %d\n",
+                        cluster_config);
+        /* need to get cluster_size first */
+        if(!zalloc_cpumask_var(&mask, GFP_ATOMIC))
+                return -ENOMEM;
+        if (cluster_config == GLOBAL_CLUSTER) {
+                cluster_size = num_online_cpus();
+        } else {
+                chk = get_shared_cpu_map(mask, 0, cluster_config);
+                if (chk) {
+                        /* if chk != 0 then it is the max allowed index */
+                        printk(KERN_INFO "C-EDF: Cluster configuration = %d "
+                               "is not supported on this hardware.\n",
+                               cluster_config);
+                        /* User should notice that the configuration failed, so
+                         * let's bail out. */
+                        return -EINVAL;
+                }
+                cluster_size = cpumask_weight(mask);
+        }
+        if ((num_online_cpus() % cluster_size) != 0) {
+                /* this can't be right, some cpus are left out */
+                printk(KERN_ERR "C-EDF: Trying to group %d cpus in %d!\n",
+                                num_online_cpus(), cluster_size);
+                return -1;
+        }
+        num_clusters = num_online_cpus() / cluster_size;
+        printk(KERN_INFO "C-EDF: %d cluster(s) of size = %d\n",
+                        num_clusters, cluster_size);
+        /* initialize clusters */
+        cedf = kmalloc(num_clusters * sizeof(cedf_domain_t), GFP_ATOMIC);
+        for (i = 0; i < num_clusters; i++) {
+                cedf[i].cpus = kmalloc(cluster_size * sizeof(cpu_entry_t),
+                                GFP_ATOMIC);
+                cedf[i].heap_node = kmalloc(
+                                cluster_size * sizeof(struct bheap_node),
+                                GFP_ATOMIC);
+                bheap_init(&(cedf[i].cpu_heap));
+                edf_domain_init(&(cedf[i].domain), NULL, cedf_release_jobs);
+                if(!zalloc_cpumask_var(&cedf[i].cpu_map, GFP_ATOMIC))
+                        return -ENOMEM;
+#ifdef CONFIG_RELEASE_MASTER
+                cedf[i].domain.release_master = atomic_read(&release_master_cpu);
+#endif
+        }
+        /* cycle through cluster and add cpus to them */
+        for (i = 0; i < num_clusters; i++) {
+                for_each_online_cpu(cpu) {
+                        /* check if the cpu is already in a cluster */
+                        for (j = 0; j < num_clusters; j++)
+                                if (cpumask_test_cpu(cpu, cedf[j].cpu_map))
+                                        break;
+                        /* if it is in a cluster go to next cpu */
+                        if (j < num_clusters &&
+                                        cpumask_test_cpu(cpu, cedf[j].cpu_map))
+                                continue;
+                        /* this cpu isn't in any cluster */
+                        /* get the shared cpus */
+                        if (unlikely(cluster_config == GLOBAL_CLUSTER))
+                                cpumask_copy(mask, cpu_online_mask);
+                        else
+                                get_shared_cpu_map(mask, cpu, cluster_config);
+                        cpumask_copy(cedf[i].cpu_map, mask);
+#ifdef VERBOSE_INIT
+                        print_cluster_topology(mask, cpu);
+#endif
+                        /* add cpus to current cluster and init cpu_entry_t */
+                        cpu_count = 0;
+                        for_each_cpu(ccpu, cedf[i].cpu_map) {
+                                entry = &per_cpu(cedf_cpu_entries, ccpu);
+                                cedf[i].cpus[cpu_count] = entry;
+                                atomic_set(&entry->will_schedule, 0);
+                                entry->cpu = ccpu;
+                                entry->cluster = &cedf[i];
+                                entry->hn = &(cedf[i].heap_node[cpu_count]);
+                                bheap_node_init(&entry->hn, entry);
+                                cpu_count++;
+                                entry->linked = NULL;
+                                entry->scheduled = NULL;
+#ifdef CONFIG_RELEASE_MASTER
+                                /* only add CPUs that should schedule jobs */
+                                if (entry->cpu != entry->cluster->domain.release_master)
+#endif
+                                        update_cpu_position(entry);
+                        }
+                        /* done with this cluster */
+                        break;
+                }
+        }
+        clusters_allocated = 1;
+        free_cpumask_var(mask);
+        cedf_setup_domain_proc();
+        return 0;
+}
+static long cedf_deactivate_plugin(void)
+{
+        destroy_domain_proc_info(&cedf_domain_proc_info);
+        return 0;
+}
+/*      Plugin object   */
+static struct sched_plugin cedf_plugin __cacheline_aligned_in_smp = {
+        .plugin_name            = "C-EDF",
+        .finish_switch          = cedf_finish_switch,
+        .task_new               = cedf_task_new,
+        .complete_job           = complete_job,
+        .task_exit              = cedf_task_exit,
+        .schedule               = cedf_schedule,
+        .task_wake_up           = cedf_task_wake_up,
+        .task_block             = cedf_task_block,
+        .admit_task             = cedf_admit_task,
+        .activate_plugin        = cedf_activate_plugin,
+        .deactivate_plugin      = cedf_deactivate_plugin,
+        .get_domain_proc_info   = cedf_get_domain_proc_info,
+};
+static struct proc_dir_entry *cluster_file = NULL, *cedf_dir = NULL;
+static int __init init_cedf(void)
+{
+        int err, fs;
+        err = register_sched_plugin(&cedf_plugin);
+        if (!err) {
+                fs = make_plugin_proc_dir(&cedf_plugin, &cedf_dir);
+                if (!fs)
+                        cluster_file = create_cluster_file(cedf_dir, &cluster_config);
+                else
+                        printk(KERN_ERR "Could not allocate C-EDF procfs dir.\n");
+        }
+        return err;
+}
+static void clean_cedf(void)
+{
+        cleanup_cedf();
+        if (cluster_file)
+                remove_proc_entry("cluster", cedf_dir);
+        if (cedf_dir)
+                remove_plugin_proc_dir(&cedf_plugin);
+}
+module_init(init_cedf);
+module_exit(clean_cedf);
diff --git a/litmus/sched_gsn_edf.c b/litmus/sched_gsn_edf.c
new file mode 100644
index 000000000000..8f28dc4e5192
--- /dev/null
+++ b/litmus/sched_gsn_edf.c
@@ -0,0 +1,1070 @@
+/*
+ * litmus/sched_gsn_edf.c
+ *
+ * Implementation of the GSN-EDF scheduling algorithm.
+ *
+ * This version uses the simple approach and serializes all scheduling
+ * decisions by the use of a queue lock. This is probably not the
+ * best way to do it, but it should suffice for now.
+ */
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <litmus/debug_trace.h>
+#include <litmus/litmus.h>
+#include <litmus/jobs.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/edf_common.h>
+#include <litmus/sched_trace.h>
+#include <litmus/trace.h>
+#include <litmus/preempt.h>
+#include <litmus/budget.h>
+#include <litmus/np.h>
+#include <litmus/bheap.h>
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+#include <litmus/affinity.h>
+#endif
+/* to set up domain/cpu mappings */
+#include <litmus/litmus_proc.h>
+#include <linux/module.h>
+/* Overview of GSN-EDF operations.
+ *
+ * For a detailed explanation of GSN-EDF have a look at the FMLP paper. This
+ * description only covers how the individual operations are implemented in
+ * LITMUS.
+ *
+ * link_task_to_cpu(T, cpu)     - Low-level operation to update the linkage
+ *                                structure (NOT the actually scheduled
+ *                                task). If there is another linked task To
+ *                                already it will set To->linked_on = NO_CPU
+ *                                (thereby removing its association with this
+ *                                CPU). However, it will not requeue the
+ *                                previously linked task (if any). It will set
+ *                                T's state to not completed and check whether
+ *                                it is already running somewhere else. If T
+ *                                is scheduled somewhere else it will link
+ *                                it to that CPU instead (and pull the linked
+ *                                task to cpu). T may be NULL.
+ *
+ * unlink(T)                    - Unlink removes T from all scheduler data
+ *                                structures. If it is linked to some CPU it
+ *                                will link NULL to that CPU. If it is
+ *                                currently queued in the gsnedf queue it will
+ *                                be removed from the rt_domain. It is safe to
+ *                                call unlink(T) if T is not linked. T may not
+ *                                be NULL.
+ *
+ * requeue(T)                   - Requeue will insert T into the appropriate
+ *                                queue. If the system is in real-time mode and
+ *                                the T is released already, it will go into the
+ *                                ready queue. If the system is not in
+ *                                real-time mode is T, then T will go into the
+ *                                release queue. If T's release time is in the
+ *                                future, it will go into the release
+ *                                queue. That means that T's release time/job
+ *                                no/etc. has to be updated before requeu(T) is
+ *                                called. It is not safe to call requeue(T)
+ *                                when T is already queued. T may not be NULL.
+ *
+ * gsnedf_job_arrival(T)        - This is the catch all function when T enters
+ *                                the system after either a suspension or at a
+ *                                job release. It will queue T (which means it
+ *                                is not safe to call gsnedf_job_arrival(T) if
+ *                                T is already queued) and then check whether a
+ *                                preemption is necessary. If a preemption is
+ *                                necessary it will update the linkage
+ *                                accordingly and cause scheduled to be called
+ *                                (either with an IPI or need_resched). It is
+ *                                safe to call gsnedf_job_arrival(T) if T's
+ *                                next job has not been actually released yet
+ *                                (releast time in the future). T will be put
+ *                                on the release queue in that case.
+ *
+ * curr_job_completion()        - Take care of everything that needs to be done
+ *                                to prepare the current task for its next
+ *                                release and place it in the right queue with
+ *                                gsnedf_job_arrival().
+ *
+ *
+ * When we now that T is linked to CPU then link_task_to_cpu(NULL, CPU) is
+ * equivalent to unlink(T). Note that if you unlink a task from a CPU none of
+ * the functions will automatically propagate pending task from the ready queue
+ * to a linked task. This is the job of the calling function ( by means of
+ * __take_ready).
+ */
+/* cpu_entry_t - maintain the linked and scheduled state
+ */
+typedef struct  {
+        int                     cpu;
+        struct task_struct*     linked;         /* only RT tasks */
+        struct task_struct*     scheduled;      /* only RT tasks */
+        struct bheap_node*      hn;
+} cpu_entry_t;
+DEFINE_PER_CPU(cpu_entry_t, gsnedf_cpu_entries);
+cpu_entry_t* gsnedf_cpus[NR_CPUS];
+/* the cpus queue themselves according to priority in here */
+static struct bheap_node gsnedf_heap_node[NR_CPUS];
+static struct bheap      gsnedf_cpu_heap;
+static rt_domain_t gsnedf;
+#define gsnedf_lock (gsnedf.ready_lock)
+/* Uncomment this if you want to see all scheduling decisions in the
+ * TRACE() log.
+#define WANT_ALL_SCHED_EVENTS
+ */
+static int cpu_lower_prio(struct bheap_node *_a, struct bheap_node *_b)
+{
+        cpu_entry_t *a, *b;
+        a = _a->value;
+        b = _b->value;
+        /* Note that a and b are inverted: we want the lowest-priority CPU at
+         * the top of the heap.
+         */
+        return edf_higher_prio(b->linked, a->linked);
+}
+/* update_cpu_position - Move the cpu entry to the correct place to maintain
+ *                       order in the cpu queue. Caller must hold gsnedf lock.
+ */
+static void update_cpu_position(cpu_entry_t *entry)
+{
+        if (likely(bheap_node_in_heap(entry->hn)))
+                bheap_delete(cpu_lower_prio, &gsnedf_cpu_heap, entry->hn);
+        bheap_insert(cpu_lower_prio, &gsnedf_cpu_heap, entry->hn);
+}
+/* caller must hold gsnedf lock */
+static cpu_entry_t* lowest_prio_cpu(void)
+{
+        struct bheap_node* hn;
+        hn = bheap_peek(cpu_lower_prio, &gsnedf_cpu_heap);
+        return hn->value;
+}
+/* link_task_to_cpu - Update the link of a CPU.
+ *                    Handles the case where the to-be-linked task is already
+ *                    scheduled on a different CPU.
+ */
+static noinline void link_task_to_cpu(struct task_struct* linked,
+                                      cpu_entry_t *entry)
+{
+        cpu_entry_t *sched;
+        struct task_struct* tmp;
+        int on_cpu;
+        BUG_ON(linked && !is_realtime(linked));
+        /* Currently linked task is set to be unlinked. */
+        if (entry->linked) {
+                entry->linked->rt_param.linked_on = NO_CPU;
+        }
+        /* Link new task to CPU. */
+        if (linked) {
+                /* handle task is already scheduled somewhere! */
+                on_cpu = linked->rt_param.scheduled_on;
+                if (on_cpu != NO_CPU) {
+                        sched = &per_cpu(gsnedf_cpu_entries, on_cpu);
+                        /* this should only happen if not linked already */
+                        BUG_ON(sched->linked == linked);
+                        /* If we are already scheduled on the CPU to which we
+                         * wanted to link, we don't need to do the swap --
+                         * we just link ourselves to the CPU and depend on
+                         * the caller to get things right.
+                         */
+                        if (entry != sched) {
+                                TRACE_TASK(linked,
+                                           "already scheduled on %d, updating link.\n",
+                                           sched->cpu);
+                                tmp = sched->linked;
+                                linked->rt_param.linked_on = sched->cpu;
+                                sched->linked = linked;
+                                update_cpu_position(sched);
+                                linked = tmp;
+                        }
+                }
+                if (linked) /* might be NULL due to swap */
+                        linked->rt_param.linked_on = entry->cpu;
+        }
+        entry->linked = linked;
+#ifdef WANT_ALL_SCHED_EVENTS
+        if (linked)
+                TRACE_TASK(linked, "linked to %d.\n", entry->cpu);
+        else
+                TRACE("NULL linked to %d.\n", entry->cpu);
+#endif
+        update_cpu_position(entry);
+}
+/* unlink - Make sure a task is not linked any longer to an entry
+ *          where it was linked before. Must hold gsnedf_lock.
+ */
+static noinline void unlink(struct task_struct* t)
+{
+        cpu_entry_t *entry;
+        if (t->rt_param.linked_on != NO_CPU) {
+                /* unlink */
+                entry = &per_cpu(gsnedf_cpu_entries, t->rt_param.linked_on);
+                t->rt_param.linked_on = NO_CPU;
+                link_task_to_cpu(NULL, entry);
+        } else if (is_queued(t)) {
+                /* This is an interesting situation: t is scheduled,
+                 * but was just recently unlinked.  It cannot be
+                 * linked anywhere else (because then it would have
+                 * been relinked to this CPU), thus it must be in some
+                 * queue. We must remove it from the list in this
+                 * case.
+                 */
+                remove(&gsnedf, t);
+        }
+}
+/* preempt - force a CPU to reschedule
+ */
+static void preempt(cpu_entry_t *entry)
+{
+        preempt_if_preemptable(entry->scheduled, entry->cpu);
+}
+/* requeue - Put an unlinked task into gsn-edf domain.
+ *           Caller must hold gsnedf_lock.
+ */
+static noinline void requeue(struct task_struct* task)
+{
+        BUG_ON(!task);
+        /* sanity check before insertion */
+        BUG_ON(is_queued(task));
+        if (is_early_releasing(task) || is_released(task, litmus_clock()))
+                __add_ready(&gsnedf, task);
+        else {
+                /* it has got to wait */
+                add_release(&gsnedf, task);
+        }
+}
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+static cpu_entry_t* gsnedf_get_nearest_available_cpu(cpu_entry_t *start)
+{
+        cpu_entry_t *affinity;
+        get_nearest_available_cpu(affinity, start, gsnedf_cpu_entries,
+#ifdef CONFIG_RELEASE_MASTER
+                        gsnedf.release_master,
+#else
+                        NO_CPU,
+#endif
+                        cpu_online_mask);
+        return(affinity);
+}
+#endif
+/* check for any necessary preemptions */
+static void check_for_preemptions(void)
+{
+        struct task_struct *task;
+        cpu_entry_t *last;
+#ifdef CONFIG_PREFER_LOCAL_LINKING
+        cpu_entry_t *local;
+        /* Before linking to other CPUs, check first whether the local CPU is
+         * idle. */
+        local = this_cpu_ptr(&gsnedf_cpu_entries);
+        task  = __peek_ready(&gsnedf);
+        if (task && !local->linked
+#ifdef CONFIG_RELEASE_MASTER
+            && likely(local->cpu != gsnedf.release_master)
+#endif
+                ) {
+                task = __take_ready(&gsnedf);
+                TRACE_TASK(task, "linking to local CPU %d to avoid IPI\n", local->cpu);
+                link_task_to_cpu(task, local);
+                preempt(local);
+        }
+#endif
+        for (last = lowest_prio_cpu();
+             edf_preemption_needed(&gsnedf, last->linked);
+             last = lowest_prio_cpu()) {
+                /* preemption necessary */
+                task = __take_ready(&gsnedf);
+                TRACE("check_for_preemptions: attempting to link task %d to %d\n",
+                      task->pid, last->cpu);
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+                {
+                        cpu_entry_t *affinity =
+                                        gsnedf_get_nearest_available_cpu(
+                                                &per_cpu(gsnedf_cpu_entries, task_cpu(task)));
+                        if (affinity)
+                                last = affinity;
+                        else if (requeue_preempted_job(last->linked))
+                                requeue(last->linked);
+                }
+#else
+                if (requeue_preempted_job(last->linked))
+                        requeue(last->linked);
+#endif
+                link_task_to_cpu(task, last);
+                preempt(last);
+        }
+}
+/* gsnedf_job_arrival: task is either resumed or released */
+static noinline void gsnedf_job_arrival(struct task_struct* task)
+{
+        BUG_ON(!task);
+        requeue(task);
+        check_for_preemptions();
+}
+static void gsnedf_release_jobs(rt_domain_t* rt, struct bheap* tasks)
+{
+        unsigned long flags;
+        raw_spin_lock_irqsave(&gsnedf_lock, flags);
+        __merge_ready(rt, tasks);
+        check_for_preemptions();
+        raw_spin_unlock_irqrestore(&gsnedf_lock, flags);
+}
+/* caller holds gsnedf_lock */
+static noinline void curr_job_completion(int forced)
+{
+        struct task_struct *t = current;
+        BUG_ON(!t);
+        sched_trace_task_completion(t, forced);
+        TRACE_TASK(t, "job_completion(forced=%d).\n", forced);
+        /* set flags */
+        tsk_rt(t)->completed = 0;
+        /* prepare for next period */
+        prepare_for_next_period(t);
+        if (is_early_releasing(t) || is_released(t, litmus_clock()))
+                sched_trace_task_release(t);
+        /* unlink */
+        unlink(t);
+        /* requeue
+         * But don't requeue a blocking task. */
+        if (is_current_running())
+                gsnedf_job_arrival(t);
+}
+/* Getting schedule() right is a bit tricky. schedule() may not make any
+ * assumptions on the state of the current task since it may be called for a
+ * number of reasons. The reasons include a scheduler_tick() determined that it
+ * was necessary, because sys_exit_np() was called, because some Linux
+ * subsystem determined so, or even (in the worst case) because there is a bug
+ * hidden somewhere. Thus, we must take extreme care to determine what the
+ * current state is.
+ *
+ * The CPU could currently be scheduling a task (or not), be linked (or not).
+ *
+ * The following assertions for the scheduled task could hold:
+ *
+ *      - !is_running(scheduled)        // the job blocks
+ *      - scheduled->timeslice == 0     // the job completed (forcefully)
+ *      - is_completed()                // the job completed (by syscall)
+ *      - linked != scheduled           // we need to reschedule (for any reason)
+ *      - is_np(scheduled)              // rescheduling must be delayed,
+ *                                         sys_exit_np must be requested
+ *
+ * Any of these can occur together.
+ */
+static struct task_struct* gsnedf_schedule(struct task_struct * prev)
+{
+        cpu_entry_t* entry = this_cpu_ptr(&gsnedf_cpu_entries);
+        int out_of_time, sleep, preempt, np, exists, blocks;
+        struct task_struct* next = NULL;
+#ifdef CONFIG_RELEASE_MASTER
+        /* Bail out early if we are the release master.
+         * The release master never schedules any real-time tasks.
+         */
+        if (unlikely(gsnedf.release_master == entry->cpu)) {
+                sched_state_task_picked();
+                return NULL;
+        }
+#endif
+        raw_spin_lock(&gsnedf_lock);
+        /* sanity checking */
+        BUG_ON(entry->scheduled && entry->scheduled != prev);
+        BUG_ON(entry->scheduled && !is_realtime(prev));
+        BUG_ON(is_realtime(prev) && !entry->scheduled);
+        /* (0) Determine state */
+        exists      = entry->scheduled != NULL;
+        blocks      = exists && !is_current_running();
+        out_of_time = exists && budget_enforced(entry->scheduled)
+                && budget_exhausted(entry->scheduled);
+        np          = exists && is_np(entry->scheduled);
+        sleep       = exists && is_completed(entry->scheduled);
+        preempt     = entry->scheduled != entry->linked;
+#ifdef WANT_ALL_SCHED_EVENTS
+        TRACE_TASK(prev, "invoked gsnedf_schedule.\n");
+#endif
+        if (exists)
+                TRACE_TASK(prev,
+                           "blocks:%d out_of_time:%d np:%d sleep:%d preempt:%d "
+                           "state:%d sig:%d\n",
+                           blocks, out_of_time, np, sleep, preempt,
+                           prev->state, signal_pending(prev));
+        if (entry->linked && preempt)
+                TRACE_TASK(prev, "will be preempted by %s/%d\n",
+                           entry->linked->comm, entry->linked->pid);
+        /* If a task blocks we have no choice but to reschedule.
+         */
+        if (blocks)
+                unlink(entry->scheduled);
+        /* Request a sys_exit_np() call if we would like to preempt but cannot.
+         * We need to make sure to update the link structure anyway in case
+         * that we are still linked. Multiple calls to request_exit_np() don't
+         * hurt.
+         */
+        if (np && (out_of_time || preempt || sleep)) {
+                unlink(entry->scheduled);
+                request_exit_np(entry->scheduled);
+        }
+        /* Any task that is preemptable and either exhausts its execution
+         * budget or wants to sleep completes. We may have to reschedule after
+         * this. Don't do a job completion if we block (can't have timers running
+         * for blocked jobs).
+         */
+        if (!np && (out_of_time || sleep))
+                curr_job_completion(!sleep);
+        /* Link pending task if we became unlinked.
+         */
+        if (!entry->linked)
+                link_task_to_cpu(__take_ready(&gsnedf), entry);
+        /* The final scheduling decision. Do we need to switch for some reason?
+         * If linked is different from scheduled, then select linked as next.
+         */
+        if ((!np || blocks) &&
+            entry->linked != entry->scheduled) {
+                /* Schedule a linked job? */
+                if (entry->linked) {
+                        entry->linked->rt_param.scheduled_on = entry->cpu;
+                        next = entry->linked;
+                        TRACE_TASK(next, "scheduled_on = P%d\n", smp_processor_id());
+                }
+                if (entry->scheduled) {
+                        /* not gonna be scheduled soon */
+                        entry->scheduled->rt_param.scheduled_on = NO_CPU;
+                        TRACE_TASK(entry->scheduled, "scheduled_on = NO_CPU\n");
+                }
+        } else
+                /* Only override Linux scheduler if we have a real-time task
+                 * scheduled that needs to continue.
+                 */
+                if (exists)
+                        next = prev;
+        sched_state_task_picked();
+        raw_spin_unlock(&gsnedf_lock);
+#ifdef WANT_ALL_SCHED_EVENTS
+        TRACE("gsnedf_lock released, next=0x%p\n", next);
+        if (next)
+                TRACE_TASK(next, "scheduled at %llu\n", litmus_clock());
+        else if (exists && !next)
+                TRACE("becomes idle at %llu.\n", litmus_clock());
+#endif
+        return next;
+}
+/* _finish_switch - we just finished the switch away from prev
+ */
+static void gsnedf_finish_switch(struct task_struct *prev)
+{
+        cpu_entry_t*    entry = this_cpu_ptr(&gsnedf_cpu_entries);
+        entry->scheduled = is_realtime(current) ? current : NULL;
+#ifdef WANT_ALL_SCHED_EVENTS
+        TRACE_TASK(prev, "switched away from\n");
+#endif
+}
+/*      Prepare a task for running in RT mode
+ */
+static void gsnedf_task_new(struct task_struct * t, int on_rq, int is_scheduled)
+{
+        unsigned long           flags;
+        cpu_entry_t*            entry;
+        TRACE("gsn edf: task new %d\n", t->pid);
+        raw_spin_lock_irqsave(&gsnedf_lock, flags);
+        /* setup job params */
+        release_at(t, litmus_clock());
+        if (is_scheduled) {
+                entry = &per_cpu(gsnedf_cpu_entries, task_cpu(t));
+                BUG_ON(entry->scheduled);
+#ifdef CONFIG_RELEASE_MASTER
+                if (entry->cpu != gsnedf.release_master) {
+#endif
+                        entry->scheduled = t;
+                        tsk_rt(t)->scheduled_on = task_cpu(t);
+#ifdef CONFIG_RELEASE_MASTER
+                } else {
+                        /* do not schedule on release master */
+                        preempt(entry); /* force resched */
+                        tsk_rt(t)->scheduled_on = NO_CPU;
+                }
+#endif
+        } else {
+                t->rt_param.scheduled_on = NO_CPU;
+        }
+        t->rt_param.linked_on          = NO_CPU;
+        if (on_rq || is_scheduled)
+                gsnedf_job_arrival(t);
+        raw_spin_unlock_irqrestore(&gsnedf_lock, flags);
+}
+static void gsnedf_task_wake_up(struct task_struct *task)
+{
+        unsigned long flags;
+        lt_t now;
+        TRACE_TASK(task, "wake_up at %llu\n", litmus_clock());
+        raw_spin_lock_irqsave(&gsnedf_lock, flags);
+        now = litmus_clock();
+        if (is_sporadic(task) && is_tardy(task, now)) {
+                inferred_sporadic_job_release_at(task, now);
+        }
+        gsnedf_job_arrival(task);
+        raw_spin_unlock_irqrestore(&gsnedf_lock, flags);
+}
+static void gsnedf_task_block(struct task_struct *t)
+{
+        unsigned long flags;
+        TRACE_TASK(t, "block at %llu\n", litmus_clock());
+        /* unlink if necessary */
+        raw_spin_lock_irqsave(&gsnedf_lock, flags);
+        unlink(t);
+        raw_spin_unlock_irqrestore(&gsnedf_lock, flags);
+        BUG_ON(!is_realtime(t));
+}
+static void gsnedf_task_exit(struct task_struct * t)
+{
+        unsigned long flags;
+        /* unlink if necessary */
+        raw_spin_lock_irqsave(&gsnedf_lock, flags);
+        unlink(t);
+        if (tsk_rt(t)->scheduled_on != NO_CPU) {
+                gsnedf_cpus[tsk_rt(t)->scheduled_on]->scheduled = NULL;
+                tsk_rt(t)->scheduled_on = NO_CPU;
+        }
+        raw_spin_unlock_irqrestore(&gsnedf_lock, flags);
+        BUG_ON(!is_realtime(t));
+        TRACE_TASK(t, "RIP\n");
+}
+static long gsnedf_admit_task(struct task_struct* tsk)
+{
+        return 0;
+}
+#ifdef CONFIG_LITMUS_LOCKING
+#include <litmus/fdso.h>
+/* called with IRQs off */
+static void set_priority_inheritance(struct task_struct* t, struct task_struct* prio_inh)
+{
+        int linked_on;
+        int check_preempt = 0;
+        raw_spin_lock(&gsnedf_lock);
+        TRACE_TASK(t, "inherits priority from %s/%d\n", prio_inh->comm, prio_inh->pid);
+        tsk_rt(t)->inh_task = prio_inh;
+        linked_on  = tsk_rt(t)->linked_on;
+        /* If it is scheduled, then we need to reorder the CPU heap. */
+        if (linked_on != NO_CPU) {
+                TRACE_TASK(t, "%s: linked  on %d\n",
+                           __FUNCTION__, linked_on);
+                /* Holder is scheduled; need to re-order CPUs.
+                 * We can't use heap_decrease() here since
+                 * the cpu_heap is ordered in reverse direction, so
+                 * it is actually an increase. */
+                bheap_delete(cpu_lower_prio, &gsnedf_cpu_heap,
+                            gsnedf_cpus[linked_on]->hn);
+                bheap_insert(cpu_lower_prio, &gsnedf_cpu_heap,
+                            gsnedf_cpus[linked_on]->hn);
+        } else {
+                /* holder may be queued: first stop queue changes */
+                raw_spin_lock(&gsnedf.release_lock);
+                if (is_queued(t)) {
+                        TRACE_TASK(t, "%s: is queued\n",
+                                   __FUNCTION__);
+                        /* We need to update the position of holder in some
+                         * heap. Note that this could be a release heap if we
+                         * budget enforcement is used and this job overran. */
+                        check_preempt =
+                                !bheap_decrease(edf_ready_order,
+                                               tsk_rt(t)->heap_node);
+                } else {
+                        /* Nothing to do: if it is not queued and not linked
+                         * then it is either sleeping or currently being moved
+                         * by other code (e.g., a timer interrupt handler) that
+                         * will use the correct priority when enqueuing the
+                         * task. */
+                        TRACE_TASK(t, "%s: is NOT queued => Done.\n",
+                                   __FUNCTION__);
+                }
+                raw_spin_unlock(&gsnedf.release_lock);
+                /* If holder was enqueued in a release heap, then the following
+                 * preemption check is pointless, but we can't easily detect
+                 * that case. If you want to fix this, then consider that
+                 * simply adding a state flag requires O(n) time to update when
+                 * releasing n tasks, which conflicts with the goal to have
+                 * O(log n) merges. */
+                if (check_preempt) {
+                        /* heap_decrease() hit the top level of the heap: make
+                         * sure preemption checks get the right task, not the
+                         * potentially stale cache. */
+                        bheap_uncache_min(edf_ready_order,
+                                         &gsnedf.ready_queue);
+                        check_for_preemptions();
+                }
+        }
+        raw_spin_unlock(&gsnedf_lock);
+}
+/* called with IRQs off */
+static void clear_priority_inheritance(struct task_struct* t)
+{
+        raw_spin_lock(&gsnedf_lock);
+        /* A job only stops inheriting a priority when it releases a
+         * resource. Thus we can make the following assumption.*/
+        BUG_ON(tsk_rt(t)->scheduled_on == NO_CPU);
+        TRACE_TASK(t, "priority restored\n");
+        tsk_rt(t)->inh_task = NULL;
+        /* Check if rescheduling is necessary. We can't use heap_decrease()
+         * since the priority was effectively lowered. */
+        unlink(t);
+        gsnedf_job_arrival(t);
+        raw_spin_unlock(&gsnedf_lock);
+}
+/* ******************** FMLP support ********************** */
+/* struct for semaphore with priority inheritance */
+struct fmlp_semaphore {
+        struct litmus_lock litmus_lock;
+        /* current resource holder */
+        struct task_struct *owner;
+        /* highest-priority waiter */
+        struct task_struct *hp_waiter;
+        /* FIFO queue of waiting tasks */
+        wait_queue_head_t wait;
+};
+static inline struct fmlp_semaphore* fmlp_from_lock(struct litmus_lock* lock)
+{
+        return container_of(lock, struct fmlp_semaphore, litmus_lock);
+}
+/* caller is responsible for locking */
+struct task_struct* find_hp_waiter(struct fmlp_semaphore *sem,
+                                   struct task_struct* skip)
+{
+        struct list_head        *pos;
+        struct task_struct      *queued, *found = NULL;
+        list_for_each(pos, &sem->wait.task_list) {
+                queued  = (struct task_struct*) list_entry(pos, wait_queue_t,
+                                                           task_list)->private;
+                /* Compare task prios, find high prio task. */
+                if (queued != skip && edf_higher_prio(queued, found))
+                        found = queued;
+        }
+        return found;
+}
+int gsnedf_fmlp_lock(struct litmus_lock* l)
+{
+        struct task_struct* t = current;
+        struct fmlp_semaphore *sem = fmlp_from_lock(l);
+        wait_queue_t wait;
+        unsigned long flags;
+        if (!is_realtime(t))
+                return -EPERM;
+        /* prevent nested lock acquisition --- not supported by FMLP */
+        if (tsk_rt(t)->num_locks_held)
+                return -EBUSY;
+        spin_lock_irqsave(&sem->wait.lock, flags);
+        if (sem->owner) {
+                /* resource is not free => must suspend and wait */
+                init_waitqueue_entry(&wait, t);
+                /* FIXME: interruptible would be nice some day */
+                set_task_state(t, TASK_UNINTERRUPTIBLE);
+                __add_wait_queue_tail_exclusive(&sem->wait, &wait);
+                /* check if we need to activate priority inheritance */
+                if (edf_higher_prio(t, sem->hp_waiter)) {
+                        sem->hp_waiter = t;
+                        if (edf_higher_prio(t, sem->owner))
+                                set_priority_inheritance(sem->owner, sem->hp_waiter);
+                }
+                TS_LOCK_SUSPEND;
+                /* release lock before sleeping */
+                spin_unlock_irqrestore(&sem->wait.lock, flags);
+                /* We depend on the FIFO order.  Thus, we don't need to recheck
+                 * when we wake up; we are guaranteed to have the lock since
+                 * there is only one wake up per release.
+                 */
+                schedule();
+                TS_LOCK_RESUME;
+                /* Since we hold the lock, no other task will change
+                 * ->owner. We can thus check it without acquiring the spin
+                 * lock. */
+                BUG_ON(sem->owner != t);
+        } else {
+                /* it's ours now */
+                sem->owner = t;
+                spin_unlock_irqrestore(&sem->wait.lock, flags);
+        }
+        tsk_rt(t)->num_locks_held++;
+        return 0;
+}
+int gsnedf_fmlp_unlock(struct litmus_lock* l)
+{
+        struct task_struct *t = current, *next;
+        struct fmlp_semaphore *sem = fmlp_from_lock(l);
+        unsigned long flags;
+        int err = 0;
+        spin_lock_irqsave(&sem->wait.lock, flags);
+        if (sem->owner != t) {
+                err = -EINVAL;
+                goto out;
+        }
+        tsk_rt(t)->num_locks_held--;
+        /* check if there are jobs waiting for this resource */
+        next = __waitqueue_remove_first(&sem->wait);
+        if (next) {
+                /* next becomes the resouce holder */
+                sem->owner = next;
+                TRACE_CUR("lock ownership passed to %s/%d\n", next->comm, next->pid);
+                /* determine new hp_waiter if necessary */
+                if (next == sem->hp_waiter) {
+                        TRACE_TASK(next, "was highest-prio waiter\n");
+                        /* next has the highest priority --- it doesn't need to
+                         * inherit.  However, we need to make sure that the
+                         * next-highest priority in the queue is reflected in
+                         * hp_waiter. */
+                        sem->hp_waiter = find_hp_waiter(sem, next);
+                        if (sem->hp_waiter)
+                                TRACE_TASK(sem->hp_waiter, "is new highest-prio waiter\n");
+                        else
+                                TRACE("no further waiters\n");
+                } else {
+                        /* Well, if next is not the highest-priority waiter,
+                         * then it ought to inherit the highest-priority
+                         * waiter's priority. */
+                        set_priority_inheritance(next, sem->hp_waiter);
+                }
+                /* wake up next */
+                wake_up_process(next);
+        } else
+                /* becomes available */
+                sem->owner = NULL;
+        /* we lose the benefit of priority inheritance (if any) */
+        if (tsk_rt(t)->inh_task)
+                clear_priority_inheritance(t);
+out:
+        spin_unlock_irqrestore(&sem->wait.lock, flags);
+        return err;
+}
+int gsnedf_fmlp_close(struct litmus_lock* l)
+{
+        struct task_struct *t = current;
+        struct fmlp_semaphore *sem = fmlp_from_lock(l);
+        unsigned long flags;
+        int owner;
+        spin_lock_irqsave(&sem->wait.lock, flags);
+        owner = sem->owner == t;
+        spin_unlock_irqrestore(&sem->wait.lock, flags);
+        if (owner)
+                gsnedf_fmlp_unlock(l);
+        return 0;
+}
+void gsnedf_fmlp_free(struct litmus_lock* lock)
+{
+        kfree(fmlp_from_lock(lock));
+}
+static struct litmus_lock_ops gsnedf_fmlp_lock_ops = {
+        .close  = gsnedf_fmlp_close,
+        .lock   = gsnedf_fmlp_lock,
+        .unlock = gsnedf_fmlp_unlock,
+        .deallocate = gsnedf_fmlp_free,
+};
+static struct litmus_lock* gsnedf_new_fmlp(void)
+{
+        struct fmlp_semaphore* sem;
+        sem = kmalloc(sizeof(*sem), GFP_KERNEL);
+        if (!sem)
+                return NULL;
+        sem->owner   = NULL;
+        sem->hp_waiter = NULL;
+        init_waitqueue_head(&sem->wait);
+        sem->litmus_lock.ops = &gsnedf_fmlp_lock_ops;
+        return &sem->litmus_lock;
+}
+/* **** lock constructor **** */
+static long gsnedf_allocate_lock(struct litmus_lock **lock, int type,
+                                 void* __user unused)
+{
+        int err = -ENXIO;
+        /* GSN-EDF currently only supports the FMLP for global resources. */
+        switch (type) {
+        case FMLP_SEM:
+                /* Flexible Multiprocessor Locking Protocol */
+                *lock = gsnedf_new_fmlp();
+                if (*lock)
+                        err = 0;
+                else
+                        err = -ENOMEM;
+                break;
+        };
+        return err;
+}
+#endif
+static struct domain_proc_info gsnedf_domain_proc_info;
+static long gsnedf_get_domain_proc_info(struct domain_proc_info **ret)
+{
+        *ret = &gsnedf_domain_proc_info;
+        return 0;
+}
+static void gsnedf_setup_domain_proc(void)
+{
+        int i, cpu;
+        int release_master =
+#ifdef CONFIG_RELEASE_MASTER
+                        atomic_read(&release_master_cpu);
+#else
+                NO_CPU;
+#endif
+        int num_rt_cpus = num_online_cpus() - (release_master != NO_CPU);
+        struct cd_mapping *map;
+        memset(&gsnedf_domain_proc_info, 0, sizeof(gsnedf_domain_proc_info));
+        init_domain_proc_info(&gsnedf_domain_proc_info, num_rt_cpus, 1);
+        gsnedf_domain_proc_info.num_cpus = num_rt_cpus;
+        gsnedf_domain_proc_info.num_domains = 1;
+        gsnedf_domain_proc_info.domain_to_cpus[0].id = 0;
+        for (cpu = 0, i = 0; cpu < num_online_cpus(); ++cpu) {
+                if (cpu == release_master)
+                        continue;
+                map = &gsnedf_domain_proc_info.cpu_to_domains[i];
+                map->id = cpu;
+                cpumask_set_cpu(0, map->mask);
+                ++i;
+                /* add cpu to the domain */
+                cpumask_set_cpu(cpu,
+                        gsnedf_domain_proc_info.domain_to_cpus[0].mask);
+        }
+}
+static long gsnedf_activate_plugin(void)
+{
+        int cpu;
+        cpu_entry_t *entry;
+        bheap_init(&gsnedf_cpu_heap);
+#ifdef CONFIG_RELEASE_MASTER
+        gsnedf.release_master = atomic_read(&release_master_cpu);
+#endif
+        for_each_online_cpu(cpu) {
+                entry = &per_cpu(gsnedf_cpu_entries, cpu);
+                bheap_node_init(&entry->hn, entry);
+                entry->linked    = NULL;
+                entry->scheduled = NULL;
+#ifdef CONFIG_RELEASE_MASTER
+                if (cpu != gsnedf.release_master) {
+#endif
+                        TRACE("GSN-EDF: Initializing CPU #%d.\n", cpu);
+                        update_cpu_position(entry);
+#ifdef CONFIG_RELEASE_MASTER
+                } else {
+                        TRACE("GSN-EDF: CPU %d is release master.\n", cpu);
+                }
+#endif
+        }
+        gsnedf_setup_domain_proc();
+        return 0;
+}
+static long gsnedf_deactivate_plugin(void)
+{
+        destroy_domain_proc_info(&gsnedf_domain_proc_info);
+        return 0;
+}
+/*      Plugin object   */
+static struct sched_plugin gsn_edf_plugin __cacheline_aligned_in_smp = {
+        .plugin_name            = "GSN-EDF",
+        .finish_switch          = gsnedf_finish_switch,
+        .task_new               = gsnedf_task_new,
+        .complete_job           = complete_job,
+        .task_exit              = gsnedf_task_exit,
+        .schedule               = gsnedf_schedule,
+        .task_wake_up           = gsnedf_task_wake_up,
+        .task_block             = gsnedf_task_block,
+        .admit_task             = gsnedf_admit_task,
+        .activate_plugin        = gsnedf_activate_plugin,
+        .deactivate_plugin      = gsnedf_deactivate_plugin,
+        .get_domain_proc_info   = gsnedf_get_domain_proc_info,
+#ifdef CONFIG_LITMUS_LOCKING
+        .allocate_lock          = gsnedf_allocate_lock,
+#endif
+};
+static int __init init_gsn_edf(void)
+{
+        int cpu;
+        cpu_entry_t *entry;
+        bheap_init(&gsnedf_cpu_heap);
+        /* initialize CPU state */
+        for (cpu = 0; cpu < NR_CPUS; cpu++)  {
+                entry = &per_cpu(gsnedf_cpu_entries, cpu);
+                gsnedf_cpus[cpu] = entry;
+                entry->cpu       = cpu;
+                entry->hn        = &gsnedf_heap_node[cpu];
+                bheap_node_init(&entry->hn, entry);
+        }
+        edf_domain_init(&gsnedf, NULL, gsnedf_release_jobs);
+        return register_sched_plugin(&gsn_edf_plugin);
+}
+module_init(init_gsn_edf);
diff --git a/litmus/sched_pfair.c b/litmus/sched_pfair.c
new file mode 100644
index 000000000000..f66488dc6a12
--- /dev/null
+++ b/litmus/sched_pfair.c
@@ -0,0 +1,1231 @@
+/*
+ * kernel/sched_pfair.c
+ *
+ * Implementation of the PD^2 pfair scheduling algorithm. This
+ * implementation realizes "early releasing," i.e., it is work-conserving.
+ *
+ */
+#include <asm/div64.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <litmus/debug_trace.h>
+#include <litmus/litmus.h>
+#include <litmus/jobs.h>
+#include <litmus/preempt.h>
+#include <litmus/rt_domain.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/sched_trace.h>
+#include <litmus/trace.h>
+#include <litmus/bheap.h>
+/* to configure the cluster size */
+#include <litmus/litmus_proc.h>
+#include <litmus/clustered.h>
+static enum cache_level pfair_cluster_level = GLOBAL_CLUSTER;
+struct subtask {
+        /* measured in quanta relative to job release */
+        quanta_t release;
+        quanta_t deadline;
+        quanta_t overlap; /* called "b bit" by PD^2 */
+        quanta_t group_deadline;
+};
+struct pfair_param   {
+        quanta_t        quanta;       /* number of subtasks */
+        quanta_t        cur;          /* index of current subtask */
+        quanta_t        release;      /* in quanta */
+        quanta_t        period;       /* in quanta */
+        quanta_t        last_quantum; /* when scheduled last */
+        int             last_cpu;     /* where scheduled last */
+        unsigned int    needs_requeue:1;
+        struct pfair_cluster* cluster; /* where this task is scheduled */
+        struct subtask subtasks[0];   /* allocate together with pfair_param */
+};
+#define tsk_pfair(tsk) ((tsk)->rt_param.pfair)
+struct pfair_state {
+        struct cluster_cpu topology;
+        struct hrtimer quantum_timer;
+        volatile quanta_t cur_tick;    /* updated by the CPU that is advancing
+                                        * the time */
+        volatile quanta_t local_tick;  /* What tick is the local CPU currently
+                                        * executing? Updated only by the local
+                                        * CPU. In QEMU, this may lag behind the
+                                        * current tick. In a real system, with
+                                        * proper timers and aligned quanta,
+                                        * that should only be the case for a
+                                        * very short time after the time
+                                        * advanced. With staggered quanta, it
+                                        * will lag for the duration of the
+                                        * offset.
+                                        */
+        struct task_struct* linked;    /* the task that should be executing */
+        struct task_struct* local;     /* the local copy of linked          */
+        struct task_struct* scheduled; /* what is actually scheduled        */
+        struct list_head    out_of_budget; /* list of tasks that exhausted their allocation */
+        lt_t offset;                    /* stagger offset */
+        unsigned int missed_updates;
+        unsigned int missed_quanta;
+};
+struct pfair_cluster {
+        struct scheduling_cluster topology;
+        /* The "global" time in this cluster. */
+        quanta_t pfair_time; /* the "official" PFAIR clock */
+        /* The ready queue for this cluster. */
+        rt_domain_t pfair;
+        /* The set of jobs that should have their release enacted at the next
+         * quantum boundary.
+         */
+        struct bheap release_queue;
+        raw_spinlock_t release_lock;
+};
+static inline struct pfair_cluster* cpu_cluster(struct pfair_state* state)
+{
+        return container_of(state->topology.cluster, struct pfair_cluster, topology);
+}
+static inline int cpu_id(struct pfair_state* state)
+{
+        return state->topology.id;
+}
+static inline struct pfair_state* from_cluster_list(struct list_head* pos)
+{
+        return list_entry(pos, struct pfair_state, topology.cluster_list);
+}
+static inline struct pfair_cluster* from_domain(rt_domain_t* rt)
+{
+        return container_of(rt, struct pfair_cluster, pfair);
+}
+static inline raw_spinlock_t* cluster_lock(struct pfair_cluster* cluster)
+{
+        /* The ready_lock is used to serialize all scheduling events. */
+        return &cluster->pfair.ready_lock;
+}
+static inline raw_spinlock_t* cpu_lock(struct pfair_state* state)
+{
+        return cluster_lock(cpu_cluster(state));
+}
+DEFINE_PER_CPU(struct pfair_state, pfair_state);
+struct pfair_state* *pstate; /* short cut */
+static struct pfair_cluster* pfair_clusters;
+static int num_pfair_clusters;
+/* Enable for lots of trace info.
+ * #define PFAIR_DEBUG
+ */
+#ifdef PFAIR_DEBUG
+#define PTRACE_TASK(t, f, args...)  TRACE_TASK(t, f, ## args)
+#define PTRACE(f, args...) TRACE(f, ## args)
+#else
+#define PTRACE_TASK(t, f, args...)
+#define PTRACE(f, args...)
+#endif
+/* gcc will inline all of these accessor functions... */
+static struct subtask* cur_subtask(struct task_struct* t)
+{
+        return tsk_pfair(t)->subtasks + tsk_pfair(t)->cur;
+}
+static quanta_t cur_deadline(struct task_struct* t)
+{
+        return cur_subtask(t)->deadline +  tsk_pfair(t)->release;
+}
+static quanta_t cur_release(struct task_struct* t)
+{
+        /* This is early releasing: only the release of the first subtask
+         * counts. */
+        return tsk_pfair(t)->release;
+}
+static quanta_t cur_overlap(struct task_struct* t)
+{
+        return cur_subtask(t)->overlap;
+}
+static quanta_t cur_group_deadline(struct task_struct* t)
+{
+        quanta_t gdl = cur_subtask(t)->group_deadline;
+        if (gdl)
+                return gdl + tsk_pfair(t)->release;
+        else
+                return gdl;
+}
+static int pfair_higher_prio(struct task_struct* first,
+                             struct task_struct* second)
+{
+        return  /* first task must exist */
+                first && (
+                /* Does the second task exist and is it a real-time task?  If
+                 * not, the first task (which is a RT task) has higher
+                 * priority.
+                 */
+                !second || !is_realtime(second)  ||
+                /* Is the (subtask) deadline of the first task earlier?
+                 * Then it has higher priority.
+                 */
+                time_before(cur_deadline(first), cur_deadline(second)) ||
+                /* Do we have a deadline tie?
+                 * Then break by B-bit.
+                 */
+                (cur_deadline(first) == cur_deadline(second) &&
+                 (cur_overlap(first) > cur_overlap(second) ||
+                /* Do we have a B-bit tie?
+                 * Then break by group deadline.
+                 */
+                (cur_overlap(first) == cur_overlap(second) &&
+                 (time_after(cur_group_deadline(first),
+                             cur_group_deadline(second)) ||
+                /* Do we have a group deadline tie?
+                 * Then break by PID, which are unique.
+                 */
+                (cur_group_deadline(first) ==
+                 cur_group_deadline(second) &&
+                 first->pid < second->pid))))));
+}
+int pfair_ready_order(struct bheap_node* a, struct bheap_node* b)
+{
+        return pfair_higher_prio(bheap2task(a), bheap2task(b));
+}
+static void pfair_release_jobs(rt_domain_t* rt, struct bheap* tasks)
+{
+        struct pfair_cluster* cluster = from_domain(rt);
+        unsigned long flags;
+        raw_spin_lock_irqsave(&cluster->release_lock, flags);
+        bheap_union(pfair_ready_order, &cluster->release_queue, tasks);
+        raw_spin_unlock_irqrestore(&cluster->release_lock, flags);
+}
+static void prepare_release(struct task_struct* t, quanta_t at)
+{
+        tsk_pfair(t)->release    = at;
+        tsk_pfair(t)->cur        = 0;
+}
+/* pull released tasks from the release queue */
+static void poll_releases(struct pfair_cluster* cluster)
+{
+        raw_spin_lock(&cluster->release_lock);
+        __merge_ready(&cluster->pfair, &cluster->release_queue);
+        raw_spin_unlock(&cluster->release_lock);
+}
+static void check_preempt(struct task_struct* t)
+{
+        int cpu = NO_CPU;
+        if (tsk_rt(t)->linked_on != tsk_rt(t)->scheduled_on &&
+            is_present(t)) {
+                /* the task can be scheduled and
+                 * is not scheduled where it ought to be scheduled
+                 */
+                cpu = tsk_rt(t)->linked_on != NO_CPU ?
+                        tsk_rt(t)->linked_on         :
+                        tsk_rt(t)->scheduled_on;
+                PTRACE_TASK(t, "linked_on:%d, scheduled_on:%d\n",
+                           tsk_rt(t)->linked_on, tsk_rt(t)->scheduled_on);
+                /* preempt */
+                litmus_reschedule(cpu);
+        }
+}
+/* caller must hold pfair.ready_lock */
+static void drop_all_references(struct task_struct *t)
+{
+        int cpu;
+        struct pfair_state* s;
+        struct pfair_cluster* cluster;
+        if (bheap_node_in_heap(tsk_rt(t)->heap_node)) {
+                /* It must be in the ready queue; drop references isn't called
+                 * when the job is in a release queue. */
+                cluster = tsk_pfair(t)->cluster;
+                bheap_delete(pfair_ready_order, &cluster->pfair.ready_queue,
+                            tsk_rt(t)->heap_node);
+        }
+        for (cpu = 0; cpu < num_online_cpus(); cpu++) {
+                s = &per_cpu(pfair_state, cpu);
+                if (s->linked == t)
+                        s->linked = NULL;
+                if (s->local  == t)
+                        s->local  = NULL;
+                if (s->scheduled  == t)
+                        s->scheduled = NULL;
+        }
+        /* make sure we don't have a stale linked_on field */
+        tsk_rt(t)->linked_on = NO_CPU;
+        /* make sure we're not queued for re-releasing */
+        if (in_list(&tsk_rt(t)->list))
+        {
+                TRACE_TASK(t, "removing from out_of_budget queue\n");
+                list_del(&tsk_rt(t)->list);
+        }
+}
+static void pfair_prepare_next_period(struct task_struct* t)
+{
+        struct pfair_param* p = tsk_pfair(t);
+        prepare_for_next_period(t);
+        tsk_rt(t)->completed = 0;
+        p->release = time2quanta(get_release(t), CEIL);
+}
+/* returns 1 if the task needs to go the release queue */
+static int advance_subtask(quanta_t time, struct task_struct* t, int cpu)
+{
+        struct pfair_param* p = tsk_pfair(t);
+        int to_relq;
+        p->cur = (p->cur + 1) % p->quanta;
+        if (!p->cur) {
+                if (is_present(t)) {
+                        /* The job overran; we start a new budget allocation. */
+                        TRACE_TASK(t, "overran budget, preparing next period\n");
+                        sched_trace_task_completion(t, 1);
+                        pfair_prepare_next_period(t);
+                } else {
+                        /* remove task from system until it wakes */
+                        drop_all_references(t);
+                        p->needs_requeue = 1;
+                        TRACE_TASK(t, "on %d advanced to subtask %lu (not present)\n",
+                                   cpu, p->cur);
+                        return 0;
+                }
+        }
+        to_relq = time_after(cur_release(t), time);
+        TRACE_TASK(t, "on %d advanced to subtask %lu -> to_relq=%d "
+                "(cur_release:%lu time:%lu present:%d on_cpu=%d)\n",
+                cpu, p->cur, to_relq, cur_release(t), time,
+                tsk_rt(t)->present, tsk_rt(t)->scheduled_on);
+        return to_relq;
+}
+static void advance_subtasks(struct pfair_cluster *cluster, quanta_t time)
+{
+        struct task_struct* l;
+        struct pfair_param* p;
+        struct list_head* pos;
+        struct pfair_state* cpu;
+        list_for_each(pos, &cluster->topology.cpus) {
+                cpu = from_cluster_list(pos);
+                l = cpu->linked;
+                cpu->missed_updates += cpu->linked != cpu->local;
+                if (l) {
+                        p = tsk_pfair(l);
+                        p->last_quantum = time;
+                        p->last_cpu     =  cpu_id(cpu);
+                        if (advance_subtask(time, l, cpu_id(cpu))) {
+                                cpu->linked = NULL;
+                                tsk_rt(l)->linked_on = NO_CPU;
+                                PTRACE_TASK(l, "should go to release queue. "
+                                            "scheduled_on=%d present=%d\n",
+                                            tsk_rt(l)->scheduled_on,
+                                            tsk_rt(l)->present);
+                                list_add(&tsk_rt(l)->list, &cpu->out_of_budget);
+                        }
+                }
+        }
+}
+static int target_cpu(quanta_t time, struct task_struct* t, int default_cpu)
+{
+        int cpu;
+        if (tsk_rt(t)->scheduled_on != NO_CPU) {
+                /* always observe scheduled_on linkage */
+                default_cpu = tsk_rt(t)->scheduled_on;
+        } else if (tsk_pfair(t)->last_quantum == time - 1) {
+                /* back2back quanta */
+                /* Only observe last_quantum if no scheduled_on is in the way.
+                 * This should only kick in if a CPU missed quanta, and that
+                 * *should* only happen in QEMU.
+                 */
+                cpu = tsk_pfair(t)->last_cpu;
+                if (!pstate[cpu]->linked ||
+                    tsk_rt(pstate[cpu]->linked)->scheduled_on != cpu) {
+                        default_cpu = cpu;
+                }
+        }
+        return default_cpu;
+}
+/* returns one if linking was redirected */
+static int pfair_link(quanta_t time, int cpu,
+                      struct task_struct* t)
+{
+        int target = target_cpu(time, t, cpu);
+        struct task_struct* prev  = pstate[cpu]->linked;
+        struct task_struct* other;
+        struct pfair_cluster* cluster = cpu_cluster(pstate[cpu]);
+        if (target != cpu) {
+                BUG_ON(pstate[target]->topology.cluster != pstate[cpu]->topology.cluster);
+                other = pstate[target]->linked;
+                pstate[target]->linked = t;
+                tsk_rt(t)->linked_on   = target;
+                if (!other)
+                        /* linked ok, but reschedule this CPU */
+                        return 1;
+                if (target < cpu) {
+                        /* link other to cpu instead */
+                        tsk_rt(other)->linked_on = cpu;
+                        pstate[cpu]->linked      = other;
+                        if (prev) {
+                                /* prev got pushed back into the ready queue */
+                                tsk_rt(prev)->linked_on = NO_CPU;
+                                __add_ready(&cluster->pfair, prev);
+                        }
+                        /* we are done with this cpu */
+                        return 0;
+                } else {
+                        /* re-add other, it's original CPU was not considered yet */
+                        tsk_rt(other)->linked_on = NO_CPU;
+                        __add_ready(&cluster->pfair, other);
+                        /* reschedule this CPU */
+                        return 1;
+                }
+        } else {
+                pstate[cpu]->linked  = t;
+                tsk_rt(t)->linked_on = cpu;
+                if (prev) {
+                        /* prev got pushed back into the ready queue */
+                        tsk_rt(prev)->linked_on = NO_CPU;
+                        __add_ready(&cluster->pfair, prev);
+                }
+                /* we are done with this CPU */
+                return 0;
+        }
+}
+static void schedule_subtasks(struct pfair_cluster *cluster, quanta_t time)
+{
+        int retry;
+        struct list_head *pos;
+        struct pfair_state *cpu_state;
+        list_for_each(pos, &cluster->topology.cpus) {
+                cpu_state = from_cluster_list(pos);
+                retry = 1;
+#ifdef CONFIG_RELEASE_MASTER
+                /* skip release master */
+                if (cluster->pfair.release_master == cpu_id(cpu_state))
+                        continue;
+#endif
+                while (retry) {
+                        if (pfair_higher_prio(__peek_ready(&cluster->pfair),
+                                              cpu_state->linked))
+                                retry = pfair_link(time, cpu_id(cpu_state),
+                                                   __take_ready(&cluster->pfair));
+                        else
+                                retry = 0;
+                }
+        }
+}
+static void schedule_next_quantum(struct pfair_cluster *cluster, quanta_t time)
+{
+        struct pfair_state *cpu;
+        struct list_head* pos;
+        /* called with interrupts disabled */
+        PTRACE("--- Q %lu at %llu PRE-SPIN\n",
+               time, litmus_clock());
+        raw_spin_lock(cluster_lock(cluster));
+        PTRACE("<<< Q %lu at %llu\n",
+               time, litmus_clock());
+        sched_trace_quantum_boundary();
+        advance_subtasks(cluster, time);
+        poll_releases(cluster);
+        schedule_subtasks(cluster, time);
+        list_for_each(pos, &cluster->topology.cpus) {
+                cpu = from_cluster_list(pos);
+                if (cpu->linked)
+                        PTRACE_TASK(cpu->linked,
+                                    " linked on %d.\n", cpu_id(cpu));
+                else
+                        PTRACE("(null) linked on %d.\n", cpu_id(cpu));
+        }
+        /* We are done. Advance time. */
+        mb();
+        list_for_each(pos, &cluster->topology.cpus) {
+                cpu = from_cluster_list(pos);
+                if (cpu->local_tick != cpu->cur_tick) {
+                        TRACE("BAD Quantum not acked on %d "
+                              "(l:%lu c:%lu p:%lu)\n",
+                              cpu_id(cpu),
+                              cpu->local_tick,
+                              cpu->cur_tick,
+                              cluster->pfair_time);
+                        cpu->missed_quanta++;
+                }
+                cpu->cur_tick = time;
+        }
+        PTRACE(">>> Q %lu at %llu\n",
+               time, litmus_clock());
+        raw_spin_unlock(cluster_lock(cluster));
+}
+static noinline void wait_for_quantum(quanta_t q, struct pfair_state* state)
+{
+        quanta_t loc;
+        goto first; /* skip mb() on first iteration */
+        do {
+                cpu_relax();
+                mb();
+        first:  loc = state->cur_tick;
+                /* FIXME: what if loc > cur? */
+        } while (time_before(loc, q));
+        PTRACE("observed cur_tick:%lu >= q:%lu\n",
+               loc, q);
+}
+static quanta_t current_quantum(struct pfair_state* state)
+{
+        lt_t t = litmus_clock() - state->offset;
+        return time2quanta(t, FLOOR);
+}
+static void catchup_quanta(quanta_t from, quanta_t target,
+                           struct pfair_state* state)
+{
+        quanta_t cur = from, time;
+        TRACE("+++< BAD catching up quanta from %lu to %lu\n",
+              from, target);
+        while (time_before(cur, target)) {
+                wait_for_quantum(cur, state);
+                cur++;
+                time = cmpxchg(&cpu_cluster(state)->pfair_time,
+                               cur - 1,   /* expected */
+                               cur        /* next     */
+                        );
+                if (time == cur - 1)
+                        schedule_next_quantum(cpu_cluster(state), cur);
+        }
+        TRACE("+++> catching up done\n");
+}
+/* pfair_tick - this function is called for every local timer
+ *                         interrupt.
+ */
+static void pfair_tick(struct task_struct* t)
+{
+        struct pfair_state* state = this_cpu_ptr(&pfair_state);
+        quanta_t time, cur;
+        int retry = 10;
+        do {
+                cur  = current_quantum(state);
+                PTRACE("q %lu at %llu\n", cur, litmus_clock());
+                /* Attempt to advance time. First CPU to get here
+                 * will prepare the next quantum.
+                 */
+                time = cpu_cluster(state)->pfair_time;
+                if (time == cur - 1)
+                {
+                        /* looks good, see if we can advance the time */
+                        time = cmpxchg(&cpu_cluster(state)->pfair_time,
+                                       cur - 1,   /* expected */
+                                       cur        /* next     */
+                                );
+                }
+                if (time == cur - 1) {
+                        /* exchange succeeded */
+                        wait_for_quantum(cur - 1, state);
+                        schedule_next_quantum(cpu_cluster(state), cur);
+                        retry = 0;
+                } else if (time_before(time, cur - 1)) {
+                        /* the whole system missed a tick !? */
+                        catchup_quanta(time, cur, state);
+                        retry--;
+                } else if (time_after(time, cur)) {
+                        /* our timer lagging behind!? */
+                        TRACE("BAD pfair_time:%lu > cur:%lu\n", time, cur);
+                        retry--;
+                } else {
+                        /* Some other CPU already started scheduling
+                         * this quantum. Let it do its job and then update.
+                         */
+                        retry = 0;
+                }
+        } while (retry);
+        /* Spin locally until time advances. */
+        wait_for_quantum(cur, state);
+        /* copy assignment */
+        /* FIXME: what if we race with a future update? Corrupted state? */
+        state->local      = state->linked;
+        /* signal that we are done */
+        mb();
+        state->local_tick = state->cur_tick;
+        if (state->local != current
+            && (is_realtime(current) || is_present(state->local)))
+                litmus_reschedule_local();
+}
+static void process_out_of_budget_tasks(
+        struct pfair_state* state,
+        struct task_struct* prev,
+        unsigned int blocks)
+{
+        struct task_struct *t;
+        while (!list_empty(&state->out_of_budget))
+        {
+                t = list_first_entry(&state->out_of_budget,
+                                     struct task_struct, rt_param.list);
+                TRACE_TASK(t, "found on out_of_budget queue is_prev=%d\n", t == prev);
+                list_del(&tsk_rt(t)->list);
+                if (t != prev || !blocks)
+                {
+                        if (time_after(cur_release(t), state->local_tick)) {
+                                TRACE_TASK(t, "adding to release queue (budget exhausted)\n");
+                                add_release(&cpu_cluster(state)->pfair, t);
+                        } else {
+                                TRACE_TASK(t, "adding to ready queue (budget exhausted)\n");
+                                sched_trace_task_release(t);
+                                __add_ready(&cpu_cluster(state)->pfair, t);
+                        }
+                } else {
+                        TRACE_TASK(t, "not added to release queue (blocks=%d)\n", blocks);
+                        tsk_pfair(t)->needs_requeue = 1;
+                }
+                if (unlikely(state->local == t)) {
+                        TRACE_TASK(t, "still linked as ->local, cleaning up\n");
+                        state->local = NULL;
+                }
+        }
+}
+/* Custom scheduling tick: called on each quantum boundary. */
+static enum hrtimer_restart on_quantum_boundary(struct hrtimer *timer)
+{
+        TS_QUANTUM_BOUNDARY_START;
+        pfair_tick(current);
+        hrtimer_add_expires_ns(timer, LITMUS_QUANTUM_LENGTH_NS);
+        TS_QUANTUM_BOUNDARY_END;
+        return  HRTIMER_RESTART;
+}
+static int safe_to_schedule(struct task_struct* t, int cpu)
+{
+        int where = tsk_rt(t)->scheduled_on;
+        if (where != NO_CPU && where != cpu) {
+                TRACE_TASK(t, "BAD: can't be scheduled on %d, "
+                           "scheduled already on %d.\n", cpu, where);
+                return 0;
+        } else
+                return is_present(t) && !is_completed(t);
+}
+static struct task_struct* pfair_schedule(struct task_struct * prev)
+{
+        struct pfair_state* state = this_cpu_ptr(&pfair_state);
+        struct pfair_cluster* cluster = cpu_cluster(state);
+        int blocks, completion, out_of_time;
+        struct task_struct* next = NULL;
+#ifdef CONFIG_RELEASE_MASTER
+        /* Bail out early if we are the release master.
+         * The release master never schedules any real-time tasks.
+         */
+        if (unlikely(cluster->pfair.release_master == cpu_id(state))) {
+                goto out;
+        }
+#endif
+        raw_spin_lock(cpu_lock(state));
+        blocks      = is_realtime(prev) && !is_current_running();
+        completion  = is_realtime(prev) && is_completed(prev);
+        out_of_time = is_realtime(prev) && time_after(cur_release(prev),
+                                                      state->local_tick);
+        if (is_realtime(prev))
+            PTRACE_TASK(prev, "blocks:%d completion:%d out_of_time:%d\n",
+                        blocks, completion, out_of_time);
+        if (completion && !out_of_time) {
+                sched_trace_task_completion(prev, 0);
+                pfair_prepare_next_period(prev);
+                prepare_release(prev, cur_release(prev));
+                drop_all_references(prev);
+                list_add(&tsk_rt(prev)->list, &state->out_of_budget);
+        }
+        process_out_of_budget_tasks(state, prev, blocks);
+        if (state->local && safe_to_schedule(state->local, cpu_id(state)))
+                next = state->local;
+        if (prev != next) {
+                tsk_rt(prev)->scheduled_on = NO_CPU;
+                if (next)
+                        tsk_rt(next)->scheduled_on = cpu_id(state);
+        }
+        sched_state_task_picked();
+        raw_spin_unlock(cpu_lock(state));
+        if (next)
+                TRACE_TASK(next, "scheduled rel=%lu at %lu (%llu)\n",
+                           tsk_pfair(next)->release, cpu_cluster(state)->pfair_time, litmus_clock());
+        else if (is_realtime(prev))
+                TRACE("Becomes idle at %lu (%llu)\n", cpu_cluster(state)->pfair_time, litmus_clock());
+#ifdef CONFIG_RELEASE_MASTER
+out:
+#endif
+        if (unlikely(!hrtimer_active(&state->quantum_timer))) {
+                TRACE("activating quantum timer start=%llu\n",
+                        hrtimer_get_expires(&state->quantum_timer));
+                hrtimer_start(&state->quantum_timer,
+                        hrtimer_get_expires(&state->quantum_timer),
+                        HRTIMER_MODE_ABS);
+        }
+        return next;
+}
+static void pfair_task_new(struct task_struct * t, int on_rq, int is_scheduled)
+{
+        unsigned long flags;
+        struct pfair_cluster* cluster;
+        TRACE("pfair: task new %d state:%d\n", t->pid, t->state);
+        cluster = tsk_pfair(t)->cluster;
+        raw_spin_lock_irqsave(cluster_lock(cluster), flags);
+        prepare_release(t, cluster->pfair_time + 1);
+        release_at(t, quanta2time(cur_release(t)));
+        t->rt_param.scheduled_on = NO_CPU;
+        t->rt_param.linked_on    = NO_CPU;
+        if (is_scheduled) {
+#ifdef CONFIG_RELEASE_MASTER
+                if (task_cpu(t) != cluster->pfair.release_master)
+#endif
+                        t->rt_param.scheduled_on = task_cpu(t);
+        }
+        if (on_rq || is_scheduled) {
+                tsk_rt(t)->present = 1;
+                __add_ready(&cluster->pfair, t);
+        } else {
+                tsk_rt(t)->present = 0;
+                tsk_pfair(t)->needs_requeue = 1;
+        }
+        check_preempt(t);
+        raw_spin_unlock_irqrestore(cluster_lock(cluster), flags);
+}
+static void pfair_task_wake_up(struct task_struct *t)
+{
+        unsigned long flags;
+        lt_t now;
+        struct pfair_cluster* cluster;
+        struct pfair_state* state;
+        int sporadic_release = 0;
+        cluster = tsk_pfair(t)->cluster;
+        TRACE_TASK(t, "wakes at %llu, release=%lu, pfair_time:%lu\n",
+                   litmus_clock(), cur_release(t), cluster->pfair_time);
+        raw_spin_lock_irqsave(cluster_lock(cluster), flags);
+        state = this_cpu_ptr(&pfair_state);
+        /* If a task blocks and wakes before its next job release,
+         * then it may resume if it is currently linked somewhere
+         * (as if it never blocked at all). Otherwise, we have a
+         * new sporadic job release.
+         */
+        now = litmus_clock();
+        if (is_tardy(t, now)) {
+                TRACE_TASK(t, "sporadic release!\n");
+                sporadic_release = 1;
+                inferred_sporadic_job_release_at(t, now);
+                prepare_release(t, time2quanta(now, CEIL));
+        }
+        /* only add to ready queue if the task isn't still linked somewhere */
+        if (tsk_pfair(t)->needs_requeue) {
+                tsk_pfair(t)->needs_requeue = 0;
+                TRACE_TASK(t, "requeueing required (released:%d)\n",
+                        !time_after(cur_release(t), state->local_tick));
+                tsk_rt(t)->completed = 0;
+                if (time_after(cur_release(t), state->local_tick)
+                    && !sporadic_release)
+                        add_release(&cluster->pfair, t);
+                else
+                        __add_ready(&cluster->pfair, t);
+        }
+        check_preempt(t);
+        raw_spin_unlock_irqrestore(cluster_lock(cluster), flags);
+        TRACE_TASK(t, "wake up done at %llu\n", litmus_clock());
+}
+static void pfair_task_block(struct task_struct *t)
+{
+        BUG_ON(!is_realtime(t));
+        TRACE_TASK(t, "blocks at %llu, state:%d\n",
+                   litmus_clock(), t->state);
+}
+static void pfair_task_exit(struct task_struct * t)
+{
+        unsigned long flags;
+        struct pfair_cluster *cluster;
+        BUG_ON(!is_realtime(t));
+        cluster = tsk_pfair(t)->cluster;
+        /* Remote task from release or ready queue, and ensure
+         * that it is not the scheduled task for ANY CPU. We
+         * do this blanket check because occassionally when
+         * tasks exit while blocked, the task_cpu of the task
+         * might not be the same as the CPU that the PFAIR scheduler
+         * has chosen for it.
+         */
+        raw_spin_lock_irqsave(cluster_lock(cluster), flags);
+        TRACE_TASK(t, "RIP, state:%d\n", t->state);
+        drop_all_references(t);
+        raw_spin_unlock_irqrestore(cluster_lock(cluster), flags);
+        kfree(t->rt_param.pfair);
+        t->rt_param.pfair = NULL;
+}
+static void init_subtask(struct subtask* sub, unsigned long i,
+                         lt_t quanta, lt_t period)
+{
+        /* since i is zero-based, the formulas are shifted by one */
+        lt_t tmp;
+        /* release */
+        tmp = period * i;
+        do_div(tmp, quanta); /* floor */
+        sub->release = (quanta_t) tmp;
+        /* deadline */
+        tmp = period * (i + 1);
+        if (do_div(tmp, quanta)) /* ceil */
+                tmp++;
+        sub->deadline = (quanta_t) tmp;
+        /* next release */
+        tmp = period * (i + 1);
+        do_div(tmp, quanta); /* floor */
+        sub->overlap =  sub->deadline - (quanta_t) tmp;
+        /* Group deadline.
+         * Based on the formula given in Uma's thesis.
+         */
+        if (2 * quanta >= period) {
+                /* heavy */
+                tmp = (sub->deadline - (i + 1)) * period;
+                if (period > quanta &&
+                    do_div(tmp, (period - quanta))) /* ceil */
+                        tmp++;
+                sub->group_deadline = (quanta_t) tmp;
+        } else
+                sub->group_deadline = 0;
+}
+static void dump_subtasks(struct task_struct* t)
+{
+        unsigned long i;
+        for (i = 0; i < t->rt_param.pfair->quanta; i++)
+                TRACE_TASK(t, "SUBTASK %lu: rel=%lu dl=%lu bbit:%lu gdl:%lu\n",
+                           i + 1,
+                           t->rt_param.pfair->subtasks[i].release,
+                           t->rt_param.pfair->subtasks[i].deadline,
+                           t->rt_param.pfair->subtasks[i].overlap,
+                           t->rt_param.pfair->subtasks[i].group_deadline);
+}
+static long pfair_admit_task(struct task_struct* t)
+{
+        lt_t quanta;
+        lt_t period;
+        s64  quantum_length = LITMUS_QUANTUM_LENGTH_NS;
+        struct pfair_param* param;
+        unsigned long i;
+        /* first check that the task is in the right cluster */
+        if (cpu_cluster(pstate[tsk_rt(t)->task_params.cpu]) !=
+            cpu_cluster(pstate[task_cpu(t)]))
+                return -EINVAL;
+        if (get_rt_period(t) != get_rt_relative_deadline(t)) {
+                printk(KERN_INFO "%s: Admission rejected. "
+                        "Only implicit deadlines are currently supported.\n",
+                        litmus->plugin_name);
+                return -EINVAL;
+        }
+        /* Pfair is a tick-based scheduler, so the unit of time
+         * is one quantum. Calculate quantum-based parameters for everything.
+         * (Ceiling of exec cost, floor of period.)
+         */
+        quanta = get_exec_cost(t);
+        period = get_rt_period(t);
+        quanta = time2quanta(get_exec_cost(t), CEIL);
+        if (do_div(period, quantum_length))
+                printk(KERN_WARNING
+                       "The period of %s/%d is not a multiple of %llu.\n",
+                       t->comm, t->pid, (unsigned long long) quantum_length);
+        if (quanta == period) {
+                PTRACE_TASK(t, "Admitting weight 1.0 task. (%llu, %llu).\n", quanta, period);
+        }
+        param = kzalloc(sizeof(*param) +
+                        quanta * sizeof(struct subtask), GFP_ATOMIC);
+        if (!param)
+                return -ENOMEM;
+        param->quanta  = quanta;
+        param->period  = period;
+        param->cluster = cpu_cluster(pstate[tsk_rt(t)->task_params.cpu]);
+        for (i = 0; i < quanta; i++)
+                init_subtask(param->subtasks + i, i, quanta, period);
+        if (t->rt_param.pfair)
+                /* get rid of stale allocation */
+                kfree(t->rt_param.pfair);
+        t->rt_param.pfair = param;
+        /* spew out some debug info */
+        dump_subtasks(t);
+        /* Disable generic budget enforcement (if enabled).
+         * The plugin provides its own (non-optional) enforcement
+         * of allocations at quantum granularity. */
+        tsk_rt(t)->task_params.budget_policy = NO_ENFORCEMENT;
+        return 0;
+}
+static void pfair_init_cluster(struct pfair_cluster* cluster)
+{
+        rt_domain_init(&cluster->pfair, pfair_ready_order, NULL, pfair_release_jobs);
+        bheap_init(&cluster->release_queue);
+        raw_spin_lock_init(&cluster->release_lock);
+        INIT_LIST_HEAD(&cluster->topology.cpus);
+}
+static void cleanup_clusters(void)
+{
+        int i;
+        if (num_pfair_clusters)
+                kfree(pfair_clusters);
+        pfair_clusters = NULL;
+        num_pfair_clusters = 0;
+        /* avoid stale pointers */
+        for (i = 0; i < num_online_cpus(); i++) {
+                pstate[i]->topology.cluster = NULL;
+                printk("P%d missed %u updates and %u quanta.\n", cpu_id(pstate[i]),
+                       pstate[i]->missed_updates, pstate[i]->missed_quanta);
+        }
+}
+static struct domain_proc_info pfair_domain_proc_info;
+static long pfair_get_domain_proc_info(struct domain_proc_info **ret)
+{
+        *ret = &pfair_domain_proc_info;
+        return 0;
+}
+static void pfair_setup_domain_proc(void)
+{
+        int i, cpu, domain;
+#ifdef CONFIG_RELEASE_MASTER
+        int release_master = atomic_read(&release_master_cpu);
+        /* skip over the domain with the release master if cluster size is 1 */
+        int cluster_size = num_online_cpus() / num_pfair_clusters;
+        int skip_domain = (1 == cluster_size && release_master != NO_CPU) ?
+                        release_master : NO_CPU;
+#else
+        int release_master = NO_CPU;
+        int skip_domain = NO_CPU;
+#endif
+        int num_rt_cpus = num_online_cpus() - (release_master != NO_CPU);
+        int num_rt_domains = num_pfair_clusters - (skip_domain != NO_CPU);
+        struct cd_mapping *map;
+        memset(&pfair_domain_proc_info, 0, sizeof(pfair_domain_proc_info));
+        init_domain_proc_info(&pfair_domain_proc_info, num_rt_cpus, num_pfair_clusters);
+        pfair_domain_proc_info.num_cpus = num_rt_cpus;
+        pfair_domain_proc_info.num_domains = num_rt_domains;
+        for (cpu = 0, i = 0; cpu < num_online_cpus(); ++cpu) {
+                if (cpu == release_master)
+                        continue;
+                map = &pfair_domain_proc_info.cpu_to_domains[i];
+                /* pointer math to figure out the domain index */
+                domain = cpu_cluster(&per_cpu(pfair_state, cpu)) - pfair_clusters;
+                map->id = cpu;
+                cpumask_set_cpu(domain, map->mask);
+                ++i;
+        }
+        for (domain = 0, i = 0; domain < num_pfair_clusters; ++domain) {
+                struct pfair_cluster *cluster;
+                struct list_head *pos;
+                if (domain == skip_domain)
+                        continue;
+                cluster = &pfair_clusters[domain];
+                map = &pfair_domain_proc_info.domain_to_cpus[i];
+                map->id = i;
+                list_for_each(pos, &cluster->topology.cpus) {
+                        cpu = cpu_id(from_cluster_list(pos));
+                        if (cpu != release_master)
+                                cpumask_set_cpu(cpu, map->mask);
+                }
+                ++i;
+        }
+}
+static long pfair_activate_plugin(void)
+{
+        int err, i;
+        struct pfair_state* state;
+        struct pfair_cluster* cluster;
+        quanta_t now, start;
+        int cluster_size;
+        struct cluster_cpu* cpus[NR_CPUS];
+        struct scheduling_cluster* clust[NR_CPUS];
+        lt_t quantum_timer_start;
+        cluster_size = get_cluster_size(pfair_cluster_level);
+        if (cluster_size <= 0 || num_online_cpus() % cluster_size != 0)
+                return -EINVAL;
+        num_pfair_clusters = num_online_cpus() / cluster_size;
+        pfair_clusters = kzalloc(num_pfair_clusters * sizeof(struct pfair_cluster), GFP_ATOMIC);
+        if (!pfair_clusters) {
+                num_pfair_clusters = 0;
+                printk(KERN_ERR "Could not allocate Pfair clusters!\n");
+                return -ENOMEM;
+        }
+        state = this_cpu_ptr(&pfair_state);
+        now   = current_quantum(state);
+        start = now + 50;
+        quantum_timer_start = quanta2time(start);
+        TRACE("Activating PFAIR at %llu (q=%lu), first tick at %llu (q=%lu)\n",
+                litmus_clock(),
+                now,
+                quantum_timer_start,
+                time2quanta(quantum_timer_start, CEIL));
+        for (i = 0; i < num_pfair_clusters; i++) {
+                cluster = &pfair_clusters[i];
+                pfair_init_cluster(cluster);
+                cluster->pfair_time = start;
+                clust[i] = &cluster->topology;
+#ifdef CONFIG_RELEASE_MASTER
+                cluster->pfair.release_master = atomic_read(&release_master_cpu);
+#endif
+        }
+        for_each_online_cpu(i) {
+                state = &per_cpu(pfair_state, i);
+                state->cur_tick   = start;
+                state->local_tick = start;
+                state->missed_quanta = 0;
+                state->missed_updates = 0;
+                state->offset     = cpu_stagger_offset(i);
+                hrtimer_set_expires(&state->quantum_timer,
+                        ns_to_ktime(quantum_timer_start + state->offset));
+                cpus[i] = &state->topology;
+                TRACE("cpus[%d] set; offset=%llu; %d\n", i, state->offset, num_online_cpus());
+                INIT_LIST_HEAD(&state->out_of_budget);
+                /* force rescheduling to start quantum timer */
+                litmus_reschedule(i);
+                WARN_ONCE(!hrtimer_is_hres_active(&state->quantum_timer),
+                        KERN_ERR "WARNING: no high resolution timers available!?\n");
+        }
+        err = assign_cpus_to_clusters(pfair_cluster_level, clust, num_pfair_clusters,
+                                      cpus, num_online_cpus());
+        if (err < 0)
+                cleanup_clusters();
+        else
+                pfair_setup_domain_proc();
+        return err;
+}
+static long pfair_deactivate_plugin(void)
+{
+        int cpu;
+        struct pfair_state* state;
+        for_each_online_cpu(cpu) {
+                state = &per_cpu(pfair_state, cpu);
+                TRACE("stopping quantum timer on CPU%d\n", cpu);
+                hrtimer_cancel(&state->quantum_timer);
+        }
+        cleanup_clusters();
+        destroy_domain_proc_info(&pfair_domain_proc_info);
+        return 0;
+}
+/*      Plugin object   */
+static struct sched_plugin pfair_plugin __cacheline_aligned_in_smp = {
+        .plugin_name            = "PFAIR",
+        .task_new               = pfair_task_new,
+        .task_exit              = pfair_task_exit,
+        .schedule               = pfair_schedule,
+        .task_wake_up           = pfair_task_wake_up,
+        .task_block             = pfair_task_block,
+        .admit_task             = pfair_admit_task,
+        .complete_job           = complete_job,
+        .activate_plugin        = pfair_activate_plugin,
+        .deactivate_plugin      = pfair_deactivate_plugin,
+        .get_domain_proc_info   = pfair_get_domain_proc_info,
+};
+static struct proc_dir_entry *cluster_file = NULL, *pfair_dir = NULL;
+static int __init init_pfair(void)
+{
+        int cpu, err, fs;
+        struct pfair_state *state;
+        /*
+         * initialize short_cut for per-cpu pfair state;
+         * there may be a problem here if someone removes a cpu
+         * while we are doing this initialization... and if cpus
+         * are added / removed later... but we don't support CPU hotplug atm anyway.
+         */
+        pstate = kmalloc(sizeof(struct pfair_state*) * num_online_cpus(), GFP_KERNEL);
+        /* initialize CPU state */
+        for (cpu = 0; cpu < num_online_cpus(); cpu++)  {
+                state = &per_cpu(pfair_state, cpu);
+                hrtimer_init(&state->quantum_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
+                state->quantum_timer.function = on_quantum_boundary;
+                state->topology.id = cpu;
+                state->cur_tick   = 0;
+                state->local_tick = 0;
+                state->linked     = NULL;
+                state->local      = NULL;
+                state->scheduled  = NULL;
+                state->missed_quanta = 0;
+                state->offset     = cpu_stagger_offset(cpu);
+                pstate[cpu] = state;
+        }
+        pfair_clusters = NULL;
+        num_pfair_clusters = 0;
+        err = register_sched_plugin(&pfair_plugin);
+        if (!err) {
+                fs = make_plugin_proc_dir(&pfair_plugin, &pfair_dir);
+                if (!fs)
+                        cluster_file = create_cluster_file(pfair_dir, &pfair_cluster_level);
+                else
+                        printk(KERN_ERR "Could not allocate PFAIR procfs dir.\n");
+        }
+        return err;
+}
+static void __exit clean_pfair(void)
+{
+        kfree(pstate);
+        if (cluster_file)
+                remove_proc_entry("cluster", pfair_dir);
+        if (pfair_dir)
+                remove_plugin_proc_dir(&pfair_plugin);
+}
+module_init(init_pfair);
+module_exit(clean_pfair);
diff --git a/litmus/sched_pfp.c b/litmus/sched_pfp.c
new file mode 100644
index 000000000000..c7f2e60d010b
--- /dev/null
+++ b/litmus/sched_pfp.c
@@ -0,0 +1,2048 @@
+/*
+ * litmus/sched_pfp.c
+ *
+ * Implementation of partitioned fixed-priority scheduling.
+ * Based on PSN-EDF.
+ */
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <litmus/debug_trace.h>
+#include <litmus/litmus.h>
+#include <litmus/wait.h>
+#include <litmus/jobs.h>
+#include <litmus/preempt.h>
+#include <litmus/fp_common.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/sched_trace.h>
+#include <litmus/trace.h>
+#include <litmus/budget.h>
+#include <litmus/np.h>
+/* to set up domain/cpu mappings */
+#include <litmus/litmus_proc.h>
+#include <linux/uaccess.h>
+typedef struct {
+        rt_domain_t             domain;
+        struct fp_prio_queue    ready_queue;
+        int                     cpu;
+        struct task_struct*     scheduled; /* only RT tasks */
+/*
+ * scheduling lock slock
+ * protects the domain and serializes scheduling decisions
+ */
+#define slock domain.ready_lock
+} pfp_domain_t;
+DEFINE_PER_CPU(pfp_domain_t, pfp_domains);
+pfp_domain_t* pfp_doms[NR_CPUS];
+#define local_pfp               (this_cpu_ptr(&pfp_domains))
+#define remote_dom(cpu)         (&per_cpu(pfp_domains, cpu).domain)
+#define remote_pfp(cpu) (&per_cpu(pfp_domains, cpu))
+#define task_dom(task)          remote_dom(get_partition(task))
+#define task_pfp(task)          remote_pfp(get_partition(task))
+#ifdef CONFIG_LITMUS_LOCKING
+DEFINE_PER_CPU(uint64_t,fmlp_timestamp);
+#endif
+/* we assume the lock is being held */
+static void preempt(pfp_domain_t *pfp)
+{
+        preempt_if_preemptable(pfp->scheduled, pfp->cpu);
+}
+static unsigned int priority_index(struct task_struct* t)
+{
+#ifdef CONFIG_LITMUS_LOCKING
+        if (unlikely(t->rt_param.inh_task))
+                /* use effective priority */
+                t = t->rt_param.inh_task;
+        if (is_priority_boosted(t)) {
+                /* zero is reserved for priority-boosted tasks */
+                return 0;
+        } else
+#endif
+                return get_priority(t);
+}
+static void pfp_release_jobs(rt_domain_t* rt, struct bheap* tasks)
+{
+        pfp_domain_t *pfp = container_of(rt, pfp_domain_t, domain);
+        unsigned long flags;
+        struct task_struct* t;
+        struct bheap_node* hn;
+        raw_spin_lock_irqsave(&pfp->slock, flags);
+        while (!bheap_empty(tasks)) {
+                hn = bheap_take(fp_ready_order, tasks);
+                t = bheap2task(hn);
+                TRACE_TASK(t, "released (part:%d prio:%d)\n",
+                           get_partition(t), get_priority(t));
+                fp_prio_add(&pfp->ready_queue, t, priority_index(t));
+        }
+        /* do we need to preempt? */
+        if (fp_higher_prio(fp_prio_peek(&pfp->ready_queue), pfp->scheduled)) {
+                TRACE_CUR("preempted by new release\n");
+                preempt(pfp);
+        }
+        raw_spin_unlock_irqrestore(&pfp->slock, flags);
+}
+static void pfp_preempt_check(pfp_domain_t *pfp)
+{
+        if (fp_higher_prio(fp_prio_peek(&pfp->ready_queue), pfp->scheduled))
+                preempt(pfp);
+}
+static void pfp_domain_init(pfp_domain_t* pfp,
+                               int cpu)
+{
+        fp_domain_init(&pfp->domain, NULL, pfp_release_jobs);
+        pfp->cpu                = cpu;
+        pfp->scheduled          = NULL;
+        fp_prio_queue_init(&pfp->ready_queue);
+}
+static void requeue(struct task_struct* t, pfp_domain_t *pfp)
+{
+        tsk_rt(t)->completed = 0;
+        if (is_released(t, litmus_clock())) {
+                TRACE_TASK(t, "add to ready\n");
+                fp_prio_add(&pfp->ready_queue, t, priority_index(t));
+        } else
+                add_release(&pfp->domain, t); /* it has got to wait */
+}
+static void job_completion(struct task_struct* t, int forced)
+{
+        sched_trace_task_completion(t, forced);
+        TRACE_TASK(t, "job_completion(forced=%d).\n", forced);
+        tsk_rt(t)->completed = 0;
+        prepare_for_next_period(t);
+        if (is_released(t, litmus_clock()))
+                sched_trace_task_release(t);
+}
+static struct task_struct* pfp_schedule(struct task_struct * prev)
+{
+        pfp_domain_t*   pfp = local_pfp;
+        struct task_struct*     next;
+        int out_of_time, sleep, preempt, np, exists, blocks, resched, migrate;
+        raw_spin_lock(&pfp->slock);
+        /* sanity checking
+         * differently from gedf, when a task exits (dead)
+         * pfp->schedule may be null and prev _is_ realtime
+         */
+        BUG_ON(pfp->scheduled && pfp->scheduled != prev);
+        BUG_ON(pfp->scheduled && !is_realtime(prev));
+        /* (0) Determine state */
+        exists      = pfp->scheduled != NULL;
+        blocks      = exists && !is_current_running();
+        out_of_time = exists && budget_enforced(pfp->scheduled)
+                             && budget_exhausted(pfp->scheduled);
+        np          = exists && is_np(pfp->scheduled);
+        sleep       = exists && is_completed(pfp->scheduled);
+        migrate     = exists && get_partition(pfp->scheduled) != pfp->cpu;
+        preempt     = !blocks && (migrate || fp_preemption_needed(&pfp->ready_queue, prev));
+        /* If we need to preempt do so.
+         * The following checks set resched to 1 in case of special
+         * circumstances.
+         */
+        resched = preempt;
+        /* If a task blocks we have no choice but to reschedule.
+         */
+        if (blocks)
+                resched = 1;
+        /* Request a sys_exit_np() call if we would like to preempt but cannot.
+         * Multiple calls to request_exit_np() don't hurt.
+         */
+        if (np && (out_of_time || preempt || sleep))
+                request_exit_np(pfp->scheduled);
+        /* Any task that is preemptable and either exhausts its execution
+         * budget or wants to sleep completes. We may have to reschedule after
+         * this.
+         */
+        if (!np && (out_of_time || sleep)) {
+                job_completion(pfp->scheduled, !sleep);
+                resched = 1;
+        }
+        if (exists)
+                TRACE_TASK(pfp->scheduled, "state:%d blocks:%d oot:%d np:%d sleep:%d "
+                        "mig:%d preempt:%d resched:%d on_rq:%d on_cpu:%d\n",
+                        pfp->scheduled->state,
+                        blocks, out_of_time, np, sleep, migrate, preempt, resched,
+                        pfp->scheduled->on_rq, pfp->scheduled->on_cpu);
+        /* The final scheduling decision. Do we need to switch for some reason?
+         * Switch if we are in RT mode and have no task or if we need to
+         * resched.
+         */
+        next = NULL;
+        if ((!np || blocks) && (resched || !exists)) {
+                /* When preempting a task that does not block, then
+                 * re-insert it into either the ready queue or the
+                 * release queue (if it completed). requeue() picks
+                 * the appropriate queue.
+                 */
+                if (pfp->scheduled && !blocks  && !migrate)
+                        requeue(pfp->scheduled, pfp);
+                next = fp_prio_take(&pfp->ready_queue);
+                if (next == prev) {
+                        struct task_struct *t = fp_prio_peek(&pfp->ready_queue);
+                        TRACE_TASK(next, "next==prev sleep=%d oot=%d np=%d preempt=%d migrate=%d "
+                                   "boost=%d empty=%d prio-idx=%u prio=%u\n",
+                                   sleep, out_of_time, np, preempt, migrate,
+                                   is_priority_boosted(next),
+                                   t == NULL,
+                                   priority_index(next),
+                                   get_priority(next));
+                        if (t)
+                                TRACE_TASK(t, "waiter boost=%d prio-idx=%u prio=%u\n",
+                                           is_priority_boosted(t),
+                                           priority_index(t),
+                                           get_priority(t));
+                }
+                /* If preempt is set, we should not see the same task again. */
+                BUG_ON(preempt && next == prev);
+                /* Similarly, if preempt is set, then next may not be NULL,
+                 * unless it's a migration. */
+                BUG_ON(preempt && !migrate && next == NULL);
+        } else
+                /* Only override Linux scheduler if we have a real-time task
+                 * scheduled that needs to continue.
+                 */
+                if (exists)
+                        next = prev;
+        if (next) {
+                TRACE_TASK(next, "scheduled at %llu\n", litmus_clock());
+        } else if (exists) {
+                TRACE("becoming idle at %llu\n", litmus_clock());
+        }
+        pfp->scheduled = next;
+        sched_state_task_picked();
+        raw_spin_unlock(&pfp->slock);
+        return next;
+}
+#ifdef CONFIG_LITMUS_LOCKING
+/* prev is no longer scheduled --- see if it needs to migrate */
+static void pfp_finish_switch(struct task_struct *prev)
+{
+        pfp_domain_t *to;
+        if (is_realtime(prev))
+                TRACE_TASK(prev, "state:%d on_rq:%d on_cpu:%d\n",
+                        prev->state, prev->on_rq, prev->on_cpu);
+        if (is_realtime(prev) &&
+            prev->state == TASK_RUNNING &&
+            get_partition(prev) != smp_processor_id()) {
+                TRACE_TASK(prev, "needs to migrate from P%d to P%d\n",
+                           smp_processor_id(), get_partition(prev));
+                to = task_pfp(prev);
+                raw_spin_lock(&to->slock);
+                TRACE_TASK(prev, "adding to queue on P%d\n", to->cpu);
+                requeue(prev, to);
+                if (fp_preemption_needed(&to->ready_queue, to->scheduled))
+                        preempt(to);
+                raw_spin_unlock(&to->slock);
+        }
+}
+#endif
+/*      Prepare a task for running in RT mode
+ */
+static void pfp_task_new(struct task_struct * t, int on_rq, int is_scheduled)
+{
+        pfp_domain_t*   pfp = task_pfp(t);
+        unsigned long           flags;
+        TRACE_TASK(t, "P-FP: task new, cpu = %d\n",
+                   t->rt_param.task_params.cpu);
+        /* setup job parameters */
+        release_at(t, litmus_clock());
+        raw_spin_lock_irqsave(&pfp->slock, flags);
+        if (is_scheduled) {
+                /* there shouldn't be anything else running at the time */
+                BUG_ON(pfp->scheduled);
+                pfp->scheduled = t;
+        } else if (on_rq) {
+                requeue(t, pfp);
+                /* maybe we have to reschedule */
+                pfp_preempt_check(pfp);
+        }
+        raw_spin_unlock_irqrestore(&pfp->slock, flags);
+}
+static void pfp_task_wake_up(struct task_struct *task)
+{
+        unsigned long           flags;
+        pfp_domain_t*           pfp = task_pfp(task);
+        lt_t                    now;
+        TRACE_TASK(task, "wake_up at %llu\n", litmus_clock());
+        raw_spin_lock_irqsave(&pfp->slock, flags);
+#ifdef CONFIG_LITMUS_LOCKING
+        /* Should only be queued when processing a fake-wake up due to a
+         * migration-related state change. */
+        if (unlikely(is_queued(task))) {
+                TRACE_TASK(task, "WARNING: waking task still queued. Is this right?\n");
+                goto out_unlock;
+        }
+#else
+        BUG_ON(is_queued(task));
+#endif
+        now = litmus_clock();
+        if (is_sporadic(task) && is_tardy(task, now)
+#ifdef CONFIG_LITMUS_LOCKING
+        /* We need to take suspensions because of semaphores into
+         * account! If a job resumes after being suspended due to acquiring
+         * a semaphore, it should never be treated as a new job release.
+         */
+            && !is_priority_boosted(task)
+#endif
+                ) {
+                inferred_sporadic_job_release_at(task, now);
+        }
+        /* Only add to ready queue if it is not the currently-scheduled
+         * task. This could be the case if a task was woken up concurrently
+         * on a remote CPU before the executing CPU got around to actually
+         * de-scheduling the task, i.e., wake_up() raced with schedule()
+         * and won. Also, don't requeue if it is still queued, which can
+         * happen under the DPCP due wake-ups racing with migrations.
+         */
+        if (pfp->scheduled != task) {
+                requeue(task, pfp);
+                pfp_preempt_check(pfp);
+        }
+#ifdef CONFIG_LITMUS_LOCKING
+out_unlock:
+#endif
+        raw_spin_unlock_irqrestore(&pfp->slock, flags);
+        TRACE_TASK(task, "wake up done\n");
+}
+static void pfp_task_block(struct task_struct *t)
+{
+        /* only running tasks can block, thus t is in no queue */
+        TRACE_TASK(t, "block at %llu, state=%d\n", litmus_clock(), t->state);
+        BUG_ON(!is_realtime(t));
+        /* If this task blocked normally, it shouldn't be queued. The exception is
+         * if this is a simulated block()/wakeup() pair from the pull-migration code path.
+         * This should only happen if the DPCP is being used.
+         */
+#ifdef CONFIG_LITMUS_LOCKING
+        if (unlikely(is_queued(t)))
+                TRACE_TASK(t, "WARNING: blocking task still queued. Is this right?\n");
+#else
+        BUG_ON(is_queued(t));
+#endif
+}
+static void pfp_task_exit(struct task_struct * t)
+{
+        unsigned long flags;
+        pfp_domain_t*   pfp = task_pfp(t);
+        rt_domain_t*            dom;
+        raw_spin_lock_irqsave(&pfp->slock, flags);
+        if (is_queued(t)) {
+                BUG(); /* This currently doesn't work. */
+                /* dequeue */
+                dom  = task_dom(t);
+                remove(dom, t);
+        }
+        if (pfp->scheduled == t) {
+                pfp->scheduled = NULL;
+                preempt(pfp);
+        }
+        TRACE_TASK(t, "RIP, now reschedule\n");
+        raw_spin_unlock_irqrestore(&pfp->slock, flags);
+}
+#ifdef CONFIG_LITMUS_LOCKING
+#include <litmus/fdso.h>
+#include <litmus/srp.h>
+static void fp_dequeue(pfp_domain_t* pfp, struct task_struct* t)
+{
+        BUG_ON(pfp->scheduled == t && is_queued(t));
+        if (is_queued(t))
+                fp_prio_remove(&pfp->ready_queue, t, priority_index(t));
+}
+static void fp_set_prio_inh(pfp_domain_t* pfp, struct task_struct* t,
+                            struct task_struct* prio_inh)
+{
+        int requeue;
+        if (!t || t->rt_param.inh_task == prio_inh) {
+                /* no update  required */
+                if (t)
+                        TRACE_TASK(t, "no prio-inh update required\n");
+                return;
+        }
+        requeue = is_queued(t);
+        TRACE_TASK(t, "prio-inh: is_queued:%d\n", requeue);
+        if (requeue)
+                /* first remove */
+                fp_dequeue(pfp, t);
+        t->rt_param.inh_task = prio_inh;
+        if (requeue)
+                /* add again to the right queue */
+                fp_prio_add(&pfp->ready_queue, t, priority_index(t));
+}
+static int effective_agent_priority(int prio)
+{
+        /* make sure agents have higher priority */
+        return prio - LITMUS_MAX_PRIORITY;
+}
+static lt_t prio_point(int eprio)
+{
+        /* make sure we have non-negative prio points */
+        return eprio + LITMUS_MAX_PRIORITY;
+}
+static void boost_priority(struct task_struct* t, lt_t priority_point)
+{
+        unsigned long           flags;
+        pfp_domain_t*   pfp = task_pfp(t);
+        raw_spin_lock_irqsave(&pfp->slock, flags);
+        TRACE_TASK(t, "priority boosted at %llu\n", litmus_clock());
+        tsk_rt(t)->priority_boosted = 1;
+        /* tie-break by protocol-specific priority point */
+        tsk_rt(t)->boost_start_time = priority_point;
+        /* Priority boosting currently only takes effect for already-scheduled
+         * tasks. This is sufficient since priority boosting only kicks in as
+         * part of lock acquisitions. */
+        BUG_ON(pfp->scheduled != t);
+        raw_spin_unlock_irqrestore(&pfp->slock, flags);
+}
+static void unboost_priority(struct task_struct* t)
+{
+        unsigned long           flags;
+        pfp_domain_t*   pfp = task_pfp(t);
+        raw_spin_lock_irqsave(&pfp->slock, flags);
+        /* Assumption: this only happens when the job is scheduled.
+         * Exception: If t transitioned to non-real-time mode, we no longer
+         * care abou tit. */
+        BUG_ON(pfp->scheduled != t && is_realtime(t));
+        TRACE_TASK(t, "priority restored at %llu\n", litmus_clock());
+        tsk_rt(t)->priority_boosted = 0;
+        tsk_rt(t)->boost_start_time = 0;
+        /* check if this changes anything */
+        if (fp_preemption_needed(&pfp->ready_queue, pfp->scheduled))
+                preempt(pfp);
+        raw_spin_unlock_irqrestore(&pfp->slock, flags);
+}
+/* ******************** SRP support ************************ */
+static unsigned int pfp_get_srp_prio(struct task_struct* t)
+{
+        return get_priority(t);
+}
+/* ******************** FMLP support ********************** */
+struct fmlp_semaphore {
+        struct litmus_lock litmus_lock;
+        /* current resource holder */
+        struct task_struct *owner;
+        /* FIFO queue of waiting tasks */
+        wait_queue_head_t wait;
+};
+static inline struct fmlp_semaphore* fmlp_from_lock(struct litmus_lock* lock)
+{
+        return container_of(lock, struct fmlp_semaphore, litmus_lock);
+}
+static inline lt_t
+fmlp_clock(void)
+{
+        return (lt_t) this_cpu_inc_return(fmlp_timestamp);
+}
+int pfp_fmlp_lock(struct litmus_lock* l)
+{
+        struct task_struct* t = current;
+        struct fmlp_semaphore *sem = fmlp_from_lock(l);
+        wait_queue_t wait;
+        unsigned long flags;
+        lt_t time_of_request;
+        if (!is_realtime(t))
+                return -EPERM;
+        /* prevent nested lock acquisition --- not supported by FMLP */
+        if (tsk_rt(t)->num_locks_held ||
+            tsk_rt(t)->num_local_locks_held)
+                return -EBUSY;
+        spin_lock_irqsave(&sem->wait.lock, flags);
+        /* tie-break by this point in time */
+        time_of_request = fmlp_clock();
+        /* Priority-boost ourself *before* we suspend so that
+         * our priority is boosted when we resume. */
+        boost_priority(t, time_of_request);
+        if (sem->owner) {
+                /* resource is not free => must suspend and wait */
+                init_waitqueue_entry(&wait, t);
+                /* FIXME: interruptible would be nice some day */
+                set_task_state(t, TASK_UNINTERRUPTIBLE);
+                __add_wait_queue_tail_exclusive(&sem->wait, &wait);
+                TS_LOCK_SUSPEND;
+                /* release lock before sleeping */
+                spin_unlock_irqrestore(&sem->wait.lock, flags);
+                /* We depend on the FIFO order.  Thus, we don't need to recheck
+                 * when we wake up; we are guaranteed to have the lock since
+                 * there is only one wake up per release.
+                 */
+                schedule();
+                TS_LOCK_RESUME;
+                /* Since we hold the lock, no other task will change
+                 * ->owner. We can thus check it without acquiring the spin
+                 * lock. */
+                BUG_ON(sem->owner != t);
+        } else {
+                /* it's ours now */
+                sem->owner = t;
+                spin_unlock_irqrestore(&sem->wait.lock, flags);
+        }
+        tsk_rt(t)->num_locks_held++;
+        return 0;
+}
+int pfp_fmlp_unlock(struct litmus_lock* l)
+{
+        struct task_struct *t = current, *next = NULL;
+        struct fmlp_semaphore *sem = fmlp_from_lock(l);
+        unsigned long flags;
+        int err = 0;
+        preempt_disable();
+        spin_lock_irqsave(&sem->wait.lock, flags);
+        if (sem->owner != t) {
+                err = -EINVAL;
+                goto out;
+        }
+        tsk_rt(t)->num_locks_held--;
+        /* we lose the benefit of priority boosting */
+        unboost_priority(t);
+        /* check if there are jobs waiting for this resource */
+        next = __waitqueue_remove_first(&sem->wait);
+        sem->owner = next;
+out:
+        spin_unlock_irqrestore(&sem->wait.lock, flags);
+        /* Wake up next. The waiting job is already priority-boosted. */
+        if(next) {
+                wake_up_process(next);
+        }
+        preempt_enable();
+        return err;
+}
+int pfp_fmlp_close(struct litmus_lock* l)
+{
+        struct task_struct *t = current;
+        struct fmlp_semaphore *sem = fmlp_from_lock(l);
+        unsigned long flags;
+        int owner;
+        spin_lock_irqsave(&sem->wait.lock, flags);
+        owner = sem->owner == t;
+        spin_unlock_irqrestore(&sem->wait.lock, flags);
+        if (owner)
+                pfp_fmlp_unlock(l);
+        return 0;
+}
+void pfp_fmlp_free(struct litmus_lock* lock)
+{
+        kfree(fmlp_from_lock(lock));
+}
+static struct litmus_lock_ops pfp_fmlp_lock_ops = {
+        .close  = pfp_fmlp_close,
+        .lock   = pfp_fmlp_lock,
+        .unlock = pfp_fmlp_unlock,
+        .deallocate = pfp_fmlp_free,
+};
+static struct litmus_lock* pfp_new_fmlp(void)
+{
+        struct fmlp_semaphore* sem;
+        sem = kmalloc(sizeof(*sem), GFP_KERNEL);
+        if (!sem)
+                return NULL;
+        sem->owner   = NULL;
+        init_waitqueue_head(&sem->wait);
+        sem->litmus_lock.ops = &pfp_fmlp_lock_ops;
+        return &sem->litmus_lock;
+}
+/* ******************** MPCP support ********************** */
+struct mpcp_semaphore {
+        struct litmus_lock litmus_lock;
+        /* current resource holder */
+        struct task_struct *owner;
+        /* priority queue of waiting tasks */
+        wait_queue_head_t wait;
+        /* priority ceiling per cpu */
+        unsigned int prio_ceiling[NR_CPUS];
+        /* should jobs spin "virtually" for this resource? */
+        int vspin;
+};
+#define OMEGA_CEILING UINT_MAX
+/* Since jobs spin "virtually" while waiting to acquire a lock,
+ * they first must aquire a local per-cpu resource.
+ */
+static DEFINE_PER_CPU(wait_queue_head_t, mpcpvs_vspin_wait);
+static DEFINE_PER_CPU(struct task_struct*, mpcpvs_vspin);
+/* called with preemptions off <=> no local modifications */
+static void mpcp_vspin_enter(void)
+{
+        struct task_struct* t = current;
+        while (1) {
+                if (this_cpu_read(mpcpvs_vspin) == NULL) {
+                        /* good, we get to issue our request */
+                        this_cpu_write(mpcpvs_vspin, t);
+                        break;
+                } else {
+                        /* some job is spinning => enqueue in request queue */
+                        prio_wait_queue_t wait;
+                        wait_queue_head_t* vspin = this_cpu_ptr(&mpcpvs_vspin_wait);
+                        unsigned long flags;
+                        /* ordered by regular priority */
+                        init_prio_waitqueue_entry(&wait, t, prio_point(get_priority(t)));
+                        spin_lock_irqsave(&vspin->lock, flags);
+                        set_task_state(t, TASK_UNINTERRUPTIBLE);
+                        __add_wait_queue_prio_exclusive(vspin, &wait);
+                        spin_unlock_irqrestore(&vspin->lock, flags);
+                        TS_LOCK_SUSPEND;
+                        preempt_enable_no_resched();
+                        schedule();
+                        preempt_disable();
+                        TS_LOCK_RESUME;
+                        /* Recheck if we got it --- some higher-priority process might
+                         * have swooped in. */
+                }
+        }
+        /* ok, now it is ours */
+}
+/* called with preemptions off */
+static void mpcp_vspin_exit(void)
+{
+        struct task_struct* t = current, *next;
+        unsigned long flags;
+        wait_queue_head_t* vspin = this_cpu_ptr(&mpcpvs_vspin_wait);
+        BUG_ON(this_cpu_read(mpcpvs_vspin) != t);
+        /* no spinning job */
+        this_cpu_write(mpcpvs_vspin, NULL);
+        /* see if anyone is waiting for us to stop "spinning" */
+        spin_lock_irqsave(&vspin->lock, flags);
+        next = __waitqueue_remove_first(vspin);
+        if (next)
+                wake_up_process(next);
+        spin_unlock_irqrestore(&vspin->lock, flags);
+}
+static inline struct mpcp_semaphore* mpcp_from_lock(struct litmus_lock* lock)
+{
+        return container_of(lock, struct mpcp_semaphore, litmus_lock);
+}
+int pfp_mpcp_lock(struct litmus_lock* l)
+{
+        struct task_struct* t = current;
+        struct mpcp_semaphore *sem = mpcp_from_lock(l);
+        prio_wait_queue_t wait;
+        unsigned long flags;
+        if (!is_realtime(t))
+                return -EPERM;
+        /* prevent nested lock acquisition */
+        if (tsk_rt(t)->num_locks_held ||
+            tsk_rt(t)->num_local_locks_held)
+                return -EBUSY;
+        preempt_disable();
+        if (sem->vspin)
+                mpcp_vspin_enter();
+        /* Priority-boost ourself *before* we suspend so that
+         * our priority is boosted when we resume. Use the priority
+         * ceiling for the local partition. */
+        boost_priority(t, sem->prio_ceiling[get_partition(t)]);
+        spin_lock_irqsave(&sem->wait.lock, flags);
+        preempt_enable_no_resched();
+        if (sem->owner) {
+                /* resource is not free => must suspend and wait */
+                /* ordered by regular priority */
+                init_prio_waitqueue_entry(&wait, t, prio_point(get_priority(t)));
+                /* FIXME: interruptible would be nice some day */
+                set_task_state(t, TASK_UNINTERRUPTIBLE);
+                __add_wait_queue_prio_exclusive(&sem->wait, &wait);
+                TS_LOCK_SUSPEND;
+                /* release lock before sleeping */
+                spin_unlock_irqrestore(&sem->wait.lock, flags);
+                /* We depend on the FIFO order.  Thus, we don't need to recheck
+                 * when we wake up; we are guaranteed to have the lock since
+                 * there is only one wake up per release.
+                 */
+                schedule();
+                TS_LOCK_RESUME;
+                /* Since we hold the lock, no other task will change
+                 * ->owner. We can thus check it without acquiring the spin
+                 * lock. */
+                BUG_ON(sem->owner != t);
+        } else {
+                /* it's ours now */
+                sem->owner = t;
+                spin_unlock_irqrestore(&sem->wait.lock, flags);
+        }
+        tsk_rt(t)->num_locks_held++;
+        return 0;
+}
+int pfp_mpcp_unlock(struct litmus_lock* l)
+{
+        struct task_struct *t = current, *next = NULL;
+        struct mpcp_semaphore *sem = mpcp_from_lock(l);
+        unsigned long flags;
+        int err = 0;
+        preempt_disable();
+        spin_lock_irqsave(&sem->wait.lock, flags);
+        if (sem->owner != t) {
+                err = -EINVAL;
+                goto out;
+        }
+        tsk_rt(t)->num_locks_held--;
+        /* we lose the benefit of priority boosting */
+        unboost_priority(t);
+        /* check if there are jobs waiting for this resource */
+        next = __waitqueue_remove_first(&sem->wait);
+        sem->owner = next;
+out:
+        spin_unlock_irqrestore(&sem->wait.lock, flags);
+        /* Wake up next. The waiting job is already priority-boosted. */
+        if(next) {
+                wake_up_process(next);
+        }
+        if (sem->vspin && err == 0) {
+                mpcp_vspin_exit();
+        }
+        preempt_enable();
+        return err;
+}
+int pfp_mpcp_open(struct litmus_lock* l, void* config)
+{
+        struct task_struct *t = current;
+        int cpu, local_cpu;
+        struct mpcp_semaphore *sem = mpcp_from_lock(l);
+        unsigned long flags;
+        if (!is_realtime(t))
+                /* we need to know the real-time priority */
+                return -EPERM;
+        local_cpu = get_partition(t);
+        spin_lock_irqsave(&sem->wait.lock, flags);
+        for (cpu = 0; cpu < NR_CPUS; cpu++) {
+                if (cpu != local_cpu) {
+                        sem->prio_ceiling[cpu] = min(sem->prio_ceiling[cpu],
+                                                     get_priority(t));
+                        TRACE_CUR("priority ceiling for sem %p is now %d on cpu %d\n",
+                                  sem, sem->prio_ceiling[cpu], cpu);
+                }
+        }
+        spin_unlock_irqrestore(&sem->wait.lock, flags);
+        return 0;
+}
+int pfp_mpcp_close(struct litmus_lock* l)
+{
+        struct task_struct *t = current;
+        struct mpcp_semaphore *sem = mpcp_from_lock(l);
+        unsigned long flags;
+        int owner;
+        spin_lock_irqsave(&sem->wait.lock, flags);
+        owner = sem->owner == t;
+        spin_unlock_irqrestore(&sem->wait.lock, flags);
+        if (owner)
+                pfp_mpcp_unlock(l);
+        return 0;
+}
+void pfp_mpcp_free(struct litmus_lock* lock)
+{
+        kfree(mpcp_from_lock(lock));
+}
+static struct litmus_lock_ops pfp_mpcp_lock_ops = {
+        .close  = pfp_mpcp_close,
+        .lock   = pfp_mpcp_lock,
+        .open   = pfp_mpcp_open,
+        .unlock = pfp_mpcp_unlock,
+        .deallocate = pfp_mpcp_free,
+};
+static struct litmus_lock* pfp_new_mpcp(int vspin)
+{
+        struct mpcp_semaphore* sem;
+        int cpu;
+        sem = kmalloc(sizeof(*sem), GFP_KERNEL);
+        if (!sem)
+                return NULL;
+        sem->owner   = NULL;
+        init_waitqueue_head(&sem->wait);
+        sem->litmus_lock.ops = &pfp_mpcp_lock_ops;
+        for (cpu = 0; cpu < NR_CPUS; cpu++)
+                sem->prio_ceiling[cpu] = OMEGA_CEILING;
+        /* mark as virtual spinning */
+        sem->vspin = vspin;
+        return &sem->litmus_lock;
+}
+/* ******************** PCP support ********************** */
+struct pcp_semaphore {
+        struct litmus_lock litmus_lock;
+        struct list_head ceiling;
+        /* current resource holder */
+        struct task_struct *owner;
+        /* priority ceiling --- can be negative due to DPCP support */
+        int prio_ceiling;
+        /* on which processor is this PCP semaphore allocated? */
+        int on_cpu;
+};
+static inline struct pcp_semaphore* pcp_from_lock(struct litmus_lock* lock)
+{
+        return container_of(lock, struct pcp_semaphore, litmus_lock);
+}
+struct pcp_state {
+        struct list_head system_ceiling;
+        /* highest-priority waiting task */
+        struct task_struct* hp_waiter;
+        /* list of jobs waiting to get past the system ceiling */
+        wait_queue_head_t ceiling_blocked;
+};
+static void pcp_init_state(struct pcp_state* s)
+{
+        INIT_LIST_HEAD(&s->system_ceiling);
+        s->hp_waiter = NULL;
+        init_waitqueue_head(&s->ceiling_blocked);
+}
+static DEFINE_PER_CPU(struct pcp_state, pcp_state);
+/* assumes preemptions are off */
+static struct pcp_semaphore* pcp_get_ceiling(void)
+{
+        struct list_head* top = &(this_cpu_ptr(&pcp_state)->system_ceiling);
+        return list_first_entry_or_null(top, struct pcp_semaphore, ceiling);
+}
+/* assumes preempt off */
+static void pcp_add_ceiling(struct pcp_semaphore* sem)
+{
+        struct list_head *pos;
+        struct list_head *in_use = &(this_cpu_ptr(&pcp_state)->system_ceiling);
+        struct pcp_semaphore* held;
+        BUG_ON(sem->on_cpu != smp_processor_id());
+        BUG_ON(in_list(&sem->ceiling));
+        list_for_each(pos, in_use) {
+                held = list_entry(pos, struct pcp_semaphore, ceiling);
+                if (held->prio_ceiling >= sem->prio_ceiling) {
+                        __list_add(&sem->ceiling, pos->prev, pos);
+                        return;
+                }
+        }
+        /* we hit the end of the list */
+        list_add_tail(&sem->ceiling, in_use);
+}
+/* assumes preempt off */
+static int pcp_exceeds_ceiling(struct pcp_semaphore* ceiling,
+                              struct task_struct* task,
+                              int effective_prio)
+{
+        return ceiling == NULL ||
+                ceiling->prio_ceiling > effective_prio ||
+                ceiling->owner == task;
+}
+/* assumes preempt off */
+static void pcp_priority_inheritance(void)
+{
+        unsigned long   flags;
+        pfp_domain_t*   pfp = local_pfp;
+        struct pcp_semaphore* ceiling = pcp_get_ceiling();
+        struct task_struct *blocker, *blocked;
+        blocker = ceiling ?  ceiling->owner : NULL;
+        blocked = this_cpu_ptr(&pcp_state)->hp_waiter;
+        raw_spin_lock_irqsave(&pfp->slock, flags);
+        /* Current is no longer inheriting anything by default.  This should be
+         * the currently scheduled job, and hence not currently queued.
+         * Special case: if current stopped being a real-time task, it will no longer
+         * be registered as pfp->scheduled. */
+        BUG_ON(current != pfp->scheduled && is_realtime(current));
+        fp_set_prio_inh(pfp, current, NULL);
+        fp_set_prio_inh(pfp, blocked, NULL);
+        fp_set_prio_inh(pfp, blocker, NULL);
+        /* Let blocking job inherit priority of blocked job, if required. */
+        if (blocker && blocked &&
+            fp_higher_prio(blocked, blocker)) {
+                TRACE_TASK(blocker, "PCP inherits from %s/%d (prio %u -> %u) \n",
+                           blocked->comm, blocked->pid,
+                           get_priority(blocker), get_priority(blocked));
+                fp_set_prio_inh(pfp, blocker, blocked);
+        }
+        /* Check if anything changed. If the blocked job is current, then it is
+         * just blocking and hence is going to call the scheduler anyway. */
+        if (blocked != current &&
+            fp_higher_prio(fp_prio_peek(&pfp->ready_queue), pfp->scheduled))
+                preempt(pfp);
+        raw_spin_unlock_irqrestore(&pfp->slock, flags);
+}
+/* called with preemptions off */
+static void pcp_raise_ceiling(struct pcp_semaphore* sem,
+                              int effective_prio)
+{
+        struct task_struct* t = current;
+        struct pcp_semaphore* ceiling;
+        prio_wait_queue_t wait;
+        unsigned int waiting_higher_prio;
+        while(1) {
+                ceiling = pcp_get_ceiling();
+                if (pcp_exceeds_ceiling(ceiling, t, effective_prio))
+                        break;
+                TRACE_CUR("PCP ceiling-blocked, wanted sem %p, but %s/%d has the ceiling \n",
+                          sem, ceiling->owner->comm, ceiling->owner->pid);
+                /* we need to wait until the ceiling is lowered */
+                /* enqueue in priority order */
+                init_prio_waitqueue_entry(&wait, t, effective_prio);
+                set_task_state(t, TASK_UNINTERRUPTIBLE);
+                waiting_higher_prio = add_wait_queue_prio_exclusive(
+                        &(this_cpu_ptr(&pcp_state)->ceiling_blocked), &wait);
+                if (waiting_higher_prio == 0) {
+                        TRACE_CUR("PCP new highest-prio waiter => prio inheritance\n");
+                        /* we are the new highest-priority waiting job
+                         * => update inheritance */
+                        this_cpu_ptr(&pcp_state)->hp_waiter = t;
+                        pcp_priority_inheritance();
+                }
+                TS_LOCK_SUSPEND;
+                preempt_enable_no_resched();
+                schedule();
+                preempt_disable();
+                /* pcp_resume_unblocked() removed us from wait queue */
+                TS_LOCK_RESUME;
+        }
+        TRACE_CUR("PCP got the ceiling and sem %p\n", sem);
+        /* We are good to go. The semaphore should be available. */
+        BUG_ON(sem->owner != NULL);
+        sem->owner = t;
+        pcp_add_ceiling(sem);
+}
+static void pcp_resume_unblocked(void)
+{
+        wait_queue_head_t *blocked =  &(this_cpu_ptr(&pcp_state)->ceiling_blocked);
+        unsigned long flags;
+        prio_wait_queue_t* q;
+        struct task_struct* t = NULL;
+        struct pcp_semaphore* ceiling = pcp_get_ceiling();
+        spin_lock_irqsave(&blocked->lock, flags);
+        while (waitqueue_active(blocked)) {
+                /* check first == highest-priority waiting job */
+                q = list_entry(blocked->task_list.next,
+                               prio_wait_queue_t, wq.task_list);
+                t = (struct task_struct*) q->wq.private;
+                /* can it proceed now? => let it go */
+                if (pcp_exceeds_ceiling(ceiling, t, q->priority)) {
+                    __remove_wait_queue(blocked, &q->wq);
+                    wake_up_process(t);
+                } else {
+                        /* We are done. Update highest-priority waiter. */
+                        this_cpu_ptr(&pcp_state)->hp_waiter = t;
+                        goto out;
+                }
+        }
+        /* If we get here, then there are no more waiting
+         * jobs. */
+        this_cpu_ptr(&pcp_state)->hp_waiter = NULL;
+out:
+        spin_unlock_irqrestore(&blocked->lock, flags);
+}
+/* assumes preempt off */
+static void pcp_lower_ceiling(struct pcp_semaphore* sem)
+{
+        BUG_ON(!in_list(&sem->ceiling));
+        BUG_ON(sem->owner != current);
+        BUG_ON(sem->on_cpu != smp_processor_id());
+        /* remove from ceiling list */
+        list_del(&sem->ceiling);
+        /* release */
+        sem->owner = NULL;
+        TRACE_CUR("PCP released sem %p\n", sem);
+        /* Wake up all ceiling-blocked jobs that now pass the ceiling. */
+        pcp_resume_unblocked();
+        pcp_priority_inheritance();
+}
+static void pcp_update_prio_ceiling(struct pcp_semaphore* sem,
+                                    int effective_prio)
+{
+        /* This needs to be synchronized on something.
+         * Might as well use waitqueue lock for the processor.
+         * We assume this happens only before the task set starts execution,
+         * (i.e., during initialization), but it may happen on multiple processors
+         * at the same time.
+         */
+        unsigned long flags;
+        struct pcp_state* s = &per_cpu(pcp_state, sem->on_cpu);
+        spin_lock_irqsave(&s->ceiling_blocked.lock, flags);
+        sem->prio_ceiling = min(sem->prio_ceiling, effective_prio);
+        spin_unlock_irqrestore(&s->ceiling_blocked.lock, flags);
+}
+static void pcp_init_semaphore(struct pcp_semaphore* sem, int cpu)
+{
+        sem->owner   = NULL;
+        INIT_LIST_HEAD(&sem->ceiling);
+        sem->prio_ceiling = INT_MAX;
+        sem->on_cpu = cpu;
+}
+int pfp_pcp_lock(struct litmus_lock* l)
+{
+        struct task_struct* t = current;
+        struct pcp_semaphore *sem = pcp_from_lock(l);
+        /* The regular PCP uses the regular task priorities, not agent
+         * priorities. */
+        int eprio = get_priority(t);
+        int from  = get_partition(t);
+        int to    = sem->on_cpu;
+        if (!is_realtime(t) || from != to)
+                return -EPERM;
+        /* prevent nested lock acquisition in global critical section */
+        if (tsk_rt(t)->num_locks_held)
+                return -EBUSY;
+        preempt_disable();
+        pcp_raise_ceiling(sem, eprio);
+        preempt_enable();
+        tsk_rt(t)->num_local_locks_held++;
+        return 0;
+}
+int pfp_pcp_unlock(struct litmus_lock* l)
+{
+        struct task_struct *t = current;
+        struct pcp_semaphore *sem = pcp_from_lock(l);
+        int err = 0;
+        preempt_disable();
+        if (sem->owner != t) {
+                err = -EINVAL;
+                goto out;
+        }
+        /* The current owner should be executing on the correct CPU.
+         *
+         * If the owner transitioned out of RT mode or is exiting, then
+         * we it might have already been migrated away by the best-effort
+         * scheduler and we just have to deal with it. */
+        if (unlikely(!is_realtime(t) && sem->on_cpu != smp_processor_id())) {
+                TRACE_TASK(t, "PCP unlock cpu=%d, sem->on_cpu=%d\n",
+                        smp_processor_id(), sem->on_cpu);
+                preempt_enable();
+                err = litmus_be_migrate_to(sem->on_cpu);
+                preempt_disable();
+                TRACE_TASK(t, "post-migrate: cpu=%d, sem->on_cpu=%d err=%d\n",
+                        smp_processor_id(), sem->on_cpu, err);
+        }
+        BUG_ON(sem->on_cpu != smp_processor_id());
+        err = 0;
+        tsk_rt(t)->num_local_locks_held--;
+        /* give it back */
+        pcp_lower_ceiling(sem);
+out:
+        preempt_enable();
+        return err;
+}
+int pfp_pcp_open(struct litmus_lock* l, void* __user config)
+{
+        struct task_struct *t = current;
+        struct pcp_semaphore *sem = pcp_from_lock(l);
+        int cpu, eprio;
+        if (!is_realtime(t))
+                /* we need to know the real-time priority */
+                return -EPERM;
+        if (!config)
+                cpu = get_partition(t);
+        else if (get_user(cpu, (int*) config))
+                return -EFAULT;
+        /* make sure the resource location matches */
+        if (cpu != sem->on_cpu)
+                return -EINVAL;
+        /* The regular PCP uses regular task priorites, not agent
+         * priorities. */
+        eprio = get_priority(t);
+        pcp_update_prio_ceiling(sem, eprio);
+        return 0;
+}
+int pfp_pcp_close(struct litmus_lock* l)
+{
+        struct task_struct *t = current;
+        struct pcp_semaphore *sem = pcp_from_lock(l);
+        int owner = 0;
+        preempt_disable();
+        if (sem->on_cpu == smp_processor_id())
+                owner = sem->owner == t;
+        preempt_enable();
+        if (owner)
+                pfp_pcp_unlock(l);
+        return 0;
+}
+void pfp_pcp_free(struct litmus_lock* lock)
+{
+        kfree(pcp_from_lock(lock));
+}
+static struct litmus_lock_ops pfp_pcp_lock_ops = {
+        .close  = pfp_pcp_close,
+        .lock   = pfp_pcp_lock,
+        .open   = pfp_pcp_open,
+        .unlock = pfp_pcp_unlock,
+        .deallocate = pfp_pcp_free,
+};
+static struct litmus_lock* pfp_new_pcp(int on_cpu)
+{
+        struct pcp_semaphore* sem;
+        sem = kmalloc(sizeof(*sem), GFP_KERNEL);
+        if (!sem)
+                return NULL;
+        sem->litmus_lock.ops = &pfp_pcp_lock_ops;
+        pcp_init_semaphore(sem, on_cpu);
+        return &sem->litmus_lock;
+}
+/* ******************** DPCP support ********************** */
+struct dpcp_semaphore {
+        struct litmus_lock litmus_lock;
+        struct pcp_semaphore  pcp;
+        int owner_cpu;
+};
+static inline struct dpcp_semaphore* dpcp_from_lock(struct litmus_lock* lock)
+{
+        return container_of(lock, struct dpcp_semaphore, litmus_lock);
+}
+/* called with preemptions disabled */
+static void pfp_migrate_to(int target_cpu)
+{
+        struct task_struct* t = current;
+        pfp_domain_t *from;
+        if (get_partition(t) == target_cpu)
+                return;
+        if (!is_realtime(t))
+        {
+                TRACE_TASK(t, "not migrating, not a RT task (anymore?)\n");
+                return;
+        }
+        /* make sure target_cpu makes sense */
+        BUG_ON(target_cpu >= NR_CPUS || !cpu_online(target_cpu));
+        local_irq_disable();
+        from = task_pfp(t);
+        raw_spin_lock(&from->slock);
+        /* Scheduled task should not be in any ready or release queue.  Check
+         * this while holding the lock to avoid RT mode transitions.*/
+        BUG_ON(is_realtime(t) && is_queued(t));
+        /* switch partitions */
+        tsk_rt(t)->task_params.cpu = target_cpu;
+        raw_spin_unlock(&from->slock);
+        /* Don't trace scheduler costs as part of
+         * locking overhead. Scheduling costs are accounted for
+         * explicitly. */
+        TS_LOCK_SUSPEND;
+        local_irq_enable();
+        preempt_enable_no_resched();
+        /* deschedule to be migrated */
+        schedule();
+        /* we are now on the target processor */
+        preempt_disable();
+        /* start recording costs again */
+        TS_LOCK_RESUME;
+        BUG_ON(smp_processor_id() != target_cpu && is_realtime(t));
+}
+int pfp_dpcp_lock(struct litmus_lock* l)
+{
+        struct task_struct* t = current;
+        struct dpcp_semaphore *sem = dpcp_from_lock(l);
+        int eprio = effective_agent_priority(get_priority(t));
+        int from  = get_partition(t);
+        int to    = sem->pcp.on_cpu;
+        if (!is_realtime(t))
+                return -EPERM;
+        /* prevent nested lock accquisition */
+        if (tsk_rt(t)->num_locks_held ||
+            tsk_rt(t)->num_local_locks_held)
+                return -EBUSY;
+        preempt_disable();
+        /* Priority-boost ourself *before* we suspend so that
+         * our priority is boosted when we resume. */
+        boost_priority(t, get_priority(t));
+        pfp_migrate_to(to);
+        pcp_raise_ceiling(&sem->pcp, eprio);
+        /* yep, we got it => execute request */
+        sem->owner_cpu = from;
+        preempt_enable();
+        tsk_rt(t)->num_locks_held++;
+        return 0;
+}
+int pfp_dpcp_unlock(struct litmus_lock* l)
+{
+        struct task_struct *t = current;
+        struct dpcp_semaphore *sem = dpcp_from_lock(l);
+        int err = 0;
+        int home;
+        preempt_disable();
+        if (sem->pcp.owner != t) {
+                err = -EINVAL;
+                goto out;
+        }
+        /* The current owner should be executing on the correct CPU.
+         *
+         * If the owner transitioned out of RT mode or is exiting, then
+         * we it might have already been migrated away by the best-effort
+         * scheduler and we just have to deal with it. */
+        if (unlikely(!is_realtime(t) && sem->pcp.on_cpu != smp_processor_id())) {
+                TRACE_TASK(t, "DPCP unlock cpu=%d, sem->pcp.on_cpu=%d\n", smp_processor_id(), sem->pcp.on_cpu);
+                preempt_enable();
+                err = litmus_be_migrate_to(sem->pcp.on_cpu);
+                preempt_disable();
+                TRACE_TASK(t, "post-migrate: cpu=%d, sem->pcp.on_cpu=%d err=%d\n", smp_processor_id(), sem->pcp.on_cpu, err);
+        }
+        BUG_ON(sem->pcp.on_cpu != smp_processor_id());
+        err = 0;
+        tsk_rt(t)->num_locks_held--;
+        home = sem->owner_cpu;
+        /* give it back */
+        pcp_lower_ceiling(&sem->pcp);
+        /* we lose the benefit of priority boosting */
+        unboost_priority(t);
+        pfp_migrate_to(home);
+out:
+        preempt_enable();
+        return err;
+}
+int pfp_dpcp_open(struct litmus_lock* l, void* __user config)
+{
+        struct task_struct *t = current;
+        struct dpcp_semaphore *sem = dpcp_from_lock(l);
+        int cpu, eprio;
+        if (!is_realtime(t))
+                /* we need to know the real-time priority */
+                return -EPERM;
+        if (get_user(cpu, (int*) config))
+                return -EFAULT;
+        /* make sure the resource location matches */
+        if (cpu != sem->pcp.on_cpu)
+                return -EINVAL;
+        eprio = effective_agent_priority(get_priority(t));
+        pcp_update_prio_ceiling(&sem->pcp, eprio);
+        return 0;
+}
+int pfp_dpcp_close(struct litmus_lock* l)
+{
+        struct task_struct *t = current;
+        struct dpcp_semaphore *sem = dpcp_from_lock(l);
+        int owner = 0;
+        preempt_disable();
+        if (sem->pcp.on_cpu == smp_processor_id())
+                owner = sem->pcp.owner == t;
+        preempt_enable();
+        if (owner)
+                pfp_dpcp_unlock(l);
+        return 0;
+}
+void pfp_dpcp_free(struct litmus_lock* lock)
+{
+        kfree(dpcp_from_lock(lock));
+}
+static struct litmus_lock_ops pfp_dpcp_lock_ops = {
+        .close  = pfp_dpcp_close,
+        .lock   = pfp_dpcp_lock,
+        .open   = pfp_dpcp_open,
+        .unlock = pfp_dpcp_unlock,
+        .deallocate = pfp_dpcp_free,
+};
+static struct litmus_lock* pfp_new_dpcp(int on_cpu)
+{
+        struct dpcp_semaphore* sem;
+        sem = kmalloc(sizeof(*sem), GFP_KERNEL);
+        if (!sem)
+                return NULL;
+        sem->litmus_lock.ops = &pfp_dpcp_lock_ops;
+        sem->owner_cpu = NO_CPU;
+        pcp_init_semaphore(&sem->pcp, on_cpu);
+        return &sem->litmus_lock;
+}
+/* ******************** DFLP support ********************** */
+struct dflp_semaphore {
+        struct litmus_lock litmus_lock;
+        /* current resource holder */
+        struct task_struct *owner;
+        int owner_cpu;
+        /* FIFO queue of waiting tasks */
+        wait_queue_head_t wait;
+        /* where is the resource assigned to */
+        int on_cpu;
+};
+static inline struct dflp_semaphore* dflp_from_lock(struct litmus_lock* lock)
+{
+        return container_of(lock, struct dflp_semaphore, litmus_lock);
+}
+int pfp_dflp_lock(struct litmus_lock* l)
+{
+        struct task_struct* t = current;
+        struct dflp_semaphore *sem = dflp_from_lock(l);
+        int from  = get_partition(t);
+        int to    = sem->on_cpu;
+        unsigned long flags;
+        wait_queue_t wait;
+        lt_t time_of_request;
+        if (!is_realtime(t))
+                return -EPERM;
+        /* prevent nested lock accquisition */
+        if (tsk_rt(t)->num_locks_held ||
+            tsk_rt(t)->num_local_locks_held)
+                return -EBUSY;
+        preempt_disable();
+        /* tie-break by this point in time */
+        time_of_request = litmus_clock();
+        /* Priority-boost ourself *before* we suspend so that
+         * our priority is boosted when we resume. */
+        boost_priority(t, time_of_request);
+        pfp_migrate_to(to);
+        /* Now on the right CPU, preemptions still disabled. */
+        spin_lock_irqsave(&sem->wait.lock, flags);
+        if (sem->owner) {
+                /* resource is not free => must suspend and wait */
+                init_waitqueue_entry(&wait, t);
+                /* FIXME: interruptible would be nice some day */
+                set_task_state(t, TASK_UNINTERRUPTIBLE);
+                __add_wait_queue_tail_exclusive(&sem->wait, &wait);
+                TS_LOCK_SUSPEND;
+                /* release lock before sleeping */
+                spin_unlock_irqrestore(&sem->wait.lock, flags);
+                /* We depend on the FIFO order.  Thus, we don't need to recheck
+                 * when we wake up; we are guaranteed to have the lock since
+                 * there is only one wake up per release.
+                 */
+                preempt_enable_no_resched();
+                schedule();
+                preempt_disable();
+                TS_LOCK_RESUME;
+                /* Since we hold the lock, no other task will change
+                 * ->owner. We can thus check it without acquiring the spin
+                 * lock. */
+                BUG_ON(sem->owner != t);
+        } else {
+                /* it's ours now */
+                sem->owner = t;
+                spin_unlock_irqrestore(&sem->wait.lock, flags);
+        }
+        sem->owner_cpu = from;
+        preempt_enable();
+        tsk_rt(t)->num_locks_held++;
+        return 0;
+}
+int pfp_dflp_unlock(struct litmus_lock* l)
+{
+        struct task_struct *t = current, *next;
+        struct dflp_semaphore *sem = dflp_from_lock(l);
+        int err = 0;
+        int home;
+        unsigned long flags;
+        preempt_disable();
+        spin_lock_irqsave(&sem->wait.lock, flags);
+        if (sem->owner != t) {
+                err = -EINVAL;
+                spin_unlock_irqrestore(&sem->wait.lock, flags);
+                goto out;
+        }
+        /* check if there are jobs waiting for this resource */
+        next = __waitqueue_remove_first(&sem->wait);
+        if (next) {
+                /* next becomes the resouce holder */
+                sem->owner = next;
+                /* Wake up next. The waiting job is already priority-boosted. */
+                wake_up_process(next);
+        } else
+                /* resource becomes available */
+                sem->owner = NULL;
+        tsk_rt(t)->num_locks_held--;
+        home = sem->owner_cpu;
+        spin_unlock_irqrestore(&sem->wait.lock, flags);
+        /* we lose the benefit of priority boosting */
+        unboost_priority(t);
+        pfp_migrate_to(home);
+out:
+        preempt_enable();
+        return err;
+}
+int pfp_dflp_open(struct litmus_lock* l, void* __user config)
+{
+        struct dflp_semaphore *sem = dflp_from_lock(l);
+        int cpu;
+        if (get_user(cpu, (int*) config))
+                return -EFAULT;
+        /* make sure the resource location matches */
+        if (cpu != sem->on_cpu)
+                return -EINVAL;
+        return 0;
+}
+int pfp_dflp_close(struct litmus_lock* l)
+{
+        struct task_struct *t = current;
+        struct dflp_semaphore *sem = dflp_from_lock(l);
+        int owner = 0;
+        preempt_disable();
+        if (sem->on_cpu == smp_processor_id())
+                owner = sem->owner == t;
+        preempt_enable();
+        if (owner)
+                pfp_dflp_unlock(l);
+        return 0;
+}
+void pfp_dflp_free(struct litmus_lock* lock)
+{
+        kfree(dflp_from_lock(lock));
+}
+static struct litmus_lock_ops pfp_dflp_lock_ops = {
+        .close  = pfp_dflp_close,
+        .lock   = pfp_dflp_lock,
+        .open   = pfp_dflp_open,
+        .unlock = pfp_dflp_unlock,
+        .deallocate = pfp_dflp_free,
+};
+static struct litmus_lock* pfp_new_dflp(int on_cpu)
+{
+        struct dflp_semaphore* sem;
+        sem = kmalloc(sizeof(*sem), GFP_KERNEL);
+        if (!sem)
+                return NULL;
+        sem->litmus_lock.ops = &pfp_dflp_lock_ops;
+        sem->owner_cpu = NO_CPU;
+        sem->owner   = NULL;
+        sem->on_cpu  = on_cpu;
+        init_waitqueue_head(&sem->wait);
+        return &sem->litmus_lock;
+}
+/* **** lock constructor **** */
+static long pfp_allocate_lock(struct litmus_lock **lock, int type,
+                                 void* __user config)
+{
+        int err = -ENXIO, cpu;
+        struct srp_semaphore* srp;
+        /* P-FP currently supports the SRP for local resources and the FMLP
+         * for global resources. */
+        switch (type) {
+        case FMLP_SEM:
+                /* FIFO Mutex Locking Protocol */
+                *lock = pfp_new_fmlp();
+                if (*lock)
+                        err = 0;
+                else
+                        err = -ENOMEM;
+                break;
+        case MPCP_SEM:
+                /* Multiprocesor Priority Ceiling Protocol */
+                *lock = pfp_new_mpcp(0);
+                if (*lock)
+                        err = 0;
+                else
+                        err = -ENOMEM;
+                break;
+        case MPCP_VS_SEM:
+                /* Multiprocesor Priority Ceiling Protocol with virtual spinning */
+                *lock = pfp_new_mpcp(1);
+                if (*lock)
+                        err = 0;
+                else
+                        err = -ENOMEM;
+                break;
+        case DPCP_SEM:
+                /* Distributed Priority Ceiling Protocol */
+                if (get_user(cpu, (int*) config))
+                        return -EFAULT;
+                TRACE("DPCP_SEM: provided cpu=%d\n", cpu);
+                if (cpu >= NR_CPUS || !cpu_online(cpu))
+                        return -EINVAL;
+                *lock = pfp_new_dpcp(cpu);
+                if (*lock)
+                        err = 0;
+                else
+                        err = -ENOMEM;
+                break;
+        case DFLP_SEM:
+                /* Distributed FIFO Locking Protocol */
+                if (get_user(cpu, (int*) config))
+                        return -EFAULT;
+                TRACE("DPCP_SEM: provided cpu=%d\n", cpu);
+                if (cpu >= NR_CPUS || !cpu_online(cpu))
+                        return -EINVAL;
+                *lock = pfp_new_dflp(cpu);
+                if (*lock)
+                        err = 0;
+                else
+                        err = -ENOMEM;
+                break;
+        case SRP_SEM:
+                /* Baker's Stack Resource Policy */
+                srp = allocate_srp_semaphore();
+                if (srp) {
+                        *lock = &srp->litmus_lock;
+                        err = 0;
+                } else
+                        err = -ENOMEM;
+                break;
+        case PCP_SEM:
+                /* Priority Ceiling Protocol */
+                if (!config)
+                        cpu = get_partition(current);
+                else if (get_user(cpu, (int*) config))
+                        return -EFAULT;
+                if (cpu >= NR_CPUS || !cpu_online(cpu))
+                        return -EINVAL;
+                *lock = pfp_new_pcp(cpu);
+                if (*lock)
+                        err = 0;
+                else
+                        err = -ENOMEM;
+                break;
+        };
+        return err;
+}
+#endif
+static long pfp_admit_task(struct task_struct* tsk)
+{
+        if (task_cpu(tsk) == tsk->rt_param.task_params.cpu &&
+#ifdef CONFIG_RELEASE_MASTER
+            /* don't allow tasks on release master CPU */
+            task_cpu(tsk) != remote_dom(task_cpu(tsk))->release_master &&
+#endif
+            litmus_is_valid_fixed_prio(get_priority(tsk)))
+                return 0;
+        else
+                return -EINVAL;
+}
+static struct domain_proc_info pfp_domain_proc_info;
+static long pfp_get_domain_proc_info(struct domain_proc_info **ret)
+{
+        *ret = &pfp_domain_proc_info;
+        return 0;
+}
+static void pfp_setup_domain_proc(void)
+{
+        int i, cpu;
+        int release_master =
+#ifdef CONFIG_RELEASE_MASTER
+                atomic_read(&release_master_cpu);
+#else
+                NO_CPU;
+#endif
+        int num_rt_cpus = num_online_cpus() - (release_master != NO_CPU);
+        struct cd_mapping *cpu_map, *domain_map;
+        memset(&pfp_domain_proc_info, 0, sizeof(pfp_domain_proc_info));
+        init_domain_proc_info(&pfp_domain_proc_info, num_rt_cpus, num_rt_cpus);
+        pfp_domain_proc_info.num_cpus = num_rt_cpus;
+        pfp_domain_proc_info.num_domains = num_rt_cpus;
+        for (cpu = 0, i = 0; cpu < num_online_cpus(); ++cpu) {
+                if (cpu == release_master)
+                        continue;
+                cpu_map = &pfp_domain_proc_info.cpu_to_domains[i];
+                domain_map = &pfp_domain_proc_info.domain_to_cpus[i];
+                cpu_map->id = cpu;
+                domain_map->id = i; /* enumerate w/o counting the release master */
+                cpumask_set_cpu(i, cpu_map->mask);
+                cpumask_set_cpu(cpu, domain_map->mask);
+                ++i;
+        }
+}
+static long pfp_activate_plugin(void)
+{
+#if defined(CONFIG_RELEASE_MASTER) || defined(CONFIG_LITMUS_LOCKING)
+        int cpu;
+#endif
+#ifdef CONFIG_RELEASE_MASTER
+        for_each_online_cpu(cpu) {
+                remote_dom(cpu)->release_master = atomic_read(&release_master_cpu);
+        }
+#endif
+#ifdef CONFIG_LITMUS_LOCKING
+        get_srp_prio = pfp_get_srp_prio;
+        for_each_online_cpu(cpu) {
+                init_waitqueue_head(&per_cpu(mpcpvs_vspin_wait, cpu));
+                per_cpu(mpcpvs_vspin, cpu) = NULL;
+                pcp_init_state(&per_cpu(pcp_state, cpu));
+                pfp_doms[cpu] = remote_pfp(cpu);
+                per_cpu(fmlp_timestamp,cpu) = 0;
+        }
+#endif
+        pfp_setup_domain_proc();
+        return 0;
+}
+static long pfp_deactivate_plugin(void)
+{
+        destroy_domain_proc_info(&pfp_domain_proc_info);
+        return 0;
+}
+/*      Plugin object   */
+static struct sched_plugin pfp_plugin __cacheline_aligned_in_smp = {
+        .plugin_name            = "P-FP",
+        .task_new               = pfp_task_new,
+        .complete_job           = complete_job,
+        .task_exit              = pfp_task_exit,
+        .schedule               = pfp_schedule,
+        .task_wake_up           = pfp_task_wake_up,
+        .task_block             = pfp_task_block,
+        .admit_task             = pfp_admit_task,
+        .activate_plugin        = pfp_activate_plugin,
+        .deactivate_plugin      = pfp_deactivate_plugin,
+        .get_domain_proc_info   = pfp_get_domain_proc_info,
+#ifdef CONFIG_LITMUS_LOCKING
+        .allocate_lock          = pfp_allocate_lock,
+        .finish_switch          = pfp_finish_switch,
+#endif
+};
+static int __init init_pfp(void)
+{
+        int i;
+        /* We do not really want to support cpu hotplug, do we? ;)
+         * However, if we are so crazy to do so,
+         * we cannot use num_online_cpu()
+         */
+        for (i = 0; i < num_online_cpus(); i++) {
+                pfp_domain_init(remote_pfp(i), i);
+        }
+        return register_sched_plugin(&pfp_plugin);
+}
+module_init(init_pfp);
diff --git a/litmus/sched_plugin.c b/litmus/sched_plugin.c
new file mode 100644
index 000000000000..9390eb9141bf
--- /dev/null
+++ b/litmus/sched_plugin.c
@@ -0,0 +1,290 @@
+/* sched_plugin.c -- core infrastructure for the scheduler plugin system
+ *
+ * This file includes the initialization of the plugin system, the no-op Linux
+ * scheduler plugin, some dummy functions, and some helper functions.
+ */
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <litmus/debug_trace.h>
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/preempt.h>
+#include <litmus/jobs.h>
+#include <litmus/budget.h>
+#include <litmus/np.h>
+/*
+ * Generic function to trigger preemption on either local or remote cpu
+ * from scheduler plugins. The key feature is that this function is
+ * non-preemptive section aware and does not invoke the scheduler / send
+ * IPIs if the to-be-preempted task is actually non-preemptive.
+ */
+void preempt_if_preemptable(struct task_struct* t, int cpu)
+{
+        /* t is the real-time task executing on CPU on_cpu If t is NULL, then
+         * on_cpu is currently scheduling background work.
+         */
+        int reschedule = 0;
+        if (!t)
+                /* move non-real-time task out of the way */
+                reschedule = 1;
+        else {
+                if (smp_processor_id() == cpu) {
+                        /* local CPU case */
+                        /* check if we need to poke userspace */
+                        if (is_user_np(t))
+                                /* Yes, poke it. This doesn't have to be atomic since
+                                 * the task is definitely not executing. */
+                                request_exit_np(t);
+                        else if (!is_kernel_np(t))
+                                /* only if we are allowed to preempt the
+                                 * currently-executing task */
+                                reschedule = 1;
+                } else {
+                        /* Remote CPU case.  Only notify if it's not a kernel
+                         * NP section and if we didn't set the userspace
+                         * flag. */
+                        reschedule = !(is_kernel_np(t) || request_exit_np_atomic(t));
+                }
+        }
+        if (likely(reschedule))
+                litmus_reschedule(cpu);
+}
+/*************************************************************
+ *                   Dummy plugin functions                  *
+ *************************************************************/
+static void litmus_dummy_finish_switch(struct task_struct * prev)
+{
+}
+static struct task_struct* litmus_dummy_schedule(struct task_struct * prev)
+{
+        sched_state_task_picked();
+        return NULL;
+}
+static bool litmus_dummy_should_wait_for_stack(struct task_struct *next)
+{
+        return true; /* by default, wait indefinitely */
+}
+static void litmus_dummy_next_became_invalid(struct task_struct *next)
+{
+}
+static bool litmus_dummy_post_migration_validate(struct task_struct *next)
+{
+        return true; /* by default, anything is ok */
+}
+static long litmus_dummy_admit_task(struct task_struct* tsk)
+{
+        printk(KERN_CRIT "LITMUS^RT: Linux plugin rejects %s/%d.\n",
+                tsk->comm, tsk->pid);
+        return -EINVAL;
+}
+static bool litmus_dummy_fork_task(struct task_struct* tsk)
+{
+        /* Default behavior: return false to demote to non-real-time task */
+        return false;
+}
+static void litmus_dummy_task_new(struct task_struct *t, int on_rq, int running)
+{
+}
+static void litmus_dummy_task_wake_up(struct task_struct *task)
+{
+}
+static void litmus_dummy_task_block(struct task_struct *task)
+{
+}
+static void litmus_dummy_task_exit(struct task_struct *task)
+{
+}
+static void litmus_dummy_task_cleanup(struct task_struct *task)
+{
+}
+static long litmus_dummy_complete_job(void)
+{
+        return -ENOSYS;
+}
+static long litmus_dummy_activate_plugin(void)
+{
+        return 0;
+}
+static long litmus_dummy_deactivate_plugin(void)
+{
+        return 0;
+}
+static long litmus_dummy_get_domain_proc_info(struct domain_proc_info **d)
+{
+        *d = NULL;
+        return 0;
+}
+static void litmus_dummy_synchronous_release_at(lt_t time_zero)
+{
+        /* ignore */
+}
+static long litmus_dummy_task_change_params(
+        struct task_struct *task,
+        struct rt_task *new_params)
+{
+        /* by default, do not allow changes to task parameters */
+        return -EBUSY;
+}
+#ifdef CONFIG_LITMUS_LOCKING
+static long litmus_dummy_allocate_lock(struct litmus_lock **lock, int type,
+                                       void* __user config)
+{
+        return -ENXIO;
+}
+#endif
+static long  litmus_dummy_reservation_create(
+        int reservation_type,
+        void* __user config)
+{
+        return -ENOSYS;
+}
+static long litmus_dummy_reservation_destroy(unsigned int reservation_id, int cpu)
+{
+        return -ENOSYS;
+}
+/* The default scheduler plugin. It doesn't do anything and lets Linux do its
+ * job.
+ */
+struct sched_plugin linux_sched_plugin = {
+        .plugin_name = "Linux",
+        .task_new   = litmus_dummy_task_new,
+        .task_exit = litmus_dummy_task_exit,
+        .task_wake_up = litmus_dummy_task_wake_up,
+        .task_block = litmus_dummy_task_block,
+        .complete_job = litmus_dummy_complete_job,
+        .schedule = litmus_dummy_schedule,
+        .finish_switch = litmus_dummy_finish_switch,
+        .activate_plugin = litmus_dummy_activate_plugin,
+        .deactivate_plugin = litmus_dummy_deactivate_plugin,
+        .get_domain_proc_info = litmus_dummy_get_domain_proc_info,
+        .synchronous_release_at = litmus_dummy_synchronous_release_at,
+#ifdef CONFIG_LITMUS_LOCKING
+        .allocate_lock = litmus_dummy_allocate_lock,
+#endif
+        .admit_task = litmus_dummy_admit_task
+};
+/*
+ *      The reference to current plugin that is used to schedule tasks within
+ *      the system. It stores references to actual function implementations
+ *      Should be initialized by calling "init_***_plugin()"
+ */
+struct sched_plugin *litmus = &linux_sched_plugin;
+/* the list of registered scheduling plugins */
+static LIST_HEAD(sched_plugins);
+static DEFINE_RAW_SPINLOCK(sched_plugins_lock);
+#define CHECK(func) {\
+        if (!plugin->func) \
+                plugin->func = litmus_dummy_ ## func;}
+/* FIXME: get reference to module  */
+int register_sched_plugin(struct sched_plugin* plugin)
+{
+        printk(KERN_INFO "Registering LITMUS^RT plugin %s.\n",
+               plugin->plugin_name);
+        /* make sure we don't trip over null pointers later */
+        CHECK(finish_switch);
+        CHECK(schedule);
+        CHECK(should_wait_for_stack);
+        CHECK(post_migration_validate);
+        CHECK(next_became_invalid);
+        CHECK(task_wake_up);
+        CHECK(task_exit);
+        CHECK(task_cleanup);
+        CHECK(task_block);
+        CHECK(task_new);
+        CHECK(task_change_params);
+        CHECK(complete_job);
+        CHECK(activate_plugin);
+        CHECK(deactivate_plugin);
+        CHECK(get_domain_proc_info);
+#ifdef CONFIG_LITMUS_LOCKING
+        CHECK(allocate_lock);
+#endif
+        CHECK(admit_task);
+        CHECK(fork_task);
+        CHECK(synchronous_release_at);
+        CHECK(reservation_destroy);
+        CHECK(reservation_create);
+        if (!plugin->wait_for_release_at)
+                plugin->wait_for_release_at = default_wait_for_release_at;
+        if (!plugin->current_budget)
+                plugin->current_budget = litmus_current_budget;
+        raw_spin_lock(&sched_plugins_lock);
+        list_add(&plugin->list, &sched_plugins);
+        raw_spin_unlock(&sched_plugins_lock);
+        return 0;
+}
+/* FIXME: reference counting, etc. */
+struct sched_plugin* find_sched_plugin(const char* name)
+{
+        struct list_head *pos;
+        struct sched_plugin *plugin;
+        raw_spin_lock(&sched_plugins_lock);
+        list_for_each(pos, &sched_plugins) {
+                plugin = list_entry(pos, struct sched_plugin, list);
+                if (!strcmp(plugin->plugin_name, name))
+                    goto out_unlock;
+        }
+        plugin = NULL;
+out_unlock:
+        raw_spin_unlock(&sched_plugins_lock);
+        return plugin;
+}
+void print_sched_plugins(struct seq_file *m)
+{
+        struct list_head *pos;
+        struct sched_plugin *plugin;
+        raw_spin_lock(&sched_plugins_lock);
+        list_for_each(pos, &sched_plugins) {
+                plugin = list_entry(pos, struct sched_plugin, list);
+                seq_printf(m, "%s\n", plugin->plugin_name);
+        }
+        raw_spin_unlock(&sched_plugins_lock);
+}
diff --git a/litmus/sched_pres.c b/litmus/sched_pres.c
new file mode 100644
index 000000000000..0a3270346656
--- /dev/null
+++ b/litmus/sched_pres.c
@@ -0,0 +1,612 @@
+#include <linux/percpu.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <asm/uaccess.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/preempt.h>
+#include <litmus/debug_trace.h>
+#include <litmus/litmus.h>
+#include <litmus/jobs.h>
+#include <litmus/budget.h>
+#include <litmus/litmus_proc.h>
+#include <litmus/sched_trace.h>
+#include <litmus/reservations/reservation.h>
+#include <litmus/reservations/alloc.h>
+struct pres_task_state {
+        struct reservation_client *client;
+        int cpu;
+        struct task_client res_info;
+};
+struct pres_cpu_state {
+        raw_spinlock_t lock;
+        struct sup_reservation_environment sup_env;
+        struct hrtimer timer;
+        int cpu;
+        struct task_struct* scheduled;
+};
+static DEFINE_PER_CPU(struct pres_cpu_state, pres_cpu_state);
+#define cpu_state_for(cpu_id)   (&per_cpu(pres_cpu_state, cpu_id))
+#define local_cpu_state()       (this_cpu_ptr(&pres_cpu_state))
+static struct pres_task_state* get_pres_state(struct task_struct *tsk)
+{
+        return (struct pres_task_state*) tsk_rt(tsk)->plugin_state;
+}
+static void task_departs(struct task_struct *tsk, int job_complete)
+{
+        struct pres_task_state* state = get_pres_state(tsk);
+        struct reservation* res;
+        struct reservation_client *client;
+        client = state->client;
+        res    = client->reservation;
+        res->ops->client_departs(res, client, job_complete);
+        TRACE_TASK(tsk, "client_departs: removed from reservation R%d\n", res->id);
+}
+static void task_arrives(struct task_struct *tsk)
+{
+        struct pres_task_state* state = get_pres_state(tsk);
+        struct reservation* res;
+        struct reservation_client *client;
+        client = state->client;
+        res    = client->reservation;
+        res->ops->client_arrives(res, client);
+        TRACE_TASK(tsk, "client_arrives: added to reservation R%d\n", res->id);
+}
+/* NOTE: drops state->lock */
+static void pres_update_timer_and_unlock(struct pres_cpu_state *state)
+{
+        int local;
+        lt_t update, now;
+        update = state->sup_env.next_scheduler_update;
+        now = state->sup_env.env.current_time;
+        /* Be sure we're actually running on the right core,
+         * as pres_update_timer() is also called from pres_task_resume(),
+         * which might be called on any CPU when a thread resumes.
+         */
+        local = local_cpu_state() == state;
+        /* Must drop state lock before calling into hrtimer_start(), which
+         * may raise a softirq, which in turn may wake ksoftirqd. */
+        raw_spin_unlock(&state->lock);
+        if (update <= now) {
+                litmus_reschedule(state->cpu);
+        } else if (likely(local && update != SUP_NO_SCHEDULER_UPDATE)) {
+                /* Reprogram only if not already set correctly. */
+                if (!hrtimer_active(&state->timer) ||
+                    ktime_to_ns(hrtimer_get_expires(&state->timer)) != update) {
+                        TRACE("canceling timer...\n");
+                        hrtimer_cancel(&state->timer);
+                        TRACE("setting scheduler timer for %llu\n", update);
+                        hrtimer_start(&state->timer,
+                                        ns_to_ktime(update),
+                                        HRTIMER_MODE_ABS_PINNED);
+                        if (update < litmus_clock()) {
+                                /* uh oh, timer expired while trying to set it */
+                                TRACE("timer expired during setting "
+                                      "update:%llu now:%llu actual:%llu\n",
+                                      update, now, litmus_clock());
+                                /* The timer HW may not have been reprogrammed
+                                 * correctly; force rescheduling now. */
+                                litmus_reschedule(state->cpu);
+                        }
+                }
+        } else if (unlikely(!local && update != SUP_NO_SCHEDULER_UPDATE)) {
+                /* Poke remote core only if timer needs to be set earlier than
+                 * it is currently set.
+                 */
+                TRACE("pres_update_timer for remote CPU %d (update=%llu, "
+                      "active:%d, set:%llu)\n",
+                        state->cpu,
+                        update,
+                        hrtimer_active(&state->timer),
+                        ktime_to_ns(hrtimer_get_expires(&state->timer)));
+                if (!hrtimer_active(&state->timer) ||
+                    ktime_to_ns(hrtimer_get_expires(&state->timer)) > update) {
+                        TRACE("poking CPU %d so that it can update its "
+                               "scheduling timer (active:%d, set:%llu)\n",
+                               state->cpu,
+                               hrtimer_active(&state->timer),
+                               ktime_to_ns(hrtimer_get_expires(&state->timer)));
+                        litmus_reschedule(state->cpu);
+                }
+        }
+}
+static enum hrtimer_restart on_scheduling_timer(struct hrtimer *timer)
+{
+        unsigned long flags;
+        enum hrtimer_restart restart = HRTIMER_NORESTART;
+        struct pres_cpu_state *state;
+        lt_t update, now;
+        state = container_of(timer, struct pres_cpu_state, timer);
+        /* The scheduling timer should only fire on the local CPU, because
+         * otherwise deadlocks via timer_cancel() are possible.
+         * Note: this does not interfere with dedicated interrupt handling, as
+         * even under dedicated interrupt handling scheduling timers for
+         * budget enforcement must occur locally on each CPU.
+         */
+        BUG_ON(state->cpu != raw_smp_processor_id());
+        raw_spin_lock_irqsave(&state->lock, flags);
+        sup_update_time(&state->sup_env, litmus_clock());
+        update = state->sup_env.next_scheduler_update;
+        now = state->sup_env.env.current_time;
+        TRACE_CUR("on_scheduling_timer at %llu, upd:%llu (for cpu=%d)\n",
+                now, update, state->cpu);
+        if (update <= now) {
+                litmus_reschedule_local();
+        } else if (update != SUP_NO_SCHEDULER_UPDATE) {
+                hrtimer_set_expires(timer, ns_to_ktime(update));
+                restart = HRTIMER_RESTART;
+        }
+        raw_spin_unlock_irqrestore(&state->lock, flags);
+        return restart;
+}
+static struct task_struct* pres_schedule(struct task_struct * prev)
+{
+        /* next == NULL means "schedule background work". */
+        struct pres_cpu_state *state = local_cpu_state();
+        raw_spin_lock(&state->lock);
+        BUG_ON(state->scheduled && state->scheduled != prev);
+        BUG_ON(state->scheduled && !is_realtime(prev));
+        /* update time */
+        state->sup_env.will_schedule = true;
+        sup_update_time(&state->sup_env, litmus_clock());
+        /* figure out what to schedule next */
+        state->scheduled = sup_dispatch(&state->sup_env);
+        /* Notify LITMUS^RT core that we've arrived at a scheduling decision. */
+        sched_state_task_picked();
+        /* program scheduler timer */
+        state->sup_env.will_schedule = false;
+        /* NOTE: drops state->lock */
+        pres_update_timer_and_unlock(state);
+        if (prev != state->scheduled && is_realtime(prev))
+                TRACE_TASK(prev, "descheduled.\n");
+        if (state->scheduled)
+                TRACE_TASK(state->scheduled, "scheduled.\n");
+        return state->scheduled;
+}
+static void resume_legacy_task_model_updates(struct task_struct *tsk)
+{
+        lt_t now;
+        if (is_sporadic(tsk)) {
+                /* If this sporadic task was gone for a "long" time and woke up past
+                 * its deadline, then give it a new budget by triggering a job
+                 * release. This is purely cosmetic and has no effect on the
+                 * P-RES scheduler. */
+                now = litmus_clock();
+                if (is_tardy(tsk, now)) {
+                        inferred_sporadic_job_release_at(tsk, now);
+                }
+        }
+}
+/* Called when a task should be removed from the ready queue.
+ */
+static void pres_task_block(struct task_struct *tsk)
+{
+        unsigned long flags;
+        struct pres_task_state* tinfo = get_pres_state(tsk);
+        struct pres_cpu_state *state = cpu_state_for(tinfo->cpu);
+        TRACE_TASK(tsk, "thread suspends at %llu (state:%d, running:%d)\n",
+                litmus_clock(), tsk->state, is_current_running());
+        raw_spin_lock_irqsave(&state->lock, flags);
+        sup_update_time(&state->sup_env, litmus_clock());
+        task_departs(tsk, is_completed(tsk));
+        raw_spin_unlock_irqrestore(&state->lock, flags);
+}
+/* Called when the state of tsk changes back to TASK_RUNNING.
+ * We need to requeue the task.
+ */
+static void pres_task_resume(struct task_struct  *tsk)
+{
+        unsigned long flags;
+        struct pres_task_state* tinfo = get_pres_state(tsk);
+        struct pres_cpu_state *state = cpu_state_for(tinfo->cpu);
+        TRACE_TASK(tsk, "thread wakes up at %llu\n", litmus_clock());
+        raw_spin_lock_irqsave(&state->lock, flags);
+        /* Assumption: litmus_clock() is synchronized across cores,
+         * since we might not actually be executing on tinfo->cpu
+         * at the moment. */
+        sup_update_time(&state->sup_env, litmus_clock());
+        task_arrives(tsk);
+        /* NOTE: drops state->lock */
+        pres_update_timer_and_unlock(state);
+        local_irq_restore(flags);
+        resume_legacy_task_model_updates(tsk);
+}
+static long pres_admit_task(struct task_struct *tsk)
+{
+        long err = -EINVAL;
+        unsigned long flags;
+        struct reservation *res;
+        struct pres_cpu_state *state;
+        struct pres_task_state *tinfo = kzalloc(sizeof(*tinfo), GFP_ATOMIC);
+        if (!tinfo)
+                return -ENOMEM;
+        preempt_disable();
+        /* NOTE: this is obviously racy w.r.t. affinity changes since
+         *       we are not holding any runqueue locks. */
+        if (tsk->nr_cpus_allowed != 1) {
+                printk(KERN_WARNING "%s/%d: task does not have "
+                       "singleton affinity mask\n",
+                        tsk->comm, tsk->pid);
+                state = cpu_state_for(task_cpu(tsk));
+        } else {
+                state = cpu_state_for(cpumask_first(&tsk->cpus_allowed));
+        }
+        TRACE_TASK(tsk, "on CPU %d, valid?:%d\n",
+                task_cpu(tsk), cpumask_test_cpu(task_cpu(tsk), &tsk->cpus_allowed));
+        raw_spin_lock_irqsave(&state->lock, flags);
+        res = sup_find_by_id(&state->sup_env, tsk_rt(tsk)->task_params.cpu);
+        /* found the appropriate reservation (or vCPU) */
+        if (res) {
+                task_client_init(&tinfo->res_info, tsk, res);
+                tinfo->cpu = state->cpu;
+                tinfo->client = &tinfo->res_info.client;
+                tsk_rt(tsk)->plugin_state = tinfo;
+                err = 0;
+                /* disable LITMUS^RT's per-thread budget enforcement */
+                tsk_rt(tsk)->task_params.budget_policy = NO_ENFORCEMENT;
+        } else {
+                printk(KERN_WARNING "Could not find reservation %d on "
+                       "core %d for task %s/%d\n",
+                       tsk_rt(tsk)->task_params.cpu, state->cpu,
+                       tsk->comm, tsk->pid);
+        }
+        raw_spin_unlock_irqrestore(&state->lock, flags);
+        preempt_enable();
+        if (err)
+                kfree(tinfo);
+        return err;
+}
+static void task_new_legacy_task_model_updates(struct task_struct *tsk)
+{
+        lt_t now = litmus_clock();
+        /* the first job exists starting as of right now */
+        release_at(tsk, now);
+        sched_trace_task_release(tsk);
+}
+static void pres_task_new(struct task_struct *tsk, int on_runqueue,
+                          int is_running)
+{
+        unsigned long flags;
+        struct pres_task_state* tinfo = get_pres_state(tsk);
+        struct pres_cpu_state *state = cpu_state_for(tinfo->cpu);
+        TRACE_TASK(tsk, "new RT task %llu (on_rq:%d, running:%d)\n",
+                   litmus_clock(), on_runqueue, is_running);
+        /* acquire the lock protecting the state and disable interrupts */
+        raw_spin_lock_irqsave(&state->lock, flags);
+        if (is_running) {
+                state->scheduled = tsk;
+                /* make sure this task should actually be running */
+                litmus_reschedule_local();
+        }
+        if (on_runqueue || is_running) {
+                /* Assumption: litmus_clock() is synchronized across cores
+                 * [see comment in pres_task_resume()] */
+                sup_update_time(&state->sup_env, litmus_clock());
+                task_arrives(tsk);
+                /* NOTE: drops state->lock */
+                pres_update_timer_and_unlock(state);
+                local_irq_restore(flags);
+        } else
+                raw_spin_unlock_irqrestore(&state->lock, flags);
+        task_new_legacy_task_model_updates(tsk);
+}
+static bool pres_fork_task(struct task_struct *tsk)
+{
+        TRACE_CUR("is forking\n");
+        TRACE_TASK(tsk, "forked child rt:%d cpu:%d task_cpu:%d "
+                        "wcet:%llu per:%llu\n",
+                is_realtime(tsk),
+                tsk_rt(tsk)->task_params.cpu,
+                task_cpu(tsk),
+                tsk_rt(tsk)->task_params.exec_cost,
+                tsk_rt(tsk)->task_params.period);
+        /* We always allow forking. */
+        /* The newly forked task will be in the same reservation. */
+        return true;
+}
+static void pres_task_exit(struct task_struct *tsk)
+{
+        unsigned long flags;
+        struct pres_task_state* tinfo = get_pres_state(tsk);
+        struct pres_cpu_state *state = cpu_state_for(tinfo->cpu);
+        raw_spin_lock_irqsave(&state->lock, flags);
+        TRACE_TASK(tsk, "task exits at %llu (present:%d sched:%d)\n",
+                litmus_clock(), is_present(tsk), state->scheduled == tsk);
+        if (state->scheduled == tsk)
+                state->scheduled = NULL;
+        /* remove from queues */
+        if (is_present(tsk)) {
+                /* Assumption: litmus_clock() is synchronized across cores
+                 * [see comment in pres_task_resume()] */
+                sup_update_time(&state->sup_env, litmus_clock());
+                task_departs(tsk, 0);
+                /* NOTE: drops state->lock */
+                pres_update_timer_and_unlock(state);
+                local_irq_restore(flags);
+        } else
+                raw_spin_unlock_irqrestore(&state->lock, flags);
+        kfree(tsk_rt(tsk)->plugin_state);
+        tsk_rt(tsk)->plugin_state = NULL;
+}
+static void pres_current_budget(lt_t *used_so_far, lt_t *remaining)
+{
+        struct pres_task_state *tstate = get_pres_state(current);
+        struct pres_cpu_state *state;
+        /* FIXME: protect against concurrent task_exit() */
+        local_irq_disable();
+        state = cpu_state_for(tstate->cpu);
+        raw_spin_lock(&state->lock);
+        sup_update_time(&state->sup_env, litmus_clock());
+        if (remaining)
+                *remaining = tstate->client->reservation->cur_budget;
+        if (used_so_far)
+                *used_so_far = tstate->client->reservation->budget_consumed;
+        pres_update_timer_and_unlock(state);
+        local_irq_enable();
+}
+static long do_pres_reservation_create(
+        int res_type,
+        struct reservation_config *config)
+{
+        struct pres_cpu_state *state;
+        struct reservation* res;
+        struct reservation* new_res = NULL;
+        unsigned long flags;
+        long err;
+        /* Allocate before we grab a spin lock. */
+        switch (res_type) {
+                case PERIODIC_POLLING:
+                case SPORADIC_POLLING:
+                        err = alloc_polling_reservation(res_type, config, &new_res);
+                        break;
+                case TABLE_DRIVEN:
+                        err = alloc_table_driven_reservation(config, &new_res);
+                        break;
+                default:
+                        err = -EINVAL;
+                        break;
+        }
+        if (err)
+                return err;
+        state = cpu_state_for(config->cpu);
+        raw_spin_lock_irqsave(&state->lock, flags);
+        res = sup_find_by_id(&state->sup_env, config->id);
+        if (!res) {
+                sup_add_new_reservation(&state->sup_env, new_res);
+                err = config->id;
+        } else {
+                err = -EEXIST;
+        }
+        raw_spin_unlock_irqrestore(&state->lock, flags);
+        if (err < 0)
+                kfree(new_res);
+        return err;
+}
+static long pres_reservation_create(int res_type, void* __user _config)
+{
+        struct reservation_config config;
+        TRACE("Attempt to create reservation (%d)\n", res_type);
+        if (copy_from_user(&config, _config, sizeof(config)))
+                return -EFAULT;
+        if (config.cpu < 0 || !cpu_online(config.cpu)) {
+                printk(KERN_ERR "invalid polling reservation (%u): "
+                       "CPU %d offline\n", config.id, config.cpu);
+                return -EINVAL;
+        }
+        return do_pres_reservation_create(res_type, &config);
+}
+static struct domain_proc_info pres_domain_proc_info;
+static long pres_get_domain_proc_info(struct domain_proc_info **ret)
+{
+        *ret = &pres_domain_proc_info;
+        return 0;
+}
+static void pres_setup_domain_proc(void)
+{
+        int i, cpu;
+        int num_rt_cpus = num_online_cpus();
+        struct cd_mapping *cpu_map, *domain_map;
+        memset(&pres_domain_proc_info, 0, sizeof(pres_domain_proc_info));
+        init_domain_proc_info(&pres_domain_proc_info, num_rt_cpus, num_rt_cpus);
+        pres_domain_proc_info.num_cpus = num_rt_cpus;
+        pres_domain_proc_info.num_domains = num_rt_cpus;
+        i = 0;
+        for_each_online_cpu(cpu) {
+                cpu_map = &pres_domain_proc_info.cpu_to_domains[i];
+                domain_map = &pres_domain_proc_info.domain_to_cpus[i];
+                cpu_map->id = cpu;
+                domain_map->id = i;
+                cpumask_set_cpu(i, cpu_map->mask);
+                cpumask_set_cpu(cpu, domain_map->mask);
+                ++i;
+        }
+}
+static long pres_activate_plugin(void)
+{
+        int cpu;
+        struct pres_cpu_state *state;
+        for_each_online_cpu(cpu) {
+                TRACE("Initializing CPU%d...\n", cpu);
+                state = cpu_state_for(cpu);
+                raw_spin_lock_init(&state->lock);
+                state->cpu = cpu;
+                state->scheduled = NULL;
+                sup_init(&state->sup_env);
+                hrtimer_init(&state->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
+                state->timer.function = on_scheduling_timer;
+        }
+        pres_setup_domain_proc();
+        return 0;
+}
+static long pres_deactivate_plugin(void)
+{
+        int cpu;
+        struct pres_cpu_state *state;
+        struct reservation *res;
+        for_each_online_cpu(cpu) {
+                state = cpu_state_for(cpu);
+                raw_spin_lock(&state->lock);
+                hrtimer_cancel(&state->timer);
+                /* Delete all reservations --- assumes struct reservation
+                 * is prefix of containing struct. */
+                while (!list_empty(&state->sup_env.all_reservations)) {
+                        res = list_first_entry(
+                                &state->sup_env.all_reservations,
+                                struct reservation, all_list);
+                        list_del(&res->all_list);
+                        if (res->ops->shutdown)
+                                res->ops->shutdown(res);
+                        kfree(res);
+                }
+                raw_spin_unlock(&state->lock);
+        }
+        destroy_domain_proc_info(&pres_domain_proc_info);
+        return 0;
+}
+static struct sched_plugin pres_plugin = {
+        .plugin_name            = "P-RES",
+        .schedule               = pres_schedule,
+        .task_block             = pres_task_block,
+        .task_wake_up           = pres_task_resume,
+        .admit_task             = pres_admit_task,
+        .task_new               = pres_task_new,
+        .fork_task              = pres_fork_task,
+        .task_exit              = pres_task_exit,
+        .complete_job           = complete_job_oneshot,
+        .get_domain_proc_info   = pres_get_domain_proc_info,
+        .activate_plugin        = pres_activate_plugin,
+        .deactivate_plugin      = pres_deactivate_plugin,
+        .reservation_create     = pres_reservation_create,
+        .current_budget         = pres_current_budget,
+};
+static int __init init_pres(void)
+{
+        return register_sched_plugin(&pres_plugin);
+}
+module_init(init_pres);
diff --git a/litmus/sched_psn_edf.c b/litmus/sched_psn_edf.c
new file mode 100644
index 000000000000..4e60695578b5
--- /dev/null
+++ b/litmus/sched_psn_edf.c
@@ -0,0 +1,688 @@
+/*
+ * kernel/sched_psn_edf.c
+ *
+ * Implementation of the PSN-EDF scheduler plugin.
+ * Based on kern/sched_part_edf.c and kern/sched_gsn_edf.c.
+ *
+ * Suspensions and non-preemptable sections are supported.
+ * Priority inheritance is not supported.
+ */
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <litmus/debug_trace.h>
+#include <litmus/litmus.h>
+#include <litmus/jobs.h>
+#include <litmus/preempt.h>
+#include <litmus/budget.h>
+#include <litmus/np.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/edf_common.h>
+#include <litmus/sched_trace.h>
+#include <litmus/trace.h>
+/* to set up domain/cpu mappings */
+#include <litmus/litmus_proc.h>
+typedef struct {
+        rt_domain_t             domain;
+        int                     cpu;
+        struct task_struct*     scheduled; /* only RT tasks */
+/*
+ * scheduling lock slock
+ * protects the domain and serializes scheduling decisions
+ */
+#define slock domain.ready_lock
+} psnedf_domain_t;
+DEFINE_PER_CPU(psnedf_domain_t, psnedf_domains);
+#define local_edf               (&(this_cpu_ptr(&psnedf_domains)->domain))
+#define local_pedf              (this_cpu_ptr(&psnedf_domains))
+#define remote_edf(cpu)         (&per_cpu(psnedf_domains, cpu).domain)
+#define remote_pedf(cpu)        (&per_cpu(psnedf_domains, cpu))
+#define task_edf(task)          remote_edf(get_partition(task))
+#define task_pedf(task)         remote_pedf(get_partition(task))
+static void psnedf_domain_init(psnedf_domain_t* pedf,
+                               check_resched_needed_t check,
+                               release_jobs_t release,
+                               int cpu)
+{
+        edf_domain_init(&pedf->domain, check, release);
+        pedf->cpu               = cpu;
+        pedf->scheduled         = NULL;
+}
+static void requeue(struct task_struct* t, rt_domain_t *edf)
+{
+        if (t->state != TASK_RUNNING)
+                TRACE_TASK(t, "requeue: !TASK_RUNNING\n");
+        tsk_rt(t)->completed = 0;
+        if (is_early_releasing(t) || is_released(t, litmus_clock()))
+                __add_ready(edf, t);
+        else
+                add_release(edf, t); /* it has got to wait */
+}
+/* we assume the lock is being held */
+static void preempt(psnedf_domain_t *pedf)
+{
+        preempt_if_preemptable(pedf->scheduled, pedf->cpu);
+}
+#ifdef CONFIG_LITMUS_LOCKING
+static void boost_priority(struct task_struct* t)
+{
+        unsigned long           flags;
+        psnedf_domain_t*        pedf = task_pedf(t);
+        lt_t                    now;
+        raw_spin_lock_irqsave(&pedf->slock, flags);
+        now = litmus_clock();
+        TRACE_TASK(t, "priority boosted at %llu\n", now);
+        tsk_rt(t)->priority_boosted = 1;
+        tsk_rt(t)->boost_start_time = now;
+        if (pedf->scheduled != t) {
+                /* holder may be queued: first stop queue changes */
+                raw_spin_lock(&pedf->domain.release_lock);
+                if (is_queued(t) &&
+                    /* If it is queued, then we need to re-order. */
+                    bheap_decrease(edf_ready_order, tsk_rt(t)->heap_node) &&
+                    /* If we bubbled to the top, then we need to check for preemptions. */
+                    edf_preemption_needed(&pedf->domain, pedf->scheduled))
+                                preempt(pedf);
+                raw_spin_unlock(&pedf->domain.release_lock);
+        } /* else: nothing to do since the job is not queued while scheduled */
+        raw_spin_unlock_irqrestore(&pedf->slock, flags);
+}
+static void unboost_priority(struct task_struct* t)
+{
+        unsigned long           flags;
+        psnedf_domain_t*        pedf = task_pedf(t);
+        lt_t                    now;
+        raw_spin_lock_irqsave(&pedf->slock, flags);
+        now = litmus_clock();
+        /* Assumption: this only happens when the job is scheduled.
+         * Exception: If t transitioned to non-real-time mode, we no longer
+         * care about it. */
+        BUG_ON(pedf->scheduled != t && is_realtime(t));
+        TRACE_TASK(t, "priority restored at %llu\n", now);
+        tsk_rt(t)->priority_boosted = 0;
+        tsk_rt(t)->boost_start_time = 0;
+        /* check if this changes anything */
+        if (edf_preemption_needed(&pedf->domain, pedf->scheduled))
+                preempt(pedf);
+        raw_spin_unlock_irqrestore(&pedf->slock, flags);
+}
+#endif
+static int psnedf_preempt_check(psnedf_domain_t *pedf)
+{
+        if (edf_preemption_needed(&pedf->domain, pedf->scheduled)) {
+                preempt(pedf);
+                return 1;
+        } else
+                return 0;
+}
+/* This check is trivial in partioned systems as we only have to consider
+ * the CPU of the partition.
+ */
+static int psnedf_check_resched(rt_domain_t *edf)
+{
+        psnedf_domain_t *pedf = container_of(edf, psnedf_domain_t, domain);
+        /* because this is a callback from rt_domain_t we already hold
+         * the necessary lock for the ready queue
+         */
+        return psnedf_preempt_check(pedf);
+}
+static void job_completion(struct task_struct* t, int forced)
+{
+        sched_trace_task_completion(t, forced);
+        TRACE_TASK(t, "job_completion(forced=%d).\n", forced);
+        tsk_rt(t)->completed = 0;
+        prepare_for_next_period(t);
+}
+static struct task_struct* psnedf_schedule(struct task_struct * prev)
+{
+        psnedf_domain_t*        pedf = local_pedf;
+        rt_domain_t*            edf  = &pedf->domain;
+        struct task_struct*     next;
+        int                     out_of_time, sleep, preempt,
+                                np, exists, blocks, resched;
+        raw_spin_lock(&pedf->slock);
+        /* sanity checking
+         * differently from gedf, when a task exits (dead)
+         * pedf->schedule may be null and prev _is_ realtime
+         */
+        BUG_ON(pedf->scheduled && pedf->scheduled != prev);
+        BUG_ON(pedf->scheduled && !is_realtime(prev));
+        /* (0) Determine state */
+        exists      = pedf->scheduled != NULL;
+        blocks      = exists && !is_current_running();
+        out_of_time = exists && budget_enforced(pedf->scheduled)
+                             && budget_exhausted(pedf->scheduled);
+        np          = exists && is_np(pedf->scheduled);
+        sleep       = exists && is_completed(pedf->scheduled);
+        preempt     = edf_preemption_needed(edf, prev);
+        /* If we need to preempt do so.
+         * The following checks set resched to 1 in case of special
+         * circumstances.
+         */
+        resched = preempt;
+        /* If a task blocks we have no choice but to reschedule.
+         */
+        if (blocks)
+                resched = 1;
+        /* Request a sys_exit_np() call if we would like to preempt but cannot.
+         * Multiple calls to request_exit_np() don't hurt.
+         */
+        if (np && (out_of_time || preempt || sleep))
+                request_exit_np(pedf->scheduled);
+        /* Any task that is preemptable and either exhausts its execution
+         * budget or wants to sleep completes. We may have to reschedule after
+         * this.
+         */
+        if (!np && (out_of_time || sleep)) {
+                job_completion(pedf->scheduled, !sleep);
+                resched = 1;
+        }
+        /* The final scheduling decision. Do we need to switch for some reason?
+         * Switch if we are in RT mode and have no task or if we need to
+         * resched.
+         */
+        next = NULL;
+        if ((!np || blocks) && (resched || !exists)) {
+                /* When preempting a task that does not block, then
+                 * re-insert it into either the ready queue or the
+                 * release queue (if it completed). requeue() picks
+                 * the appropriate queue.
+                 */
+                if (pedf->scheduled && !blocks)
+                        requeue(pedf->scheduled, edf);
+                next = __take_ready(edf);
+        } else
+                /* Only override Linux scheduler if we have a real-time task
+                 * scheduled that needs to continue.
+                 */
+                if (exists)
+                        next = prev;
+        if (next) {
+                TRACE_TASK(next, "scheduled at %llu\n", litmus_clock());
+        } else {
+                TRACE("becoming idle at %llu\n", litmus_clock());
+        }
+        pedf->scheduled = next;
+        sched_state_task_picked();
+        raw_spin_unlock(&pedf->slock);
+        return next;
+}
+/*      Prepare a task for running in RT mode
+ */
+static void psnedf_task_new(struct task_struct * t, int on_rq, int is_scheduled)
+{
+        rt_domain_t*            edf  = task_edf(t);
+        psnedf_domain_t*        pedf = task_pedf(t);
+        unsigned long           flags;
+        TRACE_TASK(t, "psn edf: task new, cpu = %d\n",
+                   t->rt_param.task_params.cpu);
+        /* setup job parameters */
+        release_at(t, litmus_clock());
+        /* The task should be running in the queue, otherwise signal
+         * code will try to wake it up with fatal consequences.
+         */
+        raw_spin_lock_irqsave(&pedf->slock, flags);
+        if (is_scheduled) {
+                /* there shouldn't be anything else scheduled at the time */
+                BUG_ON(pedf->scheduled);
+                pedf->scheduled = t;
+        } else {
+                /* !is_scheduled means it is not scheduled right now, but it
+                 * does not mean that it is suspended. If it is not suspended,
+                 * it still needs to be requeued. If it is suspended, there is
+                 * nothing that we need to do as it will be handled by the
+                 * wake_up() handler. */
+                if (on_rq) {
+                        requeue(t, edf);
+                        /* maybe we have to reschedule */
+                        psnedf_preempt_check(pedf);
+                }
+        }
+        raw_spin_unlock_irqrestore(&pedf->slock, flags);
+}
+static void psnedf_task_wake_up(struct task_struct *task)
+{
+        unsigned long           flags;
+        psnedf_domain_t*        pedf = task_pedf(task);
+        rt_domain_t*            edf  = task_edf(task);
+        lt_t                    now;
+        TRACE_TASK(task, "wake_up at %llu\n", litmus_clock());
+        raw_spin_lock_irqsave(&pedf->slock, flags);
+        BUG_ON(is_queued(task));
+        now = litmus_clock();
+        if (is_sporadic(task) && is_tardy(task, now)
+#ifdef CONFIG_LITMUS_LOCKING
+        /* We need to take suspensions because of semaphores into
+         * account! If a job resumes after being suspended due to acquiring
+         * a semaphore, it should never be treated as a new job release.
+         */
+            && !is_priority_boosted(task)
+#endif
+                ) {
+                        inferred_sporadic_job_release_at(task, now);
+        }
+        /* Only add to ready queue if it is not the currently-scheduled
+         * task. This could be the case if a task was woken up concurrently
+         * on a remote CPU before the executing CPU got around to actually
+         * de-scheduling the task, i.e., wake_up() raced with schedule()
+         * and won.
+         */
+        if (pedf->scheduled != task) {
+                requeue(task, edf);
+                psnedf_preempt_check(pedf);
+        }
+        raw_spin_unlock_irqrestore(&pedf->slock, flags);
+        TRACE_TASK(task, "wake up done\n");
+}
+static void psnedf_task_block(struct task_struct *t)
+{
+        /* only running tasks can block, thus t is in no queue */
+        TRACE_TASK(t, "block at %llu, state=%d\n", litmus_clock(), t->state);
+        BUG_ON(!is_realtime(t));
+        BUG_ON(is_queued(t));
+}
+static void psnedf_task_exit(struct task_struct * t)
+{
+        unsigned long flags;
+        psnedf_domain_t*        pedf = task_pedf(t);
+        rt_domain_t*            edf;
+        raw_spin_lock_irqsave(&pedf->slock, flags);
+        if (is_queued(t)) {
+                /* dequeue */
+                edf  = task_edf(t);
+                remove(edf, t);
+        }
+        if (pedf->scheduled == t)
+                pedf->scheduled = NULL;
+        TRACE_TASK(t, "RIP, now reschedule\n");
+        preempt(pedf);
+        raw_spin_unlock_irqrestore(&pedf->slock, flags);
+}
+#ifdef CONFIG_LITMUS_LOCKING
+#include <litmus/fdso.h>
+#include <litmus/srp.h>
+/* ******************** SRP support ************************ */
+static unsigned int psnedf_get_srp_prio(struct task_struct* t)
+{
+        return get_rt_relative_deadline(t);
+}
+/* ******************** FMLP support ********************** */
+/* struct for semaphore with priority inheritance */
+struct fmlp_semaphore {
+        struct litmus_lock litmus_lock;
+        /* current resource holder */
+        struct task_struct *owner;
+        /* FIFO queue of waiting tasks */
+        wait_queue_head_t wait;
+};
+static inline struct fmlp_semaphore* fmlp_from_lock(struct litmus_lock* lock)
+{
+        return container_of(lock, struct fmlp_semaphore, litmus_lock);
+}
+int psnedf_fmlp_lock(struct litmus_lock* l)
+{
+        struct task_struct* t = current;
+        struct fmlp_semaphore *sem = fmlp_from_lock(l);
+        wait_queue_t wait;
+        unsigned long flags;
+        if (!is_realtime(t))
+                return -EPERM;
+        /* prevent nested lock acquisition --- not supported by FMLP */
+        if (tsk_rt(t)->num_locks_held ||
+            tsk_rt(t)->num_local_locks_held)
+                return -EBUSY;
+        spin_lock_irqsave(&sem->wait.lock, flags);
+        if (sem->owner) {
+                /* resource is not free => must suspend and wait */
+                init_waitqueue_entry(&wait, t);
+                /* FIXME: interruptible would be nice some day */
+                set_task_state(t, TASK_UNINTERRUPTIBLE);
+                __add_wait_queue_tail_exclusive(&sem->wait, &wait);
+                TS_LOCK_SUSPEND;
+                /* release lock before sleeping */
+                spin_unlock_irqrestore(&sem->wait.lock, flags);
+                /* We depend on the FIFO order.  Thus, we don't need to recheck
+                 * when we wake up; we are guaranteed to have the lock since
+                 * there is only one wake up per release.
+                 */
+                schedule();
+                TS_LOCK_RESUME;
+                /* Since we hold the lock, no other task will change
+                 * ->owner. We can thus check it without acquiring the spin
+                 * lock. */
+                BUG_ON(sem->owner != t);
+        } else {
+                /* it's ours now */
+                sem->owner = t;
+                /* mark the task as priority-boosted. */
+                boost_priority(t);
+                spin_unlock_irqrestore(&sem->wait.lock, flags);
+        }
+        tsk_rt(t)->num_locks_held++;
+        return 0;
+}
+int psnedf_fmlp_unlock(struct litmus_lock* l)
+{
+        struct task_struct *t = current, *next;
+        struct fmlp_semaphore *sem = fmlp_from_lock(l);
+        unsigned long flags;
+        int err = 0;
+        spin_lock_irqsave(&sem->wait.lock, flags);
+        if (sem->owner != t) {
+                err = -EINVAL;
+                goto out;
+        }
+        tsk_rt(t)->num_locks_held--;
+        /* we lose the benefit of priority boosting */
+        unboost_priority(t);
+        /* check if there are jobs waiting for this resource */
+        next = __waitqueue_remove_first(&sem->wait);
+        if (next) {
+                /* boost next job */
+                boost_priority(next);
+                /* next becomes the resouce holder */
+                sem->owner = next;
+                /* wake up next */
+                wake_up_process(next);
+        } else
+                /* resource becomes available */
+                sem->owner = NULL;
+out:
+        spin_unlock_irqrestore(&sem->wait.lock, flags);
+        return err;
+}
+int psnedf_fmlp_close(struct litmus_lock* l)
+{
+        struct task_struct *t = current;
+        struct fmlp_semaphore *sem = fmlp_from_lock(l);
+        unsigned long flags;
+        int owner;
+        spin_lock_irqsave(&sem->wait.lock, flags);
+        owner = sem->owner == t;
+        spin_unlock_irqrestore(&sem->wait.lock, flags);
+        if (owner)
+                psnedf_fmlp_unlock(l);
+        return 0;
+}
+void psnedf_fmlp_free(struct litmus_lock* lock)
+{
+        kfree(fmlp_from_lock(lock));
+}
+static struct litmus_lock_ops psnedf_fmlp_lock_ops = {
+        .close  = psnedf_fmlp_close,
+        .lock   = psnedf_fmlp_lock,
+        .unlock = psnedf_fmlp_unlock,
+        .deallocate = psnedf_fmlp_free,
+};
+static struct litmus_lock* psnedf_new_fmlp(void)
+{
+        struct fmlp_semaphore* sem;
+        sem = kmalloc(sizeof(*sem), GFP_KERNEL);
+        if (!sem)
+                return NULL;
+        sem->owner   = NULL;
+        init_waitqueue_head(&sem->wait);
+        sem->litmus_lock.ops = &psnedf_fmlp_lock_ops;
+        return &sem->litmus_lock;
+}
+/* **** lock constructor **** */
+static long psnedf_allocate_lock(struct litmus_lock **lock, int type,
+                                 void* __user unused)
+{
+        int err = -ENXIO;
+        struct srp_semaphore* srp;
+        /* PSN-EDF currently supports the SRP for local resources and the FMLP
+         * for global resources. */
+        switch (type) {
+        case FMLP_SEM:
+                /* Flexible Multiprocessor Locking Protocol */
+                *lock = psnedf_new_fmlp();
+                if (*lock)
+                        err = 0;
+                else
+                        err = -ENOMEM;
+                break;
+        case SRP_SEM:
+                /* Baker's Stack Resource Policy */
+                srp = allocate_srp_semaphore();
+                if (srp) {
+                        *lock = &srp->litmus_lock;
+                        err = 0;
+                } else
+                        err = -ENOMEM;
+                break;
+        };
+        return err;
+}
+#endif
+static struct domain_proc_info psnedf_domain_proc_info;
+static long psnedf_get_domain_proc_info(struct domain_proc_info **ret)
+{
+        *ret = &psnedf_domain_proc_info;
+        return 0;
+}
+static void psnedf_setup_domain_proc(void)
+{
+        int i, cpu;
+        int release_master =
+#ifdef CONFIG_RELEASE_MASTER
+                atomic_read(&release_master_cpu);
+#else
+                NO_CPU;
+#endif
+        int num_rt_cpus = num_online_cpus() - (release_master != NO_CPU);
+        struct cd_mapping *cpu_map, *domain_map;
+        memset(&psnedf_domain_proc_info, 0, sizeof(psnedf_domain_proc_info));
+        init_domain_proc_info(&psnedf_domain_proc_info, num_rt_cpus, num_rt_cpus);
+        psnedf_domain_proc_info.num_cpus = num_rt_cpus;
+        psnedf_domain_proc_info.num_domains = num_rt_cpus;
+        for (cpu = 0, i = 0; cpu < num_online_cpus(); ++cpu) {
+                if (cpu == release_master)
+                        continue;
+                cpu_map = &psnedf_domain_proc_info.cpu_to_domains[i];
+                domain_map = &psnedf_domain_proc_info.domain_to_cpus[i];
+                cpu_map->id = cpu;
+                domain_map->id = i; /* enumerate w/o counting the release master */
+                cpumask_set_cpu(i, cpu_map->mask);
+                cpumask_set_cpu(cpu, domain_map->mask);
+                ++i;
+        }
+}
+static long psnedf_activate_plugin(void)
+{
+#ifdef CONFIG_RELEASE_MASTER
+        int cpu;
+        for_each_online_cpu(cpu) {
+                remote_edf(cpu)->release_master = atomic_read(&release_master_cpu);
+        }
+#endif
+#ifdef CONFIG_LITMUS_LOCKING
+        get_srp_prio = psnedf_get_srp_prio;
+#endif
+        psnedf_setup_domain_proc();
+        return 0;
+}
+static long psnedf_deactivate_plugin(void)
+{
+        destroy_domain_proc_info(&psnedf_domain_proc_info);
+        return 0;
+}
+static long psnedf_admit_task(struct task_struct* tsk)
+{
+        if (task_cpu(tsk) == tsk->rt_param.task_params.cpu
+#ifdef CONFIG_RELEASE_MASTER
+            /* don't allow tasks on release master CPU */
+             && task_cpu(tsk) != remote_edf(task_cpu(tsk))->release_master
+#endif
+                )
+                return 0;
+        else
+                return -EINVAL;
+}
+/*      Plugin object   */
+static struct sched_plugin psn_edf_plugin __cacheline_aligned_in_smp = {
+        .plugin_name            = "PSN-EDF",
+        .task_new               = psnedf_task_new,
+        .complete_job           = complete_job,
+        .task_exit              = psnedf_task_exit,
+        .schedule               = psnedf_schedule,
+        .task_wake_up           = psnedf_task_wake_up,
+        .task_block             = psnedf_task_block,
+        .admit_task             = psnedf_admit_task,
+        .activate_plugin        = psnedf_activate_plugin,
+        .deactivate_plugin      = psnedf_deactivate_plugin,
+        .get_domain_proc_info   = psnedf_get_domain_proc_info,
+#ifdef CONFIG_LITMUS_LOCKING
+        .allocate_lock          = psnedf_allocate_lock,
+#endif
+};
+static int __init init_psn_edf(void)
+{
+        int i;
+        /* We do not really want to support cpu hotplug, do we? ;)
+         * However, if we are so crazy to do so,
+         * we cannot use num_online_cpu()
+         */
+        for (i = 0; i < num_online_cpus(); i++) {
+                psnedf_domain_init(remote_pedf(i),
+                                   psnedf_check_resched,
+                                   NULL, i);
+        }
+        return register_sched_plugin(&psn_edf_plugin);
+}
+module_init(init_psn_edf);
diff --git a/litmus/sched_task_trace.c b/litmus/sched_task_trace.c
new file mode 100644
index 000000000000..a6088f16bb08
--- /dev/null
+++ b/litmus/sched_task_trace.c
@@ -0,0 +1,258 @@
+/*
+ * sched_task_trace.c -- record scheduling events to a byte stream
+ */
+#define NO_TASK_TRACE_DECLS
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/percpu.h>
+#include <litmus/ftdev.h>
+#include <litmus/litmus.h>
+#include <litmus/sched_trace.h>
+#include <litmus/feather_trace.h>
+#include <litmus/ftdev.h>
+#define NO_EVENTS               (1 << CONFIG_SCHED_TASK_TRACE_SHIFT)
+#define now() litmus_clock()
+struct local_buffer {
+        struct st_event_record record[NO_EVENTS];
+        char   flag[NO_EVENTS];
+        struct ft_buffer ftbuf;
+};
+DEFINE_PER_CPU(struct local_buffer, st_event_buffer);
+static struct ftdev st_dev;
+static int st_dev_can_open(struct ftdev *dev, unsigned int cpu)
+{
+        return cpu_online(cpu) ? 0 : -ENODEV;
+}
+static int __init init_sched_task_trace(void)
+{
+        struct local_buffer* buf;
+        int i, ok = 0, err;
+        printk("Allocated %u sched_trace_xxx() events per CPU "
+               "(buffer size: %d bytes)\n",
+               NO_EVENTS, (int) sizeof(struct local_buffer));
+        err = ftdev_init(&st_dev, THIS_MODULE,
+                        num_online_cpus(), "sched_trace");
+        if (err)
+                goto err_out;
+        for (i = 0; i < st_dev.minor_cnt; i++) {
+                buf = &per_cpu(st_event_buffer, i);
+                ok += init_ft_buffer(&buf->ftbuf, NO_EVENTS,
+                                     sizeof(struct st_event_record),
+                                     buf->flag,
+                                     buf->record);
+                st_dev.minor[i].buf = &buf->ftbuf;
+        }
+        if (ok == st_dev.minor_cnt) {
+                st_dev.can_open = st_dev_can_open;
+                err = register_ftdev(&st_dev);
+                if (err)
+                        goto err_dealloc;
+        } else {
+                err = -EINVAL;
+                goto err_dealloc;
+        }
+        return 0;
+err_dealloc:
+        ftdev_exit(&st_dev);
+err_out:
+        printk(KERN_WARNING "Could not register sched_trace module\n");
+        return err;
+}
+static void __exit exit_sched_task_trace(void)
+{
+        ftdev_exit(&st_dev);
+}
+module_init(init_sched_task_trace);
+module_exit(exit_sched_task_trace);
+static inline struct st_event_record* get_record(u8 type, struct task_struct* t)
+{
+        struct st_event_record* rec = NULL;
+        struct local_buffer* buf;
+        buf = &get_cpu_var(st_event_buffer);
+        if (ft_buffer_start_write(&buf->ftbuf, (void**) &rec)) {
+                rec->hdr.type = type;
+                rec->hdr.cpu  = smp_processor_id();
+                rec->hdr.pid  = t ? t->pid : 0;
+                rec->hdr.job  = t ? t->rt_param.job_params.job_no : 0;
+        } else {
+                put_cpu_var(st_event_buffer);
+        }
+        /* rec will be NULL if it failed */
+        return rec;
+}
+static inline void put_record(struct st_event_record* rec)
+{
+        struct local_buffer* buf;
+        /* don't use get_cpu_var() here, get_record() did that already for us */
+        buf = this_cpu_ptr(&st_event_buffer);
+        ft_buffer_finish_write(&buf->ftbuf, rec);
+        /* matches the get_cpu_var() in get_record() */
+        put_cpu_var(st_event_buffer);
+}
+feather_callback void do_sched_trace_task_name(unsigned long id, unsigned long _task)
+{
+        struct task_struct *t = (struct task_struct*) _task;
+        struct st_event_record* rec = get_record(ST_NAME, t);
+        int i;
+        if (rec) {
+                for (i = 0; i < min(TASK_COMM_LEN, ST_NAME_LEN); i++)
+                        rec->data.name.cmd[i] = t->comm[i];
+                put_record(rec);
+        }
+}
+feather_callback void do_sched_trace_task_param(unsigned long id, unsigned long _task)
+{
+        struct task_struct *t = (struct task_struct*) _task;
+        struct st_event_record* rec = get_record(ST_PARAM, t);
+        if (rec) {
+                rec->data.param.wcet      = get_exec_cost(t);
+                rec->data.param.period    = get_rt_period(t);
+                rec->data.param.phase     = get_rt_phase(t);
+                rec->data.param.partition = get_partition(t);
+                rec->data.param.class     = get_class(t);
+                put_record(rec);
+        }
+}
+feather_callback void do_sched_trace_task_release(unsigned long id, unsigned long _task)
+{
+        struct task_struct *t = (struct task_struct*) _task;
+        struct st_event_record* rec = get_record(ST_RELEASE, t);
+        if (rec) {
+                rec->data.release.release  = get_release(t);
+                rec->data.release.deadline = get_deadline(t);
+                put_record(rec);
+        }
+}
+/* skipped: st_assigned_data, we don't use it atm */
+feather_callback void do_sched_trace_task_switch_to(unsigned long id,
+                                                    unsigned long _task)
+{
+        struct task_struct *t = (struct task_struct*) _task;
+        struct st_event_record* rec;
+        if (is_realtime(t)) {
+                rec = get_record(ST_SWITCH_TO, t);
+                if (rec) {
+                        rec->data.switch_to.when      = now();
+                        rec->data.switch_to.exec_time = get_exec_time(t);
+                        put_record(rec);
+                }
+        }
+}
+feather_callback void do_sched_trace_task_switch_away(unsigned long id,
+                                                      unsigned long _task)
+{
+        struct task_struct *t = (struct task_struct*) _task;
+        struct st_event_record* rec;
+        if (is_realtime(t)) {
+                rec = get_record(ST_SWITCH_AWAY, t);
+                if (rec) {
+                        rec->data.switch_away.when      = now();
+                        rec->data.switch_away.exec_time = get_exec_time(t);
+                        put_record(rec);
+                }
+        }
+}
+feather_callback void do_sched_trace_task_completion(unsigned long id,
+                                                     unsigned long _task,
+                                                     unsigned long forced)
+{
+        struct task_struct *t = (struct task_struct*) _task;
+        struct st_event_record* rec = get_record(ST_COMPLETION, t);
+        if (rec) {
+                rec->data.completion.when   = now();
+                rec->data.completion.forced = forced;
+                rec->data.completion.exec_time = get_exec_time(t);
+                put_record(rec);
+        }
+}
+feather_callback void do_sched_trace_last_suspension_as_completion(
+        unsigned long id,
+        unsigned long _task)
+{
+        struct task_struct *t = (struct task_struct*) _task;
+        struct st_event_record* rec = get_record(ST_COMPLETION, t);
+        if (rec) {
+                rec->data.completion.when
+                        = tsk_rt(t)->job_params.last_suspension;
+                rec->data.completion.forced = 0;
+                rec->data.completion.exec_time = get_exec_time(t);
+                put_record(rec);
+        }
+}
+feather_callback void do_sched_trace_task_block(unsigned long id,
+                                                unsigned long _task)
+{
+        struct task_struct *t = (struct task_struct*) _task;
+        struct st_event_record* rec = get_record(ST_BLOCK, t);
+        if (rec) {
+                rec->data.block.when      = now();
+                put_record(rec);
+        }
+}
+feather_callback void do_sched_trace_task_resume(unsigned long id,
+                                                 unsigned long _task)
+{
+        struct task_struct *t = (struct task_struct*) _task;
+        struct st_event_record* rec = get_record(ST_RESUME, t);
+        if (rec) {
+                rec->data.resume.when      = now();
+                put_record(rec);
+        }
+}
+feather_callback void do_sched_trace_sys_release(unsigned long id,
+                                                 unsigned long _start)
+{
+        lt_t *start = (lt_t*) _start;
+        struct st_event_record* rec = get_record(ST_SYS_RELEASE, NULL);
+        if (rec) {
+                rec->data.sys_release.when    = now();
+                rec->data.sys_release.release = *start;
+                put_record(rec);
+        }
+}
+feather_callback void do_sched_trace_action(unsigned long id,
+                                            unsigned long _task,
+                                            unsigned long action)
+{
+        struct task_struct *t = (struct task_struct*) _task;
+        struct st_event_record* rec = get_record(ST_ACTION, t);
+        if (rec) {
+                rec->data.action.when   = now();
+                rec->data.action.action = action;
+                put_record(rec);
+        }
+}
diff --git a/litmus/sched_trace.c b/litmus/sched_trace.c
new file mode 100644
index 000000000000..e8648f308ccd
--- /dev/null
+++ b/litmus/sched_trace.c
@@ -0,0 +1,251 @@
+/*
+ * sched_trace.c -- record scheduling events to a byte stream.
+ */
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/miscdevice.h>
+#include <asm/uaccess.h>
+#include <linux/module.h>
+#include <linux/sysrq.h>
+#include <linux/sched.h>
+#include <linux/kfifo.h>
+atomic_t __log_seq_no = ATOMIC_INIT(0);
+#define SCHED_TRACE_NAME "litmus/log"
+/* Compute size of TRACE() buffer */
+#define LITMUS_TRACE_BUF_SIZE (1 << CONFIG_SCHED_DEBUG_TRACE_SHIFT)
+/* Max length of one read from the buffer */
+#define MAX_READ_LEN (64 * 1024)
+/* Max length for one write --- by TRACE() --- to the buffer. This is used to
+ * allocate a per-cpu buffer for printf() formatting. */
+#define MSG_SIZE 255
+static DEFINE_MUTEX(reader_mutex);
+static atomic_t reader_cnt = ATOMIC_INIT(0);
+static DEFINE_KFIFO(debug_buffer, char, LITMUS_TRACE_BUF_SIZE);
+static DEFINE_RAW_SPINLOCK(log_buffer_lock);
+static DEFINE_PER_CPU(char[MSG_SIZE], fmt_buffer);
+/*
+ * sched_trace_log_message - Write to the trace buffer (log_buffer)
+ *
+ * This is the only function accessing the log_buffer from inside the
+ * kernel for writing.
+ * Concurrent access to sched_trace_log_message must be serialized using
+ * log_buffer_lock
+ * The maximum length of a formatted message is 255
+ */
+void sched_trace_log_message(const char* fmt, ...)
+{
+        unsigned long   flags;
+        va_list         args;
+        size_t          len;
+        char*           buf;
+        if (!atomic_read(&reader_cnt))
+                /* early exit if nobody is listening */
+                return;
+        va_start(args, fmt);
+        local_irq_save(flags);
+        /* format message */
+        buf = this_cpu_ptr(fmt_buffer);
+        len = vscnprintf(buf, MSG_SIZE, fmt, args);
+        raw_spin_lock(&log_buffer_lock);
+        /* Don't copy the trailing null byte, we don't want null bytes in a
+         * text file.
+         */
+        kfifo_in(&debug_buffer, buf, len);
+        raw_spin_unlock(&log_buffer_lock);
+        local_irq_restore(flags);
+        va_end(args);
+}
+/*
+ * log_read - Read the trace buffer
+ *
+ * This function is called as a file operation from userspace.
+ * Readers can sleep. Access is serialized through reader_mutex
+ */
+static ssize_t log_read(struct file *filp,
+                        char __user *to, size_t len,
+                        loff_t *f_pos)
+{
+        /* we ignore f_pos, this is strictly sequential */
+        ssize_t error = -EINVAL;
+        char* mem;
+        if (mutex_lock_interruptible(&reader_mutex)) {
+                error = -ERESTARTSYS;
+                goto out;
+        }
+        if (len > MAX_READ_LEN)
+                len = MAX_READ_LEN;
+        mem = kmalloc(len, GFP_KERNEL);
+        if (!mem) {
+                error = -ENOMEM;
+                goto out_unlock;
+        }
+        error = kfifo_out(&debug_buffer, mem, len);
+        while (!error) {
+                set_current_state(TASK_INTERRUPTIBLE);
+                schedule_timeout(110);
+                if (signal_pending(current))
+                        error = -ERESTARTSYS;
+                else
+                        error = kfifo_out(&debug_buffer, mem, len);
+        }
+        if (error > 0 && copy_to_user(to, mem, error))
+                error = -EFAULT;
+        kfree(mem);
+ out_unlock:
+        mutex_unlock(&reader_mutex);
+ out:
+        return error;
+}
+/*
+ * Enable redirection of printk() messages to the trace buffer.
+ * Defined in kernel/printk.c
+ */
+extern int trace_override;
+extern int trace_recurse;
+/*
+ * log_open - open the global log message ring buffer.
+ */
+static int log_open(struct inode *in, struct file *filp)
+{
+        int error = -EINVAL;
+        if (mutex_lock_interruptible(&reader_mutex)) {
+                error = -ERESTARTSYS;
+                goto out;
+        }
+        atomic_inc(&reader_cnt);
+        error = 0;
+        printk(KERN_DEBUG
+               "sched_trace kfifo with buffer starting at: 0x%p\n",
+               debug_buffer.buf);
+        /* override printk() */
+        trace_override++;
+        mutex_unlock(&reader_mutex);
+ out:
+        return error;
+}
+static int log_release(struct inode *in, struct file *filp)
+{
+        int error = -EINVAL;
+        if (mutex_lock_interruptible(&reader_mutex)) {
+                error = -ERESTARTSYS;
+                goto out;
+        }
+        atomic_dec(&reader_cnt);
+        /* release printk() overriding */
+        trace_override--;
+        printk(KERN_DEBUG "sched_trace kfifo released\n");
+        mutex_unlock(&reader_mutex);
+ out:
+        return error;
+}
+/*
+ * log_fops  - The file operations for accessing the global LITMUS log message
+ *             buffer.
+ *
+ * Except for opening the device file it uses the same operations as trace_fops.
+ */
+static struct file_operations log_fops = {
+        .owner   = THIS_MODULE,
+        .open    = log_open,
+        .release = log_release,
+        .read    = log_read,
+};
+static struct miscdevice litmus_log_dev = {
+        .name    = SCHED_TRACE_NAME,
+        .minor   = MISC_DYNAMIC_MINOR,
+        .fops    = &log_fops,
+};
+#ifdef CONFIG_MAGIC_SYSRQ
+void dump_trace_buffer(int max)
+{
+        char line[80];
+        int len;
+        int count = 0;
+        /* potential, but very unlikely, race... */
+        trace_recurse = 1;
+        while ((max == 0 || count++ < max) &&
+               (len = kfifo_out(&debug_buffer, line, sizeof(line - 1))) > 0) {
+                line[len] = '\0';
+                printk("%s", line);
+        }
+        trace_recurse = 0;
+}
+static void sysrq_dump_trace_buffer(int key)
+{
+        dump_trace_buffer(100);
+}
+static struct sysrq_key_op sysrq_dump_trace_buffer_op = {
+        .handler        = sysrq_dump_trace_buffer,
+        .help_msg       = "dump-trace-buffer(Y)",
+        .action_msg     = "writing content of TRACE() buffer",
+};
+#endif
+static int __init init_sched_trace(void)
+{
+        printk("Initializing TRACE() device\n");
+#ifdef CONFIG_MAGIC_SYSRQ
+        /* offer some debugging help */
+        if (!register_sysrq_key('y', &sysrq_dump_trace_buffer_op))
+                printk("Registered dump-trace-buffer(Y) magic sysrq.\n");
+        else
+                printk("Could not register dump-trace-buffer(Y) magic sysrq.\n");
+#endif
+        return misc_register(&litmus_log_dev);
+}
+static void __exit exit_sched_trace(void)
+{
+        misc_deregister(&litmus_log_dev);
+}
+module_init(init_sched_trace);
+module_exit(exit_sched_trace);
diff --git a/litmus/srp.c b/litmus/srp.c
new file mode 100644
index 000000000000..7e3c057c0752
--- /dev/null
+++ b/litmus/srp.c
@@ -0,0 +1,310 @@
+/* ************************************************************************** */
+/*                          STACK RESOURCE POLICY                             */
+/* ************************************************************************** */
+#include <linux/module.h>
+#include <asm/atomic.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/debug_trace.h>
+#include <litmus/fdso.h>
+#include <litmus/trace.h>
+#ifdef CONFIG_LITMUS_LOCKING
+#include <litmus/srp.h>
+srp_prioritization_t get_srp_prio;
+struct srp {
+        struct list_head        ceiling;
+        wait_queue_head_t       ceiling_blocked;
+};
+#define system_ceiling(srp) list2prio(srp->ceiling.next)
+#define ceiling2sem(c) container_of(c, struct srp_semaphore, ceiling)
+#define UNDEF_SEM -2
+DEFINE_PER_CPU(struct srp, srp);
+DEFINE_PER_CPU(int, srp_objects_in_use);
+/* Initialize SRP semaphores at boot time. */
+static int __init srp_init(void)
+{
+        int i;
+        printk("Initializing SRP per-CPU ceilings...");
+        for (i = 0; i < NR_CPUS; i++) {
+                init_waitqueue_head(&per_cpu(srp, i).ceiling_blocked);
+                INIT_LIST_HEAD(&per_cpu(srp, i).ceiling);
+                per_cpu(srp_objects_in_use, i) = 0;
+        }
+        printk(" done!\n");
+        return 0;
+}
+module_init(srp_init);
+/* SRP task priority comparison function. Smaller numeric values have higher
+ * priority, tie-break is PID. Special case: priority == 0 <=> no priority
+ */
+static int srp_higher_prio(struct srp_priority* first,
+                           struct srp_priority* second)
+{
+        if (!first->priority)
+                return 0;
+        else
+                return  !second->priority ||
+                        first->priority < second->priority || (
+                        first->priority == second->priority &&
+                        first->pid < second->pid);
+}
+static int srp_exceeds_ceiling(struct task_struct* first,
+                               struct srp* srp)
+{
+        struct srp_priority prio;
+        if (list_empty(&srp->ceiling))
+                return 1;
+        else {
+                prio.pid = first->pid;
+                prio.priority = get_srp_prio(first);
+                return srp_higher_prio(&prio, system_ceiling(srp)) ||
+                        ceiling2sem(system_ceiling(srp))->owner == first;
+        }
+}
+static void srp_add_prio(struct srp* srp, struct srp_priority* prio)
+{
+        struct list_head *pos;
+        if (in_list(&prio->list)) {
+                printk(KERN_CRIT "WARNING: SRP violation detected, prio is already in "
+                       "ceiling list! cpu=%d, srp=%p\n", smp_processor_id(), ceiling2sem(prio));
+                return;
+        }
+        list_for_each(pos, &srp->ceiling)
+                if (unlikely(srp_higher_prio(prio, list2prio(pos)))) {
+                        __list_add(&prio->list, pos->prev, pos);
+                        return;
+                }
+        list_add_tail(&prio->list, &srp->ceiling);
+}
+static int lock_srp_semaphore(struct litmus_lock* l)
+{
+        struct task_struct* t = current;
+        struct srp_semaphore* sem = container_of(l, struct srp_semaphore, litmus_lock);
+        if (!is_realtime(t))
+                return -EPERM;
+        /* prevent acquisition of local locks in global critical sections */
+        if (tsk_rt(t)->num_locks_held)
+                return -EBUSY;
+        preempt_disable();
+        /* Update ceiling. */
+        srp_add_prio(this_cpu_ptr(&srp), &sem->ceiling);
+        /* SRP invariant: all resources available */
+        BUG_ON(sem->owner != NULL);
+        sem->owner = t;
+        TRACE_CUR("acquired srp 0x%p\n", sem);
+        tsk_rt(t)->num_local_locks_held++;
+        preempt_enable();
+        return 0;
+}
+static int unlock_srp_semaphore(struct litmus_lock* l)
+{
+        struct task_struct* t = current;
+        struct srp_semaphore* sem = container_of(l, struct srp_semaphore, litmus_lock);
+        int err = 0;
+        preempt_disable();
+        if (sem->owner != t) {
+                err = -EINVAL;
+        } else {
+                /* The current owner should be executing on the correct CPU.
+                 *
+                 * If the owner transitioned out of RT mode or is exiting, then
+                 * we it might have already been migrated away by the best-effort
+                 * scheduler and we just have to deal with it. */
+                if (unlikely(!is_realtime(t) && sem->cpu != smp_processor_id())) {
+                        TRACE_TASK(t, "SRP unlock cpu=%d, sem->cpu=%d\n",
+                                smp_processor_id(), sem->cpu);
+                        preempt_enable();
+                        err = litmus_be_migrate_to(sem->cpu);
+                        preempt_disable();
+                        TRACE_TASK(t, "post-migrate: cpu=%d, sem->cpu=%d err=%d\n",
+                                smp_processor_id(), sem->cpu, err);
+                }
+                BUG_ON(sem->cpu != smp_processor_id());
+                err = 0;
+                /* Determine new system priority ceiling for this CPU. */
+                BUG_ON(!in_list(&sem->ceiling.list));
+                list_del(&sem->ceiling.list);
+                sem->owner = NULL;
+                /* Wake tasks on this CPU, if they exceed current ceiling. */
+                TRACE_CUR("released srp 0x%p\n", sem);
+                wake_up_all(&this_cpu_ptr(&srp)->ceiling_blocked);
+                tsk_rt(t)->num_local_locks_held--;
+        }
+        preempt_enable();
+        return err;
+}
+static int open_srp_semaphore(struct litmus_lock* l, void* __user arg)
+{
+        struct srp_semaphore* sem = container_of(l, struct srp_semaphore, litmus_lock);
+        int err = 0;
+        struct task_struct* t = current;
+        struct srp_priority t_prio;
+        if (!is_realtime(t))
+                return -EPERM;
+        TRACE_CUR("opening SRP semaphore %p, cpu=%d\n", sem, sem->cpu);
+        preempt_disable();
+        if (sem->owner != NULL)
+                err = -EBUSY;
+        if (err == 0) {
+                if (sem->cpu == UNDEF_SEM)
+                        sem->cpu = get_partition(t);
+                else if (sem->cpu != get_partition(t))
+                        err = -EPERM;
+        }
+        if (err == 0) {
+                t_prio.priority = get_srp_prio(t);
+                t_prio.pid      = t->pid;
+                if (srp_higher_prio(&t_prio, &sem->ceiling)) {
+                        sem->ceiling.priority = t_prio.priority;
+                        sem->ceiling.pid      = t_prio.pid;
+                }
+        }
+        preempt_enable();
+        return err;
+}
+static int close_srp_semaphore(struct litmus_lock* l)
+{
+        struct srp_semaphore* sem = container_of(l, struct srp_semaphore, litmus_lock);
+        int err = 0;
+        preempt_disable();
+        if (sem->owner == current)
+                unlock_srp_semaphore(l);
+        preempt_enable();
+        return err;
+}
+static void deallocate_srp_semaphore(struct litmus_lock* l)
+{
+        struct srp_semaphore* sem = container_of(l, struct srp_semaphore, litmus_lock);
+        raw_cpu_dec(srp_objects_in_use);
+        kfree(sem);
+}
+static struct litmus_lock_ops srp_lock_ops = {
+        .open   = open_srp_semaphore,
+        .close  = close_srp_semaphore,
+        .lock   = lock_srp_semaphore,
+        .unlock = unlock_srp_semaphore,
+        .deallocate = deallocate_srp_semaphore,
+};
+struct srp_semaphore* allocate_srp_semaphore(void)
+{
+        struct srp_semaphore* sem;
+        sem = kmalloc(sizeof(*sem), GFP_KERNEL);
+        if (!sem)
+                return NULL;
+        INIT_LIST_HEAD(&sem->ceiling.list);
+        sem->ceiling.priority = 0;
+        sem->cpu     = UNDEF_SEM;
+        sem->owner   = NULL;
+        sem->litmus_lock.ops = &srp_lock_ops;
+        raw_cpu_inc(srp_objects_in_use);
+        return sem;
+}
+static int srp_wake_up(wait_queue_t *wait, unsigned mode, int sync,
+                       void *key)
+{
+        int cpu = smp_processor_id();
+        struct task_struct *tsk = wait->private;
+        if (cpu != get_partition(tsk))
+                TRACE_TASK(tsk, "srp_wake_up on wrong cpu, partition is %d\b",
+                           get_partition(tsk));
+        else if (srp_exceeds_ceiling(tsk, this_cpu_ptr(&srp)))
+                return default_wake_function(wait, mode, sync, key);
+        return 0;
+}
+static void do_ceiling_block(struct task_struct *tsk)
+{
+        wait_queue_t wait = {
+                .private   = tsk,
+                .func      = srp_wake_up,
+                .task_list = {NULL, NULL}
+        };
+        tsk->state = TASK_UNINTERRUPTIBLE;
+        add_wait_queue(&this_cpu_ptr(&srp)->ceiling_blocked, &wait);
+        tsk->rt_param.srp_non_recurse = 1;
+        preempt_enable_no_resched();
+        schedule();
+        preempt_disable();
+        tsk->rt_param.srp_non_recurse = 0;
+        remove_wait_queue(&this_cpu_ptr(&srp)->ceiling_blocked, &wait);
+}
+/* Wait for current task priority to exceed system-wide priority ceiling.
+ */
+void __srp_ceiling_block(struct task_struct *cur)
+{
+        preempt_disable();
+        if (!srp_exceeds_ceiling(cur, this_cpu_ptr(&srp))) {
+                TRACE_CUR("is priority ceiling blocked.\n");
+                while (!srp_exceeds_ceiling(cur, this_cpu_ptr(&srp)))
+                        do_ceiling_block(cur);
+                TRACE_CUR("finally exceeds system ceiling.\n");
+        } else
+                TRACE_CUR("is not priority ceiling blocked\n");
+        preempt_enable();
+}
+#endif
diff --git a/litmus/sync.c b/litmus/sync.c
new file mode 100644
index 000000000000..123cefd68a36
--- /dev/null
+++ b/litmus/sync.c
@@ -0,0 +1,153 @@
+/* litmus/sync.c - Support for synchronous and asynchronous task system releases.
+ *
+ *
+ */
+#include <asm/atomic.h>
+#include <asm/uaccess.h>
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <linux/sched.h>
+#include <linux/completion.h>
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/jobs.h>
+#include <litmus/sched_trace.h>
+#include <litmus/debug_trace.h>
+struct ts_release_wait {
+        struct list_head list;
+        struct completion completion;
+        lt_t ts_release_time;
+};
+#define DECLARE_TS_RELEASE_WAIT(symb)                                   \
+        struct ts_release_wait symb =                                   \
+        {                                                               \
+                LIST_HEAD_INIT(symb.list),                              \
+                COMPLETION_INITIALIZER_ONSTACK(symb.completion),        \
+                0                                                       \
+        }
+static LIST_HEAD(task_release_list);
+static DEFINE_MUTEX(task_release_lock);
+static long do_wait_for_ts_release(void)
+{
+        DECLARE_TS_RELEASE_WAIT(wait);
+        long ret = -ERESTARTSYS;
+        if (mutex_lock_interruptible(&task_release_lock))
+                goto out;
+        list_add(&wait.list, &task_release_list);
+        mutex_unlock(&task_release_lock);
+        /* We are enqueued, now we wait for someone to wake us up. */
+        ret = wait_for_completion_interruptible(&wait.completion);
+        if (!ret) {
+                /* Completion succeeded, setup release time. */
+                ret = litmus->wait_for_release_at(
+                        wait.ts_release_time + get_rt_phase(current));
+        } else {
+                /* We were interrupted, must cleanup list. */
+                mutex_lock(&task_release_lock);
+                if (!wait.completion.done)
+                        list_del(&wait.list);
+                mutex_unlock(&task_release_lock);
+        }
+out:
+        return ret;
+}
+int count_tasks_waiting_for_release(void)
+{
+        int task_count = 0;
+        struct list_head *pos;
+        mutex_lock(&task_release_lock);
+        list_for_each(pos, &task_release_list) {
+                task_count++;
+        }
+        mutex_unlock(&task_release_lock);
+        return task_count;
+}
+static long do_release_ts(lt_t start)
+{
+        long  task_count = 0;
+        struct list_head        *pos, *safe;
+        struct ts_release_wait  *wait;
+        if (mutex_lock_interruptible(&task_release_lock)) {
+                task_count = -ERESTARTSYS;
+                goto out;
+        }
+        TRACE("<<<<<< synchronous task system release >>>>>>\n");
+        sched_trace_sys_release(&start);
+        litmus->synchronous_release_at(start);
+        task_count = 0;
+        list_for_each_safe(pos, safe, &task_release_list) {
+                wait = (struct ts_release_wait*)
+                        list_entry(pos, struct ts_release_wait, list);
+                task_count++;
+                wait->ts_release_time = start;
+                complete(&wait->completion);
+        }
+        /* clear stale list */
+        INIT_LIST_HEAD(&task_release_list);
+        mutex_unlock(&task_release_lock);
+out:
+        return task_count;
+}
+asmlinkage long sys_wait_for_ts_release(void)
+{
+        long ret = -EPERM;
+        struct task_struct *t = current;
+        if (is_realtime(t))
+                ret = do_wait_for_ts_release();
+        return ret;
+}
+#define ONE_MS 1000000ULL
+#define ONE_SECOND (ONE_MS * 1000)
+asmlinkage long sys_release_ts(lt_t __user *__when)
+{
+        long ret;
+        lt_t start_time;
+        lt_t now;
+        /* FIXME: check capabilities... */
+        ret = copy_from_user(&start_time, __when, sizeof(start_time));
+        if (ret == 0) {
+                now = litmus_clock();
+                if (lt_before(start_time, now))
+                        start_time = now + ONE_SECOND;
+                ret = do_release_ts(start_time);
+        }
+        return ret;
+}
diff --git a/litmus/trace.c b/litmus/trace.c
new file mode 100644
index 000000000000..eeb54a26104b
--- /dev/null
+++ b/litmus/trace.c
@@ -0,0 +1,575 @@
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/uaccess.h>
+#include <litmus/ftdev.h>
+#include <litmus/litmus.h>
+#include <litmus/trace.h>
+/******************************************************************************/
+/*                          Allocation                                        */
+/******************************************************************************/
+static struct ftdev cpu_overhead_dev;
+static struct ftdev msg_overhead_dev;
+#define cpu_trace_ts_buf(cpu) cpu_overhead_dev.minor[(cpu)].buf
+#define msg_trace_ts_buf(cpu) msg_overhead_dev.minor[(cpu)].buf
+DEFINE_PER_CPU(unsigned int, local_irq_count;)
+DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, msg_irq_count);
+static DEFINE_PER_CPU(unsigned int, cpu_ts_seq_no);
+static DEFINE_PER_CPU(atomic_t, msg_ts_seq_no);
+static int64_t cycle_offset[NR_CPUS][NR_CPUS];
+void ft_irq_fired(void)
+{
+        /* Only called with preemptions disabled.  */
+        /* local counter => not atomic, trace points disable interrupts */
+        this_cpu_inc(local_irq_count);
+        /* counter for messages => read remotely */
+        atomic_inc(this_cpu_ptr(&msg_irq_count));
+        if (has_control_page(current))
+                get_control_page(current)->irq_count++;
+}
+static inline unsigned int snapshot_local_irqs(void)
+{
+        return this_cpu_xchg(local_irq_count, 0);
+}
+static inline unsigned int snapshot_msg_irq_for(int cpu)
+{
+        return atomic_xchg(&per_cpu(msg_irq_count, cpu), 0);
+}
+static inline unsigned int snapshot_msg_irq_locally(void)
+{
+        return atomic_xchg(raw_cpu_ptr(&msg_irq_count), 0);
+}
+static inline void save_irq_flags(struct timestamp *ts, unsigned int irq_count)
+{
+        /* Store how many interrupts occurred. */
+        ts->irq_count = irq_count;
+        /* Extra flag because ts->irq_count overflows quickly. */
+        ts->irq_flag  = irq_count > 0;
+}
+#define NO_IRQ_COUNT 0
+#define LOCAL_IRQ_COUNT 1
+#define REMOTE_IRQ_COUNT 2
+#define DO_NOT_RECORD_TIMESTAMP 0
+#define RECORD_LOCAL_TIMESTAMP 1
+#define RECORD_OFFSET_TIMESTAMP 2
+static inline void __write_record(
+        uint8_t event,
+        uint8_t type,
+        uint16_t pid_fragment,
+        unsigned int irq_count,
+        int record_irq,
+        int hide_irq,
+        uint64_t timestamp,
+        int record_timestamp,
+        int only_single_writer,
+        int is_cpu_timestamp,
+        int local_cpu,
+        uint8_t other_cpu)
+{
+        unsigned long flags;
+        unsigned int seq_no;
+        struct timestamp *ts;
+        int cpu;
+        struct ft_buffer* buf;
+        /* Avoid preemptions while recording the timestamp. This reduces the
+         * number of "out of order" timestamps in the stream and makes
+         * post-processing easier. */
+        local_irq_save(flags);
+        if (local_cpu)
+                cpu = smp_processor_id();
+        else
+                cpu = other_cpu;
+        /* resolved during function inlining */
+        if (is_cpu_timestamp) {
+                seq_no = __this_cpu_inc_return(cpu_ts_seq_no);
+                buf = cpu_trace_ts_buf(cpu);
+        } else {
+                seq_no = atomic_fetch_inc(&per_cpu(msg_ts_seq_no, cpu));
+                buf = msg_trace_ts_buf(cpu);
+        }
+        /* If buf is non-NULL here, then the buffer cannot be deallocated until
+         * we turn interrupts on again. This is because free_timestamp_buffer()
+         * indirectly causes TLB invalidations due to modifications of the
+         * kernel address space, namely via vfree() in free_ft_buffer(), which
+         * cannot be processed until we turn on interrupts again.
+         */
+        if (buf &&
+            (only_single_writer /* resolved during function inlining */
+             ? ft_buffer_start_single_write(buf, (void**)  &ts)
+             : ft_buffer_start_write(buf, (void**) &ts))) {
+                ts->event     = event;
+                ts->seq_no    = seq_no;
+                ts->task_type = type;
+                ts->pid       = pid_fragment;
+                ts->cpu       = cpu;
+                switch (record_irq) {
+                        case LOCAL_IRQ_COUNT:
+                                if (is_cpu_timestamp)
+                                        irq_count = snapshot_local_irqs();
+                                else
+                                        irq_count = snapshot_msg_irq_locally();
+                                break;
+                        case REMOTE_IRQ_COUNT:
+                                irq_count = snapshot_msg_irq_for(other_cpu);
+                                break;
+                        case NO_IRQ_COUNT:
+                                /* fall through */
+                        default:
+                                /* do nothing */
+                                break;
+                }
+                save_irq_flags(ts, irq_count - hide_irq);
+                if (record_timestamp)
+                        timestamp = ft_timestamp();
+                if (record_timestamp == RECORD_OFFSET_TIMESTAMP)
+                        timestamp += cycle_offset[smp_processor_id()][cpu];
+                ts->timestamp = timestamp;
+                ft_buffer_finish_write(buf, ts);
+        }
+        local_irq_restore(flags);
+}
+static inline void write_cpu_timestamp(
+        uint8_t event,
+        uint8_t type,
+        uint16_t pid_fragment,
+        unsigned int irq_count,
+        int record_irq,
+        int hide_irq,
+        uint64_t timestamp,
+        int record_timestamp)
+{
+        __write_record(event, type,
+                       pid_fragment,
+                       irq_count, record_irq, hide_irq,
+                       timestamp, record_timestamp,
+                       1 /* only_single_writer */,
+                       1 /* is_cpu_timestamp */,
+                       1 /* local_cpu */,
+                       0xff /* other_cpu */);
+}
+static inline void save_msg_timestamp(
+        uint8_t event,
+        int hide_irq)
+{
+        struct task_struct *t  = current;
+        __write_record(event, is_realtime(t) ? TSK_RT : TSK_BE,
+                       t->pid,
+                       0, LOCAL_IRQ_COUNT, hide_irq,
+                       0, RECORD_LOCAL_TIMESTAMP,
+                       0 /* only_single_writer */,
+                       0 /* is_cpu_timestamp */,
+                       1 /* local_cpu */,
+                       0xff /* other_cpu */);
+}
+static inline void save_remote_msg_timestamp(
+        uint8_t event,
+        uint8_t remote_cpu)
+{
+        struct task_struct *t  = current;
+        __write_record(event, is_realtime(t) ? TSK_RT : TSK_BE,
+                       t->pid,
+                       0, REMOTE_IRQ_COUNT, 0,
+                       0, RECORD_OFFSET_TIMESTAMP,
+                       0 /* only_single_writer */,
+                       0 /* is_cpu_timestamp */,
+                       0 /* local_cpu */,
+                       remote_cpu);
+}
+feather_callback void save_cpu_timestamp_def(unsigned long event,
+                                             unsigned long type)
+{
+        write_cpu_timestamp(event, type,
+                            current->pid,
+                            0, LOCAL_IRQ_COUNT, 0,
+                            0, RECORD_LOCAL_TIMESTAMP);
+}
+feather_callback void save_cpu_timestamp_task(unsigned long event,
+                                              unsigned long t_ptr)
+{
+        struct task_struct *t = (struct task_struct *) t_ptr;
+        int rt = is_realtime(t);
+        write_cpu_timestamp(event, rt ? TSK_RT : TSK_BE,
+                            t->pid,
+                            0, LOCAL_IRQ_COUNT, 0,
+                            0, RECORD_LOCAL_TIMESTAMP);
+}
+/* fake timestamp to user-reported time */
+feather_callback void save_cpu_timestamp_time(unsigned long event,
+                         unsigned long ptr)
+{
+        uint64_t* time = (uint64_t*) ptr;
+        write_cpu_timestamp(event, is_realtime(current) ? TSK_RT : TSK_BE,
+                            current->pid,
+                            0, LOCAL_IRQ_COUNT, 0,
+                            *time, DO_NOT_RECORD_TIMESTAMP);
+}
+/* Record user-reported IRQ count */
+feather_callback void save_cpu_timestamp_irq(unsigned long event,
+                        unsigned long irq_counter_ptr)
+{
+        uint64_t* irqs = (uint64_t*) irq_counter_ptr;
+        write_cpu_timestamp(event, is_realtime(current) ? TSK_RT : TSK_BE,
+                            current->pid,
+                            *irqs, NO_IRQ_COUNT, 0,
+                            0, RECORD_LOCAL_TIMESTAMP);
+}
+feather_callback void save_cpu_task_latency(unsigned long event,
+                                            unsigned long when_ptr)
+{
+        lt_t now = litmus_clock();
+        lt_t *when = (lt_t*) when_ptr;
+        lt_t delta = now - *when;
+        write_cpu_timestamp(event, TSK_RT,
+                            0,
+                            0, LOCAL_IRQ_COUNT, 0,
+                            delta, DO_NOT_RECORD_TIMESTAMP);
+}
+/* Record to remote trace buffer */
+feather_callback void msg_sent_to(unsigned long event, unsigned long to)
+{
+        save_remote_msg_timestamp(event, to);
+}
+/* Record to local trace buffer */
+feather_callback void msg_sent_local(unsigned long event)
+{
+        save_msg_timestamp(event, 0);
+}
+/* Suppresses one IRQ from the irq count. Used by TS_SEND_RESCHED_END, which is
+ * called from within an interrupt that is expected. */
+feather_callback void msg_received_local(unsigned long event)
+{
+        save_msg_timestamp(event, 1);
+}
+/* Record to remote trace buffer */
+feather_callback void msg_received_from(unsigned long event, unsigned long from)
+{
+        save_remote_msg_timestamp(event, from);
+}
+static void __add_timestamp_user(struct timestamp *pre_recorded)
+{
+        unsigned long flags;
+        unsigned int seq_no;
+        struct timestamp *ts;
+        struct ft_buffer* buf;
+        int cpu;
+        local_irq_save(flags);
+        cpu = smp_processor_id();
+        buf = cpu_trace_ts_buf(cpu);
+        seq_no = __this_cpu_inc_return(cpu_ts_seq_no);
+        if (buf && ft_buffer_start_single_write(buf, (void**)  &ts)) {
+                *ts = *pre_recorded;
+                ts->seq_no = seq_no;
+                ts->cpu    = raw_smp_processor_id();
+                save_irq_flags(ts, snapshot_local_irqs());
+                ft_buffer_finish_write(buf, ts);
+        }
+        local_irq_restore(flags);
+}
+/******************************************************************************/
+/*                        DEVICE FILE DRIVER                                  */
+/******************************************************************************/
+struct calibrate_info {
+        atomic_t ready;
+        uint64_t cycle_count;
+};
+static void calibrate_helper(void *_info)
+{
+        struct calibrate_info *info = _info;
+        /* check in with master */
+        atomic_inc(&info->ready);
+        /* wait for master to signal start */
+        while (atomic_read(&info->ready))
+                cpu_relax();
+        /* report time stamp */
+        info->cycle_count = ft_timestamp();
+        /* tell master that we are done */
+        atomic_inc(&info->ready);
+}
+static int64_t calibrate_cpu(int cpu)
+{
+        uint64_t cycles;
+        struct calibrate_info info;
+        unsigned long flags;
+        int64_t  delta;
+        atomic_set(&info.ready, 0);
+        info.cycle_count = 0;
+        smp_wmb();
+        smp_call_function_single(cpu, calibrate_helper, &info, 0);
+        /* wait for helper to become active */
+        while (!atomic_read(&info.ready))
+                cpu_relax();
+        /* avoid interrupt interference */
+        local_irq_save(flags);
+        /* take measurement */
+        atomic_set(&info.ready, 0);
+        smp_wmb();
+        cycles = ft_timestamp();
+        /* wait for helper reading */
+        while (!atomic_read(&info.ready))
+                cpu_relax();
+        /* positive offset: the other guy is ahead of us */
+        delta  = (int64_t) info.cycle_count;
+        delta -= (int64_t) cycles;
+        local_irq_restore(flags);
+        return delta;
+}
+#define NUM_SAMPLES 10
+static long calibrate_tsc_offsets(struct ftdev* ftdev, unsigned int idx,
+                                  unsigned long uarg)
+{
+        int cpu, self, i;
+        int64_t delta, sample;
+        preempt_disable();
+        self = smp_processor_id();
+        if (uarg)
+                printk(KERN_INFO "Feather-Trace: determining TSC offsets for P%d\n", self);
+        for_each_online_cpu(cpu)
+                if (cpu != self) {
+                        delta = calibrate_cpu(cpu);
+                        for (i = 1; i < NUM_SAMPLES; i++) {
+                                sample = calibrate_cpu(cpu);
+                                delta = sample < delta ? sample : delta;
+                        }
+                        cycle_offset[self][cpu] = delta;
+                        if (uarg)
+                                printk(KERN_INFO "Feather-Trace: TSC offset for P%d->P%d is %lld cycles.\n",
+                                       self, cpu, cycle_offset[self][cpu]);
+                }
+        preempt_enable();
+        return 0;
+}
+#define NO_TIMESTAMPS (2 << CONFIG_SCHED_OVERHEAD_TRACE_SHIFT)
+static int alloc_timestamp_buffer(struct ftdev* ftdev, unsigned int idx)
+{
+        unsigned int count = NO_TIMESTAMPS;
+        /* An overhead-tracing timestamp should be exactly 16 bytes long. */
+        BUILD_BUG_ON(sizeof(struct timestamp) != 16);
+        while (count && !ftdev->minor[idx].buf) {
+                printk("time stamp buffer: trying to allocate %u time stamps for minor=%u.\n", count, idx);
+                ftdev->minor[idx].buf = alloc_ft_buffer(count, sizeof(struct timestamp));
+                count /= 2;
+        }
+        return ftdev->minor[idx].buf ? 0 : -ENOMEM;
+}
+static void free_timestamp_buffer(struct ftdev* ftdev, unsigned int idx)
+{
+        struct ft_buffer* tmp = ftdev->minor[idx].buf;
+        smp_rmb();
+        ftdev->minor[idx].buf = NULL;
+        /* Make sure all cores have actually seen buf == NULL before
+         * yanking out the mappings from underneath them. */
+        smp_wmb();
+        free_ft_buffer(tmp);
+}
+static ssize_t write_timestamp_from_user(struct ft_buffer* buf, size_t len,
+                                         const char __user *from)
+{
+        ssize_t consumed = 0;
+        struct timestamp ts;
+        /* don't give us partial timestamps */
+        if (len % sizeof(ts))
+                return -EINVAL;
+        while (len >= sizeof(ts)) {
+                if (copy_from_user(&ts, from, sizeof(ts))) {
+                        consumed = -EFAULT;
+                        goto out;
+                }
+                len  -= sizeof(ts);
+                from += sizeof(ts);
+                consumed += sizeof(ts);
+                /* Note: this always adds to the buffer of the CPU-local
+                 * device, not necessarily to the device that the system call
+                 * was invoked on. This is admittedly a bit ugly, but requiring
+                 * tasks to only write to the appropriate device would make
+                 * tracing from userspace under global and clustered scheduling
+                 * exceedingly difficult. Writing to remote buffers would
+                 * require to not use ft_buffer_start_single_write(), which we
+                 * want to do to reduce the number of atomic ops in the common
+                 * case (which is the recording of CPU-local scheduling
+                 * overheads).
+                 */
+                __add_timestamp_user(&ts);
+        }
+out:
+        return consumed;
+}
+static int __init init_cpu_ft_overhead_trace(void)
+{
+        int err, cpu;
+        printk("Initializing Feather-Trace per-cpu overhead tracing device.\n");
+        err = ftdev_init(&cpu_overhead_dev, THIS_MODULE,
+                         num_online_cpus(), "ft_cpu_trace");
+        if (err)
+                goto err_out;
+        cpu_overhead_dev.alloc = alloc_timestamp_buffer;
+        cpu_overhead_dev.free  = free_timestamp_buffer;
+        cpu_overhead_dev.write = write_timestamp_from_user;
+        err = register_ftdev(&cpu_overhead_dev);
+        if (err)
+                goto err_dealloc;
+        for (cpu = 0; cpu < NR_CPUS; cpu++)  {
+                per_cpu(cpu_ts_seq_no, cpu) = 0;
+        }
+        return 0;
+err_dealloc:
+        ftdev_exit(&cpu_overhead_dev);
+err_out:
+        printk(KERN_WARNING "Could not register per-cpu ft_trace device.\n");
+        return err;
+}
+static int __init init_msg_ft_overhead_trace(void)
+{
+        int err, cpu;
+        printk("Initializing Feather-Trace per-cpu message overhead tracing device.\n");
+        err = ftdev_init(&msg_overhead_dev, THIS_MODULE,
+                         num_online_cpus(), "ft_msg_trace");
+        if (err)
+                goto err_out;
+        msg_overhead_dev.alloc = alloc_timestamp_buffer;
+        msg_overhead_dev.free  = free_timestamp_buffer;
+        msg_overhead_dev.calibrate = calibrate_tsc_offsets;
+        err = register_ftdev(&msg_overhead_dev);
+        if (err)
+                goto err_dealloc;
+        for (cpu = 0; cpu < NR_CPUS; cpu++)  {
+                atomic_set(&per_cpu(msg_ts_seq_no, cpu), 0);
+        }
+        return 0;
+err_dealloc:
+        ftdev_exit(&msg_overhead_dev);
+err_out:
+        printk(KERN_WARNING "Could not register message ft_trace device.\n");
+        return err;
+}
+static int __init init_ft_overhead_trace(void)
+{
+        int err, i, j;
+        for (i = 0; i < NR_CPUS; i++)
+                for (j = 0; j < NR_CPUS; j++)
+                        cycle_offset[i][j] = 0;
+        err = init_cpu_ft_overhead_trace();
+        if (err)
+                return err;
+        err = init_msg_ft_overhead_trace();
+        if (err){
+                ftdev_exit(&cpu_overhead_dev);
+                return err;
+        }
+                
+        return 0;
+}
+static void __exit exit_ft_overhead_trace(void)
+{
+        ftdev_exit(&cpu_overhead_dev);
+        ftdev_exit(&msg_overhead_dev);
+}
+module_init(init_ft_overhead_trace);
+module_exit(exit_ft_overhead_trace);
diff --git a/litmus/uncachedev.c b/litmus/uncachedev.c
new file mode 100644
index 000000000000..06a6a7c17983
--- /dev/null
+++ b/litmus/uncachedev.c
@@ -0,0 +1,102 @@
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/errno.h>
+#include <linux/highmem.h>
+#include <asm/page.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <litmus/litmus.h>
+/* device for allocating pages not cached by the CPU */
+#define UNCACHE_NAME        "litmus/uncache"
+void litmus_uncache_vm_open(struct vm_area_struct *vma)
+{
+}
+void litmus_uncache_vm_close(struct vm_area_struct *vma)
+{
+}
+int litmus_uncache_vm_fault(struct vm_area_struct* vma,
+                                                        struct vm_fault* vmf)
+{
+        /* modeled after SG DMA video4linux, but without DMA. */
+        /* (see drivers/media/video/videobuf-dma-sg.c) */
+        struct page *page;
+        page = alloc_page(GFP_USER);
+        if (!page)
+                return VM_FAULT_OOM;
+        clear_user_highpage(page, (unsigned long)vmf->virtual_address);
+        vmf->page = page;
+        return 0;
+}
+static struct vm_operations_struct litmus_uncache_vm_ops = {
+        .open = litmus_uncache_vm_open,
+        .close = litmus_uncache_vm_close,
+        .fault = litmus_uncache_vm_fault,
+};
+static int litmus_uncache_mmap(struct file* filp, struct vm_area_struct* vma)
+{
+        /* first make sure mapper knows what he's doing */
+        /* you can only map the "first" page */
+        if (vma->vm_pgoff != 0)
+                return -EINVAL;
+        /* you can't share it with anyone */
+        if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED))
+                return -EINVAL;
+        /* cannot be expanded, and is not a "normal" page. */
+        vma->vm_flags |= VM_DONTEXPAND;
+        /* noncached pages are not explicitly locked in memory (for now). */
+        vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+        vma->vm_ops = &litmus_uncache_vm_ops;
+        return 0;
+}
+static struct file_operations litmus_uncache_fops = {
+        .owner = THIS_MODULE,
+        .mmap  = litmus_uncache_mmap,
+};
+static struct miscdevice litmus_uncache_dev = {
+        .name  = UNCACHE_NAME,
+        .minor = MISC_DYNAMIC_MINOR,
+        .fops  = &litmus_uncache_fops,
+        /* pages are not locked, so there is no reason why
+           anyone cannot allocate an uncache pages */
+        .mode  = (S_IRUGO | S_IWUGO),
+};
+static int __init init_litmus_uncache_dev(void)
+{
+        int err;
+        printk("Initializing LITMUS^RT uncache device.\n");
+        err = misc_register(&litmus_uncache_dev);
+        if (err)
+                printk("Could not allocate %s device (%d).\n", UNCACHE_NAME, err);
+        return err;
+}
+static void __exit exit_litmus_uncache_dev(void)
+{
+        misc_deregister(&litmus_uncache_dev);
+}
+module_init(init_litmus_uncache_dev);
+module_exit(exit_litmus_uncache_dev);