Add LITMUS^RT core implementation

This patch adds the core of LITMUS^RT: - library functionality (heaps, rt_domain, prioritization, etc.) - budget enforcement logic - job management - system call backends - virtual devices (control page, etc.) - scheduler plugin API (and dummy plugin) This code compiles, but is not yet integrated with the rest of Linux.
author: Bjoern Brandenburg <bbb@mpi-sws.org> 2013-06-25 01:27:07 -0400
committer: Bjoern Brandenburg <bbb@mpi-sws.org> 2013-08-07 03:46:49 -0400
commit: 543810eb67bea9c3046ecb58388493bca39fe796 (patch)
tree: cf65010367e53dfbd3e39a9eb6e89dacf92348f3
parent: 1412c8b72e192a14b8dd620f58a75f55a5490783 (diff)
42 files changed, 6411 insertions, 6 deletions
diff --git a/include/litmus/affinity.h b/include/litmus/affinity.h
new file mode 100644
index 000000000000..ca2e442eb547
--- /dev/null
+++ b/include/litmus/affinity.h
@@ -0,0 +1,80 @@
+#ifndef __LITMUS_AFFINITY_H
+#define __LITMUS_AFFINITY_H
+#include <linux/cpumask.h>
+/*
+  L1 (instr) = depth 0
+  L1 (data)  = depth 1
+  L2 = depth 2
+  L3 = depth 3
+ */
+#define NUM_CACHE_LEVELS 4
+struct neighborhood
+{
+        unsigned int size[NUM_CACHE_LEVELS];
+        cpumask_var_t neighbors[NUM_CACHE_LEVELS];
+};
+/* topology info is stored redundently in a big array for fast lookups */
+extern struct neighborhood neigh_info[NR_CPUS];
+void init_topology(void); /* called by Litmus module's _init_litmus() */
+/* Works like:
+void get_nearest_available_cpu(
+        cpu_entry_t **nearest,
+        cpu_entry_t *start,
+        cpu_entry_t *entries,
+        int release_master)
+Set release_master = NO_CPU for no Release Master.
+We use a macro here to exploit the fact that C-EDF and G-EDF
+have similar structures for their cpu_entry_t structs, even though
+they do not share a common base-struct.  The macro allows us to
+avoid code duplication.
+TODO: Factor out the job-to-processor linking from C/G-EDF into
+a reusable "processor mapping".  (See B.B.'s RTSS'09 paper &
+dissertation.)
+ */
+#define get_nearest_available_cpu(nearest, start, entries, release_master) \
+{ \
+        (nearest) = NULL; \
+        if (!(start)->linked) { \
+                (nearest) = (start); \
+        } else { \
+                int __level; \
+                int __cpu; \
+                int __release_master = ((release_master) == NO_CPU) ? -1 : (release_master); \
+                struct neighborhood *__neighbors = &neigh_info[(start)->cpu]; \
+                \
+                for (__level = 0; (__level < NUM_CACHE_LEVELS) && !(nearest); ++__level) { \
+                        if (__neighbors->size[__level] > 1) { \
+                                for_each_cpu(__cpu, __neighbors->neighbors[__level]) { \
+                                        if (__cpu != __release_master) { \
+                                                cpu_entry_t *__entry = &per_cpu((entries), __cpu); \
+                                                if (!__entry->linked) { \
+                                                        (nearest) = __entry; \
+                                                        break; \
+                                                } \
+                                        } \
+                                } \
+                        } else if (__neighbors->size[__level] == 0) { \
+                                break; \
+                        } \
+                } \
+        } \
+        \
+        if ((nearest)) { \
+                TRACE("P%d is closest available CPU to P%d\n", \
+                                (nearest)->cpu, (start)->cpu); \
+        } else { \
+                TRACE("Could not find an available CPU close to P%d\n", \
+                                (start)->cpu); \
+        } \
+}
+#endif
diff --git a/include/litmus/bheap.h b/include/litmus/bheap.h
new file mode 100644
index 000000000000..cf4864a498d8
--- /dev/null
+++ b/include/litmus/bheap.h
@@ -0,0 +1,77 @@
+/* bheaps.h -- Binomial Heaps
+ *
+ * (c) 2008, 2009 Bjoern Brandenburg
+ */
+#ifndef BHEAP_H
+#define BHEAP_H
+#define NOT_IN_HEAP UINT_MAX
+struct bheap_node {
+        struct bheap_node*      parent;
+        struct bheap_node*      next;
+        struct bheap_node*      child;
+        unsigned int            degree;
+        void*                   value;
+        struct bheap_node**     ref;
+};
+struct bheap {
+        struct bheap_node*      head;
+        /* We cache the minimum of the heap.
+         * This speeds up repeated peek operations.
+         */
+        struct bheap_node*      min;
+};
+typedef int (*bheap_prio_t)(struct bheap_node* a, struct bheap_node* b);
+void bheap_init(struct bheap* heap);
+void bheap_node_init(struct bheap_node** ref_to_bheap_node_ptr, void* value);
+static inline int bheap_node_in_heap(struct bheap_node* h)
+{
+        return h->degree != NOT_IN_HEAP;
+}
+static inline int bheap_empty(struct bheap* heap)
+{
+        return heap->head == NULL && heap->min == NULL;
+}
+/* insert (and reinitialize) a node into the heap */
+void bheap_insert(bheap_prio_t higher_prio,
+                 struct bheap* heap,
+                 struct bheap_node* node);
+/* merge addition into target */
+void bheap_union(bheap_prio_t higher_prio,
+                struct bheap* target,
+                struct bheap* addition);
+struct bheap_node* bheap_peek(bheap_prio_t higher_prio,
+                            struct bheap* heap);
+struct bheap_node* bheap_take(bheap_prio_t higher_prio,
+                            struct bheap* heap);
+void bheap_uncache_min(bheap_prio_t higher_prio, struct bheap* heap);
+int  bheap_decrease(bheap_prio_t higher_prio, struct bheap_node* node);
+void bheap_delete(bheap_prio_t higher_prio,
+                 struct bheap* heap,
+                 struct bheap_node* node);
+/* allocate from memcache */
+struct bheap_node* bheap_node_alloc(int gfp_flags);
+void bheap_node_free(struct bheap_node* hn);
+/* allocate a heap node for value and insert into the heap */
+int bheap_add(bheap_prio_t higher_prio, struct bheap* heap,
+             void* value, int gfp_flags);
+void* bheap_take_del(bheap_prio_t higher_prio,
+                    struct bheap* heap);
+#endif
diff --git a/include/litmus/binheap.h b/include/litmus/binheap.h
new file mode 100644
index 000000000000..1cf364701da8
--- /dev/null
+++ b/include/litmus/binheap.h
@@ -0,0 +1,205 @@
+#ifndef LITMUS_BINARY_HEAP_H
+#define LITMUS_BINARY_HEAP_H
+#include <linux/kernel.h>
+/**
+ * Simple binary heap with add, arbitrary delete, delete_root, and top
+ * operations.
+ *
+ * Style meant to conform with list.h.
+ *
+ * Motivation: Linux's prio_heap.h is of fixed size. Litmus's binomial
+ * heap may be overkill (and perhaps not general enough) for some applications.
+ *
+ * Note: In order to make node swaps fast, a node inserted with a data pointer
+ * may not always hold said data pointer. This is similar to the binomial heap
+ * implementation. This does make node deletion tricky since we have to
+ * (1) locate the node that holds the data pointer to delete, and (2) the
+ * node that was originally inserted with said data pointer. These have to be
+ * coalesced into a single node before removal (see usage of
+ * __binheap_safe_swap()). We have to track node references to accomplish this.
+ */
+struct binheap_node {
+        void    *data;
+        struct binheap_node *parent;
+        struct binheap_node *left;
+        struct binheap_node *right;
+        /* pointer to binheap_node that holds *data for which this binheap_node
+         * was originally inserted.  (*data "owns" this node)
+         */
+        struct binheap_node *ref;
+        struct binheap_node **ref_ptr;
+};
+/**
+ * Signature of compator function.  Assumed 'less-than' (min-heap).
+ * Pass in 'greater-than' for max-heap.
+ *
+ * TODO: Consider macro-based implementation that allows comparator to be
+ * inlined (similar to Linux red/black tree) for greater efficiency.
+ */
+typedef int (*binheap_order_t)(struct binheap_node *a,
+                                struct binheap_node *b);
+struct binheap {
+        struct binheap_node *root;
+        /* pointer to node to take next inserted child */
+        struct binheap_node *next;
+        /* pointer to last node in complete binary tree */
+        struct binheap_node *last;
+        /* comparator function pointer */
+        binheap_order_t compare;
+};
+/* Initialized heap nodes not in a heap have parent
+ * set to BINHEAP_POISON.
+ */
+#define BINHEAP_POISON  ((void*)(0xdeadbeef))
+/**
+ * binheap_entry - get the struct for this heap node.
+ *  Only valid when called upon heap nodes other than the root handle.
+ * @ptr:        the heap node.
+ * @type:       the type of struct pointed to by binheap_node::data.
+ * @member:     unused.
+ */
+#define binheap_entry(ptr, type, member) \
+((type *)((ptr)->data))
+/**
+ * binheap_node_container - get the struct that contains this node.
+ *  Only valid when called upon heap nodes other than the root handle.
+ * @ptr:        the heap node.
+ * @type:       the type of struct the node is embedded in.
+ * @member:     the name of the binheap_struct within the (type) struct.
+ */
+#define binheap_node_container(ptr, type, member) \
+container_of((ptr), type, member)
+/**
+ * binheap_top_entry - get the struct for the node at the top of the heap.
+ *  Only valid when called upon the heap handle node.
+ * @ptr:    the special heap-handle node.
+ * @type:   the type of the struct the head is embedded in.
+ * @member:     the name of the binheap_struct within the (type) struct.
+ */
+#define binheap_top_entry(ptr, type, member) \
+binheap_entry((ptr)->root, type, member)
+/**
+ * binheap_delete_root - remove the root element from the heap.
+ * @handle:      handle to the heap.
+ * @type:    the type of the struct the head is embedded in.
+ * @member:      the name of the binheap_struct within the (type) struct.
+ */
+#define binheap_delete_root(handle, type, member) \
+__binheap_delete_root((handle), &((type *)((handle)->root->data))->member)
+/**
+ * binheap_delete - remove an arbitrary element from the heap.
+ * @to_delete:  pointer to node to be removed.
+ * @handle:      handle to the heap.
+ */
+#define binheap_delete(to_delete, handle) \
+__binheap_delete((to_delete), (handle))
+/**
+ * binheap_add - insert an element to the heap
+ * new_node: node to add.
+ * @handle:      handle to the heap.
+ * @type:    the type of the struct the head is embedded in.
+ * @member:      the name of the binheap_struct within the (type) struct.
+ */
+#define binheap_add(new_node, handle, type, member) \
+__binheap_add((new_node), (handle), container_of((new_node), type, member))
+/**
+ * binheap_decrease - re-eval the position of a node (based upon its
+ * original data pointer).
+ * @handle: handle to the heap.
+ * @orig_node: node that was associated with the data pointer
+ *             (whose value has changed) when said pointer was
+ *             added to the heap.
+ */
+#define binheap_decrease(orig_node, handle) \
+__binheap_decrease((orig_node), (handle))
+#define BINHEAP_NODE_INIT() { NULL, BINHEAP_POISON, NULL, NULL , NULL, NULL}
+#define BINHEAP_NODE(name) \
+        struct binheap_node name = BINHEAP_NODE_INIT()
+static inline void INIT_BINHEAP_NODE(struct binheap_node *n)
+{
+        n->data = NULL;
+        n->parent = BINHEAP_POISON;
+        n->left = NULL;
+        n->right = NULL;
+        n->ref = NULL;
+        n->ref_ptr = NULL;
+}
+static inline void INIT_BINHEAP_HANDLE(struct binheap *handle,
+                                binheap_order_t compare)
+{
+        handle->root = NULL;
+        handle->next = NULL;
+        handle->last = NULL;
+        handle->compare = compare;
+}
+/* Returns true if binheap is empty. */
+static inline int binheap_empty(struct binheap *handle)
+{
+        return(handle->root == NULL);
+}
+/* Returns true if binheap node is in a heap. */
+static inline int binheap_is_in_heap(struct binheap_node *node)
+{
+        return (node->parent != BINHEAP_POISON);
+}
+/* Returns true if binheap node is in given heap. */
+int binheap_is_in_this_heap(struct binheap_node *node, struct binheap* heap);
+/* Add a node to a heap */
+void __binheap_add(struct binheap_node *new_node,
+                                struct binheap *handle,
+                                void *data);
+/**
+ * Removes the root node from the heap. The node is removed after coalescing
+ * the binheap_node with its original data pointer at the root of the tree.
+ *
+ * The 'last' node in the tree is then swapped up to the root and bubbled
+ * down.
+ */
+void __binheap_delete_root(struct binheap *handle,
+                                struct binheap_node *container);
+/**
+ * Delete an arbitrary node.  Bubble node to delete up to the root,
+ * and then delete to root.
+ */
+void __binheap_delete(struct binheap_node *node_to_delete,
+                                struct binheap *handle);
+/**
+ * Bubble up a node whose pointer has decreased in value.
+ */
+void __binheap_decrease(struct binheap_node *orig_node,
+                                                struct binheap *handle);
+#endif
diff --git a/include/litmus/budget.h b/include/litmus/budget.h
new file mode 100644
index 000000000000..a766781b87d4
--- /dev/null
+++ b/include/litmus/budget.h
@@ -0,0 +1,36 @@
+#ifndef _LITMUS_BUDGET_H_
+#define _LITMUS_BUDGET_H_
+/* Update the per-processor enforcement timer (arm/reproram/cancel) for
+ * the next task. */
+void update_enforcement_timer(struct task_struct* t);
+inline static int budget_exhausted(struct task_struct* t)
+{
+        return get_exec_time(t) >= get_exec_cost(t);
+}
+inline static lt_t budget_remaining(struct task_struct* t)
+{
+        if (!budget_exhausted(t))
+                return get_exec_cost(t) - get_exec_time(t);
+        else
+                /* avoid overflow */
+                return 0;
+}
+#define budget_enforced(t) (tsk_rt(t)->task_params.budget_policy != NO_ENFORCEMENT)
+#define budget_precisely_enforced(t) (tsk_rt(t)->task_params.budget_policy \
+                                      == PRECISE_ENFORCEMENT)
+static inline int requeue_preempted_job(struct task_struct* t)
+{
+        /* Add task to ready queue only if not subject to budget enforcement or
+         * if the job has budget remaining. t may be NULL.
+         */
+        return t && !is_completed(t) && !tsk_rt(t)->dont_requeue
+                && (!budget_exhausted(t) || !budget_enforced(t));
+}
+#endif
diff --git a/include/litmus/clustered.h b/include/litmus/clustered.h
new file mode 100644
index 000000000000..0c18dcb15e6c
--- /dev/null
+++ b/include/litmus/clustered.h
@@ -0,0 +1,44 @@
+#ifndef CLUSTERED_H
+#define CLUSTERED_H
+/* Which cache level should be used to group CPUs into clusters?
+ * GLOBAL_CLUSTER means that all CPUs form a single cluster (just like under
+ * global scheduling).
+ */
+enum cache_level {
+        GLOBAL_CLUSTER = 0,
+        L1_CLUSTER     = 1,
+        L2_CLUSTER     = 2,
+        L3_CLUSTER     = 3
+};
+int parse_cache_level(const char *str, enum cache_level *level);
+const char* cache_level_name(enum cache_level level);
+/* expose a cache level in a /proc dir */
+struct proc_dir_entry* create_cluster_file(struct proc_dir_entry* parent,
+                                           enum cache_level* level);
+struct scheduling_cluster {
+        unsigned int id;
+        /* list of CPUs that are part of this cluster */
+        struct list_head cpus;
+};
+struct cluster_cpu {
+        unsigned int id; /* which CPU is this? */
+        struct list_head cluster_list; /* List of the CPUs in this cluster. */
+        struct scheduling_cluster* cluster; /* The cluster that this CPU belongs to. */
+};
+int get_cluster_size(enum cache_level level);
+int assign_cpus_to_clusters(enum cache_level level,
+                            struct scheduling_cluster* clusters[],
+                            unsigned int num_clusters,
+                            struct cluster_cpu* cpus[],
+                            unsigned int num_cpus);
+#endif
diff --git a/include/litmus/edf_common.h b/include/litmus/edf_common.h
new file mode 100644
index 000000000000..bbaf22ea7f12
--- /dev/null
+++ b/include/litmus/edf_common.h
@@ -0,0 +1,25 @@
+/*
+ * EDF common data structures and utility functions shared by all EDF
+ * based scheduler plugins
+ */
+/* CLEANUP: Add comments and make it less messy.
+ *
+ */
+#ifndef __UNC_EDF_COMMON_H__
+#define __UNC_EDF_COMMON_H__
+#include <litmus/rt_domain.h>
+void edf_domain_init(rt_domain_t* rt, check_resched_needed_t resched,
+                     release_jobs_t release);
+int edf_higher_prio(struct task_struct* first,
+                    struct task_struct* second);
+int edf_ready_order(struct bheap_node* a, struct bheap_node* b);
+int edf_preemption_needed(rt_domain_t* rt, struct task_struct *t);
+#endif
diff --git a/include/litmus/fdso.h b/include/litmus/fdso.h
new file mode 100644
index 000000000000..f2115b83f1e4
--- /dev/null
+++ b/include/litmus/fdso.h
@@ -0,0 +1,77 @@
+/* fdso.h - file descriptor attached shared objects
+ *
+ * (c) 2007 B. Brandenburg, LITMUS^RT project
+ */
+#ifndef _LINUX_FDSO_H_
+#define _LINUX_FDSO_H_
+#include <linux/list.h>
+#include <asm/atomic.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#define MAX_OBJECT_DESCRIPTORS 85
+typedef enum  {
+        MIN_OBJ_TYPE    = 0,
+        FMLP_SEM        = 0,
+        SRP_SEM         = 1,
+        MPCP_SEM        = 2,
+        MPCP_VS_SEM     = 3,
+        DPCP_SEM        = 4,
+        PCP_SEM         = 5,
+        MAX_OBJ_TYPE    = 5
+} obj_type_t;
+struct inode_obj_id {
+        struct list_head        list;
+        atomic_t                count;
+        struct inode*           inode;
+        obj_type_t              type;
+        void*                   obj;
+        unsigned int            id;
+};
+struct fdso_ops;
+struct od_table_entry {
+        unsigned int            used;
+        struct inode_obj_id*    obj;
+        const struct fdso_ops*  class;
+};
+struct fdso_ops {
+        int   (*create)(void** obj_ref, obj_type_t type, void* __user);
+        void  (*destroy)(obj_type_t type, void*);
+        int   (*open)   (struct od_table_entry*, void* __user);
+        int   (*close)  (struct od_table_entry*);
+};
+/* translate a userspace supplied od into the raw table entry
+ * returns NULL if od is invalid
+ */
+struct od_table_entry* get_entry_for_od(int od);
+/* translate a userspace supplied od into the associated object
+ * returns NULL if od is invalid
+ */
+static inline void* od_lookup(int od, obj_type_t type)
+{
+        struct od_table_entry* e = get_entry_for_od(od);
+        return e && e->obj->type == type ? e->obj->obj : NULL;
+}
+#define lookup_fmlp_sem(od)((struct pi_semaphore*)  od_lookup(od, FMLP_SEM))
+#define lookup_srp_sem(od) ((struct srp_semaphore*) od_lookup(od, SRP_SEM))
+#define lookup_ics(od)     ((struct ics*)           od_lookup(od, ICS_ID))
+#endif
diff --git a/include/litmus/fp_common.h b/include/litmus/fp_common.h
new file mode 100644
index 000000000000..19356c0fa6c1
--- /dev/null
+++ b/include/litmus/fp_common.h
@@ -0,0 +1,105 @@
+/* Fixed-priority scheduler support.
+ */
+#ifndef __FP_COMMON_H__
+#define __FP_COMMON_H__
+#include <litmus/rt_domain.h>
+#include <asm/bitops.h>
+void fp_domain_init(rt_domain_t* rt, check_resched_needed_t resched,
+                    release_jobs_t release);
+int fp_higher_prio(struct task_struct* first,
+                   struct task_struct* second);
+int fp_ready_order(struct bheap_node* a, struct bheap_node* b);
+#define FP_PRIO_BIT_WORDS (LITMUS_MAX_PRIORITY / BITS_PER_LONG)
+#if (LITMUS_MAX_PRIORITY % BITS_PER_LONG)
+#error LITMUS_MAX_PRIORITY must be a multiple of BITS_PER_LONG
+#endif
+/* bitmask-inexed priority queue */
+struct fp_prio_queue {
+        unsigned long   bitmask[FP_PRIO_BIT_WORDS];
+        struct bheap    queue[LITMUS_MAX_PRIORITY];
+};
+void fp_prio_queue_init(struct fp_prio_queue* q);
+static inline void fpq_set(struct fp_prio_queue* q, unsigned int index)
+{
+        unsigned long *word = q->bitmask + (index / BITS_PER_LONG);
+        __set_bit(index % BITS_PER_LONG, word);
+}
+static inline void fpq_clear(struct fp_prio_queue* q, unsigned int index)
+{
+        unsigned long *word = q->bitmask + (index / BITS_PER_LONG);
+        __clear_bit(index % BITS_PER_LONG, word);
+}
+static inline unsigned int fpq_find(struct fp_prio_queue* q)
+{
+        int i;
+        /* loop optimizer should unroll this */
+        for (i = 0; i < FP_PRIO_BIT_WORDS; i++)
+                if (q->bitmask[i])
+                        return __ffs(q->bitmask[i]) + i * BITS_PER_LONG;
+        return LITMUS_MAX_PRIORITY; /* nothing found */
+}
+static inline void fp_prio_add(struct fp_prio_queue* q, struct task_struct* t, unsigned int index)
+{
+        BUG_ON(index >= LITMUS_MAX_PRIORITY);
+        BUG_ON(bheap_node_in_heap(tsk_rt(t)->heap_node));
+        fpq_set(q, index);
+        bheap_insert(fp_ready_order, &q->queue[index], tsk_rt(t)->heap_node);
+}
+static inline void fp_prio_remove(struct fp_prio_queue* q, struct task_struct* t, unsigned int index)
+{
+        BUG_ON(!is_queued(t));
+        bheap_delete(fp_ready_order, &q->queue[index], tsk_rt(t)->heap_node);
+        if (likely(bheap_empty(&q->queue[index])))
+                fpq_clear(q, index);
+}
+static inline struct task_struct* fp_prio_peek(struct fp_prio_queue* q)
+{
+        unsigned int idx = fpq_find(q);
+        struct bheap_node* hn;
+        if (idx < LITMUS_MAX_PRIORITY) {
+                hn = bheap_peek(fp_ready_order, &q->queue[idx]);
+                return bheap2task(hn);
+        } else
+                return NULL;
+}
+static inline struct task_struct* fp_prio_take(struct fp_prio_queue* q)
+{
+        unsigned int idx = fpq_find(q);
+        struct bheap_node* hn;
+        if (idx < LITMUS_MAX_PRIORITY) {
+                hn = bheap_take(fp_ready_order, &q->queue[idx]);
+                if (likely(bheap_empty(&q->queue[idx])))
+                        fpq_clear(q, idx);
+                return bheap2task(hn);
+        } else
+                return NULL;
+}
+int fp_preemption_needed(struct fp_prio_queue*  q, struct task_struct *t);
+#endif
diff --git a/include/litmus/fpmath.h b/include/litmus/fpmath.h
new file mode 100644
index 000000000000..642de98542c8
--- /dev/null
+++ b/include/litmus/fpmath.h
@@ -0,0 +1,147 @@
+#ifndef __FP_MATH_H__
+#define __FP_MATH_H__
+#include <linux/math64.h>
+#ifndef __KERNEL__
+#include <stdint.h>
+#define abs(x) (((x) < 0) ? -(x) : x)
+#endif
+// Use 64-bit because we want to track things at the nanosecond scale.
+// This can lead to very large numbers.
+typedef int64_t fpbuf_t;
+typedef struct
+{
+        fpbuf_t val;
+} fp_t;
+#define FP_SHIFT 10
+#define ROUND_BIT (FP_SHIFT - 1)
+#define _fp(x) ((fp_t) {x})
+#ifdef __KERNEL__
+static const fp_t LITMUS_FP_ZERO = {.val = 0};
+static const fp_t LITMUS_FP_ONE = {.val = (1 << FP_SHIFT)};
+#endif
+static inline fp_t FP(fpbuf_t x)
+{
+        return _fp(((fpbuf_t) x) << FP_SHIFT);
+}
+/* divide two integers to obtain a fixed point value  */
+static inline fp_t _frac(fpbuf_t a, fpbuf_t b)
+{
+        return _fp(div64_s64(FP(a).val, (b)));
+}
+static inline fpbuf_t _point(fp_t x)
+{
+        return (x.val % (1 << FP_SHIFT));
+}
+#define fp2str(x) x.val
+/*(x.val >> FP_SHIFT), (x.val % (1 << FP_SHIFT)) */
+#define _FP_  "%ld/1024"
+static inline fpbuf_t _floor(fp_t x)
+{
+        return x.val >> FP_SHIFT;
+}
+/* FIXME: negative rounding */
+static inline fpbuf_t _round(fp_t x)
+{
+        return _floor(x) + ((x.val >> ROUND_BIT) & 1);
+}
+/* multiply two fixed point values */
+static inline fp_t _mul(fp_t a, fp_t b)
+{
+        return _fp((a.val * b.val) >> FP_SHIFT);
+}
+static inline fp_t _div(fp_t a, fp_t b)
+{
+#if !defined(__KERNEL__) && !defined(unlikely)
+#define unlikely(x) (x)
+#define DO_UNDEF_UNLIKELY
+#endif
+        /* try not to overflow */
+        if (unlikely(  a.val > (2l << ((sizeof(fpbuf_t)*8) - FP_SHIFT)) ))
+                return _fp((a.val / b.val) << FP_SHIFT);
+        else
+                return _fp((a.val << FP_SHIFT) / b.val);
+#ifdef DO_UNDEF_UNLIKELY
+#undef unlikely
+#undef DO_UNDEF_UNLIKELY
+#endif
+}
+static inline fp_t _add(fp_t a, fp_t b)
+{
+        return _fp(a.val + b.val);
+}
+static inline fp_t _sub(fp_t a, fp_t b)
+{
+        return _fp(a.val - b.val);
+}
+static inline fp_t _neg(fp_t x)
+{
+        return _fp(-x.val);
+}
+static inline fp_t _abs(fp_t x)
+{
+        return _fp(abs(x.val));
+}
+/* works the same as casting float/double to integer */
+static inline fpbuf_t _fp_to_integer(fp_t x)
+{
+        return _floor(_abs(x)) * ((x.val > 0) ? 1 : -1);
+}
+static inline fp_t _integer_to_fp(fpbuf_t x)
+{
+        return _frac(x,1);
+}
+static inline int _leq(fp_t a, fp_t b)
+{
+        return a.val <= b.val;
+}
+static inline int _geq(fp_t a, fp_t b)
+{
+        return a.val >= b.val;
+}
+static inline int _lt(fp_t a, fp_t b)
+{
+        return a.val < b.val;
+}
+static inline int _gt(fp_t a, fp_t b)
+{
+        return a.val > b.val;
+}
+static inline int _eq(fp_t a, fp_t b)
+{
+        return a.val == b.val;
+}
+static inline fp_t _max(fp_t a, fp_t b)
+{
+        if (a.val < b.val)
+                return b;
+        else
+                return a;
+}
+#endif
diff --git a/include/litmus/jobs.h b/include/litmus/jobs.h
new file mode 100644
index 000000000000..e1507d4314b8
--- /dev/null
+++ b/include/litmus/jobs.h
@@ -0,0 +1,8 @@
+#ifndef __LITMUS_JOBS_H__
+#define __LITMUS_JOBS_H__
+void prepare_for_next_period(struct task_struct *t);
+void release_at(struct task_struct *t, lt_t start);
+long complete_job(void);
+#endif
diff --git a/include/litmus/litmus.h b/include/litmus/litmus.h
index c87863c9b231..e35c38c4c0a2 100644
--- a/include/litmus/litmus.h
+++ b/include/litmus/litmus.h
@@ -6,7 +6,41 @@
 #ifndef _LINUX_LITMUS_H_
 #define _LINUX_LITMUS_H_
+#include <litmus/debug_trace.h>
+#ifdef CONFIG_RELEASE_MASTER
+extern atomic_t release_master_cpu;
+#endif
+/* in_list - is a given list_head queued on some list?
+ */
+static inline int in_list(struct list_head* list)
+{
+        return !(  /* case 1: deleted */
+                   (list->next == LIST_POISON1 &&
+                    list->prev == LIST_POISON2)
+                 ||
+                   /* case 2: initialized */
+                   (list->next == list &&
+                    list->prev == list)
+                );
+}
+struct task_struct* __waitqueue_remove_first(wait_queue_head_t *wq);
+#define NO_CPU                  0xffffffff
+void litmus_fork(struct task_struct *tsk);
+void litmus_exec(void);
+/* clean up real-time state of a task */
+void exit_litmus(struct task_struct *dead_tsk);
+long litmus_admit_task(struct task_struct *tsk);
+void litmus_exit_task(struct task_struct *tsk);
 #define is_realtime(t)          ((t)->policy == SCHED_LITMUS)
+#define rt_transition_pending(t) \
+        ((t)->rt_param.transition_pending)
 #define tsk_rt(t)               (&(t)->rt_param)
@@ -28,6 +62,7 @@
 #define get_partition(t)        (tsk_rt(t)->task_params.cpu)
 #define get_priority(t)         (tsk_rt(t)->task_params.priority)
 #define get_class(t)        (tsk_rt(t)->task_params.cls)
+#define get_release_policy(t) (tsk_rt(t)->task_params.release_policy)
 /* job_param macros */
 #define get_exec_time(t)    (tsk_rt(t)->job_params.exec_time)
@@ -35,6 +70,15 @@
 #define get_release(t)          (tsk_rt(t)->job_params.release)
 #define get_lateness(t)         (tsk_rt(t)->job_params.lateness)
+/* release policy macros */
+#define is_periodic(t)          (get_release_policy(t) == TASK_PERIODIC)
+#define is_sporadic(t)          (get_release_policy(t) == TASK_SPORADIC)
+#ifdef CONFIG_ALLOW_EARLY_RELEASE
+#define is_early_releasing(t)   (get_release_policy(t) == TASK_EARLY)
+#else
+#define is_early_releasing(t)   (0)
+#endif
 #define is_hrt(t)               \
        (tsk_rt(t)->task_params.cls == RT_CLASS_HARD)
 #define is_srt(t)               \
@@ -48,6 +92,192 @@ static inline lt_t litmus_clock(void)
        return ktime_to_ns(ktime_get());
 }
+/* A macro to convert from nanoseconds to ktime_t. */
+#define ns_to_ktime(t)          ktime_add_ns(ktime_set(0, 0), t)
+#define get_domain(t) (tsk_rt(t)->domain)
+/* Honor the flag in the preempt_count variable that is set
+ * when scheduling is in progress.
+ */
+#define is_running(t)                   \
+        ((t)->state == TASK_RUNNING ||  \
+         task_thread_info(t)->preempt_count & PREEMPT_ACTIVE)
+#define is_blocked(t)       \
+        (!is_running(t))
+#define is_released(t, now)     \
+        (lt_before_eq(get_release(t), now))
+#define is_tardy(t, now)    \
+        (lt_before_eq(tsk_rt(t)->job_params.deadline, now))
+/* real-time comparison macros */
+#define earlier_deadline(a, b) (lt_before(\
+        (a)->rt_param.job_params.deadline,\
+        (b)->rt_param.job_params.deadline))
+#define earlier_release(a, b)  (lt_before(\
+        (a)->rt_param.job_params.release,\
+        (b)->rt_param.job_params.release))
+void preempt_if_preemptable(struct task_struct* t, int on_cpu);
+#ifdef CONFIG_LITMUS_LOCKING
+void srp_ceiling_block(void);
+#else
+#define srp_ceiling_block() /* nothing */
+#endif
+#define bheap2task(hn) ((struct task_struct*) hn->value)
+#ifdef CONFIG_NP_SECTION
+static inline int is_kernel_np(struct task_struct *t)
+{
+        return tsk_rt(t)->kernel_np;
+}
+static inline int is_user_np(struct task_struct *t)
+{
+        return tsk_rt(t)->ctrl_page ? tsk_rt(t)->ctrl_page->sched.np.flag : 0;
+}
+static inline void request_exit_np(struct task_struct *t)
+{
+        if (is_user_np(t)) {
+                /* Set the flag that tells user space to call
+                 * into the kernel at the end of a critical section. */
+                if (likely(tsk_rt(t)->ctrl_page)) {
+                        TRACE_TASK(t, "setting delayed_preemption flag\n");
+                        tsk_rt(t)->ctrl_page->sched.np.preempt = 1;
+                }
+        }
+}
+static inline void make_np(struct task_struct *t)
+{
+        tsk_rt(t)->kernel_np++;
+}
+/* Caller should check if preemption is necessary when
+ * the function return 0.
+ */
+static inline int take_np(struct task_struct *t)
+{
+        return --tsk_rt(t)->kernel_np;
+}
+/* returns 0 if remote CPU needs an IPI to preempt, 1 if no IPI is required */
+static inline int request_exit_np_atomic(struct task_struct *t)
+{
+        union np_flag old, new;
+        if (tsk_rt(t)->ctrl_page) {
+                old.raw = tsk_rt(t)->ctrl_page->sched.raw;
+                if (old.np.flag == 0) {
+                        /* no longer non-preemptive */
+                        return 0;
+                } else if (old.np.preempt) {
+                        /* already set, nothing for us to do */
+                        return 1;
+                } else {
+                        /* non preemptive and flag not set */
+                        new.raw = old.raw;
+                        new.np.preempt = 1;
+                        /* if we get old back, then we atomically set the flag */
+                        return cmpxchg(&tsk_rt(t)->ctrl_page->sched.raw, old.raw, new.raw) == old.raw;
+                        /* If we raced with a concurrent change, then so be
+                         * it. Deliver it by IPI.  We don't want an unbounded
+                         * retry loop here since tasks might exploit that to
+                         * keep the kernel busy indefinitely. */
+                }
+        } else
+                return 0;
+}
+#else
+static inline int is_kernel_np(struct task_struct* t)
+{
+        return 0;
+}
+static inline int is_user_np(struct task_struct* t)
+{
+        return 0;
+}
+static inline void request_exit_np(struct task_struct *t)
+{
+        /* request_exit_np() shouldn't be called if !CONFIG_NP_SECTION */
+        BUG();
+}
+static inline int request_exit_np_atomic(struct task_struct *t)
+{
+        return 0;
+}
+#endif
+static inline void clear_exit_np(struct task_struct *t)
+{
+        if (likely(tsk_rt(t)->ctrl_page))
+                tsk_rt(t)->ctrl_page->sched.np.preempt = 0;
+}
+static inline int is_np(struct task_struct *t)
+{
+#ifdef CONFIG_SCHED_DEBUG_TRACE
+        int kernel, user;
+        kernel = is_kernel_np(t);
+        user   = is_user_np(t);
+        if (kernel || user)
+                TRACE_TASK(t, " is non-preemptive: kernel=%d user=%d\n",
+                           kernel, user);
+        return kernel || user;
+#else
+        return unlikely(is_kernel_np(t) || is_user_np(t));
+#endif
+}
+static inline int is_present(struct task_struct* t)
+{
+        return t && tsk_rt(t)->present;
+}
+static inline int is_completed(struct task_struct* t)
+{
+        return t && tsk_rt(t)->completed;
+}
+/* make the unit explicit */
+typedef unsigned long quanta_t;
+enum round {
+        FLOOR,
+        CEIL
+};
+/* Tick period is used to convert ns-specified execution
+ * costs and periods into tick-based equivalents.
+ */
+extern ktime_t tick_period;
+static inline quanta_t time2quanta(lt_t time, enum round round)
+{
+        s64  quantum_length = ktime_to_ns(tick_period);
+        if (do_div(time, quantum_length) && round == CEIL)
+                time++;
+        return (quanta_t) time;
+}
+/* By how much is cpu staggered behind CPU 0? */
+u64 cpu_stagger_offset(int cpu);
 static inline struct control_page* get_control_page(struct task_struct *t)
 {
        return tsk_rt(t)->ctrl_page;
@@ -58,4 +288,29 @@ static inline int has_control_page(struct task_struct* t)
        return tsk_rt(t)->ctrl_page != NULL;
 }
+#ifdef CONFIG_SCHED_OVERHEAD_TRACE
+#define TS_SYSCALL_IN_START                                             \
+        if (has_control_page(current)) {                                \
+                __TS_SYSCALL_IN_START(&get_control_page(current)->ts_syscall_start); \
+        }
+#define TS_SYSCALL_IN_END                                               \
+        if (has_control_page(current)) {                                \
+                uint64_t irqs;                                          \
+                local_irq_disable();                                    \
+                irqs = get_control_page(current)->irq_count -           \
+                        get_control_page(current)->irq_syscall_start;   \
+                __TS_SYSCALL_IN_END(&irqs);                             \
+                local_irq_enable();                                     \
+        }
+#else
+#define TS_SYSCALL_IN_START
+#define TS_SYSCALL_IN_END
+#endif
 #endif
diff --git a/include/litmus/litmus_proc.h b/include/litmus/litmus_proc.h
new file mode 100644
index 000000000000..6800e725d48c
--- /dev/null
+++ b/include/litmus/litmus_proc.h
@@ -0,0 +1,25 @@
+#include <litmus/sched_plugin.h>
+#include <linux/proc_fs.h>
+int __init init_litmus_proc(void);
+void exit_litmus_proc(void);
+/*
+ * On success, returns 0 and sets the pointer to the location of the new
+ * proc dir entry, otherwise returns an error code and sets pde to NULL.
+ */
+long make_plugin_proc_dir(struct sched_plugin* plugin,
+                struct proc_dir_entry** pde);
+/*
+ * Plugins should deallocate all child proc directory entries before
+ * calling this, to avoid memory leaks.
+ */
+void remove_plugin_proc_dir(struct sched_plugin* plugin);
+/* Copy at most size-1 bytes from ubuf into kbuf, null-terminate buf, and
+ * remove a '\n' if present. Returns the number of bytes that were read or
+ * -EFAULT. */
+int copy_and_chomp(char *kbuf, unsigned long ksize,
+                   __user const char* ubuf, unsigned long ulength);
diff --git a/include/litmus/locking.h b/include/litmus/locking.h
new file mode 100644
index 000000000000..4d7b870cb443
--- /dev/null
+++ b/include/litmus/locking.h
@@ -0,0 +1,28 @@
+#ifndef LITMUS_LOCKING_H
+#define LITMUS_LOCKING_H
+struct litmus_lock_ops;
+/* Generic base struct for LITMUS^RT userspace semaphores.
+ * This structure should be embedded in protocol-specific semaphores.
+ */
+struct litmus_lock {
+        struct litmus_lock_ops *ops;
+        int type;
+};
+struct litmus_lock_ops {
+        /* Current task tries to obtain / drop a reference to a lock.
+         * Optional methods, allowed by default. */
+        int (*open)(struct litmus_lock*, void* __user);
+        int (*close)(struct litmus_lock*);
+        /* Current tries to lock/unlock this lock (mandatory methods). */
+        int (*lock)(struct litmus_lock*);
+        int (*unlock)(struct litmus_lock*);
+        /* The lock is no longer being referenced (mandatory method). */
+        void (*deallocate)(struct litmus_lock*);
+};
+#endif
diff --git a/include/litmus/preempt.h b/include/litmus/preempt.h
new file mode 100644
index 000000000000..4fd108a45333
--- /dev/null
+++ b/include/litmus/preempt.h
@@ -0,0 +1,164 @@
+#ifndef LITMUS_PREEMPT_H
+#define LITMUS_PREEMPT_H
+#include <linux/types.h>
+#include <linux/cache.h>
+#include <linux/percpu.h>
+#include <asm/atomic.h>
+#include <litmus/debug_trace.h>
+DECLARE_PER_CPU_SHARED_ALIGNED(atomic_t, resched_state);
+#ifdef CONFIG_PREEMPT_STATE_TRACE
+const char* sched_state_name(int s);
+#define TRACE_STATE(fmt, args...) TRACE("SCHED_STATE " fmt, args)
+#else
+#define TRACE_STATE(fmt, args...) /* ignore */
+#endif
+#define VERIFY_SCHED_STATE(x)                                           \
+        do { int __s = get_sched_state();                               \
+                if ((__s & (x)) == 0)                                   \
+                        TRACE_STATE("INVALID s=0x%x (%s) not "          \
+                                    "in 0x%x (%s) [%s]\n",              \
+                                    __s, sched_state_name(__s),         \
+                                    (x), #x, __FUNCTION__);             \
+        } while (0);
+#define TRACE_SCHED_STATE_CHANGE(x, y, cpu)                             \
+        TRACE_STATE("[P%d] 0x%x (%s) -> 0x%x (%s)\n",                   \
+                    cpu,  (x), sched_state_name(x),                     \
+                    (y), sched_state_name(y))
+typedef enum scheduling_state {
+        TASK_SCHEDULED    = (1 << 0),  /* The currently scheduled task is the one that
+                                        * should be scheduled, and the processor does not
+                                        * plan to invoke schedule(). */
+        SHOULD_SCHEDULE   = (1 << 1),  /* A remote processor has determined that the
+                                        * processor should reschedule, but this has not
+                                        * been communicated yet (IPI still pending). */
+        WILL_SCHEDULE     = (1 << 2),  /* The processor has noticed that it has to
+                                        * reschedule and will do so shortly. */
+        TASK_PICKED       = (1 << 3),  /* The processor is currently executing schedule(),
+                                        * has selected a new task to schedule, but has not
+                                        * yet performed the actual context switch. */
+        PICKED_WRONG_TASK = (1 << 4),  /* The processor has not yet performed the context
+                                        * switch, but a remote processor has already
+                                        * determined that a higher-priority task became
+                                        * eligible after the task was picked. */
+} sched_state_t;
+static inline sched_state_t get_sched_state_on(int cpu)
+{
+        return atomic_read(&per_cpu(resched_state, cpu));
+}
+static inline sched_state_t get_sched_state(void)
+{
+        return atomic_read(&__get_cpu_var(resched_state));
+}
+static inline int is_in_sched_state(int possible_states)
+{
+        return get_sched_state() & possible_states;
+}
+static inline int cpu_is_in_sched_state(int cpu, int possible_states)
+{
+        return get_sched_state_on(cpu) & possible_states;
+}
+static inline void set_sched_state(sched_state_t s)
+{
+        TRACE_SCHED_STATE_CHANGE(get_sched_state(), s, smp_processor_id());
+        atomic_set(&__get_cpu_var(resched_state), s);
+}
+static inline int sched_state_transition(sched_state_t from, sched_state_t to)
+{
+        sched_state_t old_state;
+        old_state = atomic_cmpxchg(&__get_cpu_var(resched_state), from, to);
+        if (old_state == from) {
+                TRACE_SCHED_STATE_CHANGE(from, to, smp_processor_id());
+                return 1;
+        } else
+                return 0;
+}
+static inline int sched_state_transition_on(int cpu,
+                                            sched_state_t from,
+                                            sched_state_t to)
+{
+        sched_state_t old_state;
+        old_state = atomic_cmpxchg(&per_cpu(resched_state, cpu), from, to);
+        if (old_state == from) {
+                TRACE_SCHED_STATE_CHANGE(from, to, cpu);
+                return 1;
+        } else
+                return 0;
+}
+/* Plugins must call this function after they have decided which job to
+ * schedule next.  IMPORTANT: this function must be called while still holding
+ * the lock that is used to serialize scheduling decisions.
+ *
+ * (Ideally, we would like to use runqueue locks for this purpose, but that
+ * would lead to deadlocks with the migration code.)
+ */
+static inline void sched_state_task_picked(void)
+{
+        VERIFY_SCHED_STATE(WILL_SCHEDULE);
+        /* WILL_SCHEDULE has only a local tansition => simple store is ok */
+        set_sched_state(TASK_PICKED);
+}
+static inline void sched_state_entered_schedule(void)
+{
+        /* Update state for the case that we entered schedule() not due to
+         * set_tsk_need_resched() */
+        set_sched_state(WILL_SCHEDULE);
+}
+/* Called by schedule() to check if the scheduling decision is still valid
+ * after a context switch. Returns 1 if the CPU needs to reschdule. */
+static inline int sched_state_validate_switch(void)
+{
+        int left_state_ok = 0;
+        VERIFY_SCHED_STATE(PICKED_WRONG_TASK | TASK_PICKED);
+        if (is_in_sched_state(TASK_PICKED)) {
+                /* Might be good; let's try to transition out of this
+                 * state. This must be done atomically since remote processors
+                 * may try to change the state, too. */
+                left_state_ok = sched_state_transition(TASK_PICKED, TASK_SCHEDULED);
+        }
+        if (!left_state_ok) {
+                /* We raced with a higher-priority task arrival => not
+                 * valid. The CPU needs to reschedule. */
+                set_sched_state(WILL_SCHEDULE);
+                return 1;
+        } else
+                return 0;
+}
+/* State transition events. See litmus/preempt.c for details. */
+void sched_state_will_schedule(struct task_struct* tsk);
+void sched_state_ipi(void);
+/* Cause a CPU (remote or local) to reschedule. */
+void litmus_reschedule(int cpu);
+void litmus_reschedule_local(void);
+#ifdef CONFIG_DEBUG_KERNEL
+void sched_state_plugin_check(void);
+#else
+#define sched_state_plugin_check() /* no check */
+#endif
+#endif
diff --git a/include/litmus/rt_domain.h b/include/litmus/rt_domain.h
new file mode 100644
index 000000000000..ac249292e866
--- /dev/null
+++ b/include/litmus/rt_domain.h
@@ -0,0 +1,182 @@
+/* CLEANUP: Add comments and make it less messy.
+ *
+ */
+#ifndef __UNC_RT_DOMAIN_H__
+#define __UNC_RT_DOMAIN_H__
+#include <litmus/bheap.h>
+#define RELEASE_QUEUE_SLOTS 127 /* prime */
+struct _rt_domain;
+typedef int (*check_resched_needed_t)(struct _rt_domain *rt);
+typedef void (*release_jobs_t)(struct _rt_domain *rt, struct bheap* tasks);
+struct release_queue {
+        /* each slot maintains a list of release heaps sorted
+         * by release time */
+        struct list_head                slot[RELEASE_QUEUE_SLOTS];
+};
+typedef struct _rt_domain {
+        /* runnable rt tasks are in here */
+        raw_spinlock_t                  ready_lock;
+        struct bheap                    ready_queue;
+        /* real-time tasks waiting for release are in here */
+        raw_spinlock_t                  release_lock;
+        struct release_queue            release_queue;
+#ifdef CONFIG_RELEASE_MASTER
+        int                             release_master;
+#endif
+        /* for moving tasks to the release queue */
+        raw_spinlock_t                  tobe_lock;
+        struct list_head                tobe_released;
+        /* how do we check if we need to kick another CPU? */
+        check_resched_needed_t          check_resched;
+        /* how do we release jobs? */
+        release_jobs_t                  release_jobs;
+        /* how are tasks ordered in the ready queue? */
+        bheap_prio_t                    order;
+} rt_domain_t;
+struct release_heap {
+        /* list_head for per-time-slot list */
+        struct list_head                list;
+        lt_t                            release_time;
+        /* all tasks to be released at release_time */
+        struct bheap                    heap;
+        /* used to trigger the release */
+        struct hrtimer                  timer;
+#ifdef CONFIG_RELEASE_MASTER
+        /* used to delegate releases */
+        struct hrtimer_start_on_info    info;
+#endif
+        /* required for the timer callback */
+        rt_domain_t*                    dom;
+};
+static inline struct task_struct* __next_ready(rt_domain_t* rt)
+{
+        struct bheap_node *hn = bheap_peek(rt->order, &rt->ready_queue);
+        if (hn)
+                return bheap2task(hn);
+        else
+                return NULL;
+}
+void rt_domain_init(rt_domain_t *rt, bheap_prio_t order,
+                    check_resched_needed_t check,
+                    release_jobs_t relase);
+void __add_ready(rt_domain_t* rt, struct task_struct *new);
+void __merge_ready(rt_domain_t* rt, struct bheap *tasks);
+void __add_release(rt_domain_t* rt, struct task_struct *task);
+static inline struct task_struct* __take_ready(rt_domain_t* rt)
+{
+        struct bheap_node* hn = bheap_take(rt->order, &rt->ready_queue);
+        if (hn)
+                return bheap2task(hn);
+        else
+                return NULL;
+}
+static inline struct task_struct* __peek_ready(rt_domain_t* rt)
+{
+        struct bheap_node* hn = bheap_peek(rt->order, &rt->ready_queue);
+        if (hn)
+                return bheap2task(hn);
+        else
+                return NULL;
+}
+static inline int  is_queued(struct task_struct *t)
+{
+        BUG_ON(!tsk_rt(t)->heap_node);
+        return bheap_node_in_heap(tsk_rt(t)->heap_node);
+}
+static inline void remove(rt_domain_t* rt, struct task_struct *t)
+{
+        bheap_delete(rt->order, &rt->ready_queue, tsk_rt(t)->heap_node);
+}
+static inline void add_ready(rt_domain_t* rt, struct task_struct *new)
+{
+        unsigned long flags;
+        /* first we need the write lock for rt_ready_queue */
+        raw_spin_lock_irqsave(&rt->ready_lock, flags);
+        __add_ready(rt, new);
+        raw_spin_unlock_irqrestore(&rt->ready_lock, flags);
+}
+static inline void merge_ready(rt_domain_t* rt, struct bheap* tasks)
+{
+        unsigned long flags;
+        raw_spin_lock_irqsave(&rt->ready_lock, flags);
+        __merge_ready(rt, tasks);
+        raw_spin_unlock_irqrestore(&rt->ready_lock, flags);
+}
+static inline struct task_struct* take_ready(rt_domain_t* rt)
+{
+        unsigned long flags;
+        struct task_struct* ret;
+        /* first we need the write lock for rt_ready_queue */
+        raw_spin_lock_irqsave(&rt->ready_lock, flags);
+        ret = __take_ready(rt);
+        raw_spin_unlock_irqrestore(&rt->ready_lock, flags);
+        return ret;
+}
+static inline void add_release(rt_domain_t* rt, struct task_struct *task)
+{
+        unsigned long flags;
+        raw_spin_lock_irqsave(&rt->tobe_lock, flags);
+        __add_release(rt, task);
+        raw_spin_unlock_irqrestore(&rt->tobe_lock, flags);
+}
+#ifdef CONFIG_RELEASE_MASTER
+void __add_release_on(rt_domain_t* rt, struct task_struct *task,
+                      int target_cpu);
+static inline void add_release_on(rt_domain_t* rt,
+                                  struct task_struct *task,
+                                  int target_cpu)
+{
+        unsigned long flags;
+        raw_spin_lock_irqsave(&rt->tobe_lock, flags);
+        __add_release_on(rt, task, target_cpu);
+        raw_spin_unlock_irqrestore(&rt->tobe_lock, flags);
+}
+#endif
+static inline int __jobs_pending(rt_domain_t* rt)
+{
+        return !bheap_empty(&rt->ready_queue);
+}
+static inline int jobs_pending(rt_domain_t* rt)
+{
+        unsigned long flags;
+        int ret;
+        /* first we need the write lock for rt_ready_queue */
+        raw_spin_lock_irqsave(&rt->ready_lock, flags);
+        ret = !bheap_empty(&rt->ready_queue);
+        raw_spin_unlock_irqrestore(&rt->ready_lock, flags);
+        return ret;
+}
+#endif
diff --git a/include/litmus/rt_param.h b/include/litmus/rt_param.h
index a1fed7653377..138799fbaad7 100644
--- a/include/litmus/rt_param.h
+++ b/include/litmus/rt_param.h
@@ -84,12 +84,12 @@ struct rt_task {
 };
 union np_flag {
-        uint64_t raw;
+        uint32_t raw;
        struct {
                /* Is the task currently in a non-preemptive section? */
-                uint64_t flag:31;
+                uint32_t flag:31;
                /* Should the task call into the scheduler? */
-                uint64_t preempt:1;
+                uint32_t preempt:1;
        } np;
 };
@@ -110,10 +110,10 @@ union np_flag {
 struct control_page {
        /* This flag is used by userspace to communicate non-preempive
         * sections. */
-        volatile union np_flag sched;
+        volatile __attribute__ ((aligned (8))) union np_flag sched;
-        volatile uint64_t irq_count; /* Incremented by the kernel each time an IRQ is
+        /* Incremented by the kernel each time an IRQ is handled. */
-                                      * handled. */
+        volatile __attribute__ ((aligned (8))) uint64_t irq_count;
        /* Locking overhead tracing: userspace records here the time stamp
         * and IRQ counter prior to starting the system call. */
diff --git a/include/litmus/sched_plugin.h b/include/litmus/sched_plugin.h
new file mode 100644
index 000000000000..0f2fe90123db
--- /dev/null
+++ b/include/litmus/sched_plugin.h
@@ -0,0 +1,114 @@
+/*
+ * Definition of the scheduler plugin interface.
+ *
+ */
+#ifndef _LINUX_SCHED_PLUGIN_H_
+#define _LINUX_SCHED_PLUGIN_H_
+#include <linux/sched.h>
+#ifdef CONFIG_LITMUS_LOCKING
+#include <litmus/locking.h>
+#endif
+/************************ setup/tear down ********************/
+typedef long (*activate_plugin_t) (void);
+typedef long (*deactivate_plugin_t) (void);
+/********************* scheduler invocation ******************/
+/*  Plugin-specific realtime tick handler */
+typedef void (*scheduler_tick_t) (struct task_struct *cur);
+/* Novell make sched decision function */
+typedef struct task_struct* (*schedule_t)(struct task_struct * prev);
+/* Clean up after the task switch has occured.
+ * This function is called after every (even non-rt) task switch.
+ */
+typedef void (*finish_switch_t)(struct task_struct *prev);
+/********************* task state changes ********************/
+/* Called to setup a new real-time task.
+ * Release the first job, enqueue, etc.
+ * Task may already be running.
+ */
+typedef void (*task_new_t) (struct task_struct *task,
+                            int on_rq,
+                            int running);
+/* Called to re-introduce a task after blocking.
+ * Can potentially be called multiple times.
+ */
+typedef void (*task_wake_up_t) (struct task_struct *task);
+/* called to notify the plugin of a blocking real-time task
+ * it will only be called for real-time tasks and before schedule is called */
+typedef void (*task_block_t)  (struct task_struct *task);
+/* Called when a real-time task exits or changes to a different scheduling
+ * class.
+ * Free any allocated resources
+ */
+typedef void (*task_exit_t)    (struct task_struct *);
+#ifdef CONFIG_LITMUS_LOCKING
+/* Called when the current task attempts to create a new lock of a given
+ * protocol type. */
+typedef long (*allocate_lock_t) (struct litmus_lock **lock, int type,
+                                 void* __user config);
+#endif
+/********************* sys call backends  ********************/
+/* This function causes the caller to sleep until the next release */
+typedef long (*complete_job_t) (void);
+typedef long (*admit_task_t)(struct task_struct* tsk);
+typedef void (*release_at_t)(struct task_struct *t, lt_t start);
+struct sched_plugin {
+        struct list_head        list;
+        /*      basic info              */
+        char                    *plugin_name;
+        /*      setup                   */
+        activate_plugin_t       activate_plugin;
+        deactivate_plugin_t     deactivate_plugin;
+        /*      scheduler invocation    */
+        scheduler_tick_t        tick;
+        schedule_t              schedule;
+        finish_switch_t         finish_switch;
+        /*      syscall backend         */
+        complete_job_t          complete_job;
+        release_at_t            release_at;
+        /*      task state changes      */
+        admit_task_t            admit_task;
+        task_new_t              task_new;
+        task_wake_up_t          task_wake_up;
+        task_block_t            task_block;
+        task_exit_t             task_exit;
+#ifdef CONFIG_LITMUS_LOCKING
+        /*      locking protocols       */
+        allocate_lock_t         allocate_lock;
+#endif
+} __attribute__ ((__aligned__(SMP_CACHE_BYTES)));
+extern struct sched_plugin *litmus;
+int register_sched_plugin(struct sched_plugin* plugin);
+struct sched_plugin* find_sched_plugin(const char* name);
+void print_sched_plugins(struct seq_file *m);
+extern struct sched_plugin linux_sched_plugin;
+#endif
diff --git a/include/litmus/srp.h b/include/litmus/srp.h
new file mode 100644
index 000000000000..c9a4552b2bf3
--- /dev/null
+++ b/include/litmus/srp.h
@@ -0,0 +1,28 @@
+#ifndef LITMUS_SRP_H
+#define LITMUS_SRP_H
+struct srp_semaphore;
+struct srp_priority {
+        struct list_head        list;
+        unsigned int            priority;
+        pid_t                   pid;
+};
+#define list2prio(l) list_entry(l, struct srp_priority, list)
+/* struct for uniprocessor SRP "semaphore" */
+struct srp_semaphore {
+        struct litmus_lock litmus_lock;
+        struct srp_priority ceiling;
+        struct task_struct* owner;
+        int cpu; /* cpu associated with this "semaphore" and resource */
+};
+/* map a task to its SRP preemption level priority */
+typedef unsigned int (*srp_prioritization_t)(struct task_struct* t);
+/* Must be updated by each plugin that uses SRP.*/
+extern srp_prioritization_t get_srp_prio;
+struct srp_semaphore* allocate_srp_semaphore(void);
+#endif
diff --git a/include/litmus/unistd_32.h b/include/litmus/unistd_32.h
new file mode 100644
index 000000000000..94264c27d9ac
--- /dev/null
+++ b/include/litmus/unistd_32.h
@@ -0,0 +1,21 @@
+/*
+ * included from arch/x86/include/asm/unistd_32.h
+ *
+ * LITMUS^RT syscalls with "relative" numbers
+ */
+#define __LSC(x) (__NR_LITMUS + x)
+#define __NR_set_rt_task_param  __LSC(0)
+#define __NR_get_rt_task_param  __LSC(1)
+#define __NR_complete_job       __LSC(2)
+#define __NR_od_open            __LSC(3)
+#define __NR_od_close           __LSC(4)
+#define __NR_litmus_lock        __LSC(5)
+#define __NR_litmus_unlock      __LSC(6)
+#define __NR_query_job_no       __LSC(7)
+#define __NR_wait_for_job_release __LSC(8)
+#define __NR_wait_for_ts_release __LSC(9)
+#define __NR_release_ts         __LSC(10)
+#define __NR_null_call          __LSC(11)
+#define NR_litmus_syscalls 12
diff --git a/include/litmus/unistd_64.h b/include/litmus/unistd_64.h
new file mode 100644
index 000000000000..d5ced0d2642c
--- /dev/null
+++ b/include/litmus/unistd_64.h
@@ -0,0 +1,33 @@
+/*
+ * included from arch/x86/include/asm/unistd_64.h
+ *
+ * LITMUS^RT syscalls with "relative" numbers
+ */
+#define __LSC(x) (__NR_LITMUS + x)
+#define __NR_set_rt_task_param                  __LSC(0)
+__SYSCALL(__NR_set_rt_task_param, sys_set_rt_task_param)
+#define __NR_get_rt_task_param                  __LSC(1)
+__SYSCALL(__NR_get_rt_task_param, sys_get_rt_task_param)
+#define __NR_complete_job                       __LSC(2)
+__SYSCALL(__NR_complete_job, sys_complete_job)
+#define __NR_od_open                            __LSC(3)
+__SYSCALL(__NR_od_open, sys_od_open)
+#define __NR_od_close                           __LSC(4)
+__SYSCALL(__NR_od_close, sys_od_close)
+#define __NR_litmus_lock                        __LSC(5)
+__SYSCALL(__NR_litmus_lock, sys_litmus_lock)
+#define __NR_litmus_unlock                      __LSC(6)
+__SYSCALL(__NR_litmus_unlock, sys_litmus_unlock)
+#define __NR_query_job_no                       __LSC(7)
+__SYSCALL(__NR_query_job_no, sys_query_job_no)
+#define __NR_wait_for_job_release               __LSC(8)
+__SYSCALL(__NR_wait_for_job_release, sys_wait_for_job_release)
+#define __NR_wait_for_ts_release                __LSC(9)
+__SYSCALL(__NR_wait_for_ts_release, sys_wait_for_ts_release)
+#define __NR_release_ts                         __LSC(10)
+__SYSCALL(__NR_release_ts, sys_release_ts)
+#define __NR_null_call                          __LSC(11)
+__SYSCALL(__NR_null_call, sys_null_call)
+#define NR_litmus_syscalls 12
diff --git a/include/litmus/wait.h b/include/litmus/wait.h
new file mode 100644
index 000000000000..ce1347c355f8
--- /dev/null
+++ b/include/litmus/wait.h
@@ -0,0 +1,57 @@
+#ifndef _LITMUS_WAIT_H_
+#define _LITMUS_WAIT_H_
+struct task_struct* __waitqueue_remove_first(wait_queue_head_t *wq);
+/* wrap regular wait_queue_t head */
+struct __prio_wait_queue {
+        wait_queue_t wq;
+        /* some priority point */
+        lt_t priority;
+        /* break ties in priority by lower tie_breaker */
+        unsigned int tie_breaker;
+};
+typedef struct __prio_wait_queue prio_wait_queue_t;
+static inline void init_prio_waitqueue_entry(prio_wait_queue_t *pwq,
+                                             struct task_struct* t,
+                                             lt_t priority)
+{
+        init_waitqueue_entry(&pwq->wq, t);
+        pwq->priority    = priority;
+        pwq->tie_breaker = 0;
+}
+static inline void init_prio_waitqueue_entry_tie(prio_wait_queue_t *pwq,
+                                                 struct task_struct* t,
+                                                 lt_t priority,
+                                                 unsigned int tie_breaker)
+{
+        init_waitqueue_entry(&pwq->wq, t);
+        pwq->priority    = priority;
+        pwq->tie_breaker = tie_breaker;
+}
+unsigned int __add_wait_queue_prio_exclusive(
+        wait_queue_head_t* head,
+        prio_wait_queue_t *new);
+static inline unsigned int add_wait_queue_prio_exclusive(
+        wait_queue_head_t* head,
+        prio_wait_queue_t *new)
+{
+        unsigned long flags;
+        unsigned int passed;
+        spin_lock_irqsave(&head->lock, flags);
+        passed = __add_wait_queue_prio_exclusive(head, new);
+        spin_unlock_irqrestore(&head->lock, flags);
+        return passed;
+}
+#endif
diff --git a/kernel/sched/litmus.c b/kernel/sched/litmus.c
new file mode 100644
index 000000000000..59428036e2c8
--- /dev/null
+++ b/kernel/sched/litmus.c
@@ -0,0 +1,350 @@
+/* This file is included from kernel/sched.c */
+#include "sched.h"
+#include <litmus/trace.h>
+#include <litmus/sched_trace.h>
+#include <litmus/litmus.h>
+#include <litmus/budget.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/preempt.h>
+static void update_time_litmus(struct rq *rq, struct task_struct *p)
+{
+        u64 delta = rq->clock - p->se.exec_start;
+        if (unlikely((s64)delta < 0))
+                delta = 0;
+        /* per job counter */
+        p->rt_param.job_params.exec_time += delta;
+        /* task counter */
+        p->se.sum_exec_runtime += delta;
+        /* sched_clock() */
+        p->se.exec_start = rq->clock;
+        cpuacct_charge(p, delta);
+}
+static void double_rq_lock(struct rq *rq1, struct rq *rq2);
+static void double_rq_unlock(struct rq *rq1, struct rq *rq2);
+/*
+ * litmus_tick gets called by scheduler_tick() with HZ freq
+ * Interrupts are disabled
+ */
+void litmus_tick(struct rq *rq, struct task_struct *p)
+{
+        TS_PLUGIN_TICK_START;
+        if (is_realtime(p))
+                update_time_litmus(rq, p);
+        /* plugin tick */
+        litmus->tick(p);
+        TS_PLUGIN_TICK_END;
+        return;
+}
+static struct task_struct *
+litmus_schedule(struct rq *rq, struct task_struct *prev)
+{
+        struct task_struct *next;
+#ifdef CONFIG_SMP
+        struct rq* other_rq;
+        long was_running;
+        lt_t _maybe_deadlock = 0;
+#endif
+        /* let the plugin schedule */
+        next = litmus->schedule(prev);
+        sched_state_plugin_check();
+#ifdef CONFIG_SMP
+        /* check if a global plugin pulled a task from a different RQ */
+        if (next && task_rq(next) != rq) {
+                /* we need to migrate the task */
+                other_rq = task_rq(next);
+                TRACE_TASK(next, "migrate from %d\n", other_rq->cpu);
+                /* while we drop the lock, the prev task could change its
+                 * state
+                 */
+                was_running = is_running(prev);
+                mb();
+                raw_spin_unlock(&rq->lock);
+                /* Don't race with a concurrent switch.  This could deadlock in
+                 * the case of cross or circular migrations.  It's the job of
+                 * the plugin to make sure that doesn't happen.
+                 */
+                TRACE_TASK(next, "stack_in_use=%d\n",
+                           next->rt_param.stack_in_use);
+                if (next->rt_param.stack_in_use != NO_CPU) {
+                        TRACE_TASK(next, "waiting to deschedule\n");
+                        _maybe_deadlock = litmus_clock();
+                }
+                while (next->rt_param.stack_in_use != NO_CPU) {
+                        cpu_relax();
+                        mb();
+                        if (next->rt_param.stack_in_use == NO_CPU)
+                                TRACE_TASK(next,"descheduled. Proceeding.\n");
+                        if (lt_before(_maybe_deadlock + 1000000000L,
+                                      litmus_clock())) {
+                                /* We've been spinning for 1s.
+                                 * Something can't be right!
+                                 * Let's abandon the task and bail out; at least
+                                 * we will have debug info instead of a hard
+                                 * deadlock.
+                                 */
+#ifdef CONFIG_BUG_ON_MIGRATION_DEADLOCK
+                                BUG();
+#else
+                                TRACE_TASK(next,"stack too long in use. "
+                                           "Deadlock?\n");
+                                next = NULL;
+                                /* bail out */
+                                raw_spin_lock(&rq->lock);
+                                return next;
+#endif
+                        }
+                }
+#ifdef  __ARCH_WANT_UNLOCKED_CTXSW
+                if (next->on_cpu)
+                        TRACE_TASK(next, "waiting for !oncpu");
+                while (next->on_cpu) {
+                        cpu_relax();
+                        mb();
+                }
+#endif
+                double_rq_lock(rq, other_rq);
+                mb();
+                if (is_realtime(prev) && is_running(prev) != was_running) {
+                        TRACE_TASK(prev,
+                                   "state changed while we dropped"
+                                   " the lock: is_running=%d, was_running=%d\n",
+                                   is_running(prev), was_running);
+                        if (is_running(prev) && !was_running) {
+                                /* prev task became unblocked
+                                 * we need to simulate normal sequence of events
+                                 * to scheduler plugins.
+                                 */
+                                litmus->task_block(prev);
+                                litmus->task_wake_up(prev);
+                        }
+                }
+                set_task_cpu(next, smp_processor_id());
+                /* DEBUG: now that we have the lock we need to make sure a
+                 *  couple of things still hold:
+                 *  - it is still a real-time task
+                 *  - it is still runnable (could have been stopped)
+                 * If either is violated, then the active plugin is
+                 * doing something wrong.
+                 */
+                if (!is_realtime(next) || !is_running(next)) {
+                        /* BAD BAD BAD */
+                        TRACE_TASK(next,"BAD: migration invariant FAILED: "
+                                   "rt=%d running=%d\n",
+                                   is_realtime(next),
+                                   is_running(next));
+                        /* drop the task */
+                        next = NULL;
+                }
+                /* release the other CPU's runqueue, but keep ours */
+                raw_spin_unlock(&other_rq->lock);
+        }
+#endif
+        if (next) {
+#ifdef CONFIG_SMP
+                next->rt_param.stack_in_use = rq->cpu;
+#else
+                next->rt_param.stack_in_use = 0;
+#endif
+                next->se.exec_start = rq->clock;
+        }
+        update_enforcement_timer(next);
+        return next;
+}
+static void enqueue_task_litmus(struct rq *rq, struct task_struct *p,
+                                int flags)
+{
+        if (flags & ENQUEUE_WAKEUP) {
+                sched_trace_task_resume(p);
+                tsk_rt(p)->present = 1;
+                /* LITMUS^RT plugins need to update the state
+                 * _before_ making it available in global structures.
+                 * Linux gets away with being lazy about the task state
+                 * update. We can't do that, hence we update the task
+                 * state already here.
+                 *
+                 * WARNING: this needs to be re-evaluated when porting
+                 *          to newer kernel versions.
+                 */
+                p->state = TASK_RUNNING;
+                litmus->task_wake_up(p);
+                rq->litmus.nr_running++;
+        } else
+                TRACE_TASK(p, "ignoring an enqueue, not a wake up.\n");
+}
+static void dequeue_task_litmus(struct rq *rq, struct task_struct *p,
+                                int flags)
+{
+        if (flags & DEQUEUE_SLEEP) {
+                litmus->task_block(p);
+                tsk_rt(p)->present = 0;
+                sched_trace_task_block(p);
+                rq->litmus.nr_running--;
+        } else
+                TRACE_TASK(p, "ignoring a dequeue, not going to sleep.\n");
+}
+static void yield_task_litmus(struct rq *rq)
+{
+        TS_SYSCALL_IN_START;
+        TS_SYSCALL_IN_END;
+        BUG_ON(rq->curr != current);
+        /* sched_yield() is called to trigger delayed preemptions.
+         * Thus, mark the current task as needing to be rescheduled.
+         * This will cause the scheduler plugin to be invoked, which can
+         * then determine if a preemption is still required.
+         */
+        clear_exit_np(current);
+        litmus_reschedule_local();
+        TS_SYSCALL_OUT_START;
+}
+/* Plugins are responsible for this.
+ */
+static void check_preempt_curr_litmus(struct rq *rq, struct task_struct *p, int flags)
+{
+}
+static void put_prev_task_litmus(struct rq *rq, struct task_struct *p)
+{
+}
+#ifdef CONFIG_SMP
+static void pre_schedule_litmus(struct rq *rq, struct task_struct *prev)
+{
+        update_time_litmus(rq, prev);
+        if (!is_running(prev))
+                tsk_rt(prev)->present = 0;
+}
+#endif
+/* pick_next_task_litmus() - litmus_schedule() function
+ *
+ * return the next task to be scheduled
+ */
+static struct task_struct *pick_next_task_litmus(struct rq *rq)
+{
+        /* get the to-be-switched-out task (prev) */
+        struct task_struct *prev = rq->litmus.prev;
+        struct task_struct *next;
+        /* if not called from schedule() but from somewhere
+         * else (e.g., migration), return now!
+         */
+        if(!rq->litmus.prev)
+                return NULL;
+        rq->litmus.prev = NULL;
+        TS_PLUGIN_SCHED_START;
+        next = litmus_schedule(rq, prev);
+        TS_PLUGIN_SCHED_END;
+        return next;
+}
+static void task_tick_litmus(struct rq *rq, struct task_struct *p, int queued)
+{
+        /* nothing to do; tick related tasks are done by litmus_tick() */
+        return;
+}
+static void switched_to_litmus(struct rq *rq, struct task_struct *p)
+{
+}
+static void prio_changed_litmus(struct rq *rq, struct task_struct *p,
+                                int oldprio)
+{
+}
+unsigned int get_rr_interval_litmus(struct rq *rq, struct task_struct *p)
+{
+        /* return infinity */
+        return 0;
+}
+/* This is called when a task became a real-time task, either due to a SCHED_*
+ * class transition or due to PI mutex inheritance. We don't handle Linux PI
+ * mutex inheritance yet (and probably never will). Use LITMUS provided
+ * synchronization primitives instead.
+ */
+static void set_curr_task_litmus(struct rq *rq)
+{
+        rq->curr->se.exec_start = rq->clock;
+}
+#ifdef CONFIG_SMP
+/* execve tries to rebalance task in this scheduling domain.
+ * We don't care about the scheduling domain; can gets called from
+ * exec, fork, wakeup.
+ */
+static int
+select_task_rq_litmus(struct task_struct *p, int sd_flag, int flags)
+{
+        /* preemption is already disabled.
+         * We don't want to change cpu here
+         */
+        return task_cpu(p);
+}
+#endif
+const struct sched_class litmus_sched_class = {
+        /* From 34f971f6 the stop/migrate worker threads have a class on
+         * their own, which is the highest prio class. We don't support
+         * cpu-hotplug or cpu throttling. Allows Litmus to use up to 1.0
+         * CPU capacity.
+         */
+        .next                   = &rt_sched_class,
+        .enqueue_task           = enqueue_task_litmus,
+        .dequeue_task           = dequeue_task_litmus,
+        .yield_task             = yield_task_litmus,
+        .check_preempt_curr     = check_preempt_curr_litmus,
+        .pick_next_task         = pick_next_task_litmus,
+        .put_prev_task          = put_prev_task_litmus,
+#ifdef CONFIG_SMP
+        .select_task_rq         = select_task_rq_litmus,
+        .pre_schedule           = pre_schedule_litmus,
+#endif
+        .set_curr_task          = set_curr_task_litmus,
+        .task_tick              = task_tick_litmus,
+        .get_rr_interval        = get_rr_interval_litmus,
+        .prio_changed           = prio_changed_litmus,
+        .switched_to            = switched_to_litmus,
+};
diff --git a/litmus/Kconfig b/litmus/Kconfig
index 5408ef6b159b..32c18c6eb58d 100644
--- a/litmus/Kconfig
+++ b/litmus/Kconfig
@@ -1,5 +1,156 @@
 menu "LITMUS^RT"
+menu "Scheduling"
+config RELEASE_MASTER
+        bool "Release-master Support"
+        depends on ARCH_HAS_SEND_PULL_TIMERS && SMP
+        default n
+        help
+           Allow one processor to act as a dedicated interrupt processor
+           that services all timer interrupts, but that does not schedule
+           real-time tasks. See RTSS'09 paper for details
+           (http://www.cs.unc.edu/~anderson/papers.html).
+config BUG_ON_MIGRATION_DEADLOCK
+       bool "Panic on suspected migration deadlock"
+       default y
+       help
+          This is a debugging option. The LITMUS^RT migration support code for
+          global scheduling contains a simple heuristic to detect when the
+          system deadlocks due to circular stack dependencies.
+          For example, such a deadlock exists if CPU 0 waits for task A's stack
+          to become available while using task B's stack, and CPU 1 waits for
+          task B's stack to become available while using task A's stack. Such
+          a situation can arise in (buggy) global scheduling plugins.
+          With this option enabled, such a scenario with result in a BUG().
+          You can turn off this option when debugging on real hardware (e.g.,
+          to rescue traces, etc. that would be hard to get after a panic).
+          Only turn this off if you really know what you are doing. If this
+          BUG() triggers, the scheduler is broken and turning off this option
+          won't fix it.
+endmenu
+menu "Real-Time Synchronization"
+config NP_SECTION
+        bool "Non-preemptive section support"
+        default y
+        help
+          Allow tasks to become non-preemptable.
+          Note that plugins still need to explicitly support non-preemptivity.
+          Currently, only the GSN-EDF, PSN-EDF, and P-FP plugins have such support.
+          This is required to support locking protocols such as the FMLP.
+          If disabled, all tasks will be considered preemptable at all times.
+config LITMUS_LOCKING
+        bool "Support for real-time locking protocols"
+        depends on NP_SECTION
+        default y
+        help
+          Enable LITMUS^RT's multiprocessor real-time locking protocols with
+          predicable maximum blocking times.
+          Say Yes if you want to include locking protocols such as the FMLP and
+          Baker's SRP.
+endmenu
+menu "Performance Enhancements"
+config SCHED_CPU_AFFINITY
+        bool "Local Migration Affinity"
+        depends on X86 && SYSFS
+        default y
+        help
+          Rescheduled tasks prefer CPUs near to their previously used CPU.
+          This may improve cache performance through possible preservation of
+          cache affinity, at the expense of (slightly) more involved scheduling
+          logic.
+          Warning: May make bugs harder to find since tasks may migrate less often.
+          NOTES:
+                * Feature is not utilized by PFair/PD^2.
+          Say Yes if unsure.
+config ALLOW_EARLY_RELEASE
+        bool "Allow Early Releasing"
+        default y
+        help
+          Allow tasks to release jobs early (while still maintaining job
+          precedence constraints). Only supported by EDF schedulers. Early
+          releasing must be explicitly requested by real-time tasks via
+          the task_params passed to sys_set_task_rt_param().
+          Early releasing can improve job response times while maintaining
+          real-time correctness. However, it can easily peg your CPUs
+          since tasks never suspend to wait for their next job. As such, early
+          releasing is really only useful in the context of implementing
+          bandwidth servers, interrupt handling threads, or short-lived
+          computations.
+          Beware that early releasing may affect real-time analysis
+          if using locking protocols or I/O.
+          Say Yes if unsure.
+choice
+        prompt "EDF Tie-Break Behavior"
+        default EDF_TIE_BREAK_LATENESS_NORM
+        help
+          Allows the configuration of tie-breaking behavior when the deadlines
+          of two EDF-scheduled tasks are equal.
+        config EDF_TIE_BREAK_LATENESS
+        bool "Lateness-based Tie Break"
+        help
+          Break ties between two jobs, A and B, based upon the lateness of their
+          prior jobs. The job with the greatest lateness has priority. Note that
+          lateness has a negative value if the prior job finished before its
+          deadline.
+        config EDF_TIE_BREAK_LATENESS_NORM
+        bool "Normalized Lateness-based Tie Break"
+        help
+          Break ties between two jobs, A and B, based upon the lateness, normalized
+          by relative deadline, of their prior jobs. The job with the greatest
+          normalized lateness has priority. Note that lateness has a negative value
+          if the prior job finished before its deadline.
+          Normalized lateness tie-breaks are likely desireable over non-normalized
+          tie-breaks if the execution times and/or relative deadlines of tasks in a
+          task set vary greatly.
+        config EDF_TIE_BREAK_HASH
+        bool "Hash-based Tie Breaks"
+        help
+          Break ties between two jobs, A and B, with equal deadlines by using a
+          uniform hash; i.e.: hash(A.pid, A.job_num) < hash(B.pid, B.job_num). Job
+          A has ~50% of winning a given tie-break.
+        config EDF_PID_TIE_BREAK
+        bool "PID-based Tie Breaks"
+        help
+          Break ties based upon OS-assigned thread IDs. Use this option if
+          required by algorithm's real-time analysis or per-task response-time
+          jitter must be minimized.
+          NOTES:
+            * This tie-breaking method was default in Litmus 2012.2 and before.
+endchoice
+endmenu
 menu "Tracing"
 config FEATHER_TRACE
@@ -154,6 +305,20 @@ config SCHED_DEBUG_TRACE_CALLER
         If unsure, say No.
+config PREEMPT_STATE_TRACE
+       bool "Trace preemption state machine transitions"
+       depends on SCHED_DEBUG_TRACE && DEBUG_KERNEL
+       default n
+       help
+         With this option enabled, each CPU will log when it transitions
+         states in the preemption state machine. This state machine is
+         used to determine how to react to IPIs (avoid races with in-flight IPIs).
+         Warning: this creates a lot of information in the debug trace. Only
+         recommended when you are debugging preemption-related races.
+         If unsure, say No.
 endmenu
 endmenu
diff --git a/litmus/Makefile b/litmus/Makefile
index 6318f1c6fac8..fcb4d7533327 100644
--- a/litmus/Makefile
+++ b/litmus/Makefile
@@ -2,6 +2,25 @@
 # Makefile for LITMUS^RT
 #
+obj-y     = sched_plugin.o litmus.o \
+            preempt.o \
+            litmus_proc.o \
+            budget.o \
+            clustered.o \
+            jobs.o \
+            sync.o \
+            rt_domain.o \
+            edf_common.o \
+            fp_common.o \
+            fdso.o \
+            locking.o \
+            srp.o \
+            bheap.o \
+            binheap.o \
+            ctrldev.o
+obj-$(CONFIG_SCHED_CPU_AFFINITY) += affinity.o
 obj-$(CONFIG_FEATHER_TRACE) += ft_event.o ftdev.o
 obj-$(CONFIG_SCHED_TASK_TRACE) += sched_task_trace.o
 obj-$(CONFIG_SCHED_DEBUG_TRACE) += sched_trace.o
diff --git a/litmus/affinity.c b/litmus/affinity.c
new file mode 100644
index 000000000000..a5b437b80433
--- /dev/null
+++ b/litmus/affinity.c
@@ -0,0 +1,41 @@
+#include <linux/cpu.h>
+#include <litmus/affinity.h>
+struct neighborhood neigh_info[NR_CPUS];
+/* called by _init_litmus() */
+void init_topology(void) {
+        int cpu;
+        int i;
+        int chk;
+        int depth = num_cache_leaves;
+        if (depth > NUM_CACHE_LEVELS)
+                depth = NUM_CACHE_LEVELS;
+        for_each_online_cpu(cpu) {
+                for (i = 0; i < depth; ++i) {
+                        chk = get_shared_cpu_map((struct cpumask *)&neigh_info[cpu].neighbors[i], cpu, i);
+                        if (chk) {
+                                /* failed */
+                                neigh_info[cpu].size[i] = 0;
+                        } else {
+                                /* size = num bits in mask */
+                                neigh_info[cpu].size[i] =
+                                        cpumask_weight((struct cpumask *)&neigh_info[cpu].neighbors[i]);
+                        }
+                        printk("CPU %d has %d neighbors at level %d. (mask = %lx)\n",
+                                                        cpu, neigh_info[cpu].size[i], i,
+                                                        *cpumask_bits(neigh_info[cpu].neighbors[i]));
+                }
+                /* set data for non-existent levels */
+                for (; i < NUM_CACHE_LEVELS; ++i) {
+                        neigh_info[cpu].size[i] = 0;
+                        printk("CPU %d has %d neighbors at level %d. (mask = %lx)\n",
+                                                cpu, neigh_info[cpu].size[i], i, 0lu);
+                }
+        }
+}
diff --git a/litmus/bheap.c b/litmus/bheap.c
new file mode 100644
index 000000000000..2707e0122b6d
--- /dev/null
+++ b/litmus/bheap.c
@@ -0,0 +1,316 @@
+#include <linux/bug.h>
+#include <linux/kernel.h>
+#include <litmus/bheap.h>
+void bheap_init(struct bheap* heap)
+{
+        heap->head = NULL;
+        heap->min  = NULL;
+}
+void bheap_node_init(struct bheap_node** _h, void* value)
+{
+        struct bheap_node* h = *_h;
+        h->parent = NULL;
+        h->next   = NULL;
+        h->child  = NULL;
+        h->degree = NOT_IN_HEAP;
+        h->value  = value;
+        h->ref    = _h;
+}
+/* make child a subtree of root */
+static void __bheap_link(struct bheap_node* root,
+                        struct bheap_node* child)
+{
+        child->parent = root;
+        child->next   = root->child;
+        root->child   = child;
+        root->degree++;
+}
+/* merge root lists */
+static  struct bheap_node* __bheap_merge(struct bheap_node* a,
+                                             struct bheap_node* b)
+{
+        struct bheap_node* head = NULL;
+        struct bheap_node** pos = &head;
+        while (a && b) {
+                if (a->degree < b->degree) {
+                        *pos = a;
+                        a = a->next;
+                } else {
+                        *pos = b;
+                        b = b->next;
+                }
+                pos = &(*pos)->next;
+        }
+        if (a)
+                *pos = a;
+        else
+                *pos = b;
+        return head;
+}
+/* reverse a linked list of nodes. also clears parent pointer */
+static  struct bheap_node* __bheap_reverse(struct bheap_node* h)
+{
+        struct bheap_node* tail = NULL;
+        struct bheap_node* next;
+        if (!h)
+                return h;
+        h->parent = NULL;
+        while (h->next) {
+                next    = h->next;
+                h->next = tail;
+                tail    = h;
+                h       = next;
+                h->parent = NULL;
+        }
+        h->next = tail;
+        return h;
+}
+static  void __bheap_min(bheap_prio_t higher_prio, struct bheap* heap,
+                              struct bheap_node** prev, struct bheap_node** node)
+{
+        struct bheap_node *_prev, *cur;
+        *prev = NULL;
+        if (!heap->head) {
+                *node = NULL;
+                return;
+        }
+        *node = heap->head;
+        _prev = heap->head;
+        cur   = heap->head->next;
+        while (cur) {
+                if (higher_prio(cur, *node)) {
+                        *node = cur;
+                        *prev = _prev;
+                }
+                _prev = cur;
+                cur   = cur->next;
+        }
+}
+static  void __bheap_union(bheap_prio_t higher_prio, struct bheap* heap,
+                                struct bheap_node* h2)
+{
+        struct bheap_node* h1;
+        struct bheap_node *prev, *x, *next;
+        if (!h2)
+                return;
+        h1 = heap->head;
+        if (!h1) {
+                heap->head = h2;
+                return;
+        }
+        h1 = __bheap_merge(h1, h2);
+        prev = NULL;
+        x    = h1;
+        next = x->next;
+        while (next) {
+                if (x->degree != next->degree ||
+                    (next->next && next->next->degree == x->degree)) {
+                        /* nothing to do, advance */
+                        prev = x;
+                        x    = next;
+                } else if (higher_prio(x, next)) {
+                        /* x becomes the root of next */
+                        x->next = next->next;
+                        __bheap_link(x, next);
+                } else {
+                        /* next becomes the root of x */
+                        if (prev)
+                                prev->next = next;
+                        else
+                                h1 = next;
+                        __bheap_link(next, x);
+                        x = next;
+                }
+                next = x->next;
+        }
+        heap->head = h1;
+}
+static struct bheap_node* __bheap_extract_min(bheap_prio_t higher_prio,
+                                            struct bheap* heap)
+{
+        struct bheap_node *prev, *node;
+        __bheap_min(higher_prio, heap, &prev, &node);
+        if (!node)
+                return NULL;
+        if (prev)
+                prev->next = node->next;
+        else
+                heap->head = node->next;
+        __bheap_union(higher_prio, heap, __bheap_reverse(node->child));
+        return node;
+}
+/* insert (and reinitialize) a node into the heap */
+void bheap_insert(bheap_prio_t higher_prio, struct bheap* heap,
+                 struct bheap_node* node)
+{
+        struct bheap_node *min;
+        node->child  = NULL;
+        node->parent = NULL;
+        node->next   = NULL;
+        node->degree = 0;
+        if (heap->min && higher_prio(node, heap->min)) {
+                /* swap min cache */
+                min = heap->min;
+                min->child  = NULL;
+                min->parent = NULL;
+                min->next   = NULL;
+                min->degree = 0;
+                __bheap_union(higher_prio, heap, min);
+                heap->min   = node;
+        } else
+                __bheap_union(higher_prio, heap, node);
+}
+void bheap_uncache_min(bheap_prio_t higher_prio, struct bheap* heap)
+{
+        struct bheap_node* min;
+        if (heap->min) {
+                min = heap->min;
+                heap->min = NULL;
+                bheap_insert(higher_prio, heap, min);
+        }
+}
+/* merge addition into target */
+void bheap_union(bheap_prio_t higher_prio,
+                struct bheap* target, struct bheap* addition)
+{
+        /* first insert any cached minima, if necessary */
+        bheap_uncache_min(higher_prio, target);
+        bheap_uncache_min(higher_prio, addition);
+        __bheap_union(higher_prio, target, addition->head);
+        /* this is a destructive merge */
+        addition->head = NULL;
+}
+struct bheap_node* bheap_peek(bheap_prio_t higher_prio,
+                            struct bheap* heap)
+{
+        if (!heap->min)
+                heap->min = __bheap_extract_min(higher_prio, heap);
+        return heap->min;
+}
+struct bheap_node* bheap_take(bheap_prio_t higher_prio,
+                            struct bheap* heap)
+{
+        struct bheap_node *node;
+        if (!heap->min)
+                heap->min = __bheap_extract_min(higher_prio, heap);
+        node = heap->min;
+        heap->min = NULL;
+        if (node)
+                node->degree = NOT_IN_HEAP;
+        return node;
+}
+int bheap_decrease(bheap_prio_t higher_prio, struct bheap_node* node)
+{
+        struct bheap_node  *parent;
+        struct bheap_node** tmp_ref;
+        void* tmp;
+        /* bubble up */
+        parent = node->parent;
+        while (parent && higher_prio(node, parent)) {
+                /* swap parent and node */
+                tmp           = parent->value;
+                parent->value = node->value;
+                node->value   = tmp;
+                /* swap references */
+                *(parent->ref) = node;
+                *(node->ref)   = parent;
+                tmp_ref        = parent->ref;
+                parent->ref    = node->ref;
+                node->ref      = tmp_ref;
+                /* step up */
+                node   = parent;
+                parent = node->parent;
+        }
+        return parent != NULL;
+}
+void bheap_delete(bheap_prio_t higher_prio, struct bheap* heap,
+                 struct bheap_node* node)
+{
+        struct bheap_node *parent, *prev, *pos;
+        struct bheap_node** tmp_ref;
+        void* tmp;
+        if (heap->min != node) {
+                /* bubble up */
+                parent = node->parent;
+                while (parent) {
+                        /* swap parent and node */
+                        tmp           = parent->value;
+                        parent->value = node->value;
+                        node->value   = tmp;
+                        /* swap references */
+                        *(parent->ref) = node;
+                        *(node->ref)   = parent;
+                        tmp_ref        = parent->ref;
+                        parent->ref    = node->ref;
+                        node->ref      = tmp_ref;
+                        /* step up */
+                        node   = parent;
+                        parent = node->parent;
+                }
+                /* now delete:
+                 * first find prev */
+                prev = NULL;
+                pos  = heap->head;
+                while (pos != node) {
+                        BUG_ON(!pos); /* fell off the list -> deleted from wrong heap */
+                        prev = pos;
+                        pos  = pos->next;
+                }
+                /* we have prev, now remove node */
+                if (prev)
+                        prev->next = node->next;
+                else
+                        heap->head = node->next;
+                __bheap_union(higher_prio, heap, __bheap_reverse(node->child));
+        } else
+                heap->min = NULL;
+        node->degree = NOT_IN_HEAP;
+}
+/* allocate a heap node for value and insert into the heap */
+int bheap_add(bheap_prio_t higher_prio, struct bheap* heap,
+             void* value, int gfp_flags)
+{
+        struct bheap_node* hn = bheap_node_alloc(gfp_flags);
+        if (likely(hn)) {
+                bheap_node_init(&hn, value);
+                bheap_insert(higher_prio, heap, hn);
+        }
+        return hn != NULL;
+}
+void* bheap_take_del(bheap_prio_t higher_prio,
+                    struct bheap* heap)
+{
+        struct bheap_node* hn = bheap_take(higher_prio, heap);
+        void* ret = NULL;
+        if (hn) {
+                ret = hn->value;
+                bheap_node_free(hn);
+        }
+        return ret;
+}
diff --git a/litmus/binheap.c b/litmus/binheap.c
new file mode 100644
index 000000000000..d3ab34b92096
--- /dev/null
+++ b/litmus/binheap.c
@@ -0,0 +1,387 @@
+#include <litmus/binheap.h>
+/* Returns true of the root ancestor of node is the root of the given heap. */
+int binheap_is_in_this_heap(struct binheap_node *node,
+        struct binheap* heap)
+{
+        if(!binheap_is_in_heap(node)) {
+                return 0;
+        }
+        while(node->parent != NULL) {
+                node = node->parent;
+        }
+        return (node == heap->root);
+}
+/* Update the node reference pointers.  Same logic as Litmus binomial heap. */
+static void __update_ref(struct binheap_node *parent,
+                                struct binheap_node *child)
+{
+        *(parent->ref_ptr) = child;
+        *(child->ref_ptr) = parent;
+        swap(parent->ref_ptr, child->ref_ptr);
+}
+/* Swaps data between two nodes. */
+static void __binheap_swap(struct binheap_node *parent,
+                                struct binheap_node *child)
+{
+        swap(parent->data, child->data);
+        __update_ref(parent, child);
+}
+/* Swaps memory and data between two nodes. Actual nodes swap instead of
+ * just data.  Needed when we delete nodes from the heap.
+ */
+static void __binheap_swap_safe(struct binheap *handle,
+                                struct binheap_node *a,
+                                struct binheap_node *b)
+{
+        swap(a->data, b->data);
+        __update_ref(a, b);
+        if((a->parent != NULL) && (a->parent == b->parent)) {
+                /* special case: shared parent */
+                swap(a->parent->left, a->parent->right);
+        }
+        else {
+                /* Update pointers to swap parents. */
+                if(a->parent) {
+                        if(a == a->parent->left) {
+                                a->parent->left = b;
+                        }
+                        else {
+                                a->parent->right = b;
+                        }
+                }
+                if(b->parent) {
+                        if(b == b->parent->left) {
+                                b->parent->left = a;
+                        }
+                        else {
+                                b->parent->right = a;
+                        }
+                }
+                swap(a->parent, b->parent);
+        }
+        /* swap children */
+        if(a->left) {
+                a->left->parent = b;
+                if(a->right) {
+                        a->right->parent = b;
+                }
+        }
+        if(b->left) {
+                b->left->parent = a;
+                if(b->right) {
+                        b->right->parent = a;
+                }
+        }
+        swap(a->left, b->left);
+        swap(a->right, b->right);
+        /* update next/last/root pointers */
+        if(a == handle->next) {
+                handle->next = b;
+        }
+        else if(b == handle->next) {
+                handle->next = a;
+        }
+        if(a == handle->last) {
+                handle->last = b;
+        }
+        else if(b == handle->last) {
+                handle->last = a;
+        }
+        if(a == handle->root) {
+                handle->root = b;
+        }
+        else if(b == handle->root) {
+                handle->root = a;
+        }
+}
+/**
+ * Update the pointer to the last node in the complete binary tree.
+ * Called internally after the root node has been deleted.
+ */
+static void __binheap_update_last(struct binheap *handle)
+{
+        struct binheap_node *temp = handle->last;
+        /* find a "bend" in the tree. */
+        while(temp->parent && (temp == temp->parent->left)) {
+                temp = temp->parent;
+        }
+        /* step over to sibling if we're not at root */
+        if(temp->parent != NULL) {
+                temp = temp->parent->left;
+        }
+        /* now travel right as far as possible. */
+        while(temp->right != NULL) {
+                temp = temp->right;
+        }
+        /* take one step to the left if we're not at the bottom-most level. */
+        if(temp->left != NULL) {
+                temp = temp->left;
+        }
+        handle->last = temp;
+}
+/**
+ * Update the pointer to the node that will take the next inserted node.
+ * Called internally after a node has been inserted.
+ */
+static void __binheap_update_next(struct binheap *handle)
+{
+        struct binheap_node *temp = handle->next;
+        /* find a "bend" in the tree. */
+        while(temp->parent && (temp == temp->parent->right)) {
+                temp = temp->parent;
+        }
+        /* step over to sibling if we're not at root */
+        if(temp->parent != NULL) {
+                temp = temp->parent->right;
+        }
+        /* now travel left as far as possible. */
+        while(temp->left != NULL) {
+                temp = temp->left;
+        }
+        handle->next = temp;
+}
+/* bubble node up towards root */
+static void __binheap_bubble_up(struct binheap *handle,
+                                struct binheap_node *node)
+{
+        /* let BINHEAP_POISON data bubble to the top */
+        while((node->parent != NULL) &&
+                  ((node->data == BINHEAP_POISON) ||
+                   handle->compare(node, node->parent))) {
+                          __binheap_swap(node->parent, node);
+                          node = node->parent;
+        }
+}
+/* bubble node down, swapping with min-child */
+static void __binheap_bubble_down(struct binheap *handle)
+{
+        struct binheap_node *node = handle->root;
+        while(node->left != NULL) {
+                if(node->right && handle->compare(node->right, node->left)) {
+                        if(handle->compare(node->right, node)) {
+                                __binheap_swap(node, node->right);
+                                node = node->right;
+                        }
+                        else {
+                                break;
+                        }
+                }
+                else {
+                        if(handle->compare(node->left, node)) {
+                                __binheap_swap(node, node->left);
+                                node = node->left;
+                        }
+                        else {
+                                break;
+                        }
+                }
+        }
+}
+void __binheap_add(struct binheap_node *new_node,
+                                struct binheap *handle,
+                                void *data)
+{
+        new_node->data = data;
+        new_node->ref = new_node;
+        new_node->ref_ptr = &(new_node->ref);
+        if(!binheap_empty(handle)) {
+                /* insert left side first */
+                if(handle->next->left == NULL) {
+                        handle->next->left = new_node;
+                        new_node->parent = handle->next;
+                        new_node->left = NULL;
+                        new_node->right = NULL;
+                        handle->last = new_node;
+                        __binheap_bubble_up(handle, new_node);
+                }
+                else {
+                        /* left occupied. insert right. */
+                        handle->next->right = new_node;
+                        new_node->parent = handle->next;
+                        new_node->left = NULL;
+                        new_node->right = NULL;
+                        handle->last = new_node;
+                        __binheap_update_next(handle);
+                        __binheap_bubble_up(handle, new_node);
+                }
+        }
+        else {
+                /* first node in heap */
+                new_node->parent = NULL;
+                new_node->left = NULL;
+                new_node->right = NULL;
+                handle->root = new_node;
+                handle->next = new_node;
+                handle->last = new_node;
+        }
+}
+/**
+ * Removes the root node from the heap. The node is removed after coalescing
+ * the binheap_node with its original data pointer at the root of the tree.
+ *
+ * The 'last' node in the tree is then swapped up to the root and bubbled
+ * down.
+ */
+void __binheap_delete_root(struct binheap *handle,
+                                struct binheap_node *container)
+{
+        struct binheap_node *root = handle->root;
+        if(root != container) {
+                /* coalesce */
+                __binheap_swap_safe(handle, root, container);
+                root = container;
+        }
+        if(handle->last != root) {
+                /* swap 'last' node up to root and bubble it down. */
+                struct binheap_node *to_move = handle->last;
+                if(to_move->parent != root) {
+                        handle->next = to_move->parent;
+                        if(handle->next->right == to_move) {
+                                /* disconnect from parent */
+                                to_move->parent->right = NULL;
+                                handle->last = handle->next->left;
+                        }
+                        else {
+                                /* find new 'last' before we disconnect */
+                                __binheap_update_last(handle);
+                                /* disconnect from parent */
+                                to_move->parent->left = NULL;
+                        }
+                }
+                else {
+                        /* 'last' is direct child of root */
+                        handle->next = to_move;
+                        if(to_move == to_move->parent->right) {
+                                to_move->parent->right = NULL;
+                                handle->last = to_move->parent->left;
+                        }
+                        else {
+                                to_move->parent->left = NULL;
+                                handle->last = to_move;
+                        }
+                }
+                to_move->parent = NULL;
+                /* reconnect as root.  We can't just swap data ptrs since root node
+                 * may be freed after this function returns.
+                 */
+                to_move->left = root->left;
+                to_move->right = root->right;
+                if(to_move->left != NULL) {
+                        to_move->left->parent = to_move;
+                }
+                if(to_move->right != NULL) {
+                        to_move->right->parent = to_move;
+                }
+                handle->root = to_move;
+                /* bubble down */
+                __binheap_bubble_down(handle);
+        }
+        else {
+                /* removing last node in tree */
+                handle->root = NULL;
+                handle->next = NULL;
+                handle->last = NULL;
+        }
+        /* mark as removed */
+        container->parent = BINHEAP_POISON;
+}
+/**
+ * Delete an arbitrary node.  Bubble node to delete up to the root,
+ * and then delete to root.
+ */
+void __binheap_delete(struct binheap_node *node_to_delete,
+                                struct binheap *handle)
+{
+        struct binheap_node *target = node_to_delete->ref;
+        void *temp_data = target->data;
+        /* temporarily set data to null to allow node to bubble up to the top. */
+        target->data = BINHEAP_POISON;
+        __binheap_bubble_up(handle, target);
+        __binheap_delete_root(handle, node_to_delete);
+        node_to_delete->data = temp_data;  /* restore node data pointer */
+}
+/**
+ * Bubble up a node whose pointer has decreased in value.
+ */
+void __binheap_decrease(struct binheap_node *orig_node,
+                                struct binheap *handle)
+{
+        struct binheap_node *target = orig_node->ref;
+        __binheap_bubble_up(handle, target);
+}
diff --git a/litmus/budget.c b/litmus/budget.c
new file mode 100644
index 000000000000..f7712be29adb
--- /dev/null
+++ b/litmus/budget.c
@@ -0,0 +1,113 @@
+#include <linux/sched.h>
+#include <linux/percpu.h>
+#include <linux/hrtimer.h>
+#include <litmus/litmus.h>
+#include <litmus/preempt.h>
+#include <litmus/budget.h>
+struct enforcement_timer {
+        /* The enforcement timer is used to accurately police
+         * slice budgets. */
+        struct hrtimer          timer;
+        int                     armed;
+};
+DEFINE_PER_CPU(struct enforcement_timer, budget_timer);
+static enum hrtimer_restart on_enforcement_timeout(struct hrtimer *timer)
+{
+        struct enforcement_timer* et = container_of(timer,
+                                                    struct enforcement_timer,
+                                                    timer);
+        unsigned long flags;
+        local_irq_save(flags);
+        TRACE("enforcement timer fired.\n");
+        et->armed = 0;
+        /* activate scheduler */
+        litmus_reschedule_local();
+        local_irq_restore(flags);
+        return  HRTIMER_NORESTART;
+}
+/* assumes called with IRQs off */
+static void cancel_enforcement_timer(struct enforcement_timer* et)
+{
+        int ret;
+        TRACE("cancelling enforcement timer.\n");
+        /* Since interrupts are disabled and et->armed is only
+         * modified locally, we do not need any locks.
+         */
+        if (et->armed) {
+                ret = hrtimer_try_to_cancel(&et->timer);
+                /* Should never be inactive. */
+                BUG_ON(ret == 0);
+                /* Should never be running concurrently. */
+                BUG_ON(ret == -1);
+                et->armed = 0;
+        }
+}
+/* assumes called with IRQs off */
+static void arm_enforcement_timer(struct enforcement_timer* et,
+                                  struct task_struct* t)
+{
+        lt_t when_to_fire;
+        TRACE_TASK(t, "arming enforcement timer.\n");
+        /* Calling this when there is no budget left for the task
+         * makes no sense, unless the task is non-preemptive. */
+        BUG_ON(budget_exhausted(t) && (!is_np(t)));
+        /* __hrtimer_start_range_ns() cancels the timer
+         * anyway, so we don't have to check whether it is still armed */
+        if (likely(!is_np(t))) {
+                when_to_fire = litmus_clock() + budget_remaining(t);
+                __hrtimer_start_range_ns(&et->timer,
+                                         ns_to_ktime(when_to_fire),
+                                         0 /* delta */,
+                                         HRTIMER_MODE_ABS_PINNED,
+                                         0 /* no wakeup */);
+                et->armed = 1;
+        }
+}
+/* expects to be called with IRQs off */
+void update_enforcement_timer(struct task_struct* t)
+{
+        struct enforcement_timer* et = &__get_cpu_var(budget_timer);
+        if (t && budget_precisely_enforced(t)) {
+                /* Make sure we call into the scheduler when this budget
+                 * expires. */
+                arm_enforcement_timer(et, t);
+        } else if (et->armed) {
+                /* Make sure we don't cause unnecessary interrupts. */
+                cancel_enforcement_timer(et);
+        }
+}
+static int __init init_budget_enforcement(void)
+{
+        int cpu;
+        struct enforcement_timer* et;
+        for (cpu = 0; cpu < NR_CPUS; cpu++)  {
+                et = &per_cpu(budget_timer, cpu);
+                hrtimer_init(&et->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+                et->timer.function = on_enforcement_timeout;
+        }
+        return 0;
+}
+module_init(init_budget_enforcement);
diff --git a/litmus/clustered.c b/litmus/clustered.c
new file mode 100644
index 000000000000..979fac6bebb7
--- /dev/null
+++ b/litmus/clustered.c
@@ -0,0 +1,111 @@
+#include <linux/gfp.h>
+#include <linux/cpumask.h>
+#include <linux/list.h>
+#include <litmus/clustered.h>
+#if !defined(CONFIG_X86) || !defined(CONFIG_SYSFS)
+/* fake get_shared_cpu_map() on non-x86 architectures */
+int get_shared_cpu_map(cpumask_var_t mask, unsigned int cpu, int index)
+{
+        if (index != 1)
+                return 1;
+        else {
+                /* Fake L1: CPU is all by itself. */
+                cpumask_clear(mask);
+                cpumask_set_cpu(cpu, mask);
+                return 0;
+        }
+}
+#endif
+int get_cluster_size(enum cache_level level)
+{
+        cpumask_var_t mask;
+        int ok;
+        int num_cpus;
+        if (level == GLOBAL_CLUSTER)
+                return num_online_cpus();
+        else {
+                if (!zalloc_cpumask_var(&mask, GFP_ATOMIC))
+                        return -ENOMEM;
+                /* assumes CPU 0 is representative of all CPUs */
+                ok = get_shared_cpu_map(mask, 0, level);
+                /* ok == 0 means we got the map; otherwise it's an invalid cache level */
+                if (ok == 0)
+                        num_cpus = cpumask_weight(mask);
+                free_cpumask_var(mask);
+                if (ok == 0)
+                        return num_cpus;
+                else
+                        return -EINVAL;
+        }
+}
+int assign_cpus_to_clusters(enum cache_level level,
+                            struct scheduling_cluster* clusters[],
+                            unsigned int num_clusters,
+                            struct cluster_cpu* cpus[],
+                            unsigned int num_cpus)
+{
+        cpumask_var_t mask;
+        unsigned int i, free_cluster = 0, low_cpu;
+        int err = 0;
+        if (!zalloc_cpumask_var(&mask, GFP_ATOMIC))
+                return -ENOMEM;
+        /* clear cluster pointers */
+        for (i = 0; i < num_cpus; i++) {
+                cpus[i]->id      = i;
+                cpus[i]->cluster = NULL;
+        }
+        /* initialize clusters */
+        for (i = 0; i < num_clusters; i++) {
+                clusters[i]->id = i;
+                INIT_LIST_HEAD(&clusters[i]->cpus);
+        }
+        /* Assign each CPU. Two assumtions are made:
+         * 1) The index of a cpu in cpus corresponds to its processor id (i.e., the index in a cpu mask).
+         * 2) All cpus that belong to some cluster are online.
+         */
+        for_each_online_cpu(i) {
+                /* get lowest-id CPU in cluster */
+                if (level != GLOBAL_CLUSTER) {
+                        err = get_shared_cpu_map(mask, cpus[i]->id, level);
+                        if (err != 0) {
+                                /* ugh... wrong cache level? Either caller screwed up
+                                 * or the CPU topology is weird. */
+                                printk(KERN_ERR "Could not set up clusters for L%d sharing (max: L%d).\n",
+                                       level, err);
+                                err = -EINVAL;
+                                goto out;
+                        }
+                        low_cpu = cpumask_first(mask);
+                } else
+                        low_cpu = 0;
+                if (low_cpu == i) {
+                        /* caller must provide an appropriate number of clusters */
+                        BUG_ON(free_cluster >= num_clusters);
+                        /* create new cluster */
+                        cpus[i]->cluster = clusters[free_cluster++];
+                } else {
+                        /* low_cpu points to the right cluster
+                         * Assumption: low_cpu is actually online and was processed earlier. */
+                        cpus[i]->cluster = cpus[low_cpu]->cluster;
+                }
+                /* enqueue in cpus list */
+                list_add_tail(&cpus[i]->cluster_list, &cpus[i]->cluster->cpus);
+                printk(KERN_INFO "Assigning CPU%u to cluster %u\n.", i, cpus[i]->cluster->id);
+        }
+out:
+        free_cpumask_var(mask);
+        return err;
+}
diff --git a/litmus/ctrldev.c b/litmus/ctrldev.c
new file mode 100644
index 000000000000..3e2ac2b3fbe8
--- /dev/null
+++ b/litmus/ctrldev.c
@@ -0,0 +1,160 @@
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <litmus/litmus.h>
+/* only one page for now, but we might want to add a RO version at some point */
+#define CTRL_NAME        "litmus/ctrl"
+/* allocate t->rt_param.ctrl_page*/
+static int alloc_ctrl_page(struct task_struct *t)
+{
+        int err = 0;
+        /* only allocate if the task doesn't have one yet */
+        if (!tsk_rt(t)->ctrl_page) {
+                tsk_rt(t)->ctrl_page = (void*) get_zeroed_page(GFP_KERNEL);
+                if (!tsk_rt(t)->ctrl_page)
+                        err = -ENOMEM;
+                /* will get de-allocated in task teardown */
+                TRACE_TASK(t, "%s ctrl_page = %p\n", __FUNCTION__,
+                           tsk_rt(t)->ctrl_page);
+        }
+        return err;
+}
+static int map_ctrl_page(struct task_struct *t, struct vm_area_struct* vma)
+{
+        int err;
+        struct page* ctrl = virt_to_page(tsk_rt(t)->ctrl_page);
+        TRACE_CUR(CTRL_NAME
+                  ": mapping %p (pfn:%lx) to 0x%lx (prot:%lx)\n",
+                  tsk_rt(t)->ctrl_page,page_to_pfn(ctrl), vma->vm_start,
+                  vma->vm_page_prot);
+        /* Map it into the vma. */
+        err = vm_insert_page(vma, vma->vm_start, ctrl);
+        if (err)
+                TRACE_CUR(CTRL_NAME ": vm_insert_page() failed (%d)\n", err);
+        return err;
+}
+static void litmus_ctrl_vm_close(struct vm_area_struct* vma)
+{
+        TRACE_CUR("%s flags=0x%x prot=0x%x\n", __FUNCTION__,
+                  vma->vm_flags, vma->vm_page_prot);
+        TRACE_CUR(CTRL_NAME
+                  ": %p:%p vma:%p vma->vm_private_data:%p closed.\n",
+                  (void*) vma->vm_start, (void*) vma->vm_end, vma,
+                  vma->vm_private_data);
+}
+static int litmus_ctrl_vm_fault(struct vm_area_struct* vma,
+                                      struct vm_fault* vmf)
+{
+        TRACE_CUR("%s flags=0x%x (off:%ld)\n", __FUNCTION__,
+                  vma->vm_flags, vmf->pgoff);
+        /* This function should never be called, since all pages should have
+         * been mapped by mmap() already. */
+        WARN_ONCE(1, "Page faults should be impossible in the control page\n");
+        return VM_FAULT_SIGBUS;
+}
+static struct vm_operations_struct litmus_ctrl_vm_ops = {
+        .close = litmus_ctrl_vm_close,
+        .fault = litmus_ctrl_vm_fault,
+};
+static int litmus_ctrl_mmap(struct file* filp, struct vm_area_struct* vma)
+{
+        int err = 0;
+        /* first make sure mapper knows what he's doing */
+        /* you can only get one page */
+        if (vma->vm_end - vma->vm_start != PAGE_SIZE)
+                return -EINVAL;
+        /* you can only map the "first" page */
+        if (vma->vm_pgoff != 0)
+                return -EINVAL;
+        /* you can't share it with anyone */
+        if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED))
+                return -EINVAL;
+        vma->vm_ops = &litmus_ctrl_vm_ops;
+        /* This mapping should not be kept across forks,
+         * cannot be expanded, and is not a "normal" page. */
+        vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_IO;
+        /* We don't want the first write access to trigger a "minor" page fault
+         * to mark the page as dirty.  This is transient, private memory, we
+         * don't care if it was touched or not. __S011 means RW access, but not
+         * execute, and avoids copy-on-write behavior.
+         * See protection_map in mmap.c.  */
+        vma->vm_page_prot = __S011;
+        err = alloc_ctrl_page(current);
+        if (!err)
+                err = map_ctrl_page(current, vma);
+        TRACE_CUR("%s flags=0x%x prot=0x%lx\n",
+                  __FUNCTION__, vma->vm_flags, vma->vm_page_prot);
+        return err;
+}
+static struct file_operations litmus_ctrl_fops = {
+        .owner = THIS_MODULE,
+        .mmap  = litmus_ctrl_mmap,
+};
+static struct miscdevice litmus_ctrl_dev = {
+        .name  = CTRL_NAME,
+        .minor = MISC_DYNAMIC_MINOR,
+        .fops  = &litmus_ctrl_fops,
+};
+static int __init init_litmus_ctrl_dev(void)
+{
+        int err;
+        BUILD_BUG_ON(sizeof(struct control_page) > PAGE_SIZE);
+        BUILD_BUG_ON(sizeof(union np_flag) != sizeof(uint32_t));
+        BUILD_BUG_ON(offsetof(struct control_page, sched.raw)
+                     != LITMUS_CP_OFFSET_SCHED);
+        BUILD_BUG_ON(offsetof(struct control_page, irq_count)
+                     != LITMUS_CP_OFFSET_IRQ_COUNT);
+        BUILD_BUG_ON(offsetof(struct control_page, ts_syscall_start)
+                     != LITMUS_CP_OFFSET_TS_SC_START);
+        BUILD_BUG_ON(offsetof(struct control_page, irq_syscall_start)
+                     != LITMUS_CP_OFFSET_IRQ_SC_START);
+        printk("Initializing LITMUS^RT control device.\n");
+        err = misc_register(&litmus_ctrl_dev);
+        if (err)
+                printk("Could not allocate %s device (%d).\n", CTRL_NAME, err);
+        return err;
+}
+static void __exit exit_litmus_ctrl_dev(void)
+{
+        misc_deregister(&litmus_ctrl_dev);
+}
+module_init(init_litmus_ctrl_dev);
+module_exit(exit_litmus_ctrl_dev);
diff --git a/litmus/edf_common.c b/litmus/edf_common.c
new file mode 100644
index 000000000000..5aca2934a7b5
--- /dev/null
+++ b/litmus/edf_common.c
@@ -0,0 +1,200 @@
+/*
+ * kernel/edf_common.c
+ *
+ * Common functions for EDF based scheduler.
+ */
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/sched_trace.h>
+#include <litmus/edf_common.h>
+#ifdef CONFIG_EDF_TIE_BREAK_LATENESS_NORM
+#include <litmus/fpmath.h>
+#endif
+#ifdef CONFIG_EDF_TIE_BREAK_HASH
+#include <linux/hash.h>
+static inline long edf_hash(struct task_struct *t)
+{
+        /* pid is 32 bits, so normally we would shove that into the
+         * upper 32-bits and and put the job number in the bottom
+         * and hash the 64-bit number with hash_64(). Sadly,
+         * in testing, hash_64() doesn't distribute keys were the
+         * upper bits are close together (as would be the case with
+         * pids) and job numbers are equal (as would be the case with
+         * synchronous task sets with all relative deadlines equal).
+         *
+         * A 2006 Linux patch proposed the following solution
+         * (but for some reason it wasn't accepted...).
+         *
+         * At least this workaround works for 32-bit systems as well.
+         */
+        return hash_32(hash_32((u32)tsk_rt(t)->job_params.job_no, 32) ^ t->pid, 32);
+}
+#endif
+/* edf_higher_prio -  returns true if first has a higher EDF priority
+ *                    than second. Deadline ties are broken by PID.
+ *
+ * both first and second may be NULL
+ */
+int edf_higher_prio(struct task_struct* first,
+                    struct task_struct* second)
+{
+        struct task_struct *first_task = first;
+        struct task_struct *second_task = second;
+        /* There is no point in comparing a task to itself. */
+        if (first && first == second) {
+                TRACE_TASK(first,
+                           "WARNING: pointless edf priority comparison.\n");
+                return 0;
+        }
+        /* check for NULL tasks */
+        if (!first || !second)
+                return first && !second;
+#ifdef CONFIG_LITMUS_LOCKING
+        /* Check for inherited priorities. Change task
+         * used for comparison in such a case.
+         */
+        if (unlikely(first->rt_param.inh_task))
+                first_task = first->rt_param.inh_task;
+        if (unlikely(second->rt_param.inh_task))
+                second_task = second->rt_param.inh_task;
+        /* Check for priority boosting. Tie-break by start of boosting.
+         */
+        if (unlikely(is_priority_boosted(first_task))) {
+                /* first_task is boosted, how about second_task? */
+                if (!is_priority_boosted(second_task) ||
+                    lt_before(get_boost_start(first_task),
+                              get_boost_start(second_task)))
+                        return 1;
+                else
+                        return 0;
+        } else if (unlikely(is_priority_boosted(second_task)))
+                /* second_task is boosted, first is not*/
+                return 0;
+#endif
+        if (earlier_deadline(first_task, second_task)) {
+                return 1;
+        }
+        else if (get_deadline(first_task) == get_deadline(second_task)) {
+                /* Need to tie break. All methods must set pid_break to 0/1 if
+                 * first_task does not have priority over second_task.
+                 */
+                int pid_break;
+#if defined(CONFIG_EDF_TIE_BREAK_LATENESS)
+                /* Tie break by lateness. Jobs with greater lateness get
+                 * priority. This should spread tardiness across all tasks,
+                 * especially in task sets where all tasks have the same
+                 * period and relative deadlines.
+                 */
+                if (get_lateness(first_task) > get_lateness(second_task)) {
+                        return 1;
+                }
+                pid_break = (get_lateness(first_task) == get_lateness(second_task));
+#elif defined(CONFIG_EDF_TIE_BREAK_LATENESS_NORM)
+                /* Tie break by lateness, normalized by relative deadline. Jobs with
+                 * greater normalized lateness get priority.
+                 *
+                 * Note: Considered using the algebraically equivalent
+                 *      lateness(first)*relative_deadline(second) >
+                                        lateness(second)*relative_deadline(first)
+                 * to avoid fixed-point math, but values are prone to overflow if inputs
+                 * are on the order of several seconds, even in 64-bit.
+                 */
+                fp_t fnorm = _frac(get_lateness(first_task),
+                                                   get_rt_relative_deadline(first_task));
+                fp_t snorm = _frac(get_lateness(second_task),
+                                                   get_rt_relative_deadline(second_task));
+                if (_gt(fnorm, snorm)) {
+                        return 1;
+                }
+                pid_break = _eq(fnorm, snorm);
+#elif defined(CONFIG_EDF_TIE_BREAK_HASH)
+                /* Tie break by comparing hashs of (pid, job#) tuple.  There should be
+                 * a 50% chance that first_task has a higher priority than second_task.
+                 */
+                long fhash = edf_hash(first_task);
+                long shash = edf_hash(second_task);
+                if (fhash < shash) {
+                        return 1;
+                }
+                pid_break = (fhash == shash);
+#else
+                /* CONFIG_EDF_PID_TIE_BREAK */
+                pid_break = 1; // fall through to tie-break by pid;
+#endif
+                /* Tie break by pid */
+                if(pid_break) {
+                        if (first_task->pid < second_task->pid) {
+                                return 1;
+                        }
+                        else if (first_task->pid == second_task->pid) {
+                                /* If the PIDs are the same then the task with the
+                                 * inherited priority wins.
+                                 */
+                                if (!second->rt_param.inh_task) {
+                                        return 1;
+                                }
+                        }
+                }
+        }
+        return 0; /* fall-through. prio(second_task) > prio(first_task) */
+}
+int edf_ready_order(struct bheap_node* a, struct bheap_node* b)
+{
+        return edf_higher_prio(bheap2task(a), bheap2task(b));
+}
+void edf_domain_init(rt_domain_t* rt, check_resched_needed_t resched,
+                      release_jobs_t release)
+{
+        rt_domain_init(rt,  edf_ready_order, resched, release);
+}
+/* need_to_preempt - check whether the task t needs to be preempted
+ *                   call only with irqs disabled and with  ready_lock acquired
+ *                   THIS DOES NOT TAKE NON-PREEMPTIVE SECTIONS INTO ACCOUNT!
+ */
+int edf_preemption_needed(rt_domain_t* rt, struct task_struct *t)
+{
+        /* we need the read lock for edf_ready_queue */
+        /* no need to preempt if there is nothing pending */
+        if (!__jobs_pending(rt))
+                return 0;
+        /* we need to reschedule if t doesn't exist */
+        if (!t)
+                return 1;
+        /* NOTE: We cannot check for non-preemptibility since we
+         *       don't know what address space we're currently in.
+         */
+        /* make sure to get non-rt stuff out of the way */
+        return !is_realtime(t) || edf_higher_prio(__next_ready(rt), t);
+}
diff --git a/litmus/fdso.c b/litmus/fdso.c
new file mode 100644
index 000000000000..c4b450be4509
--- /dev/null
+++ b/litmus/fdso.c
@@ -0,0 +1,307 @@
+/* fdso.c - file descriptor attached shared objects
+ *
+ * (c) 2007 B. Brandenburg, LITMUS^RT project
+ *
+ * Notes:
+ *   - objects descriptor (OD) tables are not cloned during a fork.
+ *   - objects are created on-demand, and freed after the last reference
+ *     is dropped.
+ *   - for now, object types are hard coded.
+ *   - As long as we have live objects, we keep a reference to the inode.
+ */
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/mutex.h>
+#include <linux/file.h>
+#include <asm/uaccess.h>
+#include <litmus/fdso.h>
+extern struct fdso_ops generic_lock_ops;
+static const struct fdso_ops* fdso_ops[] = {
+        &generic_lock_ops, /* FMLP_SEM */
+        &generic_lock_ops, /* SRP_SEM */
+        &generic_lock_ops, /* MPCP_SEM */
+        &generic_lock_ops, /* MPCP_VS_SEM */
+        &generic_lock_ops, /* DPCP_SEM */
+        &generic_lock_ops, /* PCP_SEM */
+};
+static int fdso_create(void** obj_ref, obj_type_t type, void* __user config)
+{
+        BUILD_BUG_ON(ARRAY_SIZE(fdso_ops) != MAX_OBJ_TYPE + 1);
+        if (fdso_ops[type]->create)
+                return fdso_ops[type]->create(obj_ref, type, config);
+        else
+                return -EINVAL;
+}
+static void fdso_destroy(obj_type_t type, void* obj)
+{
+        fdso_ops[type]->destroy(type, obj);
+}
+static int fdso_open(struct od_table_entry* entry, void* __user config)
+{
+        if (fdso_ops[entry->obj->type]->open)
+                return fdso_ops[entry->obj->type]->open(entry, config);
+        else
+                return 0;
+}
+static int fdso_close(struct od_table_entry* entry)
+{
+        if (fdso_ops[entry->obj->type]->close)
+                return fdso_ops[entry->obj->type]->close(entry);
+        else
+                return 0;
+}
+/* inode must be locked already */
+static int alloc_inode_obj(struct inode_obj_id** obj_ref,
+                           struct inode* inode,
+                           obj_type_t type,
+                           unsigned int id,
+                           void* __user config)
+{
+        struct inode_obj_id* obj;
+        void* raw_obj;
+        int err;
+        obj = kmalloc(sizeof(*obj), GFP_KERNEL);
+        if (!obj) {
+                return -ENOMEM;
+        }
+        err = fdso_create(&raw_obj, type, config);
+        if (err != 0) {
+                kfree(obj);
+                return err;
+        }
+        INIT_LIST_HEAD(&obj->list);
+        atomic_set(&obj->count, 1);
+        obj->type  = type;
+        obj->id    = id;
+        obj->obj   = raw_obj;
+        obj->inode = inode;
+        list_add(&obj->list, &inode->i_obj_list);
+        atomic_inc(&inode->i_count);
+        printk(KERN_DEBUG "alloc_inode_obj(%p, %d, %d): object created\n", inode, type, id);
+        *obj_ref = obj;
+        return 0;
+}
+/* inode must be locked already */
+static struct inode_obj_id* get_inode_obj(struct inode* inode,
+                                          obj_type_t type,
+                                          unsigned int id)
+{
+        struct list_head* pos;
+        struct inode_obj_id* obj = NULL;
+        list_for_each(pos, &inode->i_obj_list) {
+                obj = list_entry(pos, struct inode_obj_id, list);
+                if (obj->id == id && obj->type == type) {
+                        atomic_inc(&obj->count);
+                        return obj;
+                }
+        }
+        printk(KERN_DEBUG "get_inode_obj(%p, %d, %d): couldn't find object\n", inode, type, id);
+        return NULL;
+}
+static void put_inode_obj(struct inode_obj_id* obj)
+{
+        struct inode* inode;
+        int let_go = 0;
+        inode = obj->inode;
+        if (atomic_dec_and_test(&obj->count)) {
+                mutex_lock(&inode->i_obj_mutex);
+                /* no new references can be obtained */
+                if (!atomic_read(&obj->count)) {
+                        list_del(&obj->list);
+                        fdso_destroy(obj->type, obj->obj);
+                        kfree(obj);
+                        let_go = 1;
+                }
+                mutex_unlock(&inode->i_obj_mutex);
+                if (let_go)
+                        iput(inode);
+        }
+}
+static struct od_table_entry*  get_od_entry(struct task_struct* t)
+{
+        struct od_table_entry* table;
+        int i;
+        table = t->od_table;
+        if (!table) {
+                table = kzalloc(sizeof(*table) * MAX_OBJECT_DESCRIPTORS,
+                                GFP_KERNEL);
+                t->od_table = table;
+        }
+        for (i = 0; table &&  i < MAX_OBJECT_DESCRIPTORS; i++)
+                if (!table[i].used) {
+                        table[i].used = 1;
+                        return table + i;
+                }
+        return NULL;
+}
+static int put_od_entry(struct od_table_entry* od)
+{
+        put_inode_obj(od->obj);
+        od->used = 0;
+        return 0;
+}
+static long close_od_entry(struct od_table_entry *od)
+{
+        long ret;
+        /* Give the class a chance to reject the close. */
+        ret = fdso_close(od);
+        if (ret == 0)
+                ret = put_od_entry(od);
+        return ret;
+}
+void exit_od_table(struct task_struct* t)
+{
+        int i;
+        if (t->od_table) {
+                for (i = 0; i < MAX_OBJECT_DESCRIPTORS; i++)
+                        if (t->od_table[i].used)
+                                close_od_entry(t->od_table + i);
+                kfree(t->od_table);
+                t->od_table = NULL;
+        }
+}
+static int do_sys_od_open(struct file* file, obj_type_t type, int id,
+                          void* __user config)
+{
+        int idx = 0, err = 0;
+        struct inode* inode;
+        struct inode_obj_id* obj = NULL;
+        struct od_table_entry* entry;
+        inode = file->f_dentry->d_inode;
+        entry = get_od_entry(current);
+        if (!entry)
+                return -ENOMEM;
+        mutex_lock(&inode->i_obj_mutex);
+        obj = get_inode_obj(inode, type, id);
+        if (!obj)
+                err = alloc_inode_obj(&obj, inode, type, id, config);
+        if (err != 0) {
+                obj = NULL;
+                idx = err;
+                entry->used = 0;
+        } else {
+                entry->obj   = obj;
+                entry->class = fdso_ops[type];
+                idx = entry - current->od_table;
+        }
+        mutex_unlock(&inode->i_obj_mutex);
+        /* open only if creation succeeded */
+        if (!err)
+                err = fdso_open(entry, config);
+        if (err < 0) {
+                /* The class rejected the open call.
+                 * We need to clean up and tell user space.
+                 */
+                if (obj)
+                        put_od_entry(entry);
+                idx = err;
+        }
+        return idx;
+}
+struct od_table_entry* get_entry_for_od(int od)
+{
+        struct task_struct *t = current;
+        if (!t->od_table)
+                return NULL;
+        if (od < 0 || od >= MAX_OBJECT_DESCRIPTORS)
+                return NULL;
+        if (!t->od_table[od].used)
+                return NULL;
+        return t->od_table + od;
+}
+asmlinkage long sys_od_open(int fd, int type, int obj_id, void* __user config)
+{
+        int ret = 0;
+        struct file*  file;
+        /*
+           1) get file from fd, get inode from file
+           2) lock inode
+           3) try to lookup object
+           4) if not present create and enqueue object, inc inode refcnt
+           5) increment refcnt of object
+           6) alloc od_table_entry, setup ptrs
+           7) unlock inode
+           8) return offset in od_table as OD
+         */
+        if (type < MIN_OBJ_TYPE || type > MAX_OBJ_TYPE) {
+                ret = -EINVAL;
+                goto out;
+        }
+        file = fget(fd);
+        if (!file) {
+                ret = -EBADF;
+                goto out;
+        }
+        ret = do_sys_od_open(file, type, obj_id, config);
+        fput(file);
+out:
+        return ret;
+}
+asmlinkage long sys_od_close(int od)
+{
+        int ret = -EINVAL;
+        struct task_struct *t = current;
+        if (od < 0 || od >= MAX_OBJECT_DESCRIPTORS)
+                return ret;
+        if (!t->od_table || !t->od_table[od].used)
+                return ret;
+        ret = close_od_entry(t->od_table + od);
+        return ret;
+}
diff --git a/litmus/fp_common.c b/litmus/fp_common.c
new file mode 100644
index 000000000000..964a4729deff
--- /dev/null
+++ b/litmus/fp_common.c
@@ -0,0 +1,119 @@
+/*
+ * litmus/fp_common.c
+ *
+ * Common functions for fixed-priority scheduler.
+ */
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/sched_trace.h>
+#include <litmus/fp_common.h>
+/* fp_higher_prio -  returns true if first has a higher static priority
+ *                   than second. Ties are broken by PID.
+ *
+ * both first and second may be NULL
+ */
+int fp_higher_prio(struct task_struct* first,
+                   struct task_struct* second)
+{
+        struct task_struct *first_task = first;
+        struct task_struct *second_task = second;
+        /* There is no point in comparing a task to itself. */
+        if (unlikely(first && first == second)) {
+                TRACE_TASK(first,
+                           "WARNING: pointless FP priority comparison.\n");
+                return 0;
+        }
+        /* check for NULL tasks */
+        if (!first || !second)
+                return first && !second;
+        if (!is_realtime(second_task))
+                return 1;
+#ifdef CONFIG_LITMUS_LOCKING
+        /* Check for inherited priorities. Change task
+         * used for comparison in such a case.
+         */
+        if (unlikely(first->rt_param.inh_task))
+                first_task = first->rt_param.inh_task;
+        if (unlikely(second->rt_param.inh_task))
+                second_task = second->rt_param.inh_task;
+        /* Check for priority boosting. Tie-break by start of boosting.
+         */
+        if (unlikely(is_priority_boosted(first_task))) {
+                /* first_task is boosted, how about second_task? */
+                if (is_priority_boosted(second_task))
+                        /* break by priority point */
+                        return lt_before(get_boost_start(first_task),
+                                         get_boost_start(second_task));
+                else
+                        /* priority boosting wins. */
+                        return 1;
+        } else if (unlikely(is_priority_boosted(second_task)))
+                /* second_task is boosted, first is not*/
+                return 0;
+#endif
+        /* Comparisons to itself are not expected; priority inheritance
+         * should also not cause this to happen. */
+        BUG_ON(first_task == second_task);
+        if (get_priority(first_task) < get_priority(second_task))
+                return 1;
+        else if (get_priority(first_task) == get_priority(second_task))
+                /* Break by PID. */
+                return first_task->pid < second_task->pid;
+        else
+                return 0;
+}
+int fp_ready_order(struct bheap_node* a, struct bheap_node* b)
+{
+        return fp_higher_prio(bheap2task(a), bheap2task(b));
+}
+void fp_domain_init(rt_domain_t* rt, check_resched_needed_t resched,
+                    release_jobs_t release)
+{
+        rt_domain_init(rt,  fp_ready_order, resched, release);
+}
+/* need_to_preempt - check whether the task t needs to be preempted
+ */
+int fp_preemption_needed(struct fp_prio_queue *q, struct task_struct *t)
+{
+        struct task_struct *pending;
+        pending = fp_prio_peek(q);
+        if (!pending)
+                return 0;
+        if (!t)
+                return 1;
+        /* make sure to get non-rt stuff out of the way */
+        return !is_realtime(t) || fp_higher_prio(pending, t);
+}
+void fp_prio_queue_init(struct fp_prio_queue* q)
+{
+        int i;
+        for (i = 0; i < FP_PRIO_BIT_WORDS; i++)
+                q->bitmask[i] = 0;
+        for (i = 0; i < LITMUS_MAX_PRIORITY; i++)
+                bheap_init(&q->queue[i]);
+}
diff --git a/litmus/jobs.c b/litmus/jobs.c
new file mode 100644
index 000000000000..89a0810415fa
--- /dev/null
+++ b/litmus/jobs.c
@@ -0,0 +1,55 @@
+/* litmus/jobs.c - common job control code
+ */
+#include <linux/sched.h>
+#include <litmus/litmus.h>
+#include <litmus/jobs.h>
+static inline void setup_release(struct task_struct *t, lt_t release)
+{
+        /* prepare next release */
+        t->rt_param.job_params.release = release;
+        t->rt_param.job_params.deadline = release + get_rt_relative_deadline(t);
+        t->rt_param.job_params.exec_time = 0;
+        /* update job sequence number */
+        t->rt_param.job_params.job_no++;
+}
+void prepare_for_next_period(struct task_struct *t)
+{
+        BUG_ON(!t);
+        /* Record lateness before we set up the next job's
+         * release and deadline. Lateness may be negative.
+         */
+        t->rt_param.job_params.lateness =
+                (long long)litmus_clock() -
+                (long long)t->rt_param.job_params.deadline;
+        setup_release(t, get_release(t) + get_rt_period(t));
+        tsk_rt(t)->dont_requeue = 0;
+}
+void release_at(struct task_struct *t, lt_t start)
+{
+        BUG_ON(!t);
+        setup_release(t, start);
+        tsk_rt(t)->completed = 0;
+}
+/*
+ *      Deactivate current task until the beginning of the next period.
+ */
+long complete_job(void)
+{
+        /* Mark that we do not excute anymore */
+        tsk_rt(current)->completed = 1;
+        /* call schedule, this will return when a new job arrives
+         * it also takes care of preparing for the next release
+         */
+        schedule();
+        return 0;
+}
diff --git a/litmus/litmus.c b/litmus/litmus.c
new file mode 100644
index 000000000000..7417a8fbda74
--- /dev/null
+++ b/litmus/litmus.c
@@ -0,0 +1,576 @@
+/*
+ * litmus.c -- Implementation of the LITMUS syscalls,
+ *             the LITMUS intialization code,
+ *             and the procfs interface..
+ */
+#include <asm/uaccess.h>
+#include <linux/uaccess.h>
+#include <linux/sysrq.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/reboot.h>
+#include <linux/stop_machine.h>
+#include <litmus/litmus.h>
+#include <litmus/bheap.h>
+#include <litmus/trace.h>
+#include <litmus/rt_domain.h>
+#include <litmus/litmus_proc.h>
+#include <litmus/sched_trace.h>
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+#include <litmus/affinity.h>
+#endif
+/* Number of RT tasks that exist in the system */
+atomic_t rt_task_count          = ATOMIC_INIT(0);
+#ifdef CONFIG_RELEASE_MASTER
+/* current master CPU for handling timer IRQs */
+atomic_t release_master_cpu = ATOMIC_INIT(NO_CPU);
+#endif
+static struct kmem_cache * bheap_node_cache;
+extern struct kmem_cache * release_heap_cache;
+struct bheap_node* bheap_node_alloc(int gfp_flags)
+{
+        return kmem_cache_alloc(bheap_node_cache, gfp_flags);
+}
+void bheap_node_free(struct bheap_node* hn)
+{
+        kmem_cache_free(bheap_node_cache, hn);
+}
+struct release_heap* release_heap_alloc(int gfp_flags);
+void release_heap_free(struct release_heap* rh);
+/*
+ * sys_set_task_rt_param
+ * @pid: Pid of the task which scheduling parameters must be changed
+ * @param: New real-time extension parameters such as the execution cost and
+ *         period
+ * Syscall for manipulating with task rt extension params
+ * Returns EFAULT  if param is NULL.
+ *         ESRCH   if pid is not corrsponding
+ *                 to a valid task.
+ *         EINVAL  if either period or execution cost is <=0
+ *         EPERM   if pid is a real-time task
+ *         0       if success
+ *
+ * Only non-real-time tasks may be configured with this system call
+ * to avoid races with the scheduler. In practice, this means that a
+ * task's parameters must be set _before_ calling sys_prepare_rt_task()
+ *
+ * find_task_by_vpid() assumes that we are in the same namespace of the
+ * target.
+ */
+asmlinkage long sys_set_rt_task_param(pid_t pid, struct rt_task __user * param)
+{
+        struct rt_task tp;
+        struct task_struct *target;
+        int retval = -EINVAL;
+        printk("Setting up rt task parameters for process %d.\n", pid);
+        if (pid < 0 || param == 0) {
+                goto out;
+        }
+        if (copy_from_user(&tp, param, sizeof(tp))) {
+                retval = -EFAULT;
+                goto out;
+        }
+        /* Task search and manipulation must be protected */
+        read_lock_irq(&tasklist_lock);
+        if (!(target = find_task_by_vpid(pid))) {
+                retval = -ESRCH;
+                goto out_unlock;
+        }
+        if (is_realtime(target)) {
+                /* The task is already a real-time task.
+                 * We cannot not allow parameter changes at this point.
+                 */
+                retval = -EBUSY;
+                goto out_unlock;
+        }
+        /* set relative deadline to be implicit if left unspecified */
+        if (tp.relative_deadline == 0)
+                tp.relative_deadline = tp.period;
+        if (tp.exec_cost <= 0)
+                goto out_unlock;
+        if (tp.period <= 0)
+                goto out_unlock;
+        if (!cpu_online(tp.cpu))
+                goto out_unlock;
+        if (min(tp.relative_deadline, tp.period) < tp.exec_cost) /*density check*/
+        {
+                printk(KERN_INFO "litmus: real-time task %d rejected "
+                       "because task density > 1.0\n", pid);
+                goto out_unlock;
+        }
+        if (tp.cls != RT_CLASS_HARD &&
+            tp.cls != RT_CLASS_SOFT &&
+            tp.cls != RT_CLASS_BEST_EFFORT)
+        {
+                printk(KERN_INFO "litmus: real-time task %d rejected "
+                                 "because its class is invalid\n", pid);
+                goto out_unlock;
+        }
+        if (tp.budget_policy != NO_ENFORCEMENT &&
+            tp.budget_policy != QUANTUM_ENFORCEMENT &&
+            tp.budget_policy != PRECISE_ENFORCEMENT)
+        {
+                printk(KERN_INFO "litmus: real-time task %d rejected "
+                       "because unsupported budget enforcement policy "
+                       "specified (%d)\n",
+                       pid, tp.budget_policy);
+                goto out_unlock;
+        }
+        target->rt_param.task_params = tp;
+        retval = 0;
+      out_unlock:
+        read_unlock_irq(&tasklist_lock);
+      out:
+        return retval;
+}
+/*
+ * Getter of task's RT params
+ *   returns EINVAL if param or pid is NULL
+ *   returns ESRCH  if pid does not correspond to a valid task
+ *   returns EFAULT if copying of parameters has failed.
+ *
+ *   find_task_by_vpid() assumes that we are in the same namespace of the
+ *   target.
+ */
+asmlinkage long sys_get_rt_task_param(pid_t pid, struct rt_task __user * param)
+{
+        int retval = -EINVAL;
+        struct task_struct *source;
+        struct rt_task lp;
+        if (param == 0 || pid < 0)
+                goto out;
+        read_lock(&tasklist_lock);
+        if (!(source = find_task_by_vpid(pid))) {
+                retval = -ESRCH;
+                goto out_unlock;
+        }
+        lp = source->rt_param.task_params;
+        read_unlock(&tasklist_lock);
+        /* Do copying outside the lock */
+        retval =
+            copy_to_user(param, &lp, sizeof(lp)) ? -EFAULT : 0;
+        return retval;
+      out_unlock:
+        read_unlock(&tasklist_lock);
+      out:
+        return retval;
+}
+/*
+ *      This is the crucial function for periodic task implementation,
+ *      It checks if a task is periodic, checks if such kind of sleep
+ *      is permitted and calls plugin-specific sleep, which puts the
+ *      task into a wait array.
+ *      returns 0 on successful wakeup
+ *      returns EPERM if current conditions do not permit such sleep
+ *      returns EINVAL if current task is not able to go to sleep
+ */
+asmlinkage long sys_complete_job(void)
+{
+        int retval = -EPERM;
+        if (!is_realtime(current)) {
+                retval = -EINVAL;
+                goto out;
+        }
+        /* Task with negative or zero period cannot sleep */
+        if (get_rt_period(current) <= 0) {
+                retval = -EINVAL;
+                goto out;
+        }
+        /* The plugin has to put the task into an
+         * appropriate queue and call schedule
+         */
+        retval = litmus->complete_job();
+      out:
+        return retval;
+}
+/*      This is an "improved" version of sys_complete_job that
+ *      addresses the problem of unintentionally missing a job after
+ *      an overrun.
+ *
+ *      returns 0 on successful wakeup
+ *      returns EPERM if current conditions do not permit such sleep
+ *      returns EINVAL if current task is not able to go to sleep
+ */
+asmlinkage long sys_wait_for_job_release(unsigned int job)
+{
+        int retval = -EPERM;
+        if (!is_realtime(current)) {
+                retval = -EINVAL;
+                goto out;
+        }
+        /* Task with negative or zero period cannot sleep */
+        if (get_rt_period(current) <= 0) {
+                retval = -EINVAL;
+                goto out;
+        }
+        retval = 0;
+        /* first wait until we have "reached" the desired job
+         *
+         * This implementation has at least two problems:
+         *
+         * 1) It doesn't gracefully handle the wrap around of
+         *    job_no. Since LITMUS is a prototype, this is not much
+         *    of a problem right now.
+         *
+         * 2) It is theoretically racy if a job release occurs
+         *    between checking job_no and calling sleep_next_period().
+         *    A proper solution would requiring adding another callback
+         *    in the plugin structure and testing the condition with
+         *    interrupts disabled.
+         *
+         * FIXME: At least problem 2 should be taken care of eventually.
+         */
+        while (!retval && job > current->rt_param.job_params.job_no)
+                /* If the last job overran then job <= job_no and we
+                 * don't send the task to sleep.
+                 */
+                retval = litmus->complete_job();
+      out:
+        return retval;
+}
+/*      This is a helper syscall to query the current job sequence number.
+ *
+ *      returns 0 on successful query
+ *      returns EPERM if task is not a real-time task.
+ *      returns EFAULT if &job is not a valid pointer.
+ */
+asmlinkage long sys_query_job_no(unsigned int __user *job)
+{
+        int retval = -EPERM;
+        if (is_realtime(current))
+                retval = put_user(current->rt_param.job_params.job_no, job);
+        return retval;
+}
+/* sys_null_call() is only used for determining raw system call
+ * overheads (kernel entry, kernel exit). It has no useful side effects.
+ * If ts is non-NULL, then the current Feather-Trace time is recorded.
+ */
+asmlinkage long sys_null_call(cycles_t __user *ts)
+{
+        long ret = 0;
+        cycles_t now;
+        if (ts) {
+                now = get_cycles();
+                ret = put_user(now, ts);
+        }
+        return ret;
+}
+/* p is a real-time task. Re-init its state as a best-effort task. */
+static void reinit_litmus_state(struct task_struct* p, int restore)
+{
+        struct rt_task  user_config = {};
+        void*  ctrl_page     = NULL;
+        if (restore) {
+                /* Safe user-space provided configuration data.
+                 * and allocated page. */
+                user_config = p->rt_param.task_params;
+                ctrl_page   = p->rt_param.ctrl_page;
+        }
+        /* We probably should not be inheriting any task's priority
+         * at this point in time.
+         */
+        WARN_ON(p->rt_param.inh_task);
+        /* Cleanup everything else. */
+        memset(&p->rt_param, 0, sizeof(p->rt_param));
+        /* Restore preserved fields. */
+        if (restore) {
+                p->rt_param.task_params = user_config;
+                p->rt_param.ctrl_page   = ctrl_page;
+        }
+}
+long litmus_admit_task(struct task_struct* tsk)
+{
+        long retval = 0;
+        BUG_ON(is_realtime(tsk));
+        tsk_rt(tsk)->heap_node = NULL;
+        tsk_rt(tsk)->rel_heap = NULL;
+        if (get_rt_relative_deadline(tsk) == 0 ||
+            get_exec_cost(tsk) >
+                        min(get_rt_relative_deadline(tsk), get_rt_period(tsk)) ) {
+                TRACE_TASK(tsk,
+                        "litmus admit: invalid task parameters "
+                        "(e = %lu, p = %lu, d = %lu)\n",
+                        get_exec_cost(tsk), get_rt_period(tsk),
+                        get_rt_relative_deadline(tsk));
+                retval = -EINVAL;
+                goto out;
+        }
+        if (!cpu_online(get_partition(tsk))) {
+                TRACE_TASK(tsk, "litmus admit: cpu %d is not online\n",
+                           get_partition(tsk));
+                retval = -EINVAL;
+                goto out;
+        }
+        INIT_LIST_HEAD(&tsk_rt(tsk)->list);
+        /* allocate heap node for this task */
+        tsk_rt(tsk)->heap_node = bheap_node_alloc(GFP_ATOMIC);
+        tsk_rt(tsk)->rel_heap = release_heap_alloc(GFP_ATOMIC);
+        if (!tsk_rt(tsk)->heap_node || !tsk_rt(tsk)->rel_heap) {
+                printk(KERN_WARNING "litmus: no more heap node memory!?\n");
+                retval = -ENOMEM;
+                goto out;
+        } else {
+                bheap_node_init(&tsk_rt(tsk)->heap_node, tsk);
+        }
+        preempt_disable();
+        retval = litmus->admit_task(tsk);
+        if (!retval) {
+                sched_trace_task_name(tsk);
+                sched_trace_task_param(tsk);
+                atomic_inc(&rt_task_count);
+        }
+        preempt_enable();
+out:
+        if (retval) {
+                bheap_node_free(tsk_rt(tsk)->heap_node);
+                release_heap_free(tsk_rt(tsk)->rel_heap);
+        }
+        return retval;
+}
+void litmus_exit_task(struct task_struct* tsk)
+{
+        if (is_realtime(tsk)) {
+                sched_trace_task_completion(tsk, 1);
+                litmus->task_exit(tsk);
+                BUG_ON(bheap_node_in_heap(tsk_rt(tsk)->heap_node));
+                bheap_node_free(tsk_rt(tsk)->heap_node);
+                release_heap_free(tsk_rt(tsk)->rel_heap);
+                atomic_dec(&rt_task_count);
+                reinit_litmus_state(tsk, 1);
+        }
+}
+static int do_plugin_switch(void *_plugin)
+{
+        int ret;
+        struct sched_plugin* plugin = _plugin;
+        /* don't switch if there are active real-time tasks */
+        if (atomic_read(&rt_task_count) == 0) {
+                ret = litmus->deactivate_plugin();
+                if (0 != ret)
+                        goto out;
+                ret = plugin->activate_plugin();
+                if (0 != ret) {
+                        printk(KERN_INFO "Can't activate %s (%d).\n",
+                               plugin->plugin_name, ret);
+                        plugin = &linux_sched_plugin;
+                }
+                printk(KERN_INFO "Switching to LITMUS^RT plugin %s.\n", plugin->plugin_name);
+                litmus = plugin;
+        } else
+                ret = -EBUSY;
+out:
+        return ret;
+}
+/* Switching a plugin in use is tricky.
+ * We must watch out that no real-time tasks exists
+ * (and that none is created in parallel) and that the plugin is not
+ * currently in use on any processor (in theory).
+ */
+int switch_sched_plugin(struct sched_plugin* plugin)
+{
+        BUG_ON(!plugin);
+        if (atomic_read(&rt_task_count) == 0)
+                return stop_machine(do_plugin_switch, plugin, NULL);
+        else
+                return -EBUSY;
+}
+/* Called upon fork.
+ * p is the newly forked task.
+ */
+void litmus_fork(struct task_struct* p)
+{
+        if (is_realtime(p)) {
+                /* clean out any litmus related state, don't preserve anything */
+                reinit_litmus_state(p, 0);
+                /* Don't let the child be a real-time task.  */
+                p->sched_reset_on_fork = 1;
+        } else
+                /* non-rt tasks might have ctrl_page set */
+                tsk_rt(p)->ctrl_page = NULL;
+        /* od tables are never inherited across a fork */
+        p->od_table = NULL;
+}
+/* Called upon execve().
+ * current is doing the exec.
+ * Don't let address space specific stuff leak.
+ */
+void litmus_exec(void)
+{
+        struct task_struct* p = current;
+        if (is_realtime(p)) {
+                WARN_ON(p->rt_param.inh_task);
+                if (tsk_rt(p)->ctrl_page) {
+                        free_page((unsigned long) tsk_rt(p)->ctrl_page);
+                        tsk_rt(p)->ctrl_page = NULL;
+                }
+        }
+}
+void exit_litmus(struct task_struct *dead_tsk)
+{
+        /* We also allow non-RT tasks to
+         * allocate control pages to allow
+         * measurements with non-RT tasks.
+         * So check if we need to free the page
+         * in any case.
+         */
+        if (tsk_rt(dead_tsk)->ctrl_page) {
+                TRACE_TASK(dead_tsk,
+                           "freeing ctrl_page %p\n",
+                           tsk_rt(dead_tsk)->ctrl_page);
+                free_page((unsigned long) tsk_rt(dead_tsk)->ctrl_page);
+        }
+        /* main cleanup only for RT tasks */
+        if (is_realtime(dead_tsk))
+                litmus_exit_task(dead_tsk);
+}
+#ifdef CONFIG_MAGIC_SYSRQ
+int sys_kill(int pid, int sig);
+static void sysrq_handle_kill_rt_tasks(int key)
+{
+        struct task_struct *t;
+        read_lock(&tasklist_lock);
+        for_each_process(t) {
+                if (is_realtime(t)) {
+                        sys_kill(t->pid, SIGKILL);
+                }
+        }
+        read_unlock(&tasklist_lock);
+}
+static struct sysrq_key_op sysrq_kill_rt_tasks_op = {
+        .handler        = sysrq_handle_kill_rt_tasks,
+        .help_msg       = "quit-rt-tasks(X)",
+        .action_msg     = "sent SIGKILL to all LITMUS^RT real-time tasks",
+};
+#endif
+extern struct sched_plugin linux_sched_plugin;
+static int litmus_shutdown_nb(struct notifier_block *unused1,
+                                unsigned long unused2, void *unused3)
+{
+        /* Attempt to switch back to regular Linux scheduling.
+         * Forces the active plugin to clean up.
+         */
+        if (litmus != &linux_sched_plugin) {
+                int ret = switch_sched_plugin(&linux_sched_plugin);
+                if (ret) {
+                        printk("Auto-shutdown of active Litmus plugin failed.\n");
+                }
+        }
+        return NOTIFY_DONE;
+}
+static struct notifier_block shutdown_notifier = {
+        .notifier_call = litmus_shutdown_nb,
+};
+static int __init _init_litmus(void)
+{
+        /*      Common initializers,
+         *      mode change lock is used to enforce single mode change
+         *      operation.
+         */
+        printk("Starting LITMUS^RT kernel\n");
+        register_sched_plugin(&linux_sched_plugin);
+        bheap_node_cache    = KMEM_CACHE(bheap_node, SLAB_PANIC);
+        release_heap_cache = KMEM_CACHE(release_heap, SLAB_PANIC);
+#ifdef CONFIG_MAGIC_SYSRQ
+        /* offer some debugging help */
+        if (!register_sysrq_key('x', &sysrq_kill_rt_tasks_op))
+                printk("Registered kill rt tasks magic sysrq.\n");
+        else
+                printk("Could not register kill rt tasks magic sysrq.\n");
+#endif
+        init_litmus_proc();
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+        init_topology();
+#endif
+        register_reboot_notifier(&shutdown_notifier);
+        return 0;
+}
+static void _exit_litmus(void)
+{
+        unregister_reboot_notifier(&shutdown_notifier);
+        exit_litmus_proc();
+        kmem_cache_destroy(bheap_node_cache);
+        kmem_cache_destroy(release_heap_cache);
+}
+module_init(_init_litmus);
+module_exit(_exit_litmus);
diff --git a/litmus/litmus_proc.c b/litmus/litmus_proc.c
new file mode 100644
index 000000000000..1ebf1277f5d3
--- /dev/null
+++ b/litmus/litmus_proc.c
@@ -0,0 +1,407 @@
+/*
+ * litmus_proc.c -- Implementation of the /proc/litmus directory tree.
+ */
+#include <linux/sched.h>
+#include <linux/uaccess.h>
+#include <linux/seq_file.h>
+#include <litmus/litmus.h>
+#include <litmus/litmus_proc.h>
+#include <litmus/clustered.h>
+/* in litmus/litmus.c */
+extern atomic_t rt_task_count;
+static struct proc_dir_entry *litmus_dir = NULL,
+        *curr_file = NULL,
+        *stat_file = NULL,
+        *plugs_dir = NULL,
+#ifdef CONFIG_RELEASE_MASTER
+        *release_master_file = NULL,
+#endif
+        *plugs_file = NULL;
+/* in litmus/sync.c */
+int count_tasks_waiting_for_release(void);
+static int litmus_stats_proc_show(struct seq_file *m, void *v)
+{
+        seq_printf(m,
+                   "real-time tasks   = %d\n"
+                   "ready for release = %d\n",
+                   atomic_read(&rt_task_count),
+                   count_tasks_waiting_for_release());
+        return 0;
+}
+static int litmus_stats_proc_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, litmus_stats_proc_show, PDE_DATA(inode));
+}
+static const struct file_operations litmus_stats_proc_fops = {
+        .open           = litmus_stats_proc_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
+static int litmus_loaded_proc_show(struct seq_file *m, void *v)
+{
+        print_sched_plugins(m);
+        return 0;
+}
+static int litmus_loaded_proc_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, litmus_loaded_proc_show, PDE_DATA(inode));
+}
+static const struct file_operations litmus_loaded_proc_fops = {
+        .open           = litmus_loaded_proc_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
+/* in litmus/litmus.c */
+int switch_sched_plugin(struct sched_plugin*);
+static ssize_t litmus_active_proc_write(struct file *file,
+                                        const char __user *buffer, size_t count,
+                                        loff_t *ppos)
+{
+        char name[65];
+        struct sched_plugin* found;
+        ssize_t ret = -EINVAL;
+        int err;
+        ret = copy_and_chomp(name, sizeof(name), buffer, count);
+        if (ret < 0)
+                return ret;
+        found = find_sched_plugin(name);
+        if (found) {
+                err = switch_sched_plugin(found);
+                if (err) {
+                        printk(KERN_INFO "Could not switch plugin: %d\n", err);
+                        ret = err;
+                }
+        } else {
+                printk(KERN_INFO "Plugin '%s' is unknown.\n", name);
+                ret = -ESRCH;
+        }
+        return ret;
+}
+static int litmus_active_proc_show(struct seq_file *m, void *v)
+{
+        seq_printf(m, "%s\n", litmus->plugin_name);
+        return 0;
+}
+static int litmus_active_proc_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, litmus_active_proc_show, PDE_DATA(inode));
+}
+static const struct file_operations litmus_active_proc_fops = {
+        .open           = litmus_active_proc_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+        .write          = litmus_active_proc_write,
+};
+#ifdef CONFIG_RELEASE_MASTER
+static ssize_t litmus_release_master_proc_write(
+        struct file *file,
+        const char __user *buffer, size_t count,
+        loff_t *ppos)
+{
+        int cpu, err, online = 0;
+        char msg[64];
+        ssize_t len;
+        len = copy_and_chomp(msg, sizeof(msg), buffer, count);
+        if (len < 0)
+                return len;
+        if (strcmp(msg, "NO_CPU") == 0)
+                atomic_set(&release_master_cpu, NO_CPU);
+        else {
+                err = sscanf(msg, "%d", &cpu);
+                if (err == 1 && cpu >= 0 && (online = cpu_online(cpu))) {
+                        atomic_set(&release_master_cpu, cpu);
+                } else {
+                        TRACE("invalid release master: '%s' "
+                              "(err:%d cpu:%d online:%d)\n",
+                              msg, err, cpu, online);
+                        len = -EINVAL;
+                }
+        }
+        return len;
+}
+static int litmus_release_master_proc_show(struct seq_file *m, void *v)
+{
+        int master;
+        master = atomic_read(&release_master_cpu);
+        if (master == NO_CPU)
+                seq_printf(m, "NO_CPU\n");
+        else
+                seq_printf(m, "%d\n", master);
+        return 0;
+}
+static int litmus_release_master_proc_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, litmus_release_master_proc_show, PDE_DATA(inode));
+}
+static const struct file_operations litmus_release_master_proc_fops = {
+        .open           = litmus_release_master_proc_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+        .write          = litmus_release_master_proc_write,
+};
+#endif
+int __init init_litmus_proc(void)
+{
+        litmus_dir = proc_mkdir("litmus", NULL);
+        if (!litmus_dir) {
+                printk(KERN_ERR "Could not allocate LITMUS^RT procfs entry.\n");
+                return -ENOMEM;
+        }
+        curr_file = proc_create("active_plugin", 0644, litmus_dir,
+                                &litmus_active_proc_fops);
+        if (!curr_file) {
+                printk(KERN_ERR "Could not allocate active_plugin "
+                       "procfs entry.\n");
+                return -ENOMEM;
+        }
+#ifdef CONFIG_RELEASE_MASTER
+        release_master_file = proc_create("release_master", 0644, litmus_dir,
+                                          &litmus_release_master_proc_fops);
+        if (!release_master_file) {
+                printk(KERN_ERR "Could not allocate release_master "
+                       "procfs entry.\n");
+                return -ENOMEM;
+        }
+#endif
+        stat_file = proc_create("stats", 0444, litmus_dir, &litmus_stats_proc_fops);
+        plugs_dir = proc_mkdir("plugins", litmus_dir);
+        if (!plugs_dir){
+                printk(KERN_ERR "Could not allocate plugins directory "
+                                "procfs entry.\n");
+                return -ENOMEM;
+        }
+        plugs_file = proc_create("loaded", 0444, plugs_dir,
+                                 &litmus_loaded_proc_fops);
+        return 0;
+}
+void exit_litmus_proc(void)
+{
+        if (plugs_file)
+                remove_proc_entry("loaded", plugs_dir);
+        if (plugs_dir)
+                remove_proc_entry("plugins", litmus_dir);
+        if (stat_file)
+                remove_proc_entry("stats", litmus_dir);
+        if (curr_file)
+                remove_proc_entry("active_plugin", litmus_dir);
+#ifdef CONFIG_RELEASE_MASTER
+        if (release_master_file)
+                remove_proc_entry("release_master", litmus_dir);
+#endif
+        if (litmus_dir)
+                remove_proc_entry("litmus", NULL);
+}
+long make_plugin_proc_dir(struct sched_plugin* plugin,
+                struct proc_dir_entry** pde_in)
+{
+        struct proc_dir_entry *pde_new = NULL;
+        long rv;
+        if (!plugin || !plugin->plugin_name){
+                printk(KERN_ERR "Invalid plugin struct passed to %s.\n",
+                                __func__);
+                rv = -EINVAL;
+                goto out_no_pde;
+        }
+        if (!plugs_dir){
+                printk(KERN_ERR "Could not make plugin sub-directory, because "
+                                "/proc/litmus/plugins does not exist.\n");
+                rv = -ENOENT;
+                goto out_no_pde;
+        }
+        pde_new = proc_mkdir(plugin->plugin_name, plugs_dir);
+        if (!pde_new){
+                printk(KERN_ERR "Could not make plugin sub-directory: "
+                                "out of memory?.\n");
+                rv = -ENOMEM;
+                goto out_no_pde;
+        }
+        rv = 0;
+        *pde_in = pde_new;
+        goto out_ok;
+out_no_pde:
+        *pde_in = NULL;
+out_ok:
+        return rv;
+}
+void remove_plugin_proc_dir(struct sched_plugin* plugin)
+{
+        if (!plugin || !plugin->plugin_name){
+                printk(KERN_ERR "Invalid plugin struct passed to %s.\n",
+                                __func__);
+                return;
+        }
+        remove_proc_entry(plugin->plugin_name, plugs_dir);
+}
+/* misc. I/O helper functions */
+int copy_and_chomp(char *kbuf, unsigned long ksize,
+                   __user const char* ubuf, unsigned long ulength)
+{
+        /* caller must provide buffer space */
+        BUG_ON(!ksize);
+        ksize--; /* leave space for null byte */
+        if (ksize > ulength)
+                ksize = ulength;
+        if(copy_from_user(kbuf, ubuf, ksize))
+                return -EFAULT;
+        kbuf[ksize] = '\0';
+        /* chomp kbuf */
+        if (ksize > 0 && kbuf[ksize - 1] == '\n')
+                kbuf[ksize - 1] = '\0';
+        return ksize;
+}
+/* helper functions for clustered plugins */
+static const char* cache_level_names[] = {
+        "ALL",
+        "L1",
+        "L2",
+        "L3",
+};
+int parse_cache_level(const char *cache_name, enum cache_level *level)
+{
+        int err = -EINVAL;
+        int i;
+        /* do a quick and dirty comparison to find the cluster size */
+        for (i = GLOBAL_CLUSTER; i <= L3_CLUSTER; i++)
+                if (!strcmp(cache_name, cache_level_names[i])) {
+                        *level = (enum cache_level) i;
+                        err = 0;
+                        break;
+                }
+        return err;
+}
+const char* cache_level_name(enum cache_level level)
+{
+        int idx = level;
+        if (idx >= GLOBAL_CLUSTER && idx <= L3_CLUSTER)
+                return cache_level_names[idx];
+        else
+                return "INVALID";
+}
+/* proc file interface to configure the cluster size */
+static ssize_t litmus_cluster_proc_write(struct file *file,
+                                        const char __user *buffer, size_t count,
+                                        loff_t *ppos)
+{
+        enum cache_level *level = (enum cache_level *) PDE_DATA(file_inode(file));
+        ssize_t len;
+        char cache_name[8];
+        len = copy_and_chomp(cache_name, sizeof(cache_name), buffer, count);
+        if (len > 0 && parse_cache_level(cache_name, level)) {
+                printk(KERN_INFO "Cluster '%s' is unknown.\n", cache_name);
+                len = -EINVAL;
+        }
+        return len;
+}
+static int litmus_cluster_proc_show(struct seq_file *m, void *v)
+{
+        enum cache_level *level = (enum cache_level *)  m->private;
+        seq_printf(m, "%s\n", cache_level_name(*level));
+        return 0;
+}
+static int litmus_cluster_proc_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, litmus_cluster_proc_show, PDE_DATA(inode));
+}
+static const struct file_operations litmus_cluster_proc_fops = {
+        .open           = litmus_cluster_proc_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+        .write          = litmus_cluster_proc_write,
+};
+struct proc_dir_entry* create_cluster_file(struct proc_dir_entry* parent,
+                                           enum cache_level* level)
+{
+        struct proc_dir_entry* cluster_file;
+        cluster_file = proc_create_data("cluster", 0644, parent,
+                                        &litmus_cluster_proc_fops,
+                                        (void *) level);
+        if (!cluster_file) {
+                printk(KERN_ERR
+                       "Could not cluster procfs entry.\n");
+        }
+        return cluster_file;
+}
diff --git a/litmus/locking.c b/litmus/locking.c
new file mode 100644
index 000000000000..43d9aece2e74
--- /dev/null
+++ b/litmus/locking.c
@@ -0,0 +1,188 @@
+#include <linux/sched.h>
+#include <litmus/litmus.h>
+#include <litmus/fdso.h>
+#ifdef CONFIG_LITMUS_LOCKING
+#include <linux/sched.h>
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/trace.h>
+#include <litmus/wait.h>
+static int create_generic_lock(void** obj_ref, obj_type_t type, void* __user arg);
+static int open_generic_lock(struct od_table_entry* entry, void* __user arg);
+static int close_generic_lock(struct od_table_entry* entry);
+static void destroy_generic_lock(obj_type_t type, void* sem);
+struct fdso_ops generic_lock_ops = {
+        .create  = create_generic_lock,
+        .open    = open_generic_lock,
+        .close   = close_generic_lock,
+        .destroy = destroy_generic_lock
+};
+static inline bool is_lock(struct od_table_entry* entry)
+{
+        return entry->class == &generic_lock_ops;
+}
+static inline struct litmus_lock* get_lock(struct od_table_entry* entry)
+{
+        BUG_ON(!is_lock(entry));
+        return (struct litmus_lock*) entry->obj->obj;
+}
+static  int create_generic_lock(void** obj_ref, obj_type_t type, void* __user arg)
+{
+        struct litmus_lock* lock;
+        int err;
+        err = litmus->allocate_lock(&lock, type, arg);
+        if (err == 0)
+                *obj_ref = lock;
+        return err;
+}
+static int open_generic_lock(struct od_table_entry* entry, void* __user arg)
+{
+        struct litmus_lock* lock = get_lock(entry);
+        if (lock->ops->open)
+                return lock->ops->open(lock, arg);
+        else
+                return 0; /* default: any task can open it */
+}
+static int close_generic_lock(struct od_table_entry* entry)
+{
+        struct litmus_lock* lock = get_lock(entry);
+        if (lock->ops->close)
+                return lock->ops->close(lock);
+        else
+                return 0; /* default: closing succeeds */
+}
+static void destroy_generic_lock(obj_type_t type, void* obj)
+{
+        struct litmus_lock* lock = (struct litmus_lock*) obj;
+        lock->ops->deallocate(lock);
+}
+asmlinkage long sys_litmus_lock(int lock_od)
+{
+        long err = -EINVAL;
+        struct od_table_entry* entry;
+        struct litmus_lock* l;
+        TS_SYSCALL_IN_START;
+        TS_SYSCALL_IN_END;
+        TS_LOCK_START;
+        entry = get_entry_for_od(lock_od);
+        if (entry && is_lock(entry)) {
+                l = get_lock(entry);
+                TRACE_CUR("attempts to lock 0x%p\n", l);
+                err = l->ops->lock(l);
+        }
+        /* Note: task my have been suspended or preempted in between!  Take
+         * this into account when computing overheads. */
+        TS_LOCK_END;
+        TS_SYSCALL_OUT_START;
+        return err;
+}
+asmlinkage long sys_litmus_unlock(int lock_od)
+{
+        long err = -EINVAL;
+        struct od_table_entry* entry;
+        struct litmus_lock* l;
+        TS_SYSCALL_IN_START;
+        TS_SYSCALL_IN_END;
+        TS_UNLOCK_START;
+        entry = get_entry_for_od(lock_od);
+        if (entry && is_lock(entry)) {
+                l = get_lock(entry);
+                TRACE_CUR("attempts to unlock 0x%p\n", l);
+                err = l->ops->unlock(l);
+        }
+        /* Note: task my have been preempted in between!  Take this into
+         * account when computing overheads. */
+        TS_UNLOCK_END;
+        TS_SYSCALL_OUT_START;
+        return err;
+}
+struct task_struct* __waitqueue_remove_first(wait_queue_head_t *wq)
+{
+        wait_queue_t* q;
+        struct task_struct* t = NULL;
+        if (waitqueue_active(wq)) {
+                q = list_entry(wq->task_list.next,
+                               wait_queue_t, task_list);
+                t = (struct task_struct*) q->private;
+                __remove_wait_queue(wq, q);
+        }
+        return(t);
+}
+unsigned int __add_wait_queue_prio_exclusive(
+        wait_queue_head_t* head,
+        prio_wait_queue_t *new)
+{
+        struct list_head *pos;
+        unsigned int passed = 0;
+        new->wq.flags |= WQ_FLAG_EXCLUSIVE;
+        /* find a spot where the new entry is less than the next */
+        list_for_each(pos, &head->task_list) {
+                prio_wait_queue_t* queued = list_entry(pos, prio_wait_queue_t,
+                                                       wq.task_list);
+                if (unlikely(lt_before(new->priority, queued->priority) ||
+                             (new->priority == queued->priority &&
+                              new->tie_breaker < queued->tie_breaker))) {
+                        /* pos is not less than new, thus insert here */
+                        __list_add(&new->wq.task_list, pos->prev, pos);
+                        goto out;
+                }
+                passed++;
+        }
+        /* if we get to this point either the list is empty or every entry
+         * queued element is less than new.
+         * Let's add new to the end. */
+        list_add_tail(&new->wq.task_list, &head->task_list);
+out:
+        return passed;
+}
+#else
+struct fdso_ops generic_lock_ops = {};
+asmlinkage long sys_litmus_lock(int sem_od)
+{
+        return -ENOSYS;
+}
+asmlinkage long sys_litmus_unlock(int sem_od)
+{
+        return -ENOSYS;
+}
+#endif
diff --git a/litmus/preempt.c b/litmus/preempt.c
new file mode 100644
index 000000000000..6be2f26728b8
--- /dev/null
+++ b/litmus/preempt.c
@@ -0,0 +1,137 @@
+#include <linux/sched.h>
+#include <litmus/litmus.h>
+#include <litmus/preempt.h>
+#include <litmus/trace.h>
+/* The rescheduling state of each processor.
+ */
+DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, resched_state);
+void sched_state_will_schedule(struct task_struct* tsk)
+{
+        /* Litmus hack: we only care about processor-local invocations of
+         * set_tsk_need_resched(). We can't reliably set the flag remotely
+         * since it might race with other updates to the scheduling state.  We
+         * can't rely on the runqueue lock protecting updates to the sched
+         * state since processors do not acquire the runqueue locks for all
+         * updates to the sched state (to avoid acquiring two runqueue locks at
+         * the same time). Further, if tsk is residing on a remote processor,
+         * then that processor doesn't actually know yet that it is going to
+         * reschedule; it still must receive an IPI (unless a local invocation
+         * races).
+         */
+        if (likely(task_cpu(tsk) == smp_processor_id())) {
+                VERIFY_SCHED_STATE(TASK_SCHEDULED | SHOULD_SCHEDULE | TASK_PICKED | WILL_SCHEDULE);
+                if (is_in_sched_state(TASK_PICKED | PICKED_WRONG_TASK))
+                        set_sched_state(PICKED_WRONG_TASK);
+                else
+                        set_sched_state(WILL_SCHEDULE);
+        } else
+                /* Litmus tasks should never be subject to a remote
+                 * set_tsk_need_resched(). */
+                BUG_ON(is_realtime(tsk));
+#ifdef CONFIG_PREEMPT_STATE_TRACE
+        TRACE_TASK(tsk, "set_tsk_need_resched() ret:%p\n",
+                   __builtin_return_address(0));
+#endif
+}
+/* Called by the IPI handler after another CPU called smp_send_resched(). */
+void sched_state_ipi(void)
+{
+        /* If the IPI was slow, we might be in any state right now. The IPI is
+         * only meaningful if we are in SHOULD_SCHEDULE. */
+        if (is_in_sched_state(SHOULD_SCHEDULE)) {
+                /* Cause scheduler to be invoked.
+                 * This will cause a transition to WILL_SCHEDULE. */
+                set_tsk_need_resched(current);
+                TRACE_STATE("IPI -> set_tsk_need_resched(%s/%d)\n",
+                            current->comm, current->pid);
+                TS_SEND_RESCHED_END;
+        } else {
+                /* ignore */
+                TRACE_STATE("ignoring IPI in state %x (%s)\n",
+                            get_sched_state(),
+                            sched_state_name(get_sched_state()));
+        }
+}
+/* Called by plugins to cause a CPU to reschedule. IMPORTANT: the caller must
+ * hold the lock that is used to serialize scheduling decisions. */
+void litmus_reschedule(int cpu)
+{
+        int picked_transition_ok = 0;
+        int scheduled_transition_ok = 0;
+        /* The (remote) CPU could be in any state. */
+        /* The critical states are TASK_PICKED and TASK_SCHEDULED, as the CPU
+         * is not aware of the need to reschedule at this point. */
+        /* is a context switch in progress? */
+        if (cpu_is_in_sched_state(cpu, TASK_PICKED))
+                picked_transition_ok = sched_state_transition_on(
+                        cpu, TASK_PICKED, PICKED_WRONG_TASK);
+        if (!picked_transition_ok &&
+            cpu_is_in_sched_state(cpu, TASK_SCHEDULED)) {
+                /* We either raced with the end of the context switch, or the
+                 * CPU was in TASK_SCHEDULED anyway. */
+                scheduled_transition_ok = sched_state_transition_on(
+                        cpu, TASK_SCHEDULED, SHOULD_SCHEDULE);
+        }
+        /* If the CPU was in state TASK_SCHEDULED, then we need to cause the
+         * scheduler to be invoked. */
+        if (scheduled_transition_ok) {
+                if (smp_processor_id() == cpu)
+                        set_tsk_need_resched(current);
+                else {
+                        TS_SEND_RESCHED_START(cpu);
+                        smp_send_reschedule(cpu);
+                }
+        }
+        TRACE_STATE("%s picked-ok:%d sched-ok:%d\n",
+                    __FUNCTION__,
+                    picked_transition_ok,
+                    scheduled_transition_ok);
+}
+void litmus_reschedule_local(void)
+{
+        if (is_in_sched_state(TASK_PICKED))
+                set_sched_state(PICKED_WRONG_TASK);
+        else if (is_in_sched_state(TASK_SCHEDULED | SHOULD_SCHEDULE)) {
+                set_sched_state(WILL_SCHEDULE);
+                set_tsk_need_resched(current);
+        }
+}
+#ifdef CONFIG_DEBUG_KERNEL
+void sched_state_plugin_check(void)
+{
+        if (!is_in_sched_state(TASK_PICKED | PICKED_WRONG_TASK)) {
+                TRACE("!!!! plugin did not call sched_state_task_picked()!"
+                      "Calling sched_state_task_picked() is mandatory---fix this.\n");
+                set_sched_state(TASK_PICKED);
+        }
+}
+#define NAME_CHECK(x) case x:  return #x
+const char* sched_state_name(int s)
+{
+        switch (s) {
+                NAME_CHECK(TASK_SCHEDULED);
+                NAME_CHECK(SHOULD_SCHEDULE);
+                NAME_CHECK(WILL_SCHEDULE);
+                NAME_CHECK(TASK_PICKED);
+                NAME_CHECK(PICKED_WRONG_TASK);
+        default:
+                return "UNKNOWN";
+        };
+}
+#endif
diff --git a/litmus/rt_domain.c b/litmus/rt_domain.c
new file mode 100644
index 000000000000..bfadf1284291
--- /dev/null
+++ b/litmus/rt_domain.c
@@ -0,0 +1,348 @@
+/*
+ * litmus/rt_domain.c
+ *
+ * LITMUS real-time infrastructure. This file contains the
+ * functions that manipulate RT domains. RT domains are an abstraction
+ * of a ready queue and a release queue.
+ */
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/sched_trace.h>
+#include <litmus/rt_domain.h>
+#include <litmus/trace.h>
+#include <litmus/bheap.h>
+/* Uncomment when debugging timer races... */
+#if 0
+#define VTRACE_TASK TRACE_TASK
+#define VTRACE TRACE
+#else
+#define VTRACE_TASK(t, fmt, args...) /* shut up */
+#define VTRACE(fmt, args...) /* be quiet already */
+#endif
+static int dummy_resched(rt_domain_t *rt)
+{
+        return 0;
+}
+static int dummy_order(struct bheap_node* a, struct bheap_node* b)
+{
+        return 0;
+}
+/* default implementation: use default lock */
+static void default_release_jobs(rt_domain_t* rt, struct bheap* tasks)
+{
+        merge_ready(rt, tasks);
+}
+static unsigned int time2slot(lt_t time)
+{
+        return (unsigned int) time2quanta(time, FLOOR) % RELEASE_QUEUE_SLOTS;
+}
+static enum hrtimer_restart on_release_timer(struct hrtimer *timer)
+{
+        unsigned long flags;
+        struct release_heap* rh;
+        rh = container_of(timer, struct release_heap, timer);
+        TS_RELEASE_LATENCY(rh->release_time);
+        VTRACE("on_release_timer(0x%p) starts.\n", timer);
+        TS_RELEASE_START;
+        raw_spin_lock_irqsave(&rh->dom->release_lock, flags);
+        VTRACE("CB has the release_lock 0x%p\n", &rh->dom->release_lock);
+        /* remove from release queue */
+        list_del(&rh->list);
+        raw_spin_unlock_irqrestore(&rh->dom->release_lock, flags);
+        VTRACE("CB returned release_lock 0x%p\n", &rh->dom->release_lock);
+        /* call release callback */
+        rh->dom->release_jobs(rh->dom, &rh->heap);
+        /* WARNING: rh can be referenced from other CPUs from now on. */
+        TS_RELEASE_END;
+        VTRACE("on_release_timer(0x%p) ends.\n", timer);
+        return  HRTIMER_NORESTART;
+}
+/* allocated in litmus.c */
+struct kmem_cache * release_heap_cache;
+struct release_heap* release_heap_alloc(int gfp_flags)
+{
+        struct release_heap* rh;
+        rh= kmem_cache_alloc(release_heap_cache, gfp_flags);
+        if (rh) {
+                /* initialize timer */
+                hrtimer_init(&rh->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+                rh->timer.function = on_release_timer;
+        }
+        return rh;
+}
+void release_heap_free(struct release_heap* rh)
+{
+        /* make sure timer is no longer in use */
+        hrtimer_cancel(&rh->timer);
+        kmem_cache_free(release_heap_cache, rh);
+}
+/* Caller must hold release lock.
+ * Will return heap for given time. If no such heap exists prior to
+ * the invocation it will be created.
+ */
+static struct release_heap* get_release_heap(rt_domain_t *rt,
+                                             struct task_struct* t,
+                                             int use_task_heap)
+{
+        struct list_head* pos;
+        struct release_heap* heap = NULL;
+        struct release_heap* rh;
+        lt_t release_time = get_release(t);
+        unsigned int slot = time2slot(release_time);
+        /* initialize pos for the case that the list is empty */
+        pos = rt->release_queue.slot[slot].next;
+        list_for_each(pos, &rt->release_queue.slot[slot]) {
+                rh = list_entry(pos, struct release_heap, list);
+                if (release_time == rh->release_time) {
+                        /* perfect match -- this happens on hyperperiod
+                         * boundaries
+                         */
+                        heap = rh;
+                        break;
+                } else if (lt_before(release_time, rh->release_time)) {
+                        /* we need to insert a new node since rh is
+                         * already in the future
+                         */
+                        break;
+                }
+        }
+        if (!heap && use_task_heap) {
+                /* use pre-allocated release heap */
+                rh = tsk_rt(t)->rel_heap;
+                rh->dom = rt;
+                rh->release_time = release_time;
+                /* add to release queue */
+                list_add(&rh->list, pos->prev);
+                heap = rh;
+        }
+        return heap;
+}
+static void reinit_release_heap(struct task_struct* t)
+{
+        struct release_heap* rh;
+        /* use pre-allocated release heap */
+        rh = tsk_rt(t)->rel_heap;
+        /* Make sure it is safe to use.  The timer callback could still
+         * be executing on another CPU; hrtimer_cancel() will wait
+         * until the timer callback has completed.  However, under no
+         * circumstances should the timer be active (= yet to be
+         * triggered).
+         *
+         * WARNING: If the CPU still holds the release_lock at this point,
+         *          deadlock may occur!
+         */
+        BUG_ON(hrtimer_cancel(&rh->timer));
+        /* initialize */
+        bheap_init(&rh->heap);
+#ifdef CONFIG_RELEASE_MASTER
+        atomic_set(&rh->info.state, HRTIMER_START_ON_INACTIVE);
+#endif
+}
+/* arm_release_timer() - start local release timer or trigger
+ *     remote timer (pull timer)
+ *
+ * Called by add_release() with:
+ * - tobe_lock taken
+ * - IRQ disabled
+ */
+#ifdef CONFIG_RELEASE_MASTER
+#define arm_release_timer(t) arm_release_timer_on((t), NO_CPU)
+static void arm_release_timer_on(rt_domain_t *_rt , int target_cpu)
+#else
+static void arm_release_timer(rt_domain_t *_rt)
+#endif
+{
+        rt_domain_t *rt = _rt;
+        struct list_head list;
+        struct list_head *pos, *safe;
+        struct task_struct* t;
+        struct release_heap* rh;
+        VTRACE("arm_release_timer() at %llu\n", litmus_clock());
+        list_replace_init(&rt->tobe_released, &list);
+        list_for_each_safe(pos, safe, &list) {
+                /* pick task of work list */
+                t = list_entry(pos, struct task_struct, rt_param.list);
+                sched_trace_task_release(t);
+                list_del(pos);
+                /* put into release heap while holding release_lock */
+                raw_spin_lock(&rt->release_lock);
+                VTRACE_TASK(t, "I have the release_lock 0x%p\n", &rt->release_lock);
+                rh = get_release_heap(rt, t, 0);
+                if (!rh) {
+                        /* need to use our own, but drop lock first */
+                        raw_spin_unlock(&rt->release_lock);
+                        VTRACE_TASK(t, "Dropped release_lock 0x%p\n",
+                                    &rt->release_lock);
+                        reinit_release_heap(t);
+                        VTRACE_TASK(t, "release_heap ready\n");
+                        raw_spin_lock(&rt->release_lock);
+                        VTRACE_TASK(t, "Re-acquired release_lock 0x%p\n",
+                                    &rt->release_lock);
+                        rh = get_release_heap(rt, t, 1);
+                }
+                bheap_insert(rt->order, &rh->heap, tsk_rt(t)->heap_node);
+                VTRACE_TASK(t, "arm_release_timer(): added to release heap\n");
+                raw_spin_unlock(&rt->release_lock);
+                VTRACE_TASK(t, "Returned the release_lock 0x%p\n", &rt->release_lock);
+                /* To avoid arming the timer multiple times, we only let the
+                 * owner do the arming (which is the "first" task to reference
+                 * this release_heap anyway).
+                 */
+                if (rh == tsk_rt(t)->rel_heap) {
+                        VTRACE_TASK(t, "arming timer 0x%p\n", &rh->timer);
+                        /* we cannot arm the timer using hrtimer_start()
+                         * as it may deadlock on rq->lock
+                         *
+                         * PINNED mode is ok on both local and remote CPU
+                         */
+#ifdef CONFIG_RELEASE_MASTER
+                        if (rt->release_master == NO_CPU &&
+                            target_cpu == NO_CPU)
+#endif
+                                __hrtimer_start_range_ns(&rh->timer,
+                                                ns_to_ktime(rh->release_time),
+                                                0, HRTIMER_MODE_ABS_PINNED, 0);
+#ifdef CONFIG_RELEASE_MASTER
+                        else
+                                hrtimer_start_on(
+                                        /* target_cpu overrides release master */
+                                        (target_cpu != NO_CPU ?
+                                         target_cpu : rt->release_master),
+                                        &rh->info, &rh->timer,
+                                        ns_to_ktime(rh->release_time),
+                                        HRTIMER_MODE_ABS_PINNED);
+#endif
+                } else
+                        VTRACE_TASK(t, "0x%p is not my timer\n", &rh->timer);
+        }
+}
+void rt_domain_init(rt_domain_t *rt,
+                    bheap_prio_t order,
+                    check_resched_needed_t check,
+                    release_jobs_t release
+                   )
+{
+        int i;
+        BUG_ON(!rt);
+        if (!check)
+                check = dummy_resched;
+        if (!release)
+                release = default_release_jobs;
+        if (!order)
+                order = dummy_order;
+#ifdef CONFIG_RELEASE_MASTER
+        rt->release_master = NO_CPU;
+#endif
+        bheap_init(&rt->ready_queue);
+        INIT_LIST_HEAD(&rt->tobe_released);
+        for (i = 0; i < RELEASE_QUEUE_SLOTS; i++)
+                INIT_LIST_HEAD(&rt->release_queue.slot[i]);
+        raw_spin_lock_init(&rt->ready_lock);
+        raw_spin_lock_init(&rt->release_lock);
+        raw_spin_lock_init(&rt->tobe_lock);
+        rt->check_resched       = check;
+        rt->release_jobs        = release;
+        rt->order               = order;
+}
+/* add_ready - add a real-time task to the rt ready queue. It must be runnable.
+ * @new:       the newly released task
+ */
+void __add_ready(rt_domain_t* rt, struct task_struct *new)
+{
+        TRACE("rt: adding %s/%d (%llu, %llu, %llu) rel=%llu "
+                "to ready queue at %llu\n",
+                new->comm, new->pid,
+                get_exec_cost(new), get_rt_period(new), get_rt_relative_deadline(new),
+                get_release(new), litmus_clock());
+        BUG_ON(bheap_node_in_heap(tsk_rt(new)->heap_node));
+        bheap_insert(rt->order, &rt->ready_queue, tsk_rt(new)->heap_node);
+        rt->check_resched(rt);
+}
+/* merge_ready - Add a sorted set of tasks to the rt ready queue. They must be runnable.
+ * @tasks      - the newly released tasks
+ */
+void __merge_ready(rt_domain_t* rt, struct bheap* tasks)
+{
+        bheap_union(rt->order, &rt->ready_queue, tasks);
+        rt->check_resched(rt);
+}
+#ifdef CONFIG_RELEASE_MASTER
+void __add_release_on(rt_domain_t* rt, struct task_struct *task,
+                      int target_cpu)
+{
+        TRACE_TASK(task, "add_release_on(), rel=%llu, target=%d\n",
+                   get_release(task), target_cpu);
+        list_add(&tsk_rt(task)->list, &rt->tobe_released);
+        task->rt_param.domain = rt;
+        arm_release_timer_on(rt, target_cpu);
+}
+#endif
+/* add_release - add a real-time task to the rt release queue.
+ * @task:        the sleeping task
+ */
+void __add_release(rt_domain_t* rt, struct task_struct *task)
+{
+        TRACE_TASK(task, "add_release(), rel=%llu\n", get_release(task));
+        list_add(&tsk_rt(task)->list, &rt->tobe_released);
+        task->rt_param.domain = rt;
+        arm_release_timer(rt);
+}
diff --git a/litmus/sched_plugin.c b/litmus/sched_plugin.c
new file mode 100644
index 000000000000..c4747e0ef2ab
--- /dev/null
+++ b/litmus/sched_plugin.c
@@ -0,0 +1,224 @@
+/* sched_plugin.c -- core infrastructure for the scheduler plugin system
+ *
+ * This file includes the initialization of the plugin system, the no-op Linux
+ * scheduler plugin, some dummy functions, and some helper functions.
+ */
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/preempt.h>
+#include <litmus/jobs.h>
+/*
+ * Generic function to trigger preemption on either local or remote cpu
+ * from scheduler plugins. The key feature is that this function is
+ * non-preemptive section aware and does not invoke the scheduler / send
+ * IPIs if the to-be-preempted task is actually non-preemptive.
+ */
+void preempt_if_preemptable(struct task_struct* t, int cpu)
+{
+        /* t is the real-time task executing on CPU on_cpu If t is NULL, then
+         * on_cpu is currently scheduling background work.
+         */
+        int reschedule = 0;
+        if (!t)
+                /* move non-real-time task out of the way */
+                reschedule = 1;
+        else {
+                if (smp_processor_id() == cpu) {
+                        /* local CPU case */
+                        /* check if we need to poke userspace */
+                        if (is_user_np(t))
+                                /* Yes, poke it. This doesn't have to be atomic since
+                                 * the task is definitely not executing. */
+                                request_exit_np(t);
+                        else if (!is_kernel_np(t))
+                                /* only if we are allowed to preempt the
+                                 * currently-executing task */
+                                reschedule = 1;
+                } else {
+                        /* Remote CPU case.  Only notify if it's not a kernel
+                         * NP section and if we didn't set the userspace
+                         * flag. */
+                        reschedule = !(is_kernel_np(t) || request_exit_np_atomic(t));
+                }
+        }
+        if (likely(reschedule))
+                litmus_reschedule(cpu);
+}
+/*************************************************************
+ *                   Dummy plugin functions                  *
+ *************************************************************/
+static void litmus_dummy_finish_switch(struct task_struct * prev)
+{
+}
+static struct task_struct* litmus_dummy_schedule(struct task_struct * prev)
+{
+        sched_state_task_picked();
+        return NULL;
+}
+static void litmus_dummy_tick(struct task_struct* tsk)
+{
+}
+static long litmus_dummy_admit_task(struct task_struct* tsk)
+{
+        printk(KERN_CRIT "LITMUS^RT: Linux plugin rejects %s/%d.\n",
+                tsk->comm, tsk->pid);
+        return -EINVAL;
+}
+static void litmus_dummy_task_new(struct task_struct *t, int on_rq, int running)
+{
+}
+static void litmus_dummy_task_wake_up(struct task_struct *task)
+{
+}
+static void litmus_dummy_task_block(struct task_struct *task)
+{
+}
+static void litmus_dummy_task_exit(struct task_struct *task)
+{
+}
+static long litmus_dummy_complete_job(void)
+{
+        return -ENOSYS;
+}
+static long litmus_dummy_activate_plugin(void)
+{
+        return 0;
+}
+static long litmus_dummy_deactivate_plugin(void)
+{
+        return 0;
+}
+#ifdef CONFIG_LITMUS_LOCKING
+static long litmus_dummy_allocate_lock(struct litmus_lock **lock, int type,
+                                       void* __user config)
+{
+        return -ENXIO;
+}
+#endif
+/* The default scheduler plugin. It doesn't do anything and lets Linux do its
+ * job.
+ */
+struct sched_plugin linux_sched_plugin = {
+        .plugin_name = "Linux",
+        .tick = litmus_dummy_tick,
+        .task_new   = litmus_dummy_task_new,
+        .task_exit = litmus_dummy_task_exit,
+        .task_wake_up = litmus_dummy_task_wake_up,
+        .task_block = litmus_dummy_task_block,
+        .complete_job = litmus_dummy_complete_job,
+        .schedule = litmus_dummy_schedule,
+        .finish_switch = litmus_dummy_finish_switch,
+        .activate_plugin = litmus_dummy_activate_plugin,
+        .deactivate_plugin = litmus_dummy_deactivate_plugin,
+#ifdef CONFIG_LITMUS_LOCKING
+        .allocate_lock = litmus_dummy_allocate_lock,
+#endif
+        .admit_task = litmus_dummy_admit_task
+};
+/*
+ *      The reference to current plugin that is used to schedule tasks within
+ *      the system. It stores references to actual function implementations
+ *      Should be initialized by calling "init_***_plugin()"
+ */
+struct sched_plugin *litmus = &linux_sched_plugin;
+/* the list of registered scheduling plugins */
+static LIST_HEAD(sched_plugins);
+static DEFINE_RAW_SPINLOCK(sched_plugins_lock);
+#define CHECK(func) {\
+        if (!plugin->func) \
+                plugin->func = litmus_dummy_ ## func;}
+/* FIXME: get reference to module  */
+int register_sched_plugin(struct sched_plugin* plugin)
+{
+        printk(KERN_INFO "Registering LITMUS^RT plugin %s.\n",
+               plugin->plugin_name);
+        /* make sure we don't trip over null pointers later */
+        CHECK(finish_switch);
+        CHECK(schedule);
+        CHECK(tick);
+        CHECK(task_wake_up);
+        CHECK(task_exit);
+        CHECK(task_block);
+        CHECK(task_new);
+        CHECK(complete_job);
+        CHECK(activate_plugin);
+        CHECK(deactivate_plugin);
+#ifdef CONFIG_LITMUS_LOCKING
+        CHECK(allocate_lock);
+#endif
+        CHECK(admit_task);
+        if (!plugin->release_at)
+                plugin->release_at = release_at;
+        raw_spin_lock(&sched_plugins_lock);
+        list_add(&plugin->list, &sched_plugins);
+        raw_spin_unlock(&sched_plugins_lock);
+        return 0;
+}
+/* FIXME: reference counting, etc. */
+struct sched_plugin* find_sched_plugin(const char* name)
+{
+        struct list_head *pos;
+        struct sched_plugin *plugin;
+        raw_spin_lock(&sched_plugins_lock);
+        list_for_each(pos, &sched_plugins) {
+                plugin = list_entry(pos, struct sched_plugin, list);
+                if (!strcmp(plugin->plugin_name, name))
+                    goto out_unlock;
+        }
+        plugin = NULL;
+out_unlock:
+        raw_spin_unlock(&sched_plugins_lock);
+        return plugin;
+}
+void print_sched_plugins(struct seq_file *m)
+{
+        struct list_head *pos;
+        struct sched_plugin *plugin;
+        raw_spin_lock(&sched_plugins_lock);
+        list_for_each(pos, &sched_plugins) {
+                plugin = list_entry(pos, struct sched_plugin, list);
+                seq_printf(m, "%s\n", plugin->plugin_name);
+        }
+        raw_spin_unlock(&sched_plugins_lock);
+}
diff --git a/litmus/srp.c b/litmus/srp.c
new file mode 100644
index 000000000000..c88dbf2f580f
--- /dev/null
+++ b/litmus/srp.c
@@ -0,0 +1,305 @@
+/* ************************************************************************** */
+/*                          STACK RESOURCE POLICY                             */
+/* ************************************************************************** */
+#include <asm/atomic.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/fdso.h>
+#include <litmus/trace.h>
+#ifdef CONFIG_LITMUS_LOCKING
+#include <litmus/srp.h>
+srp_prioritization_t get_srp_prio;
+struct srp {
+        struct list_head        ceiling;
+        wait_queue_head_t       ceiling_blocked;
+};
+#define system_ceiling(srp) list2prio(srp->ceiling.next)
+#define ceiling2sem(c) container_of(c, struct srp_semaphore, ceiling)
+#define UNDEF_SEM -2
+atomic_t srp_objects_in_use = ATOMIC_INIT(0);
+DEFINE_PER_CPU(struct srp, srp);
+/* Initialize SRP semaphores at boot time. */
+static int __init srp_init(void)
+{
+        int i;
+        printk("Initializing SRP per-CPU ceilings...");
+        for (i = 0; i < NR_CPUS; i++) {
+                init_waitqueue_head(&per_cpu(srp, i).ceiling_blocked);
+                INIT_LIST_HEAD(&per_cpu(srp, i).ceiling);
+        }
+        printk(" done!\n");
+        return 0;
+}
+module_init(srp_init);
+/* SRP task priority comparison function. Smaller numeric values have higher
+ * priority, tie-break is PID. Special case: priority == 0 <=> no priority
+ */
+static int srp_higher_prio(struct srp_priority* first,
+                           struct srp_priority* second)
+{
+        if (!first->priority)
+                return 0;
+        else
+                return  !second->priority ||
+                        first->priority < second->priority || (
+                        first->priority == second->priority &&
+                        first->pid < second->pid);
+}
+static int srp_exceeds_ceiling(struct task_struct* first,
+                               struct srp* srp)
+{
+        struct srp_priority prio;
+        if (list_empty(&srp->ceiling))
+                return 1;
+        else {
+                prio.pid = first->pid;
+                prio.priority = get_srp_prio(first);
+                return srp_higher_prio(&prio, system_ceiling(srp)) ||
+                        ceiling2sem(system_ceiling(srp))->owner == first;
+        }
+}
+static void srp_add_prio(struct srp* srp, struct srp_priority* prio)
+{
+        struct list_head *pos;
+        if (in_list(&prio->list)) {
+                printk(KERN_CRIT "WARNING: SRP violation detected, prio is already in "
+                       "ceiling list! cpu=%d, srp=%p\n", smp_processor_id(), ceiling2sem(prio));
+                return;
+        }
+        list_for_each(pos, &srp->ceiling)
+                if (unlikely(srp_higher_prio(prio, list2prio(pos)))) {
+                        __list_add(&prio->list, pos->prev, pos);
+                        return;
+                }
+        list_add_tail(&prio->list, &srp->ceiling);
+}
+static int lock_srp_semaphore(struct litmus_lock* l)
+{
+        struct task_struct* t = current;
+        struct srp_semaphore* sem = container_of(l, struct srp_semaphore, litmus_lock);
+        if (!is_realtime(t))
+                return -EPERM;
+        /* prevent acquisition of local locks in global critical sections */
+        if (tsk_rt(t)->num_locks_held)
+                return -EBUSY;
+        preempt_disable();
+        /* Update ceiling. */
+        srp_add_prio(&__get_cpu_var(srp), &sem->ceiling);
+        /* SRP invariant: all resources available */
+        BUG_ON(sem->owner != NULL);
+        sem->owner = t;
+        TRACE_CUR("acquired srp 0x%p\n", sem);
+        tsk_rt(t)->num_local_locks_held++;
+        preempt_enable();
+        return 0;
+}
+static int unlock_srp_semaphore(struct litmus_lock* l)
+{
+        struct task_struct* t = current;
+        struct srp_semaphore* sem = container_of(l, struct srp_semaphore, litmus_lock);
+        int err = 0;
+        preempt_disable();
+        if (sem->owner != t) {
+                err = -EINVAL;
+        } else {
+                /* Determine new system priority ceiling for this CPU. */
+                BUG_ON(!in_list(&sem->ceiling.list));
+                list_del(&sem->ceiling.list);
+                sem->owner = NULL;
+                /* Wake tasks on this CPU, if they exceed current ceiling. */
+                TRACE_CUR("released srp 0x%p\n", sem);
+                wake_up_all(&__get_cpu_var(srp).ceiling_blocked);
+                tsk_rt(t)->num_local_locks_held--;
+        }
+        preempt_enable();
+        return err;
+}
+static int open_srp_semaphore(struct litmus_lock* l, void* __user arg)
+{
+        struct srp_semaphore* sem = container_of(l, struct srp_semaphore, litmus_lock);
+        int err = 0;
+        struct task_struct* t = current;
+        struct srp_priority t_prio;
+        if (!is_realtime(t))
+                return -EPERM;
+        TRACE_CUR("opening SRP semaphore %p, cpu=%d\n", sem, sem->cpu);
+        preempt_disable();
+        if (sem->owner != NULL)
+                err = -EBUSY;
+        if (err == 0) {
+                if (sem->cpu == UNDEF_SEM)
+                        sem->cpu = get_partition(t);
+                else if (sem->cpu != get_partition(t))
+                        err = -EPERM;
+        }
+        if (err == 0) {
+                t_prio.priority = get_srp_prio(t);
+                t_prio.pid      = t->pid;
+                if (srp_higher_prio(&t_prio, &sem->ceiling)) {
+                        sem->ceiling.priority = t_prio.priority;
+                        sem->ceiling.pid      = t_prio.pid;
+                }
+        }
+        preempt_enable();
+        return err;
+}
+static int close_srp_semaphore(struct litmus_lock* l)
+{
+        struct srp_semaphore* sem = container_of(l, struct srp_semaphore, litmus_lock);
+        int err = 0;
+        preempt_disable();
+        if (sem->owner == current)
+                unlock_srp_semaphore(l);
+        preempt_enable();
+        return err;
+}
+static void deallocate_srp_semaphore(struct litmus_lock* l)
+{
+        struct srp_semaphore* sem = container_of(l, struct srp_semaphore, litmus_lock);
+        atomic_dec(&srp_objects_in_use);
+        kfree(sem);
+}
+static struct litmus_lock_ops srp_lock_ops = {
+        .open   = open_srp_semaphore,
+        .close  = close_srp_semaphore,
+        .lock   = lock_srp_semaphore,
+        .unlock = unlock_srp_semaphore,
+        .deallocate = deallocate_srp_semaphore,
+};
+struct srp_semaphore* allocate_srp_semaphore(void)
+{
+        struct srp_semaphore* sem;
+        sem = kmalloc(sizeof(*sem), GFP_KERNEL);
+        if (!sem)
+                return NULL;
+        INIT_LIST_HEAD(&sem->ceiling.list);
+        sem->ceiling.priority = 0;
+        sem->cpu     = UNDEF_SEM;
+        sem->owner   = NULL;
+        sem->litmus_lock.ops = &srp_lock_ops;
+        atomic_inc(&srp_objects_in_use);
+        return sem;
+}
+static int srp_wake_up(wait_queue_t *wait, unsigned mode, int sync,
+                       void *key)
+{
+        int cpu = smp_processor_id();
+        struct task_struct *tsk = wait->private;
+        if (cpu != get_partition(tsk))
+                TRACE_TASK(tsk, "srp_wake_up on wrong cpu, partition is %d\b",
+                           get_partition(tsk));
+        else if (srp_exceeds_ceiling(tsk, &__get_cpu_var(srp)))
+                return default_wake_function(wait, mode, sync, key);
+        return 0;
+}
+static void do_ceiling_block(struct task_struct *tsk)
+{
+        wait_queue_t wait = {
+                .private   = tsk,
+                .func      = srp_wake_up,
+                .task_list = {NULL, NULL}
+        };
+        tsk->state = TASK_UNINTERRUPTIBLE;
+        add_wait_queue(&__get_cpu_var(srp).ceiling_blocked, &wait);
+        tsk->rt_param.srp_non_recurse = 1;
+        preempt_enable_no_resched();
+        schedule();
+        preempt_disable();
+        tsk->rt_param.srp_non_recurse = 0;
+        remove_wait_queue(&__get_cpu_var(srp).ceiling_blocked, &wait);
+}
+/* Wait for current task priority to exceed system-wide priority ceiling.
+ * FIXME: the hotpath should be inline.
+ */
+void srp_ceiling_block(void)
+{
+        struct task_struct *tsk = current;
+        /* Only applies to real-time tasks, but optimize for RT tasks. */
+        if (unlikely(!is_realtime(tsk)))
+                return;
+        /* Avoid recursive ceiling blocking. */
+        if (unlikely(tsk->rt_param.srp_non_recurse))
+                return;
+        /* Bail out early if there aren't any SRP resources around. */
+        if (likely(!atomic_read(&srp_objects_in_use)))
+                return;
+        preempt_disable();
+        if (!srp_exceeds_ceiling(tsk, &__get_cpu_var(srp))) {
+                TRACE_CUR("is priority ceiling blocked.\n");
+                while (!srp_exceeds_ceiling(tsk, &__get_cpu_var(srp)))
+                        do_ceiling_block(tsk);
+                TRACE_CUR("finally exceeds system ceiling.\n");
+        } else
+                TRACE_CUR("is not priority ceiling blocked\n");
+        preempt_enable();
+}
+#endif
diff --git a/litmus/sync.c b/litmus/sync.c
new file mode 100644
index 000000000000..61a95463e4d2
--- /dev/null
+++ b/litmus/sync.c
@@ -0,0 +1,166 @@
+/* litmus/sync.c - Support for synchronous and asynchronous task system releases.
+ *
+ *
+ */
+#include <asm/atomic.h>
+#include <asm/uaccess.h>
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <linux/sched.h>
+#include <linux/completion.h>
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/jobs.h>
+#include <litmus/sched_trace.h>
+struct ts_release_wait {
+        struct list_head list;
+        struct completion completion;
+        lt_t ts_release_time;
+};
+#define DECLARE_TS_RELEASE_WAIT(symb)                                   \
+        struct ts_release_wait symb =                                   \
+        {                                                               \
+                LIST_HEAD_INIT(symb.list),                              \
+                COMPLETION_INITIALIZER_ONSTACK(symb.completion),        \
+                0                                                       \
+        }
+static LIST_HEAD(task_release_list);
+static DEFINE_MUTEX(task_release_lock);
+static long do_wait_for_ts_release(void)
+{
+        DECLARE_TS_RELEASE_WAIT(wait);
+        long ret = -ERESTARTSYS;
+        if (mutex_lock_interruptible(&task_release_lock))
+                goto out;
+        list_add(&wait.list, &task_release_list);
+        mutex_unlock(&task_release_lock);
+        /* We are enqueued, now we wait for someone to wake us up. */
+        ret = wait_for_completion_interruptible(&wait.completion);
+        if (!ret) {
+                /* Setting this flag before releasing ensures that this CPU
+                 * will be the next CPU to requeue the task on a ready or
+                 * release queue. Cleared by prepare_for_next_period()
+                 */
+                tsk_rt(current)->dont_requeue = 1;
+                /* Completion succeeded, setup release time. complete_job()
+                 * will indirectly cause the period to be added to the next
+                 * release time, so subtract it here. */
+                litmus->release_at(current, wait.ts_release_time
+                                   + current->rt_param.task_params.phase
+                                   - current->rt_param.task_params.period);
+                /* Advance to next job --- when complete_job() returns, the
+                 * first job has been released. Since we patched up the release
+                 * time, this occurs when all tasks synchronously release their
+                 * first job.*/
+                ret = complete_job();
+        } else {
+                /* We were interrupted, must cleanup list. */
+                mutex_lock(&task_release_lock);
+                if (!wait.completion.done)
+                        list_del(&wait.list);
+                mutex_unlock(&task_release_lock);
+        }
+out:
+        return ret;
+}
+int count_tasks_waiting_for_release(void)
+{
+        int task_count = 0;
+        struct list_head *pos;
+        mutex_lock(&task_release_lock);
+        list_for_each(pos, &task_release_list) {
+                task_count++;
+        }
+        mutex_unlock(&task_release_lock);
+        return task_count;
+}
+static long do_release_ts(lt_t start)
+{
+        long  task_count = 0;
+        struct list_head        *pos, *safe;
+        struct ts_release_wait  *wait;
+        if (mutex_lock_interruptible(&task_release_lock)) {
+                task_count = -ERESTARTSYS;
+                goto out;
+        }
+        TRACE("<<<<<< synchronous task system release >>>>>>\n");
+        sched_trace_sys_release(&start);
+        task_count = 0;
+        list_for_each_safe(pos, safe, &task_release_list) {
+                wait = (struct ts_release_wait*)
+                        list_entry(pos, struct ts_release_wait, list);
+                task_count++;
+                wait->ts_release_time = start;
+                complete(&wait->completion);
+        }
+        /* clear stale list */
+        INIT_LIST_HEAD(&task_release_list);
+        mutex_unlock(&task_release_lock);
+out:
+        return task_count;
+}
+asmlinkage long sys_wait_for_ts_release(void)
+{
+        long ret = -EPERM;
+        struct task_struct *t = current;
+        if (is_realtime(t))
+                ret = do_wait_for_ts_release();
+        return ret;
+}
+#define ONE_MS 1000000
+asmlinkage long sys_release_ts(lt_t __user *__delay)
+{
+        long ret;
+        lt_t delay;
+        lt_t start_time;
+        /* FIXME: check capabilities... */
+        ret = copy_from_user(&delay, __delay, sizeof(delay));
+        if (ret == 0) {
+                /* round up to next larger integral millisecond */
+                start_time = litmus_clock();
+                do_div(start_time, ONE_MS);
+                start_time *= ONE_MS;
+                ret = do_release_ts(start_time + delay);
+        }
+        return ret;
+}
author	Bjoern Brandenburg <bbb@mpi-sws.org>	2013-06-25 01:27:07 -0400
committer	Bjoern Brandenburg <bbb@mpi-sws.org>	2013-08-07 03:46:49 -0400
commit	543810eb67bea9c3046ecb58388493bca39fe796 (patch)
tree	cf65010367e53dfbd3e39a9eb6e89dacf92348f3
parent	1412c8b72e192a14b8dd620f58a75f55a5490783 (diff)