Merge branch 'master' into wip-merge-2.6.34

Simple merge between master and 2.6.34 with conflicts resolved. This commit does not compile, the following main problems are still unresolved: - spinlock -> raw_spinlock API changes - kfifo API changes - sched_class API changes Conflicts: Makefile arch/x86/include/asm/hw_irq.h arch/x86/include/asm/unistd_32.h arch/x86/kernel/syscall_table_32.S include/linux/hrtimer.h kernel/sched.c kernel/sched_fair.c
author: Andrea Bastoni <bastoni@cs.unc.edu> 2010-05-29 23:35:01 -0400
committer: Andrea Bastoni <bastoni@cs.unc.edu> 2010-05-29 23:35:01 -0400
commit: 6ffc1fee98c4b995eb3a0285f4f8fb467cb0306e (patch)
tree: 69a05892a41e7f7400fa598ee0bdf8027c8f0fd6 /litmus
parent: e40152ee1e1c7a63f4777791863215e3faa37a86 (diff)
parent: 7c1ff4c544dd650cceff3cd69a04bcba60856678 (diff)
23 files changed, 7401 insertions, 0 deletions
diff --git a/litmus/Kconfig b/litmus/Kconfig
new file mode 100644
index 000000000000..874794f64af1
--- /dev/null
+++ b/litmus/Kconfig
@@ -0,0 +1,85 @@
+menu "LITMUS^RT"
+menu "Real-Time Synchronization"
+config NP_SECTION
+        bool "Non-preemptive section support"
+        default n
+        help
+          Allow tasks to become non-preemptable.
+          Note that plugins still need to explicitly support non-preemptivity.
+          Currently, only GSN-EDF and PSN-EDF have such support.
+          This is required to support the FMLP.
+          If disabled, all tasks will be considered preemptable at all times.
+config SRP
+        bool "Stack Resource Policy (SRP)"
+        default n
+        help
+          Include support for Baker's Stack Resource Policy.
+          Say Yes if you want FMLP local long critical section
+          synchronization support.
+config FMLP
+        bool "FMLP support"
+        depends on NP_SECTION
+        default n
+        help
+          Include support for deterministic multiprocessor real-time
+          synchronization support.
+          Say Yes if you want FMLP long critical section
+          synchronization support.
+endmenu
+menu "Tracing"
+config FEATHER_TRACE
+        bool "Feather-Trace Infrastructure"
+        default y
+        help
+          Feather-Trace basic tracing infrastructure. Includes device file
+          driver and instrumentation point support.
+config SCHED_TASK_TRACE
+        bool "Trace real-time tasks"
+        depends on FEATHER_TRACE
+        default y
+        help
+          Include support for the sched_trace_XXX() tracing functions. This
+          allows the collection of real-time task events such as job
+          completions, job releases, early completions, etc. This results in  a
+          small overhead in the scheduling code. Disable if the overhead is not
+          acceptable (e.g., benchmarking).
+          Say Yes for debugging.
+          Say No for overhead tracing.
+config SCHED_OVERHEAD_TRACE
+        bool "Record timestamps for overhead measurements"
+        depends on FEATHER_TRACE
+        default n
+        help
+          Export event stream for overhead tracing.
+          Say Yes for overhead tracing.
+config SCHED_DEBUG_TRACE
+        bool "TRACE() debugging"
+        default y
+        help
+          Include support for sched_trace_log_messageg(), which is used to
+          implement TRACE(). If disabled, no TRACE() messages will be included
+          in the kernel, and no overheads due to debugging statements will be
+          incurred by the scheduler. Disable if the overhead is not acceptable
+          (e.g. benchmarking).
+          Say Yes for debugging.
+          Say No for overhead tracing.
+endmenu
+endmenu
diff --git a/litmus/Makefile b/litmus/Makefile
new file mode 100644
index 000000000000..0cc33e8bee51
--- /dev/null
+++ b/litmus/Makefile
@@ -0,0 +1,23 @@
+#
+# Makefile for LITMUS^RT
+#
+obj-y     = sched_plugin.o litmus.o \
+            jobs.o \
+            sync.o \
+            rt_domain.o \
+            edf_common.o \
+            fdso.o \
+            srp.o \
+            fmlp.o \
+            bheap.o \
+            ctrldev.o \
+            sched_gsn_edf.o \
+            sched_psn_edf.o \
+            sched_cedf.o \
+            sched_pfair.o
+obj-$(CONFIG_FEATHER_TRACE) += ft_event.o ftdev.o
+obj-$(CONFIG_SCHED_TASK_TRACE) += sched_task_trace.o
+obj-$(CONFIG_SCHED_DEBUG_TRACE) += sched_trace.o
+obj-$(CONFIG_SCHED_OVERHEAD_TRACE) += trace.o
diff --git a/litmus/bheap.c b/litmus/bheap.c
new file mode 100644
index 000000000000..528af97f18a6
--- /dev/null
+++ b/litmus/bheap.c
@@ -0,0 +1,314 @@
+#include "linux/kernel.h"
+#include "litmus/bheap.h"
+void bheap_init(struct bheap* heap)
+{
+        heap->head = NULL;
+        heap->min  = NULL;
+}
+void bheap_node_init(struct bheap_node** _h, void* value)
+{
+        struct bheap_node* h = *_h;
+        h->parent = NULL;
+        h->next   = NULL;
+        h->child  = NULL;
+        h->degree = NOT_IN_HEAP;
+        h->value  = value;
+        h->ref    = _h;
+}
+/* make child a subtree of root */
+static void __bheap_link(struct bheap_node* root,
+                        struct bheap_node* child)
+{
+        child->parent = root;
+        child->next   = root->child;
+        root->child   = child;
+        root->degree++;
+}
+/* merge root lists */
+static  struct bheap_node* __bheap_merge(struct bheap_node* a,
+                                             struct bheap_node* b)
+{
+        struct bheap_node* head = NULL;
+        struct bheap_node** pos = &head;
+        while (a && b) {
+                if (a->degree < b->degree) {
+                        *pos = a;
+                        a = a->next;
+                } else {
+                        *pos = b;
+                        b = b->next;
+                }
+                pos = &(*pos)->next;
+        }
+        if (a)
+                *pos = a;
+        else
+                *pos = b;
+        return head;
+}
+/* reverse a linked list of nodes. also clears parent pointer */
+static  struct bheap_node* __bheap_reverse(struct bheap_node* h)
+{
+        struct bheap_node* tail = NULL;
+        struct bheap_node* next;
+        if (!h)
+                return h;
+        h->parent = NULL;
+        while (h->next) {
+                next    = h->next;
+                h->next = tail;
+                tail    = h;
+                h       = next;
+                h->parent = NULL;
+        }
+        h->next = tail;
+        return h;
+}
+static  void __bheap_min(bheap_prio_t higher_prio, struct bheap* heap,
+                              struct bheap_node** prev, struct bheap_node** node)
+{
+        struct bheap_node *_prev, *cur;
+        *prev = NULL;
+        if (!heap->head) {
+                *node = NULL;
+                return;
+        }
+        *node = heap->head;
+        _prev = heap->head;
+        cur   = heap->head->next;
+        while (cur) {
+                if (higher_prio(cur, *node)) {
+                        *node = cur;
+                        *prev = _prev;
+                }
+                _prev = cur;
+                cur   = cur->next;
+        }
+}
+static  void __bheap_union(bheap_prio_t higher_prio, struct bheap* heap,
+                                struct bheap_node* h2)
+{
+        struct bheap_node* h1;
+        struct bheap_node *prev, *x, *next;
+        if (!h2)
+                return;
+        h1 = heap->head;
+        if (!h1) {
+                heap->head = h2;
+                return;
+        }
+        h1 = __bheap_merge(h1, h2);
+        prev = NULL;
+        x    = h1;
+        next = x->next;
+        while (next) {
+                if (x->degree != next->degree ||
+                    (next->next && next->next->degree == x->degree)) {
+                        /* nothing to do, advance */
+                        prev = x;
+                        x    = next;
+                } else if (higher_prio(x, next)) {
+                        /* x becomes the root of next */
+                        x->next = next->next;
+                        __bheap_link(x, next);
+                } else {
+                        /* next becomes the root of x */
+                        if (prev)
+                                prev->next = next;
+                        else
+                                h1 = next;
+                        __bheap_link(next, x);
+                        x = next;
+                }
+                next = x->next;
+        }
+        heap->head = h1;
+}
+static struct bheap_node* __bheap_extract_min(bheap_prio_t higher_prio,
+                                            struct bheap* heap)
+{
+        struct bheap_node *prev, *node;
+        __bheap_min(higher_prio, heap, &prev, &node);
+        if (!node)
+                return NULL;
+        if (prev)
+                prev->next = node->next;
+        else
+                heap->head = node->next;
+        __bheap_union(higher_prio, heap, __bheap_reverse(node->child));
+        return node;
+}
+/* insert (and reinitialize) a node into the heap */
+void bheap_insert(bheap_prio_t higher_prio, struct bheap* heap,
+                 struct bheap_node* node)
+{
+        struct bheap_node *min;
+        node->child  = NULL;
+        node->parent = NULL;
+        node->next   = NULL;
+        node->degree = 0;
+        if (heap->min && higher_prio(node, heap->min)) {
+                /* swap min cache */
+                min = heap->min;
+                min->child  = NULL;
+                min->parent = NULL;
+                min->next   = NULL;
+                min->degree = 0;
+                __bheap_union(higher_prio, heap, min);
+                heap->min   = node;
+        } else
+                __bheap_union(higher_prio, heap, node);
+}
+void bheap_uncache_min(bheap_prio_t higher_prio, struct bheap* heap)
+{
+        struct bheap_node* min;
+        if (heap->min) {
+                min = heap->min;
+                heap->min = NULL;
+                bheap_insert(higher_prio, heap, min);
+        }
+}
+/* merge addition into target */
+void bheap_union(bheap_prio_t higher_prio,
+                struct bheap* target, struct bheap* addition)
+{
+        /* first insert any cached minima, if necessary */
+        bheap_uncache_min(higher_prio, target);
+        bheap_uncache_min(higher_prio, addition);
+        __bheap_union(higher_prio, target, addition->head);
+        /* this is a destructive merge */
+        addition->head = NULL;
+}
+struct bheap_node* bheap_peek(bheap_prio_t higher_prio,
+                            struct bheap* heap)
+{
+        if (!heap->min)
+                heap->min = __bheap_extract_min(higher_prio, heap);
+        return heap->min;
+}
+struct bheap_node* bheap_take(bheap_prio_t higher_prio,
+                            struct bheap* heap)
+{
+        struct bheap_node *node;
+        if (!heap->min)
+                heap->min = __bheap_extract_min(higher_prio, heap);
+        node = heap->min;
+        heap->min = NULL;
+        if (node)
+                node->degree = NOT_IN_HEAP;
+        return node;
+}
+int bheap_decrease(bheap_prio_t higher_prio, struct bheap_node* node)
+{
+        struct bheap_node  *parent;
+        struct bheap_node** tmp_ref;
+        void* tmp;
+        /* bubble up */
+        parent = node->parent;
+        while (parent && higher_prio(node, parent)) {
+                /* swap parent and node */
+                tmp           = parent->value;
+                parent->value = node->value;
+                node->value   = tmp;
+                /* swap references */
+                *(parent->ref) = node;
+                *(node->ref)   = parent;
+                tmp_ref        = parent->ref;
+                parent->ref    = node->ref;
+                node->ref      = tmp_ref;
+                /* step up */
+                node   = parent;
+                parent = node->parent;
+        }
+        return parent != NULL;
+}
+void bheap_delete(bheap_prio_t higher_prio, struct bheap* heap,
+                 struct bheap_node* node)
+{
+        struct bheap_node *parent, *prev, *pos;
+        struct bheap_node** tmp_ref;
+        void* tmp;
+        if (heap->min != node) {
+                /* bubble up */
+                parent = node->parent;
+                while (parent) {
+                        /* swap parent and node */
+                        tmp           = parent->value;
+                        parent->value = node->value;
+                        node->value   = tmp;
+                        /* swap references */
+                        *(parent->ref) = node;
+                        *(node->ref)   = parent;
+                        tmp_ref        = parent->ref;
+                        parent->ref    = node->ref;
+                        node->ref      = tmp_ref;
+                        /* step up */
+                        node   = parent;
+                        parent = node->parent;
+                }
+                /* now delete:
+                 * first find prev */
+                prev = NULL;
+                pos  = heap->head;
+                while (pos != node) {
+                        prev = pos;
+                        pos  = pos->next;
+                }
+                /* we have prev, now remove node */
+                if (prev)
+                        prev->next = node->next;
+                else
+                        heap->head = node->next;
+                __bheap_union(higher_prio, heap, __bheap_reverse(node->child));
+        } else
+                heap->min = NULL;
+        node->degree = NOT_IN_HEAP;
+}
+/* allocate a heap node for value and insert into the heap */
+int bheap_add(bheap_prio_t higher_prio, struct bheap* heap,
+             void* value, int gfp_flags)
+{
+        struct bheap_node* hn = bheap_node_alloc(gfp_flags);
+        if (likely(hn)) {
+                bheap_node_init(&hn, value);
+                bheap_insert(higher_prio, heap, hn);
+        }
+        return hn != NULL;
+}
+void* bheap_take_del(bheap_prio_t higher_prio,
+                    struct bheap* heap)
+{
+        struct bheap_node* hn = bheap_take(higher_prio, heap);
+        void* ret = NULL;
+        if (hn) {
+                ret = hn->value;
+                bheap_node_free(hn);
+        }
+        return ret;
+}
diff --git a/litmus/ctrldev.c b/litmus/ctrldev.c
new file mode 100644
index 000000000000..6677a67cc945
--- /dev/null
+++ b/litmus/ctrldev.c
@@ -0,0 +1,150 @@
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <litmus/litmus.h>
+/* only one page for now, but we might want to add a RO version at some point */
+#define CTRL_NAME        "litmus/ctrl"
+/* allocate t->rt_param.ctrl_page*/
+static int alloc_ctrl_page(struct task_struct *t)
+{
+        int err = 0;
+        /* only allocate if the task doesn't have one yet */
+        if (!tsk_rt(t)->ctrl_page) {
+                tsk_rt(t)->ctrl_page = (void*) get_zeroed_page(GFP_KERNEL);
+                if (!tsk_rt(t)->ctrl_page)
+                        err = -ENOMEM;
+                /* will get de-allocated in task teardown */
+                TRACE_TASK(t, "%s ctrl_page = %p\n", __FUNCTION__,
+                           tsk_rt(t)->ctrl_page);
+        }
+        return err;
+}
+static int map_ctrl_page(struct task_struct *t, struct vm_area_struct* vma)
+{
+        int err;
+        unsigned long pfn;
+        struct page* ctrl = virt_to_page(tsk_rt(t)->ctrl_page);
+        /* Increase ref count. Is decreased when vma is destroyed. */
+        get_page(ctrl);
+        /* compute page frame number */
+        pfn = page_to_pfn(ctrl);
+        TRACE_CUR(CTRL_NAME
+                  ": mapping %p (pfn:%lx, %lx) to 0x%lx (prot:%lx)\n",
+                  tsk_rt(t)->ctrl_page, pfn, page_to_pfn(ctrl), vma->vm_start,
+                  vma->vm_page_prot);
+        /* Map it into the vma. Make sure to use PAGE_SHARED, otherwise
+         * userspace actually gets a copy-on-write page. */
+        err = remap_pfn_range(vma, vma->vm_start, pfn, PAGE_SIZE, PAGE_SHARED);
+        if (err)
+                TRACE_CUR(CTRL_NAME ": remap_pfn_range() failed (%d)\n", err);
+        return err;
+}
+static void litmus_ctrl_vm_close(struct vm_area_struct* vma)
+{
+        TRACE_CUR("%s flags=0x%x prot=0x%x\n", __FUNCTION__,
+                  vma->vm_flags, vma->vm_page_prot);
+        TRACE_CUR(CTRL_NAME
+                  ": %p:%p vma:%p vma->vm_private_data:%p closed.\n",
+                  (void*) vma->vm_start, (void*) vma->vm_end, vma,
+                  vma->vm_private_data, current->comm,
+                  current->pid);
+}
+static int litmus_ctrl_vm_fault(struct vm_area_struct* vma,
+                                      struct vm_fault* vmf)
+{
+        /* This function should never be called, since
+         * all pages should have been mapped by mmap()
+         * already. */
+        TRACE_CUR("%s flags=0x%x\n", __FUNCTION__, vma->vm_flags);
+        /* nope, you only get one page */
+        return VM_FAULT_SIGBUS;
+}
+static struct vm_operations_struct litmus_ctrl_vm_ops = {
+        .close = litmus_ctrl_vm_close,
+        .fault = litmus_ctrl_vm_fault,
+};
+static int litmus_ctrl_mmap(struct file* filp, struct vm_area_struct* vma)
+{
+        int err = 0;
+        /* first make sure mapper knows what he's doing */
+        /* you can only get one page */
+        if (vma->vm_end - vma->vm_start != PAGE_SIZE)
+                return -EINVAL;
+        /* you can only map the "first" page */
+        if (vma->vm_pgoff != 0)
+                return -EINVAL;
+        /* you can't share it with anyone */
+        if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED))
+                return -EINVAL;
+        vma->vm_ops = &litmus_ctrl_vm_ops;
+        /* this mapping should not be kept across forks,
+         * and cannot be expanded */
+        vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND;
+        err = alloc_ctrl_page(current);
+        if (!err)
+                err = map_ctrl_page(current, vma);
+        TRACE_CUR("%s flags=0x%x prot=0x%lx\n",
+                  __FUNCTION__, vma->vm_flags, vma->vm_page_prot);
+        return err;
+}
+static struct file_operations litmus_ctrl_fops = {
+        .owner = THIS_MODULE,
+        .mmap  = litmus_ctrl_mmap,
+};
+static struct miscdevice litmus_ctrl_dev = {
+        .name  = CTRL_NAME,
+        .minor = MISC_DYNAMIC_MINOR,
+        .fops  = &litmus_ctrl_fops,
+};
+static int __init init_litmus_ctrl_dev(void)
+{
+        int err;
+        BUILD_BUG_ON(sizeof(struct control_page) > PAGE_SIZE);
+        printk("Initializing LITMUS^RT control device.\n");
+        err = misc_register(&litmus_ctrl_dev);
+        if (err)
+                printk("Could not allocate %s device (%d).\n", CTRL_NAME, err);
+        return err;
+}
+static void __exit exit_litmus_ctrl_dev(void)
+{
+        misc_deregister(&litmus_ctrl_dev);
+}
+module_init(init_litmus_ctrl_dev);
+module_exit(exit_litmus_ctrl_dev);
diff --git a/litmus/edf_common.c b/litmus/edf_common.c
new file mode 100644
index 000000000000..06daec66c984
--- /dev/null
+++ b/litmus/edf_common.c
@@ -0,0 +1,102 @@
+/*
+ * kernel/edf_common.c
+ *
+ * Common functions for EDF based scheduler.
+ */
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/sched_trace.h>
+#include <litmus/edf_common.h>
+/* edf_higher_prio -  returns true if first has a higher EDF priority
+ *                    than second. Deadline ties are broken by PID.
+ *
+ * both first and second may be NULL
+ */
+int edf_higher_prio(struct task_struct* first,
+                    struct task_struct* second)
+{
+        struct task_struct *first_task = first;
+        struct task_struct *second_task = second;
+        /* There is no point in comparing a task to itself. */
+        if (first && first == second) {
+                TRACE_TASK(first,
+                           "WARNING: pointless edf priority comparison.\n");
+                return 0;
+        }
+        /* Check for inherited priorities. Change task
+         * used for comparison in such a case.
+         */
+        if (first && first->rt_param.inh_task)
+                first_task = first->rt_param.inh_task;
+        if (second && second->rt_param.inh_task)
+                second_task = second->rt_param.inh_task;
+        return
+                /* it has to exist in order to have higher priority */
+                first_task && (
+                /* does the second task exist and is it a real-time task?  If
+                 * not, the first task (which is a RT task) has higher
+                 * priority.
+                 */
+                !second_task || !is_realtime(second_task)  ||
+                /* is the deadline of the first task earlier?
+                 * Then it has higher priority.
+                 */
+                earlier_deadline(first_task, second_task) ||
+                /* Do we have a deadline tie?
+                 * Then break by PID.
+                 */
+                (get_deadline(first_task) == get_deadline(second_task) &&
+                (first_task->pid < second_task->pid ||
+                /* If the PIDs are the same then the task with the inherited
+                 * priority wins.
+                 */
+                (first_task->pid == second_task->pid &&
+                 !second->rt_param.inh_task))));
+}
+int edf_ready_order(struct bheap_node* a, struct bheap_node* b)
+{
+        return edf_higher_prio(bheap2task(a), bheap2task(b));
+}
+void edf_domain_init(rt_domain_t* rt, check_resched_needed_t resched,
+                      release_jobs_t release)
+{
+        rt_domain_init(rt,  edf_ready_order, resched, release);
+}
+/* need_to_preempt - check whether the task t needs to be preempted
+ *                   call only with irqs disabled and with  ready_lock acquired
+ *                   THIS DOES NOT TAKE NON-PREEMPTIVE SECTIONS INTO ACCOUNT!
+ */
+int edf_preemption_needed(rt_domain_t* rt, struct task_struct *t)
+{
+        /* we need the read lock for edf_ready_queue */
+        /* no need to preempt if there is nothing pending */
+        if (!__jobs_pending(rt))
+                return 0;
+        /* we need to reschedule if t doesn't exist */
+        if (!t)
+                return 1;
+        /* NOTE: We cannot check for non-preemptibility since we
+         *       don't know what address space we're currently in.
+         */
+        /* make sure to get non-rt stuff out of the way */
+        return !is_realtime(t) || edf_higher_prio(__next_ready(rt), t);
+}
diff --git a/litmus/fdso.c b/litmus/fdso.c
new file mode 100644
index 000000000000..85be716941d8
--- /dev/null
+++ b/litmus/fdso.c
@@ -0,0 +1,281 @@
+/* fdso.c - file descriptor attached shared objects
+ *
+ * (c) 2007 B. Brandenburg, LITMUS^RT project
+ *
+ * Notes:
+ *   - objects descriptor (OD) tables are not cloned during a fork.
+ *   - objects are created on-demand, and freed after the last reference
+ *     is dropped.
+ *   - for now, object types are hard coded.
+ *   - As long as we have live objects, we keep a reference to the inode.
+ */
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/mutex.h>
+#include <linux/file.h>
+#include <asm/uaccess.h>
+#include <litmus/fdso.h>
+extern struct fdso_ops fmlp_sem_ops;
+extern struct fdso_ops srp_sem_ops;
+static const struct fdso_ops* fdso_ops[] = {
+        &fmlp_sem_ops,
+        &srp_sem_ops,
+};
+static void* fdso_create(obj_type_t type)
+{
+        if (fdso_ops[type]->create)
+                return fdso_ops[type]->create();
+        else
+                return NULL;
+}
+static void fdso_destroy(obj_type_t type, void* obj)
+{
+        fdso_ops[type]->destroy(obj);
+}
+static int fdso_open(struct od_table_entry* entry, void* __user config)
+{
+        if (fdso_ops[entry->obj->type]->open)
+                return fdso_ops[entry->obj->type]->open(entry, config);
+        else
+                return 0;
+}
+static int fdso_close(struct od_table_entry* entry)
+{
+        if (fdso_ops[entry->obj->type]->close)
+                return fdso_ops[entry->obj->type]->close(entry);
+        else
+                return 0;
+}
+/* inode must be locked already */
+static struct inode_obj_id* alloc_inode_obj(struct inode* inode,
+                                            obj_type_t type,
+                                            unsigned int id)
+{
+        struct inode_obj_id* obj;
+        void* raw_obj;
+        raw_obj = fdso_create(type);
+        if (!raw_obj)
+                return NULL;
+        obj = kmalloc(sizeof(*obj), GFP_KERNEL);
+        if (!obj)
+                return NULL;
+        INIT_LIST_HEAD(&obj->list);
+        atomic_set(&obj->count, 1);
+        obj->type  = type;
+        obj->id    = id;
+        obj->obj   = raw_obj;
+        obj->inode = inode;
+        list_add(&obj->list, &inode->i_obj_list);
+        atomic_inc(&inode->i_count);
+        printk(KERN_DEBUG "alloc_inode_obj(%p, %d, %d): object created\n", inode, type, id);
+        return obj;
+}
+/* inode must be locked already */
+static struct inode_obj_id* get_inode_obj(struct inode* inode,
+                                          obj_type_t type,
+                                          unsigned int id)
+{
+        struct list_head* pos;
+        struct inode_obj_id* obj = NULL;
+        list_for_each(pos, &inode->i_obj_list) {
+                obj = list_entry(pos, struct inode_obj_id, list);
+                if (obj->id == id && obj->type == type) {
+                        atomic_inc(&obj->count);
+                        return obj;
+                }
+        }
+        printk(KERN_DEBUG "get_inode_obj(%p, %d, %d): couldn't find object\n", inode, type, id);
+        return NULL;
+}
+static void put_inode_obj(struct inode_obj_id* obj)
+{
+        struct inode* inode;
+        int let_go = 0;
+        inode = obj->inode;
+        if (atomic_dec_and_test(&obj->count)) {
+                mutex_lock(&inode->i_obj_mutex);
+                /* no new references can be obtained */
+                if (!atomic_read(&obj->count)) {
+                        list_del(&obj->list);
+                        fdso_destroy(obj->type, obj->obj);
+                        kfree(obj);
+                        let_go = 1;
+                }
+                mutex_unlock(&inode->i_obj_mutex);
+                if (let_go)
+                        iput(inode);
+        }
+}
+static struct od_table_entry*  get_od_entry(struct task_struct* t)
+{
+        struct od_table_entry* table;
+        int i;
+        table = t->od_table;
+        if (!table) {
+                table = kzalloc(sizeof(*table) * MAX_OBJECT_DESCRIPTORS,
+                                GFP_KERNEL);
+                t->od_table = table;
+        }
+        for (i = 0; table &&  i < MAX_OBJECT_DESCRIPTORS; i++)
+                if (!table[i].used) {
+                        table[i].used = 1;
+                        return table + i;
+                }
+        return NULL;
+}
+static int put_od_entry(struct od_table_entry* od)
+{
+        put_inode_obj(od->obj);
+        od->used = 0;
+        return 0;
+}
+void exit_od_table(struct task_struct* t)
+{
+        int i;
+        if (t->od_table) {
+                for (i = 0; i < MAX_OBJECT_DESCRIPTORS; i++)
+                        if (t->od_table[i].used)
+                                put_od_entry(t->od_table + i);
+                kfree(t->od_table);
+                t->od_table = NULL;
+        }
+}
+static int do_sys_od_open(struct file* file, obj_type_t type, int id,
+                          void* __user config)
+{
+        int idx = 0, err;
+        struct inode* inode;
+        struct inode_obj_id* obj = NULL;
+        struct od_table_entry* entry;
+        inode = file->f_dentry->d_inode;
+        entry = get_od_entry(current);
+        if (!entry)
+                return -ENOMEM;
+        mutex_lock(&inode->i_obj_mutex);
+        obj = get_inode_obj(inode, type, id);
+        if (!obj)
+                obj = alloc_inode_obj(inode, type, id);
+        if (!obj) {
+                idx = -ENOMEM;
+                entry->used = 0;
+        } else {
+                entry->obj   = obj;
+                entry->extra = NULL;
+                idx = entry - current->od_table;
+        }
+        mutex_unlock(&inode->i_obj_mutex);
+        err = fdso_open(entry, config);
+        if (err < 0) {
+                /* The class rejected the open call.
+                 * We need to clean up and tell user space.
+                 */
+                put_od_entry(entry);
+                idx = err;
+        }
+        return idx;
+}
+struct od_table_entry* __od_lookup(int od)
+{
+        struct task_struct *t = current;
+        if (!t->od_table)
+                return NULL;
+        if (od < 0 || od >= MAX_OBJECT_DESCRIPTORS)
+                return NULL;
+        if (!t->od_table[od].used)
+                return NULL;
+        return t->od_table + od;
+}
+asmlinkage long sys_od_open(int fd, int type, int obj_id, void* __user config)
+{
+        int ret = 0;
+        struct file*  file;
+        /*
+           1) get file from fd, get inode from file
+           2) lock inode
+           3) try to lookup object
+           4) if not present create and enqueue object, inc inode refcnt
+           5) increment refcnt of object
+           6) alloc od_table_entry, setup ptrs
+           7) unlock inode
+           8) return offset in od_table as OD
+         */
+        if (type < MIN_OBJ_TYPE || type > MAX_OBJ_TYPE) {
+                ret = -EINVAL;
+                goto out;
+        }
+        file = fget(fd);
+        if (!file) {
+                ret = -EBADF;
+                goto out;
+        }
+        ret = do_sys_od_open(file, type, obj_id, config);
+        fput(file);
+out:
+        return ret;
+}
+asmlinkage long sys_od_close(int od)
+{
+        int ret = -EINVAL;
+        struct task_struct *t = current;
+        if (od < 0 || od >= MAX_OBJECT_DESCRIPTORS)
+                return ret;
+        if (!t->od_table || !t->od_table[od].used)
+                return ret;
+        /* give the class a chance to reject the close
+         */
+        ret = fdso_close(t->od_table + od);
+        if (ret == 0)
+                ret = put_od_entry(t->od_table + od);
+        return ret;
+}
diff --git a/litmus/fmlp.c b/litmus/fmlp.c
new file mode 100644
index 000000000000..03fa7358d5eb
--- /dev/null
+++ b/litmus/fmlp.c
@@ -0,0 +1,268 @@
+/*
+ * FMLP implementation.
+ * Much of the code here is borrowed from include/asm-i386/semaphore.h
+ */
+#include <asm/atomic.h>
+#include <linux/semaphore.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/spinlock.h>
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/edf_common.h>
+#include <litmus/fdso.h>
+#include <litmus/trace.h>
+#ifdef CONFIG_FMLP
+static  void* create_fmlp_semaphore(void)
+{
+        struct pi_semaphore* sem;
+        int i;
+        sem = kmalloc(sizeof(*sem), GFP_KERNEL);
+        if (!sem)
+                return NULL;
+        atomic_set(&sem->count, 1);
+        sem->sleepers = 0;
+        init_waitqueue_head(&sem->wait);
+        sem->hp.task = NULL;
+        sem->holder = NULL;
+        for (i = 0; i < NR_CPUS; i++)
+                sem->hp.cpu_task[i] = NULL;
+        return sem;
+}
+static int open_fmlp_semaphore(struct od_table_entry* entry, void* __user arg)
+{
+        if (!fmlp_active())
+                return -EBUSY;
+        return 0;
+}
+static void destroy_fmlp_semaphore(void* sem)
+{
+        /* XXX assert invariants */
+        kfree(sem);
+}
+struct fdso_ops fmlp_sem_ops = {
+        .create  = create_fmlp_semaphore,
+        .open    = open_fmlp_semaphore,
+        .destroy = destroy_fmlp_semaphore
+};
+struct wq_pair {
+        struct task_struct*  tsk;
+        struct pi_semaphore* sem;
+};
+static int rt_pi_wake_up(wait_queue_t *wait, unsigned mode, int sync,
+                           void *key)
+{
+        struct wq_pair* wqp   = (struct wq_pair*) wait->private;
+        set_rt_flags(wqp->tsk, RT_F_EXIT_SEM);
+        litmus->inherit_priority(wqp->sem, wqp->tsk);
+        TRACE_TASK(wqp->tsk,
+                   "woken up by rt_pi_wake_up() (RT_F_SEM_EXIT, PI)\n");
+        /* point to task for default_wake_function() */
+        wait->private = wqp->tsk;
+        default_wake_function(wait, mode, sync, key);
+        /* Always return true since we know that if we encountered a task
+         * that was already running the wake_up raced with the schedule in
+         * rt_pi_down(). In that case the task in rt_pi_down() will be scheduled
+         * immediately and own the lock. We must not wake up another task in
+         * any case.
+         */
+        return 1;
+}
+/* caller is responsible for locking */
+int edf_set_hp_task(struct pi_semaphore *sem)
+{
+        struct list_head        *tmp, *next;
+        struct task_struct      *queued;
+        int ret = 0;
+        sem->hp.task = NULL;
+        list_for_each_safe(tmp, next, &sem->wait.task_list) {
+                queued  = ((struct wq_pair*)
+                        list_entry(tmp, wait_queue_t,
+                                   task_list)->private)->tsk;
+                /* Compare task prios, find high prio task. */
+                if (edf_higher_prio(queued, sem->hp.task)) {
+                        sem->hp.task = queued;
+                        ret = 1;
+                }
+        }
+        return ret;
+}
+/* caller is responsible for locking */
+int edf_set_hp_cpu_task(struct pi_semaphore *sem, int cpu)
+{
+        struct list_head        *tmp, *next;
+        struct task_struct      *queued;
+        int ret = 0;
+        sem->hp.cpu_task[cpu] = NULL;
+        list_for_each_safe(tmp, next, &sem->wait.task_list) {
+                queued  = ((struct wq_pair*)
+                        list_entry(tmp, wait_queue_t,
+                                   task_list)->private)->tsk;
+                /* Compare task prios, find high prio task. */
+                if (get_partition(queued) == cpu &&
+                    edf_higher_prio(queued, sem->hp.cpu_task[cpu])) {
+                        sem->hp.cpu_task[cpu] = queued;
+                        ret = 1;
+                }
+        }
+        return ret;
+}
+static int do_fmlp_down(struct pi_semaphore* sem)
+{
+        unsigned long flags;
+        struct task_struct *tsk = current;
+        struct wq_pair pair;
+        int suspended = 1;
+        wait_queue_t wait = {
+                .private = &pair,
+                .func    = rt_pi_wake_up,
+                .task_list = {NULL, NULL}
+        };
+        pair.tsk = tsk;
+        pair.sem = sem;
+        spin_lock_irqsave(&sem->wait.lock, flags);
+        if (atomic_dec_return(&sem->count) < 0 ||
+            waitqueue_active(&sem->wait)) {
+                /* we need to suspend */
+                tsk->state = TASK_UNINTERRUPTIBLE;
+                add_wait_queue_exclusive_locked(&sem->wait, &wait);
+                TRACE_CUR("suspends on PI lock %p\n", sem);
+                litmus->pi_block(sem, tsk);
+                /* release lock before sleeping */
+                spin_unlock_irqrestore(&sem->wait.lock, flags);
+                TS_PI_DOWN_END;
+                preempt_enable_no_resched();
+                /* we depend on the FIFO order
+                 * Thus, we don't need to recheck when we wake up, we
+                 * are guaranteed to have the lock since there is only one
+                 * wake up per release
+                 */
+                schedule();
+                TRACE_CUR("woke up, now owns PI lock %p\n", sem);
+                /* try_to_wake_up() set our state to TASK_RUNNING,
+                 * all we need to do is to remove our wait queue entry
+                 */
+                remove_wait_queue(&sem->wait, &wait);
+        } else {
+                /* no priority inheritance necessary, since there are no queued
+                 * tasks.
+                 */
+                suspended = 0;
+                TRACE_CUR("acquired PI lock %p, no contention\n", sem);
+                sem->holder  = tsk;
+                /* don't know if we're global or partitioned. */
+                sem->hp.task = tsk;
+                sem->hp.cpu_task[get_partition(tsk)] = tsk;
+                litmus->inherit_priority(sem, tsk);
+                spin_unlock_irqrestore(&sem->wait.lock, flags);
+        }
+        return suspended;
+}
+static void do_fmlp_up(struct pi_semaphore* sem)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&sem->wait.lock, flags);
+        TRACE_CUR("releases PI lock %p\n", sem);
+        litmus->return_priority(sem);
+        sem->holder = NULL;
+        if (atomic_inc_return(&sem->count) < 1)
+                /* there is a task queued */
+                wake_up_locked(&sem->wait);
+        spin_unlock_irqrestore(&sem->wait.lock, flags);
+}
+asmlinkage long sys_fmlp_down(int sem_od)
+{
+        long ret = 0;
+        struct pi_semaphore * sem;
+        int suspended = 0;
+        preempt_disable();
+        TS_PI_DOWN_START;
+        sem = lookup_fmlp_sem(sem_od);
+        if (sem)
+                suspended = do_fmlp_down(sem);
+        else
+                ret = -EINVAL;
+        if (!suspended) {
+                TS_PI_DOWN_END;
+                preempt_enable();
+        }
+        return ret;
+}
+asmlinkage long sys_fmlp_up(int sem_od)
+{
+        long ret = 0;
+        struct pi_semaphore * sem;
+        preempt_disable();
+        TS_PI_UP_START;
+        sem = lookup_fmlp_sem(sem_od);
+        if (sem)
+                do_fmlp_up(sem);
+        else
+                ret = -EINVAL;
+        TS_PI_UP_END;
+        preempt_enable();
+        return ret;
+}
+#else
+struct fdso_ops fmlp_sem_ops = {};
+asmlinkage long sys_fmlp_down(int sem_od)
+{
+        return -ENOSYS;
+}
+asmlinkage long sys_fmlp_up(int sem_od)
+{
+        return -ENOSYS;
+}
+#endif
diff --git a/litmus/ft_event.c b/litmus/ft_event.c
new file mode 100644
index 000000000000..6084b6d6b364
--- /dev/null
+++ b/litmus/ft_event.c
@@ -0,0 +1,43 @@
+#include <linux/types.h>
+#include <litmus/feather_trace.h>
+#ifndef __ARCH_HAS_FEATHER_TRACE
+/* provide dummy implementation */
+int ft_events[MAX_EVENTS];
+int ft_enable_event(unsigned long id)
+{
+        if (id < MAX_EVENTS) {
+                ft_events[id]++;
+                return 1;
+        } else
+                return 0;
+}
+int ft_disable_event(unsigned long id)
+{
+        if (id < MAX_EVENTS && ft_events[id]) {
+                ft_events[id]--;
+                return 1;
+        } else
+                return 0;
+}
+int ft_disable_all_events(void)
+{
+        int i;
+        for (i = 0; i < MAX_EVENTS; i++)
+                ft_events[i] = 0;
+        return MAX_EVENTS;
+}
+int ft_is_event_enabled(unsigned long id)
+{
+        return  id < MAX_EVENTS && ft_events[id];
+}
+#endif
diff --git a/litmus/ftdev.c b/litmus/ftdev.c
new file mode 100644
index 000000000000..8b2d74d816a2
--- /dev/null
+++ b/litmus/ftdev.c
@@ -0,0 +1,359 @@
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/cdev.h>
+#include <asm/uaccess.h>
+#include <linux/module.h>
+#include <litmus/litmus.h>
+#include <litmus/feather_trace.h>
+#include <litmus/ftdev.h>
+struct ft_buffer* alloc_ft_buffer(unsigned int count, size_t size)
+{
+        struct ft_buffer* buf;
+        size_t total = (size + 1) * count;
+        char* mem;
+        int order = 0, pages = 1;
+        buf = kmalloc(sizeof(*buf), GFP_KERNEL);
+        if (!buf)
+                return NULL;
+        total = (total / PAGE_SIZE) + (total % PAGE_SIZE != 0);
+        while (pages < total) {
+                order++;
+                pages *= 2;
+        }
+        mem = (char*) __get_free_pages(GFP_KERNEL, order);
+        if (!mem) {
+                kfree(buf);
+                return NULL;
+        }
+        if (!init_ft_buffer(buf, count, size,
+                            mem + (count * size),  /* markers at the end */
+                            mem)) {                /* buffer objects     */
+                free_pages((unsigned long) mem, order);
+                kfree(buf);
+                return NULL;
+        }
+        return buf;
+}
+void free_ft_buffer(struct ft_buffer* buf)
+{
+        int order = 0, pages = 1;
+        size_t total;
+        if (buf) {
+                total = (buf->slot_size + 1) * buf->slot_count;
+                total = (total / PAGE_SIZE) + (total % PAGE_SIZE != 0);
+                while (pages < total) {
+                        order++;
+                        pages *= 2;
+                }
+                free_pages((unsigned long) buf->buffer_mem, order);
+                kfree(buf);
+        }
+}
+struct ftdev_event {
+        int id;
+        struct ftdev_event* next;
+};
+static int activate(struct ftdev_event** chain, int id)
+{
+        struct ftdev_event* ev = kmalloc(sizeof(*ev), GFP_KERNEL);
+        if (ev) {
+                printk(KERN_INFO
+                       "Enabling feather-trace event %d.\n", (int) id);
+                ft_enable_event(id);
+                ev->id = id;
+                ev->next = *chain;
+                *chain    = ev;
+        }
+        return ev ? 0 : -ENOMEM;
+}
+static void deactivate(struct ftdev_event** chain, int id)
+{
+        struct ftdev_event **cur = chain;
+        struct ftdev_event *nxt;
+        while (*cur) {
+                if ((*cur)->id == id) {
+                        nxt   = (*cur)->next;
+                        kfree(*cur);
+                        *cur  = nxt;
+                        printk(KERN_INFO
+                               "Disabling feather-trace event %d.\n", (int) id);
+                        ft_disable_event(id);
+                        break;
+                }
+                cur = &(*cur)->next;
+        }
+}
+static int ftdev_open(struct inode *in, struct file *filp)
+{
+        struct ftdev* ftdev;
+        struct ftdev_minor* ftdm;
+        unsigned int buf_idx = iminor(in);
+        int err = 0;
+        ftdev = container_of(in->i_cdev, struct ftdev, cdev);
+        if (buf_idx >= ftdev->minor_cnt) {
+                err = -ENODEV;
+                goto out;
+        }
+        if (ftdev->can_open && (err = ftdev->can_open(ftdev, buf_idx)))
+                goto out;
+        ftdm = ftdev->minor + buf_idx;
+        filp->private_data = ftdm;
+        if (mutex_lock_interruptible(&ftdm->lock)) {
+                err = -ERESTARTSYS;
+                goto out;
+        }
+        if (!ftdm->readers && ftdev->alloc)
+                err = ftdev->alloc(ftdev, buf_idx);
+        if (0 == err)
+                ftdm->readers++;
+        mutex_unlock(&ftdm->lock);
+out:
+        return err;
+}
+static int ftdev_release(struct inode *in, struct file *filp)
+{
+        struct ftdev* ftdev;
+        struct ftdev_minor* ftdm;
+        unsigned int buf_idx = iminor(in);
+        int err = 0;
+        ftdev = container_of(in->i_cdev, struct ftdev, cdev);
+        if (buf_idx >= ftdev->minor_cnt) {
+                err = -ENODEV;
+                goto out;
+        }
+        ftdm = ftdev->minor + buf_idx;
+        if (mutex_lock_interruptible(&ftdm->lock)) {
+                err = -ERESTARTSYS;
+                goto out;
+        }
+        if (ftdm->readers == 1) {
+                while (ftdm->events)
+                        deactivate(&ftdm->events, ftdm->events->id);
+                /* wait for any pending events to complete */
+                set_current_state(TASK_UNINTERRUPTIBLE);
+                schedule_timeout(HZ);
+                printk(KERN_ALERT "Failed trace writes: %u\n",
+                       ftdm->buf->failed_writes);
+                if (ftdev->free)
+                        ftdev->free(ftdev, buf_idx);
+        }
+        ftdm->readers--;
+        mutex_unlock(&ftdm->lock);
+out:
+        return err;
+}
+/* based on ft_buffer_read
+ * @returns < 0 : page fault
+ *          = 0 : no data available
+ *          = 1 : one slot copied
+ */
+static int ft_buffer_copy_to_user(struct ft_buffer* buf, char __user *dest)
+{
+        unsigned int idx;
+        int err = 0;
+        if (buf->free_count != buf->slot_count) {
+                /* data available */
+                idx = buf->read_idx % buf->slot_count;
+                if (buf->slots[idx] == SLOT_READY) {
+                        err = copy_to_user(dest, ((char*) buf->buffer_mem) +
+                                           idx * buf->slot_size,
+                                           buf->slot_size);
+                        if (err == 0) {
+                                /* copy ok */
+                                buf->slots[idx] = SLOT_FREE;
+                                buf->read_idx++;
+                                fetch_and_inc(&buf->free_count);
+                                err = 1;
+                        }
+                }
+        }
+        return err;
+}
+static ssize_t ftdev_read(struct file *filp,
+                          char __user *to, size_t len, loff_t *f_pos)
+{
+        /*      we ignore f_pos, this is strictly sequential */
+        ssize_t err = 0;
+        size_t chunk;
+        int copied;
+        struct ftdev_minor* ftdm = filp->private_data;
+        if (mutex_lock_interruptible(&ftdm->lock)) {
+                err = -ERESTARTSYS;
+                goto out;
+        }
+        chunk = ftdm->buf->slot_size;
+        while (len >= chunk) {
+                copied = ft_buffer_copy_to_user(ftdm->buf, to);
+                if (copied == 1) {
+                        len    -= chunk;
+                        to     += chunk;
+                        err    += chunk;
+                } else if (err == 0 && copied == 0 && ftdm->events) {
+                        /* Only wait if there are any events enabled and only
+                         * if we haven't copied some data yet. We cannot wait
+                         * here with copied data because that data would get
+                         * lost if the task is interrupted (e.g., killed).
+                         */
+                        set_current_state(TASK_INTERRUPTIBLE);
+                        schedule_timeout(50);
+                        if (signal_pending(current)) {
+                                if (err == 0)
+                                        /* nothing read yet, signal problem */
+                                        err = -ERESTARTSYS;
+                                break;
+                        }
+                } else if (copied < 0) {
+                        /* page fault */
+                        err = copied;
+                        break;
+                } else
+                        /* nothing left to get, return to user space */
+                        break;
+        }
+        mutex_unlock(&ftdm->lock);
+out:
+        return err;
+}
+typedef uint32_t cmd_t;
+static ssize_t ftdev_write(struct file *filp, const char __user *from,
+                           size_t len, loff_t *f_pos)
+{
+        struct ftdev_minor* ftdm = filp->private_data;
+        ssize_t err = -EINVAL;
+        cmd_t cmd;
+        cmd_t id;
+        if (len % sizeof(cmd) || len < 2 * sizeof(cmd))
+                goto out;
+        if (copy_from_user(&cmd, from, sizeof(cmd))) {
+                err = -EFAULT;
+                goto out;
+        }
+        len  -= sizeof(cmd);
+        from += sizeof(cmd);
+        if (cmd != FTDEV_ENABLE_CMD && cmd != FTDEV_DISABLE_CMD)
+                goto out;
+        if (mutex_lock_interruptible(&ftdm->lock)) {
+                err = -ERESTARTSYS;
+                goto out;
+        }
+        err = sizeof(cmd);
+        while (len) {
+                if (copy_from_user(&id, from, sizeof(cmd))) {
+                        err = -EFAULT;
+                        goto out_unlock;
+                }
+                /* FIXME: check id against list of acceptable events */
+                len  -= sizeof(cmd);
+                from += sizeof(cmd);
+                if (cmd == FTDEV_DISABLE_CMD)
+                        deactivate(&ftdm->events, id);
+                else if (activate(&ftdm->events, id) != 0) {
+                        err = -ENOMEM;
+                        goto out_unlock;
+                }
+                err += sizeof(cmd);
+        }
+out_unlock:
+        mutex_unlock(&ftdm->lock);
+out:
+        return err;
+}
+struct file_operations ftdev_fops = {
+        .owner   = THIS_MODULE,
+        .open    = ftdev_open,
+        .release = ftdev_release,
+        .write   = ftdev_write,
+        .read    = ftdev_read,
+};
+void ftdev_init(struct ftdev* ftdev, struct module* owner)
+{
+        int i;
+        cdev_init(&ftdev->cdev, &ftdev_fops);
+        ftdev->cdev.owner = owner;
+        ftdev->cdev.ops = &ftdev_fops;
+        ftdev->minor_cnt  = 0;
+        for (i = 0; i < MAX_FTDEV_MINORS; i++) {
+                mutex_init(&ftdev->minor[i].lock);
+                ftdev->minor[i].readers = 0;
+                ftdev->minor[i].buf     = NULL;
+                ftdev->minor[i].events  = NULL;
+        }
+        ftdev->alloc    = NULL;
+        ftdev->free     = NULL;
+        ftdev->can_open = NULL;
+}
+int register_ftdev(struct ftdev* ftdev, const char* name, int major)
+{
+        dev_t   trace_dev;
+        int error = 0;
+        if(major) {
+                trace_dev = MKDEV(major, 0);
+                error = register_chrdev_region(trace_dev, ftdev->minor_cnt,
+                                               name);
+        } else {
+                error = alloc_chrdev_region(&trace_dev, 0, ftdev->minor_cnt,
+                                name);
+                major = MAJOR(trace_dev);
+        }
+        if (error)
+        {
+                printk(KERN_WARNING "ftdev(%s): "
+                       "Could not register major/minor number %d/%u\n",
+                       name, major, ftdev->minor_cnt);
+                return error;
+        }
+        error = cdev_add(&ftdev->cdev, trace_dev, ftdev->minor_cnt);
+        if (error) {
+                printk(KERN_WARNING "ftdev(%s): "
+                       "Could not add cdev for major/minor = %d/%u.\n",
+                       name, major, ftdev->minor_cnt);
+                return error;
+        }
+        return error;
+}
diff --git a/litmus/jobs.c b/litmus/jobs.c
new file mode 100644
index 000000000000..36e314625d86
--- /dev/null
+++ b/litmus/jobs.c
@@ -0,0 +1,43 @@
+/* litmus/jobs.c - common job control code
+ */
+#include <linux/sched.h>
+#include <litmus/litmus.h>
+#include <litmus/jobs.h>
+void prepare_for_next_period(struct task_struct *t)
+{
+        BUG_ON(!t);
+        /* prepare next release */
+        t->rt_param.job_params.release   = t->rt_param.job_params.deadline;
+        t->rt_param.job_params.deadline += get_rt_period(t);
+        t->rt_param.job_params.exec_time = 0;
+        /* update job sequence number */
+        t->rt_param.job_params.job_no++;
+        /* don't confuse Linux */
+        t->rt.time_slice = 1;
+}
+void release_at(struct task_struct *t, lt_t start)
+{
+        t->rt_param.job_params.deadline = start;
+        prepare_for_next_period(t);
+        set_rt_flags(t, RT_F_RUNNING);
+}
+/*
+ *      Deactivate current task until the beginning of the next period.
+ */
+long complete_job(void)
+{
+        /* Mark that we do not excute anymore */
+        set_rt_flags(current, RT_F_SLEEP);
+        /* call schedule, this will return when a new job arrives
+         * it also takes care of preparing for the next release
+         */
+        schedule();
+        return 0;
+}
diff --git a/litmus/litmus.c b/litmus/litmus.c
new file mode 100644
index 000000000000..e43596a5104c
--- /dev/null
+++ b/litmus/litmus.c
@@ -0,0 +1,775 @@
+/*
+ * litmus.c -- Implementation of the LITMUS syscalls,
+ *             the LITMUS intialization code,
+ *             and the procfs interface..
+ */
+#include <asm/uaccess.h>
+#include <linux/uaccess.h>
+#include <linux/sysrq.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/slab.h>
+#include <litmus/litmus.h>
+#include <linux/sched.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/bheap.h>
+#include <litmus/trace.h>
+#include <litmus/rt_domain.h>
+/* Number of RT tasks that exist in the system */
+atomic_t rt_task_count          = ATOMIC_INIT(0);
+static DEFINE_SPINLOCK(task_transition_lock);
+/* synchronize plugin switching */
+atomic_t cannot_use_plugin      = ATOMIC_INIT(0);
+/* Give log messages sequential IDs. */
+atomic_t __log_seq_no = ATOMIC_INIT(0);
+/* current master CPU for handling timer IRQs */
+atomic_t release_master_cpu = ATOMIC_INIT(NO_CPU);
+static struct kmem_cache * bheap_node_cache;
+extern struct kmem_cache * release_heap_cache;
+struct bheap_node* bheap_node_alloc(int gfp_flags)
+{
+        return kmem_cache_alloc(bheap_node_cache, gfp_flags);
+}
+void bheap_node_free(struct bheap_node* hn)
+{
+        kmem_cache_free(bheap_node_cache, hn);
+}
+struct release_heap* release_heap_alloc(int gfp_flags);
+void release_heap_free(struct release_heap* rh);
+/*
+ * sys_set_task_rt_param
+ * @pid: Pid of the task which scheduling parameters must be changed
+ * @param: New real-time extension parameters such as the execution cost and
+ *         period
+ * Syscall for manipulating with task rt extension params
+ * Returns EFAULT  if param is NULL.
+ *         ESRCH   if pid is not corrsponding
+ *                 to a valid task.
+ *         EINVAL  if either period or execution cost is <=0
+ *         EPERM   if pid is a real-time task
+ *         0       if success
+ *
+ * Only non-real-time tasks may be configured with this system call
+ * to avoid races with the scheduler. In practice, this means that a
+ * task's parameters must be set _before_ calling sys_prepare_rt_task()
+ *
+ * find_task_by_vpid() assumes that we are in the same namespace of the
+ * target.
+ */
+asmlinkage long sys_set_rt_task_param(pid_t pid, struct rt_task __user * param)
+{
+        struct rt_task tp;
+        struct task_struct *target;
+        int retval = -EINVAL;
+        printk("Setting up rt task parameters for process %d.\n", pid);
+        if (pid < 0 || param == 0) {
+                goto out;
+        }
+        if (copy_from_user(&tp, param, sizeof(tp))) {
+                retval = -EFAULT;
+                goto out;
+        }
+        /* Task search and manipulation must be protected */
+        read_lock_irq(&tasklist_lock);
+        if (!(target = find_task_by_vpid(pid))) {
+                retval = -ESRCH;
+                goto out_unlock;
+        }
+        if (is_realtime(target)) {
+                /* The task is already a real-time task.
+                 * We cannot not allow parameter changes at this point.
+                 */
+                retval = -EBUSY;
+                goto out_unlock;
+        }
+        if (tp.exec_cost <= 0)
+                goto out_unlock;
+        if (tp.period <= 0)
+                goto out_unlock;
+        if (!cpu_online(tp.cpu))
+                goto out_unlock;
+        if (tp.period < tp.exec_cost)
+        {
+                printk(KERN_INFO "litmus: real-time task %d rejected "
+                       "because wcet > period\n", pid);
+                goto out_unlock;
+        }
+        target->rt_param.task_params = tp;
+        retval = 0;
+      out_unlock:
+        read_unlock_irq(&tasklist_lock);
+      out:
+        return retval;
+}
+/*
+ * Getter of task's RT params
+ *   returns EINVAL if param or pid is NULL
+ *   returns ESRCH  if pid does not correspond to a valid task
+ *   returns EFAULT if copying of parameters has failed.
+ *
+ *   find_task_by_vpid() assumes that we are in the same namespace of the
+ *   target.
+ */
+asmlinkage long sys_get_rt_task_param(pid_t pid, struct rt_task __user * param)
+{
+        int retval = -EINVAL;
+        struct task_struct *source;
+        struct rt_task lp;
+        if (param == 0 || pid < 0)
+                goto out;
+        read_lock(&tasklist_lock);
+        if (!(source = find_task_by_vpid(pid))) {
+                retval = -ESRCH;
+                goto out_unlock;
+        }
+        lp = source->rt_param.task_params;
+        read_unlock(&tasklist_lock);
+        /* Do copying outside the lock */
+        retval =
+            copy_to_user(param, &lp, sizeof(lp)) ? -EFAULT : 0;
+        return retval;
+      out_unlock:
+        read_unlock(&tasklist_lock);
+      out:
+        return retval;
+}
+/*
+ *      This is the crucial function for periodic task implementation,
+ *      It checks if a task is periodic, checks if such kind of sleep
+ *      is permitted and calls plugin-specific sleep, which puts the
+ *      task into a wait array.
+ *      returns 0 on successful wakeup
+ *      returns EPERM if current conditions do not permit such sleep
+ *      returns EINVAL if current task is not able to go to sleep
+ */
+asmlinkage long sys_complete_job(void)
+{
+        int retval = -EPERM;
+        if (!is_realtime(current)) {
+                retval = -EINVAL;
+                goto out;
+        }
+        /* Task with negative or zero period cannot sleep */
+        if (get_rt_period(current) <= 0) {
+                retval = -EINVAL;
+                goto out;
+        }
+        /* The plugin has to put the task into an
+         * appropriate queue and call schedule
+         */
+        retval = litmus->complete_job();
+      out:
+        return retval;
+}
+/*      This is an "improved" version of sys_complete_job that
+ *      addresses the problem of unintentionally missing a job after
+ *      an overrun.
+ *
+ *      returns 0 on successful wakeup
+ *      returns EPERM if current conditions do not permit such sleep
+ *      returns EINVAL if current task is not able to go to sleep
+ */
+asmlinkage long sys_wait_for_job_release(unsigned int job)
+{
+        int retval = -EPERM;
+        if (!is_realtime(current)) {
+                retval = -EINVAL;
+                goto out;
+        }
+        /* Task with negative or zero period cannot sleep */
+        if (get_rt_period(current) <= 0) {
+                retval = -EINVAL;
+                goto out;
+        }
+        retval = 0;
+        /* first wait until we have "reached" the desired job
+         *
+         * This implementation has at least two problems:
+         *
+         * 1) It doesn't gracefully handle the wrap around of
+         *    job_no. Since LITMUS is a prototype, this is not much
+         *    of a problem right now.
+         *
+         * 2) It is theoretically racy if a job release occurs
+         *    between checking job_no and calling sleep_next_period().
+         *    A proper solution would requiring adding another callback
+         *    in the plugin structure and testing the condition with
+         *    interrupts disabled.
+         *
+         * FIXME: At least problem 2 should be taken care of eventually.
+         */
+        while (!retval && job > current->rt_param.job_params.job_no)
+                /* If the last job overran then job <= job_no and we
+                 * don't send the task to sleep.
+                 */
+                retval = litmus->complete_job();
+      out:
+        return retval;
+}
+/*      This is a helper syscall to query the current job sequence number.
+ *
+ *      returns 0 on successful query
+ *      returns EPERM if task is not a real-time task.
+ *      returns EFAULT if &job is not a valid pointer.
+ */
+asmlinkage long sys_query_job_no(unsigned int __user *job)
+{
+        int retval = -EPERM;
+        if (is_realtime(current))
+                retval = put_user(current->rt_param.job_params.job_no, job);
+        return retval;
+}
+/* sys_null_call() is only used for determining raw system call
+ * overheads (kernel entry, kernel exit). It has no useful side effects.
+ * If ts is non-NULL, then the current Feather-Trace time is recorded.
+ */
+asmlinkage long sys_null_call(cycles_t __user *ts)
+{
+        long ret = 0;
+        cycles_t now;
+        if (ts) {
+                now = get_cycles();
+                ret = put_user(now, ts);
+        }
+        return ret;
+}
+/* p is a real-time task. Re-init its state as a best-effort task. */
+static void reinit_litmus_state(struct task_struct* p, int restore)
+{
+        struct rt_task  user_config = {};
+        void*  ctrl_page     = NULL;
+        if (restore) {
+                /* Safe user-space provided configuration data.
+                 * and allocated page. */
+                user_config = p->rt_param.task_params;
+                ctrl_page   = p->rt_param.ctrl_page;
+        }
+        /* We probably should not be inheriting any task's priority
+         * at this point in time.
+         */
+        WARN_ON(p->rt_param.inh_task);
+        /* We need to restore the priority of the task. */
+//      __setscheduler(p, p->rt_param.old_policy, p->rt_param.old_prio); XXX why is this commented?
+        /* Cleanup everything else. */
+        memset(&p->rt_param, 0, sizeof(p->rt_param));
+        /* Restore preserved fields. */
+        if (restore) {
+                p->rt_param.task_params = user_config;
+                p->rt_param.ctrl_page   = ctrl_page;
+        }
+}
+long litmus_admit_task(struct task_struct* tsk)
+{
+        long retval = 0;
+        unsigned long flags;
+        BUG_ON(is_realtime(tsk));
+        if (get_rt_period(tsk) == 0 ||
+            get_exec_cost(tsk) > get_rt_period(tsk)) {
+                TRACE_TASK(tsk, "litmus admit: invalid task parameters "
+                           "(%lu, %lu)\n",
+                           get_exec_cost(tsk), get_rt_period(tsk));
+                retval = -EINVAL;
+                goto out;
+        }
+        if (!cpu_online(get_partition(tsk))) {
+                TRACE_TASK(tsk, "litmus admit: cpu %d is not online\n",
+                           get_partition(tsk));
+                retval = -EINVAL;
+                goto out;
+        }
+        INIT_LIST_HEAD(&tsk_rt(tsk)->list);
+        /* avoid scheduler plugin changing underneath us */
+        spin_lock_irqsave(&task_transition_lock, flags);
+        /* allocate heap node for this task */
+        tsk_rt(tsk)->heap_node = bheap_node_alloc(GFP_ATOMIC);
+        tsk_rt(tsk)->rel_heap = release_heap_alloc(GFP_ATOMIC);
+        if (!tsk_rt(tsk)->heap_node || !tsk_rt(tsk)->rel_heap) {
+                printk(KERN_WARNING "litmus: no more heap node memory!?\n");
+                bheap_node_free(tsk_rt(tsk)->heap_node);
+                release_heap_free(tsk_rt(tsk)->rel_heap);
+                retval = -ENOMEM;
+                goto out_unlock;
+        } else {
+                bheap_node_init(&tsk_rt(tsk)->heap_node, tsk);
+        }
+        retval = litmus->admit_task(tsk);
+        if (!retval) {
+                sched_trace_task_name(tsk);
+                sched_trace_task_param(tsk);
+                atomic_inc(&rt_task_count);
+        }
+out_unlock:
+        spin_unlock_irqrestore(&task_transition_lock, flags);
+out:
+        return retval;
+}
+void litmus_exit_task(struct task_struct* tsk)
+{
+        if (is_realtime(tsk)) {
+                sched_trace_task_completion(tsk, 1);
+                litmus->task_exit(tsk);
+                BUG_ON(bheap_node_in_heap(tsk_rt(tsk)->heap_node));
+                bheap_node_free(tsk_rt(tsk)->heap_node);
+                release_heap_free(tsk_rt(tsk)->rel_heap);
+                atomic_dec(&rt_task_count);
+                reinit_litmus_state(tsk, 1);
+        }
+}
+/* IPI callback to synchronize plugin switching */
+static void synch_on_plugin_switch(void* info)
+{
+        while (atomic_read(&cannot_use_plugin))
+                cpu_relax();
+}
+/* Switching a plugin in use is tricky.
+ * We must watch out that no real-time tasks exists
+ * (and that none is created in parallel) and that the plugin is not
+ * currently in use on any processor (in theory).
+ */
+int switch_sched_plugin(struct sched_plugin* plugin)
+{
+        unsigned long flags;
+        int ret = 0;
+        BUG_ON(!plugin);
+        /* forbid other cpus to use the plugin */
+        atomic_set(&cannot_use_plugin, 1);
+        /* send IPI to force other CPUs to synch with us */
+        smp_call_function(synch_on_plugin_switch, NULL, 0);
+        /* stop task transitions */
+        spin_lock_irqsave(&task_transition_lock, flags);
+        /* don't switch if there are active real-time tasks */
+        if (atomic_read(&rt_task_count) == 0) {
+                ret = litmus->deactivate_plugin();
+                if (0 != ret)
+                        goto out;
+                ret = plugin->activate_plugin();
+                if (0 != ret) {
+                        printk(KERN_INFO "Can't activate %s (%d).\n",
+                               plugin->plugin_name, ret);
+                        plugin = &linux_sched_plugin;
+                }
+                printk(KERN_INFO "Switching to LITMUS^RT plugin %s.\n", plugin->plugin_name);
+                litmus = plugin;
+        } else
+                ret = -EBUSY;
+out:
+        spin_unlock_irqrestore(&task_transition_lock, flags);
+        atomic_set(&cannot_use_plugin, 0);
+        return ret;
+}
+/* Called upon fork.
+ * p is the newly forked task.
+ */
+void litmus_fork(struct task_struct* p)
+{
+        if (is_realtime(p))
+                /* clean out any litmus related state, don't preserve anything */
+                reinit_litmus_state(p, 0);
+        else
+                /* non-rt tasks might have ctrl_page set */
+                tsk_rt(p)->ctrl_page = NULL;
+        /* od tables are never inherited across a fork */
+        p->od_table = NULL;
+}
+/* Called upon execve().
+ * current is doing the exec.
+ * Don't let address space specific stuff leak.
+ */
+void litmus_exec(void)
+{
+        struct task_struct* p = current;
+        if (is_realtime(p)) {
+                WARN_ON(p->rt_param.inh_task);
+                if (tsk_rt(p)->ctrl_page) {
+                        free_page((unsigned long) tsk_rt(p)->ctrl_page);
+                        tsk_rt(p)->ctrl_page = NULL;
+                }
+        }
+}
+void exit_litmus(struct task_struct *dead_tsk)
+{
+        /* We also allow non-RT tasks to
+         * allocate control pages to allow
+         * measurements with non-RT tasks.
+         * So check if we need to free the page
+         * in any case.
+         */
+        if (tsk_rt(dead_tsk)->ctrl_page) {
+                TRACE_TASK(dead_tsk,
+                           "freeing ctrl_page %p\n",
+                           tsk_rt(dead_tsk)->ctrl_page);
+                free_page((unsigned long) tsk_rt(dead_tsk)->ctrl_page);
+        }
+        /* main cleanup only for RT tasks */
+        if (is_realtime(dead_tsk))
+                litmus_exit_task(dead_tsk);
+}
+#ifdef CONFIG_MAGIC_SYSRQ
+int sys_kill(int pid, int sig);
+static void sysrq_handle_kill_rt_tasks(int key, struct tty_struct *tty)
+{
+        struct task_struct *t;
+        read_lock(&tasklist_lock);
+        for_each_process(t) {
+                if (is_realtime(t)) {
+                        sys_kill(t->pid, SIGKILL);
+                }
+        }
+        read_unlock(&tasklist_lock);
+}
+static struct sysrq_key_op sysrq_kill_rt_tasks_op = {
+        .handler        = sysrq_handle_kill_rt_tasks,
+        .help_msg       = "quit-rt-tasks(X)",
+        .action_msg     = "sent SIGKILL to all LITMUS^RT real-time tasks",
+};
+#endif
+/* in litmus/sync.c */
+int count_tasks_waiting_for_release(void);
+static int proc_read_stats(char *page, char **start,
+                           off_t off, int count,
+                           int *eof, void *data)
+{
+        int len;
+        len = snprintf(page, PAGE_SIZE,
+                       "real-time tasks   = %d\n"
+                       "ready for release = %d\n",
+                       atomic_read(&rt_task_count),
+                       count_tasks_waiting_for_release());
+        return len;
+}
+static int proc_read_plugins(char *page, char **start,
+                           off_t off, int count,
+                           int *eof, void *data)
+{
+        int len;
+        len = print_sched_plugins(page, PAGE_SIZE);
+        return len;
+}
+static int proc_read_curr(char *page, char **start,
+                          off_t off, int count,
+                          int *eof, void *data)
+{
+        int len;
+        len = snprintf(page, PAGE_SIZE, "%s\n", litmus->plugin_name);
+        return len;
+}
+static int proc_write_curr(struct file *file,
+                           const char *buffer,
+                           unsigned long count,
+                           void *data)
+{
+        int len, ret;
+        char name[65];
+        struct sched_plugin* found;
+        if(count > 64)
+                len = 64;
+        else
+                len = count;
+        if(copy_from_user(name, buffer, len))
+                return -EFAULT;
+        name[len] = '\0';
+        /* chomp name */
+        if (len > 1 && name[len - 1] == '\n')
+                name[len - 1] = '\0';
+        found = find_sched_plugin(name);
+        if (found) {
+                ret = switch_sched_plugin(found);
+                if (ret != 0)
+                        printk(KERN_INFO "Could not switch plugin: %d\n", ret);
+        } else
+                printk(KERN_INFO "Plugin '%s' is unknown.\n", name);
+        return len;
+}
+static int proc_read_cluster_size(char *page, char **start,
+                          off_t off, int count,
+                          int *eof, void *data)
+{
+        int len;
+        if (cluster_cache_index == 2)
+                len = snprintf(page, PAGE_SIZE, "L2\n");
+        else if (cluster_cache_index == 3)
+                len = snprintf(page, PAGE_SIZE, "L3\n");
+        else /* (cluster_cache_index == 1) */
+                len = snprintf(page, PAGE_SIZE, "L1\n");
+        return len;
+}
+static int proc_write_cluster_size(struct file *file,
+                           const char *buffer,
+                           unsigned long count,
+                           void *data)
+{
+        int len;
+        /* L2, L3 */
+        char cache_name[33];
+        if(count > 32)
+                len = 32;
+        else
+                len = count;
+        if(copy_from_user(cache_name, buffer, len))
+                return -EFAULT;
+        cache_name[len] = '\0';
+        /* chomp name */
+        if (len > 1 && cache_name[len - 1] == '\n')
+                cache_name[len - 1] = '\0';
+        /* do a quick and dirty comparison to find the cluster size */
+        if (!strcmp(cache_name, "L2"))
+                cluster_cache_index = 2;
+        else if (!strcmp(cache_name, "L3"))
+                cluster_cache_index = 3;
+        else if (!strcmp(cache_name, "L1"))
+                cluster_cache_index = 1;
+        else
+                printk(KERN_INFO "Cluster '%s' is unknown.\n", cache_name);
+        return len;
+}
+static int proc_read_release_master(char *page, char **start,
+                                    off_t off, int count,
+                                    int *eof, void *data)
+{
+        int len, master;
+        master = atomic_read(&release_master_cpu);
+        if (master == NO_CPU)
+                len = snprintf(page, PAGE_SIZE, "NO_CPU\n");
+        else
+                len = snprintf(page, PAGE_SIZE, "%d\n", master);
+        return len;
+}
+static int proc_write_release_master(struct file *file,
+                                     const char *buffer,
+                                     unsigned long count,
+                                     void *data)
+{
+        int cpu, err, online = 0;
+        char msg[64];
+        if (count > 63)
+                return -EINVAL;
+        if (copy_from_user(msg, buffer, count))
+                return -EFAULT;
+        /* terminate */
+        msg[count] = '\0';
+        /* chomp */
+        if (count > 1 && msg[count - 1] == '\n')
+                msg[count - 1] = '\0';
+        if (strcmp(msg, "NO_CPU") == 0) {
+                atomic_set(&release_master_cpu, NO_CPU);
+                return count;
+        } else {
+                err = sscanf(msg, "%d", &cpu);
+                if (err == 1 && cpu >= 0 && (online = cpu_online(cpu))) {
+                        atomic_set(&release_master_cpu, cpu);
+                        return count;
+                } else {
+                        TRACE("invalid release master: '%s' "
+                              "(err:%d cpu:%d online:%d)\n",
+                              msg, err, cpu, online);
+                        return -EINVAL;
+                }
+        }
+}
+static struct proc_dir_entry *litmus_dir = NULL,
+        *curr_file = NULL,
+        *stat_file = NULL,
+        *plugs_file = NULL,
+        *clus_cache_idx_file = NULL,
+        *release_master_file = NULL;
+static int __init init_litmus_proc(void)
+{
+        litmus_dir = proc_mkdir("litmus", NULL);
+        if (!litmus_dir) {
+                printk(KERN_ERR "Could not allocate LITMUS^RT procfs entry.\n");
+                return -ENOMEM;
+        }
+        curr_file = create_proc_entry("active_plugin",
+                                      0644, litmus_dir);
+        if (!curr_file) {
+                printk(KERN_ERR "Could not allocate active_plugin "
+                       "procfs entry.\n");
+                return -ENOMEM;
+        }
+        curr_file->read_proc  = proc_read_curr;
+        curr_file->write_proc = proc_write_curr;
+        release_master_file = create_proc_entry("release_master",
+                                                0644, litmus_dir);
+        if (!release_master_file) {
+                printk(KERN_ERR "Could not allocate release_master "
+                       "procfs entry.\n");
+                return -ENOMEM;
+        }
+        release_master_file->read_proc = proc_read_release_master;
+        release_master_file->write_proc  = proc_write_release_master;
+        clus_cache_idx_file = create_proc_entry("cluster_cache",
+                                                0644, litmus_dir);
+        if (!clus_cache_idx_file) {
+                printk(KERN_ERR "Could not allocate cluster_cache "
+                       "procfs entry.\n");
+                return -ENOMEM;
+        }
+        clus_cache_idx_file->read_proc = proc_read_cluster_size;
+        clus_cache_idx_file->write_proc = proc_write_cluster_size;
+        stat_file = create_proc_read_entry("stats", 0444, litmus_dir,
+                                           proc_read_stats, NULL);
+        plugs_file = create_proc_read_entry("plugins", 0444, litmus_dir,
+                                           proc_read_plugins, NULL);
+        return 0;
+}
+static void exit_litmus_proc(void)
+{
+        if (plugs_file)
+                remove_proc_entry("plugins", litmus_dir);
+        if (stat_file)
+                remove_proc_entry("stats", litmus_dir);
+        if (curr_file)
+                remove_proc_entry("active_plugin", litmus_dir);
+        if (clus_cache_idx_file)
+                remove_proc_entry("cluster_cache", litmus_dir);
+        if (release_master_file)
+                remove_proc_entry("release_master", litmus_dir);
+        if (litmus_dir)
+                remove_proc_entry("litmus", NULL);
+}
+extern struct sched_plugin linux_sched_plugin;
+static int __init _init_litmus(void)
+{
+        /*      Common initializers,
+         *      mode change lock is used to enforce single mode change
+         *      operation.
+         */
+        printk("Starting LITMUS^RT kernel\n");
+        register_sched_plugin(&linux_sched_plugin);
+        bheap_node_cache    = KMEM_CACHE(bheap_node, SLAB_PANIC);
+        release_heap_cache = KMEM_CACHE(release_heap, SLAB_PANIC);
+#ifdef CONFIG_MAGIC_SYSRQ
+        /* offer some debugging help */
+        if (!register_sysrq_key('x', &sysrq_kill_rt_tasks_op))
+                printk("Registered kill rt tasks magic sysrq.\n");
+        else
+                printk("Could not register kill rt tasks magic sysrq.\n");
+#endif
+        init_litmus_proc();
+        return 0;
+}
+static void _exit_litmus(void)
+{
+        exit_litmus_proc();
+        kmem_cache_destroy(bheap_node_cache);
+        kmem_cache_destroy(release_heap_cache);
+}
+module_init(_init_litmus);
+module_exit(_exit_litmus);
diff --git a/litmus/rt_domain.c b/litmus/rt_domain.c
new file mode 100644
index 000000000000..609ff0f82abb
--- /dev/null
+++ b/litmus/rt_domain.c
@@ -0,0 +1,310 @@
+/*
+ * litmus/rt_domain.c
+ *
+ * LITMUS real-time infrastructure. This file contains the
+ * functions that manipulate RT domains. RT domains are an abstraction
+ * of a ready queue and a release queue.
+ */
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/sched_trace.h>
+#include <litmus/rt_domain.h>
+#include <litmus/trace.h>
+#include <litmus/bheap.h>
+static int dummy_resched(rt_domain_t *rt)
+{
+        return 0;
+}
+static int dummy_order(struct bheap_node* a, struct bheap_node* b)
+{
+        return 0;
+}
+/* default implementation: use default lock */
+static void default_release_jobs(rt_domain_t* rt, struct bheap* tasks)
+{
+        merge_ready(rt, tasks);
+}
+static unsigned int time2slot(lt_t time)
+{
+        return (unsigned int) time2quanta(time, FLOOR) % RELEASE_QUEUE_SLOTS;
+}
+static enum hrtimer_restart on_release_timer(struct hrtimer *timer)
+{
+        unsigned long flags;
+        struct release_heap* rh;
+        TRACE("on_release_timer(0x%p) starts.\n", timer);
+        TS_RELEASE_START;
+        rh = container_of(timer, struct release_heap, timer);
+        spin_lock_irqsave(&rh->dom->release_lock, flags);
+        TRACE("CB has the release_lock 0x%p\n", &rh->dom->release_lock);
+        /* remove from release queue */
+        list_del(&rh->list);
+        spin_unlock_irqrestore(&rh->dom->release_lock, flags);
+        TRACE("CB returned release_lock 0x%p\n", &rh->dom->release_lock);
+        /* call release callback */
+        rh->dom->release_jobs(rh->dom, &rh->heap);
+        /* WARNING: rh can be referenced from other CPUs from now on. */
+        TS_RELEASE_END;
+        TRACE("on_release_timer(0x%p) ends.\n", timer);
+        return  HRTIMER_NORESTART;
+}
+/* allocated in litmus.c */
+struct kmem_cache * release_heap_cache;
+struct release_heap* release_heap_alloc(int gfp_flags)
+{
+        struct release_heap* rh;
+        rh= kmem_cache_alloc(release_heap_cache, gfp_flags);
+        if (rh) {
+                /* initialize timer */
+                hrtimer_init(&rh->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+                rh->timer.function = on_release_timer;
+        }
+        return rh;
+}
+void release_heap_free(struct release_heap* rh)
+{
+        /* make sure timer is no longer in use */
+        hrtimer_cancel(&rh->timer);
+        kmem_cache_free(release_heap_cache, rh);
+}
+/* Caller must hold release lock.
+ * Will return heap for given time. If no such heap exists prior to
+ * the invocation it will be created.
+ */
+static struct release_heap* get_release_heap(rt_domain_t *rt,
+                                             struct task_struct* t,
+                                             int use_task_heap)
+{
+        struct list_head* pos;
+        struct release_heap* heap = NULL;
+        struct release_heap* rh;
+        lt_t release_time = get_release(t);
+        unsigned int slot = time2slot(release_time);
+        /* initialize pos for the case that the list is empty */
+        pos = rt->release_queue.slot[slot].next;
+        list_for_each(pos, &rt->release_queue.slot[slot]) {
+                rh = list_entry(pos, struct release_heap, list);
+                if (release_time == rh->release_time) {
+                        /* perfect match -- this happens on hyperperiod
+                         * boundaries
+                         */
+                        heap = rh;
+                        break;
+                } else if (lt_before(release_time, rh->release_time)) {
+                        /* we need to insert a new node since rh is
+                         * already in the future
+                         */
+                        break;
+                }
+        }
+        if (!heap && use_task_heap) {
+                /* use pre-allocated release heap */
+                rh = tsk_rt(t)->rel_heap;
+                rh->dom = rt;
+                rh->release_time = release_time;
+                /* add to release queue */
+                list_add(&rh->list, pos->prev);
+                heap = rh;
+        }
+        return heap;
+}
+static void reinit_release_heap(struct task_struct* t)
+{
+        struct release_heap* rh;
+        /* use pre-allocated release heap */
+        rh = tsk_rt(t)->rel_heap;
+        /* Make sure it is safe to use.  The timer callback could still
+         * be executing on another CPU; hrtimer_cancel() will wait
+         * until the timer callback has completed.  However, under no
+         * circumstances should the timer be active (= yet to be
+         * triggered).
+         *
+         * WARNING: If the CPU still holds the release_lock at this point,
+         *          deadlock may occur!
+         */
+        BUG_ON(hrtimer_cancel(&rh->timer));
+        /* initialize */
+        bheap_init(&rh->heap);
+        atomic_set(&rh->info.state, HRTIMER_START_ON_INACTIVE);
+}
+/* arm_release_timer() - start local release timer or trigger
+ *     remote timer (pull timer)
+ *
+ * Called by add_release() with:
+ * - tobe_lock taken
+ * - IRQ disabled
+ */
+static void arm_release_timer(rt_domain_t *_rt)
+{
+        rt_domain_t *rt = _rt;
+        struct list_head list;
+        struct list_head *pos, *safe;
+        struct task_struct* t;
+        struct release_heap* rh;
+        TRACE("arm_release_timer() at %llu\n", litmus_clock());
+        list_replace_init(&rt->tobe_released, &list);
+        list_for_each_safe(pos, safe, &list) {
+                /* pick task of work list */
+                t = list_entry(pos, struct task_struct, rt_param.list);
+                sched_trace_task_release(t);
+                list_del(pos);
+                /* put into release heap while holding release_lock */
+                spin_lock(&rt->release_lock);
+                TRACE_TASK(t, "I have the release_lock 0x%p\n", &rt->release_lock);
+                rh = get_release_heap(rt, t, 0);
+                if (!rh) {
+                        /* need to use our own, but drop lock first */
+                        spin_unlock(&rt->release_lock);
+                        TRACE_TASK(t, "Dropped release_lock 0x%p\n",
+                                   &rt->release_lock);
+                        reinit_release_heap(t);
+                        TRACE_TASK(t, "release_heap ready\n");
+                        spin_lock(&rt->release_lock);
+                        TRACE_TASK(t, "Re-acquired release_lock 0x%p\n",
+                                   &rt->release_lock);
+                        rh = get_release_heap(rt, t, 1);
+                }
+                bheap_insert(rt->order, &rh->heap, tsk_rt(t)->heap_node);
+                TRACE_TASK(t, "arm_release_timer(): added to release heap\n");
+                spin_unlock(&rt->release_lock);
+                TRACE_TASK(t, "Returned the release_lock 0x%p\n", &rt->release_lock);
+                /* To avoid arming the timer multiple times, we only let the
+                 * owner do the arming (which is the "first" task to reference
+                 * this release_heap anyway).
+                 */
+                if (rh == tsk_rt(t)->rel_heap) {
+                        TRACE_TASK(t, "arming timer 0x%p\n", &rh->timer);
+                        /* we cannot arm the timer using hrtimer_start()
+                         * as it may deadlock on rq->lock
+                         *
+                         * PINNED mode is ok on both local and remote CPU
+                         */
+                        if (rt->release_master == NO_CPU)
+                                __hrtimer_start_range_ns(&rh->timer,
+                                                ns_to_ktime(rh->release_time),
+                                                0, HRTIMER_MODE_ABS_PINNED, 0);
+                        else
+                                hrtimer_start_on(rt->release_master,
+                                                &rh->info, &rh->timer,
+                                                ns_to_ktime(rh->release_time),
+                                                HRTIMER_MODE_ABS_PINNED);
+                } else
+                        TRACE_TASK(t, "0x%p is not my timer\n", &rh->timer);
+        }
+}
+void rt_domain_init(rt_domain_t *rt,
+                    bheap_prio_t order,
+                    check_resched_needed_t check,
+                    release_jobs_t release
+                   )
+{
+        int i;
+        BUG_ON(!rt);
+        if (!check)
+                check = dummy_resched;
+        if (!release)
+                release = default_release_jobs;
+        if (!order)
+                order = dummy_order;
+        rt->release_master = NO_CPU;
+        bheap_init(&rt->ready_queue);
+        INIT_LIST_HEAD(&rt->tobe_released);
+        for (i = 0; i < RELEASE_QUEUE_SLOTS; i++)
+                INIT_LIST_HEAD(&rt->release_queue.slot[i]);
+        spin_lock_init(&rt->ready_lock);
+        spin_lock_init(&rt->release_lock);
+        spin_lock_init(&rt->tobe_lock);
+        rt->check_resched       = check;
+        rt->release_jobs        = release;
+        rt->order               = order;
+}
+/* add_ready - add a real-time task to the rt ready queue. It must be runnable.
+ * @new:       the newly released task
+ */
+void __add_ready(rt_domain_t* rt, struct task_struct *new)
+{
+        TRACE("rt: adding %s/%d (%llu, %llu) rel=%llu to ready queue at %llu\n",
+              new->comm, new->pid, get_exec_cost(new), get_rt_period(new),
+              get_release(new), litmus_clock());
+        BUG_ON(bheap_node_in_heap(tsk_rt(new)->heap_node));
+        bheap_insert(rt->order, &rt->ready_queue, tsk_rt(new)->heap_node);
+        rt->check_resched(rt);
+}
+/* merge_ready - Add a sorted set of tasks to the rt ready queue. They must be runnable.
+ * @tasks      - the newly released tasks
+ */
+void __merge_ready(rt_domain_t* rt, struct bheap* tasks)
+{
+        bheap_union(rt->order, &rt->ready_queue, tasks);
+        rt->check_resched(rt);
+}
+/* add_release - add a real-time task to the rt release queue.
+ * @task:        the sleeping task
+ */
+void __add_release(rt_domain_t* rt, struct task_struct *task)
+{
+        TRACE_TASK(task, "add_release(), rel=%llu\n", get_release(task));
+        list_add(&tsk_rt(task)->list, &rt->tobe_released);
+        task->rt_param.domain = rt;
+        /* start release timer */
+        TS_SCHED2_START(task);
+        arm_release_timer(rt);
+        TS_SCHED2_END(task);
+}
diff --git a/litmus/sched_cedf.c b/litmus/sched_cedf.c
new file mode 100644
index 000000000000..da44b451c9ad
--- /dev/null
+++ b/litmus/sched_cedf.c
@@ -0,0 +1,756 @@
+/*
+ * litmus/sched_cedf.c
+ *
+ * Implementation of the C-EDF scheduling algorithm.
+ *
+ * This implementation is based on G-EDF:
+ * - CPUs are clustered around L2 or L3 caches.
+ * - Clusters topology is automatically detected (this is arch dependent
+ *   and is working only on x86 at the moment --- and only with modern
+ *   cpus that exports cpuid4 information)
+ * - The plugins _does not_ attempt to put tasks in the right cluster i.e.
+ *   the programmer needs to be aware of the topology to place tasks
+ *   in the desired cluster
+ * - default clustering is around L2 cache (cache index = 2)
+ *   supported clusters are: L1 (private cache: pedf), L2, L3
+ *
+ *   For details on functions, take a look at sched_gsn_edf.c
+ *
+ * This version uses the simple approach and serializes all scheduling
+ * decisions by the use of a queue lock. This is probably not the
+ * best way to do it, but it should suffice for now.
+ */
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <litmus/litmus.h>
+#include <litmus/jobs.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/edf_common.h>
+#include <litmus/sched_trace.h>
+#include <litmus/bheap.h>
+#include <linux/module.h>
+/* forward declaration... a funny thing with C ;) */
+struct clusterdomain;
+/* cpu_entry_t - maintain the linked and scheduled state
+ *
+ * A cpu also contains a pointer to the cedf_domain_t cluster
+ * that owns it (struct clusterdomain*)
+ */
+typedef struct  {
+        int                     cpu;
+        struct clusterdomain*   cluster;        /* owning cluster */
+        struct task_struct*     linked;         /* only RT tasks */
+        struct task_struct*     scheduled;      /* only RT tasks */
+        atomic_t                will_schedule;  /* prevent unneeded IPIs */
+        struct bheap_node*      hn;
+} cpu_entry_t;
+/* one cpu_entry_t per CPU */
+DEFINE_PER_CPU(cpu_entry_t, cedf_cpu_entries);
+#define set_will_schedule() \
+        (atomic_set(&__get_cpu_var(cedf_cpu_entries).will_schedule, 1))
+#define clear_will_schedule() \
+        (atomic_set(&__get_cpu_var(cedf_cpu_entries).will_schedule, 0))
+#define test_will_schedule(cpu) \
+        (atomic_read(&per_cpu(cedf_cpu_entries, cpu).will_schedule))
+/*
+ * In C-EDF there is a cedf domain _per_ cluster
+ * The number of clusters is dynamically determined accordingly to the
+ * total cpu number and the cluster size
+ */
+typedef struct clusterdomain {
+        /* rt_domain for this cluster */
+        rt_domain_t     domain;
+        /* cpus in this cluster */
+        cpu_entry_t*    *cpus;
+        /* map of this cluster cpus */
+        cpumask_var_t   cpu_map;
+        /* the cpus queue themselves according to priority in here */
+        struct bheap_node *heap_node;
+        struct bheap      cpu_heap;
+        /* lock for this cluster */
+#define lock domain.ready_lock
+} cedf_domain_t;
+/* a cedf_domain per cluster; allocation is done at init/activation time */
+cedf_domain_t *cedf;
+#define remote_cluster(cpu)     ((cedf_domain_t *) per_cpu(cedf_cpu_entries, cpu).cluster)
+#define task_cpu_cluster(task)  remote_cluster(get_partition(task))
+/* Uncomment WANT_ALL_SCHED_EVENTS if you want to see all scheduling
+ * decisions in the TRACE() log; uncomment VERBOSE_INIT for verbose
+ * information during the initialization of the plugin (e.g., topology)
+#define WANT_ALL_SCHED_EVENTS
+ */
+#define VERBOSE_INIT
+static int cpu_lower_prio(struct bheap_node *_a, struct bheap_node *_b)
+{
+        cpu_entry_t *a, *b;
+        a = _a->value;
+        b = _b->value;
+        /* Note that a and b are inverted: we want the lowest-priority CPU at
+         * the top of the heap.
+         */
+        return edf_higher_prio(b->linked, a->linked);
+}
+/* update_cpu_position - Move the cpu entry to the correct place to maintain
+ *                       order in the cpu queue. Caller must hold cedf lock.
+ */
+static void update_cpu_position(cpu_entry_t *entry)
+{
+        cedf_domain_t *cluster = entry->cluster;
+        if (likely(bheap_node_in_heap(entry->hn)))
+                bheap_delete(cpu_lower_prio,
+                                &cluster->cpu_heap,
+                                entry->hn);
+        bheap_insert(cpu_lower_prio, &cluster->cpu_heap, entry->hn);
+}
+/* caller must hold cedf lock */
+static cpu_entry_t* lowest_prio_cpu(cedf_domain_t *cluster)
+{
+        struct bheap_node* hn;
+        hn = bheap_peek(cpu_lower_prio, &cluster->cpu_heap);
+        return hn->value;
+}
+/* link_task_to_cpu - Update the link of a CPU.
+ *                    Handles the case where the to-be-linked task is already
+ *                    scheduled on a different CPU.
+ */
+static noinline void link_task_to_cpu(struct task_struct* linked,
+                                      cpu_entry_t *entry)
+{
+        cpu_entry_t *sched;
+        struct task_struct* tmp;
+        int on_cpu;
+        BUG_ON(linked && !is_realtime(linked));
+        /* Currently linked task is set to be unlinked. */
+        if (entry->linked) {
+                entry->linked->rt_param.linked_on = NO_CPU;
+        }
+        /* Link new task to CPU. */
+        if (linked) {
+                set_rt_flags(linked, RT_F_RUNNING);
+                /* handle task is already scheduled somewhere! */
+                on_cpu = linked->rt_param.scheduled_on;
+                if (on_cpu != NO_CPU) {
+                        sched = &per_cpu(cedf_cpu_entries, on_cpu);
+                        /* this should only happen if not linked already */
+                        BUG_ON(sched->linked == linked);
+                        /* If we are already scheduled on the CPU to which we
+                         * wanted to link, we don't need to do the swap --
+                         * we just link ourselves to the CPU and depend on
+                         * the caller to get things right.
+                         */
+                        if (entry != sched) {
+                                TRACE_TASK(linked,
+                                           "already scheduled on %d, updating link.\n",
+                                           sched->cpu);
+                                tmp = sched->linked;
+                                linked->rt_param.linked_on = sched->cpu;
+                                sched->linked = linked;
+                                update_cpu_position(sched);
+                                linked = tmp;
+                        }
+                }
+                if (linked) /* might be NULL due to swap */
+                        linked->rt_param.linked_on = entry->cpu;
+        }
+        entry->linked = linked;
+#ifdef WANT_ALL_SCHED_EVENTS
+        if (linked)
+                TRACE_TASK(linked, "linked to %d.\n", entry->cpu);
+        else
+                TRACE("NULL linked to %d.\n", entry->cpu);
+#endif
+        update_cpu_position(entry);
+}
+/* unlink - Make sure a task is not linked any longer to an entry
+ *          where it was linked before. Must hold cedf_lock.
+ */
+static noinline void unlink(struct task_struct* t)
+{
+        cpu_entry_t *entry;
+        if (unlikely(!t)) {
+                TRACE_BUG_ON(!t);
+                return;
+        }
+        if (t->rt_param.linked_on != NO_CPU) {
+                /* unlink */
+                entry = &per_cpu(cedf_cpu_entries, t->rt_param.linked_on);
+                t->rt_param.linked_on = NO_CPU;
+                link_task_to_cpu(NULL, entry);
+        } else if (is_queued(t)) {
+                /* This is an interesting situation: t is scheduled,
+                 * but was just recently unlinked.  It cannot be
+                 * linked anywhere else (because then it would have
+                 * been relinked to this CPU), thus it must be in some
+                 * queue. We must remove it from the list in this
+                 * case.
+                 *
+                 * in C-EDF case is should be somewhere in the queue for
+                 * its domain, therefore and we can get the domain using
+                 * task_cpu_cluster
+                 */
+                remove(&(task_cpu_cluster(t))->domain, t);
+        }
+}
+/* preempt - force a CPU to reschedule
+ */
+static void preempt(cpu_entry_t *entry)
+{
+        preempt_if_preemptable(entry->scheduled, entry->cpu);
+}
+/* requeue - Put an unlinked task into gsn-edf domain.
+ *           Caller must hold cedf_lock.
+ */
+static noinline void requeue(struct task_struct* task)
+{
+        cedf_domain_t *cluster = task_cpu_cluster(task);
+        BUG_ON(!task);
+        /* sanity check before insertion */
+        BUG_ON(is_queued(task));
+        if (is_released(task, litmus_clock()))
+                __add_ready(&cluster->domain, task);
+        else {
+                /* it has got to wait */
+                add_release(&cluster->domain, task);
+        }
+}
+/* check for any necessary preemptions */
+static void check_for_preemptions(cedf_domain_t *cluster)
+{
+        struct task_struct *task;
+        cpu_entry_t* last;
+        for(last = lowest_prio_cpu(cluster);
+            edf_preemption_needed(&cluster->domain, last->linked);
+            last = lowest_prio_cpu(cluster)) {
+                /* preemption necessary */
+                task = __take_ready(&cluster->domain);
+                TRACE("check_for_preemptions: attempting to link task %d to %d\n",
+                      task->pid, last->cpu);
+                if (last->linked)
+                        requeue(last->linked);
+                link_task_to_cpu(task, last);
+                preempt(last);
+        }
+}
+/* cedf_job_arrival: task is either resumed or released */
+static noinline void cedf_job_arrival(struct task_struct* task)
+{
+        cedf_domain_t *cluster = task_cpu_cluster(task);
+        BUG_ON(!task);
+        requeue(task);
+        check_for_preemptions(cluster);
+}
+static void cedf_release_jobs(rt_domain_t* rt, struct bheap* tasks)
+{
+        cedf_domain_t* cluster = container_of(rt, cedf_domain_t, domain);
+        unsigned long flags;
+        spin_lock_irqsave(&cluster->lock, flags);
+        __merge_ready(&cluster->domain, tasks);
+        check_for_preemptions(cluster);
+        spin_unlock_irqrestore(&cluster->lock, flags);
+}
+/* caller holds cedf_lock */
+static noinline void job_completion(struct task_struct *t, int forced)
+{
+        BUG_ON(!t);
+        sched_trace_task_completion(t, forced);
+        TRACE_TASK(t, "job_completion().\n");
+        /* set flags */
+        set_rt_flags(t, RT_F_SLEEP);
+        /* prepare for next period */
+        prepare_for_next_period(t);
+        if (is_released(t, litmus_clock()))
+                sched_trace_task_release(t);
+        /* unlink */
+        unlink(t);
+        /* requeue
+         * But don't requeue a blocking task. */
+        if (is_running(t))
+                cedf_job_arrival(t);
+}
+/* cedf_tick - this function is called for every local timer
+ *                         interrupt.
+ *
+ *                   checks whether the current task has expired and checks
+ *                   whether we need to preempt it if it has not expired
+ */
+static void cedf_tick(struct task_struct* t)
+{
+        if (is_realtime(t) && budget_exhausted(t)) {
+                if (!is_np(t)) {
+                        /* np tasks will be preempted when they become
+                         * preemptable again
+                         */
+                        set_tsk_need_resched(t);
+                        set_will_schedule();
+                        TRACE("cedf_scheduler_tick: "
+                              "%d is preemptable "
+                              " => FORCE_RESCHED\n", t->pid);
+                } else if (is_user_np(t)) {
+                        TRACE("cedf_scheduler_tick: "
+                              "%d is non-preemptable, "
+                              "preemption delayed.\n", t->pid);
+                        request_exit_np(t);
+                }
+        }
+}
+/* Getting schedule() right is a bit tricky. schedule() may not make any
+ * assumptions on the state of the current task since it may be called for a
+ * number of reasons. The reasons include a scheduler_tick() determined that it
+ * was necessary, because sys_exit_np() was called, because some Linux
+ * subsystem determined so, or even (in the worst case) because there is a bug
+ * hidden somewhere. Thus, we must take extreme care to determine what the
+ * current state is.
+ *
+ * The CPU could currently be scheduling a task (or not), be linked (or not).
+ *
+ * The following assertions for the scheduled task could hold:
+ *
+ *      - !is_running(scheduled)        // the job blocks
+ *      - scheduled->timeslice == 0     // the job completed (forcefully)
+ *      - get_rt_flag() == RT_F_SLEEP   // the job completed (by syscall)
+ *      - linked != scheduled           // we need to reschedule (for any reason)
+ *      - is_np(scheduled)              // rescheduling must be delayed,
+ *                                         sys_exit_np must be requested
+ *
+ * Any of these can occur together.
+ */
+static struct task_struct* cedf_schedule(struct task_struct * prev)
+{
+        cpu_entry_t* entry = &__get_cpu_var(cedf_cpu_entries);
+        cedf_domain_t *cluster = entry->cluster;
+        int out_of_time, sleep, preempt, np, exists, blocks;
+        struct task_struct* next = NULL;
+        spin_lock(&cluster->lock);
+        clear_will_schedule();
+        /* sanity checking */
+        BUG_ON(entry->scheduled && entry->scheduled != prev);
+        BUG_ON(entry->scheduled && !is_realtime(prev));
+        BUG_ON(is_realtime(prev) && !entry->scheduled);
+        /* (0) Determine state */
+        exists      = entry->scheduled != NULL;
+        blocks      = exists && !is_running(entry->scheduled);
+        out_of_time = exists && budget_exhausted(entry->scheduled);
+        np          = exists && is_np(entry->scheduled);
+        sleep       = exists && get_rt_flags(entry->scheduled) == RT_F_SLEEP;
+        preempt     = entry->scheduled != entry->linked;
+#ifdef WANT_ALL_SCHED_EVENTS
+        TRACE_TASK(prev, "invoked cedf_schedule.\n");
+#endif
+        if (exists)
+                TRACE_TASK(prev,
+                           "blocks:%d out_of_time:%d np:%d sleep:%d preempt:%d "
+                           "state:%d sig:%d\n",
+                           blocks, out_of_time, np, sleep, preempt,
+                           prev->state, signal_pending(prev));
+        if (entry->linked && preempt)
+                TRACE_TASK(prev, "will be preempted by %s/%d\n",
+                           entry->linked->comm, entry->linked->pid);
+        /* If a task blocks we have no choice but to reschedule.
+         */
+        if (blocks)
+                unlink(entry->scheduled);
+        /* Request a sys_exit_np() call if we would like to preempt but cannot.
+         * We need to make sure to update the link structure anyway in case
+         * that we are still linked. Multiple calls to request_exit_np() don't
+         * hurt.
+         */
+        if (np && (out_of_time || preempt || sleep)) {
+                unlink(entry->scheduled);
+                request_exit_np(entry->scheduled);
+        }
+        /* Any task that is preemptable and either exhausts its execution
+         * budget or wants to sleep completes. We may have to reschedule after
+         * this. Don't do a job completion if we block (can't have timers running
+         * for blocked jobs). Preemption go first for the same reason.
+         */
+        if (!np && (out_of_time || sleep) && !blocks && !preempt)
+                job_completion(entry->scheduled, !sleep);
+        /* Link pending task if we became unlinked.
+         */
+        if (!entry->linked)
+                link_task_to_cpu(__take_ready(&cluster->domain), entry);
+        /* The final scheduling decision. Do we need to switch for some reason?
+         * If linked is different from scheduled, then select linked as next.
+         */
+        if ((!np || blocks) &&
+            entry->linked != entry->scheduled) {
+                /* Schedule a linked job? */
+                if (entry->linked) {
+                        entry->linked->rt_param.scheduled_on = entry->cpu;
+                        next = entry->linked;
+                }
+                if (entry->scheduled) {
+                        /* not gonna be scheduled soon */
+                        entry->scheduled->rt_param.scheduled_on = NO_CPU;
+                        TRACE_TASK(entry->scheduled, "scheduled_on = NO_CPU\n");
+                }
+        } else
+                /* Only override Linux scheduler if we have a real-time task
+                 * scheduled that needs to continue.
+                 */
+                if (exists)
+                        next = prev;
+        spin_unlock(&cluster->lock);
+#ifdef WANT_ALL_SCHED_EVENTS
+        TRACE("cedf_lock released, next=0x%p\n", next);
+        if (next)
+                TRACE_TASK(next, "scheduled at %llu\n", litmus_clock());
+        else if (exists && !next)
+                TRACE("becomes idle at %llu.\n", litmus_clock());
+#endif
+        return next;
+}
+/* _finish_switch - we just finished the switch away from prev
+ */
+static void cedf_finish_switch(struct task_struct *prev)
+{
+        cpu_entry_t*    entry = &__get_cpu_var(cedf_cpu_entries);
+        entry->scheduled = is_realtime(current) ? current : NULL;
+#ifdef WANT_ALL_SCHED_EVENTS
+        TRACE_TASK(prev, "switched away from\n");
+#endif
+}
+/*      Prepare a task for running in RT mode
+ */
+static void cedf_task_new(struct task_struct * t, int on_rq, int running)
+{
+        unsigned long           flags;
+        cpu_entry_t*            entry;
+        cedf_domain_t*          cluster;
+        TRACE("gsn edf: task new %d\n", t->pid);
+        /* the cluster doesn't change even if t is running */
+        cluster = task_cpu_cluster(t);
+        spin_lock_irqsave(&cluster->domain.ready_lock, flags);
+        /* setup job params */
+        release_at(t, litmus_clock());
+        if (running) {
+                entry = &per_cpu(cedf_cpu_entries, task_cpu(t));
+                BUG_ON(entry->scheduled);
+                entry->scheduled = t;
+                tsk_rt(t)->scheduled_on = task_cpu(t);
+        } else {
+                t->rt_param.scheduled_on = NO_CPU;
+        }
+        t->rt_param.linked_on          = NO_CPU;
+        cedf_job_arrival(t);
+        spin_unlock_irqrestore(&(cluster->domain.ready_lock), flags);
+}
+static void cedf_task_wake_up(struct task_struct *task)
+{
+        unsigned long flags;
+        lt_t now;
+        cedf_domain_t *cluster;
+        TRACE_TASK(task, "wake_up at %llu\n", litmus_clock());
+        cluster = task_cpu_cluster(task);
+        spin_lock_irqsave(&cluster->lock, flags);
+        /* We need to take suspensions because of semaphores into
+         * account! If a job resumes after being suspended due to acquiring
+         * a semaphore, it should never be treated as a new job release.
+         */
+        if (get_rt_flags(task) == RT_F_EXIT_SEM) {
+                set_rt_flags(task, RT_F_RUNNING);
+        } else {
+                now = litmus_clock();
+                if (is_tardy(task, now)) {
+                        /* new sporadic release */
+                        release_at(task, now);
+                        sched_trace_task_release(task);
+                }
+                else {
+                        if (task->rt.time_slice) {
+                                /* came back in time before deadline
+                                */
+                                set_rt_flags(task, RT_F_RUNNING);
+                        }
+                }
+        }
+        cedf_job_arrival(task);
+        spin_unlock_irqrestore(&cluster->lock, flags);
+}
+static void cedf_task_block(struct task_struct *t)
+{
+        unsigned long flags;
+        cedf_domain_t *cluster;
+        TRACE_TASK(t, "block at %llu\n", litmus_clock());
+        cluster = task_cpu_cluster(t);
+        /* unlink if necessary */
+        spin_lock_irqsave(&cluster->lock, flags);
+        unlink(t);
+        spin_unlock_irqrestore(&cluster->lock, flags);
+        BUG_ON(!is_realtime(t));
+}
+static void cedf_task_exit(struct task_struct * t)
+{
+        unsigned long flags;
+        cedf_domain_t *cluster = task_cpu_cluster(t);
+        /* unlink if necessary */
+        spin_lock_irqsave(&cluster->lock, flags);
+        unlink(t);
+        if (tsk_rt(t)->scheduled_on != NO_CPU) {
+                cluster->cpus[tsk_rt(t)->scheduled_on]->scheduled = NULL;
+                tsk_rt(t)->scheduled_on = NO_CPU;
+        }
+        spin_unlock_irqrestore(&cluster->lock, flags);
+        BUG_ON(!is_realtime(t));
+        TRACE_TASK(t, "RIP\n");
+}
+static long cedf_admit_task(struct task_struct* tsk)
+{
+        return task_cpu(tsk) == tsk->rt_param.task_params.cpu ? 0 : -EINVAL;
+}
+/* total number of cluster */
+static int num_clusters;
+/* we do not support cluster of different sizes */
+static unsigned int cluster_size;
+#ifdef VERBOSE_INIT
+static void print_cluster_topology(cpumask_var_t mask, int cpu)
+{
+        int chk;
+        char buf[255];
+        chk = cpulist_scnprintf(buf, 254, mask);
+        buf[chk] = '\0';
+        printk(KERN_INFO "CPU = %d, shared cpu(s) = %s\n", cpu, buf);
+}
+#endif
+static int clusters_allocated = 0;
+static void cleanup_cedf(void)
+{
+        int i;
+        if (clusters_allocated) {
+                for (i = 0; i < num_clusters; i++) {
+                        kfree(cedf[i].cpus);
+                        kfree(cedf[i].heap_node);
+                        free_cpumask_var(cedf[i].cpu_map);
+                }
+                kfree(cedf);
+        }
+}
+static long cedf_activate_plugin(void)
+{
+        int i, j, cpu, ccpu, cpu_count;
+        cpu_entry_t *entry;
+        cpumask_var_t mask;
+        int chk = 0;
+        /* de-allocate old clusters, if any */
+        cleanup_cedf();
+        printk(KERN_INFO "C-EDF: Activate Plugin, cache index = %d\n",
+                        cluster_cache_index);
+        /* need to get cluster_size first */
+        if(!zalloc_cpumask_var(&mask, GFP_ATOMIC))
+                return -ENOMEM;
+        chk = get_shared_cpu_map(mask, 0, cluster_cache_index);
+        if (chk) {
+                /* if chk != 0 then it is the max allowed index */
+                printk(KERN_INFO "C-EDF: Cannot support cache index = %d\n",
+                                cluster_cache_index);
+                printk(KERN_INFO "C-EDF: Using cache index = %d\n",
+                                chk);
+                cluster_cache_index = chk;
+        }
+        cluster_size = cpumask_weight(mask);
+        if ((num_online_cpus() % cluster_size) != 0) {
+                /* this can't be right, some cpus are left out */
+                printk(KERN_ERR "C-EDF: Trying to group %d cpus in %d!\n",
+                                num_online_cpus(), cluster_size);
+                return -1;
+        }
+        num_clusters = num_online_cpus() / cluster_size;
+        printk(KERN_INFO "C-EDF: %d cluster(s) of size = %d\n",
+                        num_clusters, cluster_size);
+        /* initialize clusters */
+        cedf = kmalloc(num_clusters * sizeof(cedf_domain_t), GFP_ATOMIC);
+        for (i = 0; i < num_clusters; i++) {
+                cedf[i].cpus = kmalloc(cluster_size * sizeof(cpu_entry_t),
+                                GFP_ATOMIC);
+                cedf[i].heap_node = kmalloc(
+                                cluster_size * sizeof(struct bheap_node),
+                                GFP_ATOMIC);
+                bheap_init(&(cedf[i].cpu_heap));
+                edf_domain_init(&(cedf[i].domain), NULL, cedf_release_jobs);
+                if(!zalloc_cpumask_var(&cedf[i].cpu_map, GFP_ATOMIC))
+                        return -ENOMEM;
+        }
+        /* cycle through cluster and add cpus to them */
+        for (i = 0; i < num_clusters; i++) {
+                for_each_online_cpu(cpu) {
+                        /* check if the cpu is already in a cluster */
+                        for (j = 0; j < num_clusters; j++)
+                                if (cpumask_test_cpu(cpu, cedf[j].cpu_map))
+                                        break;
+                        /* if it is in a cluster go to next cpu */
+                        if (cpumask_test_cpu(cpu, cedf[j].cpu_map))
+                                continue;
+                        /* this cpu isn't in any cluster */
+                        /* get the shared cpus */
+                        get_shared_cpu_map(mask, cpu, cluster_cache_index);
+                        cpumask_copy(cedf[i].cpu_map, mask);
+#ifdef VERBOSE_INIT
+                        print_cluster_topology(mask, cpu);
+#endif
+                        /* add cpus to current cluster and init cpu_entry_t */
+                        cpu_count = 0;
+                        for_each_cpu(ccpu, cedf[i].cpu_map) {
+                                entry = &per_cpu(cedf_cpu_entries, ccpu);
+                                cedf[i].cpus[cpu_count] = entry;
+                                atomic_set(&entry->will_schedule, 0);
+                                entry->cpu = ccpu;
+                                entry->cluster = &cedf[i];
+                                entry->hn = &(cedf[i].heap_node[cpu_count]);
+                                bheap_node_init(&entry->hn, entry);
+                                cpu_count++;
+                                entry->linked = NULL;
+                                entry->scheduled = NULL;
+                                update_cpu_position(entry);
+                        }
+                        /* done with this cluster */
+                        break;
+                }
+        }
+        free_cpumask_var(mask);
+        clusters_allocated = 1;
+        return 0;
+}
+/*      Plugin object   */
+static struct sched_plugin cedf_plugin __cacheline_aligned_in_smp = {
+        .plugin_name            = "C-EDF",
+        .finish_switch          = cedf_finish_switch,
+        .tick                   = cedf_tick,
+        .task_new               = cedf_task_new,
+        .complete_job           = complete_job,
+        .task_exit              = cedf_task_exit,
+        .schedule               = cedf_schedule,
+        .task_wake_up           = cedf_task_wake_up,
+        .task_block             = cedf_task_block,
+        .admit_task             = cedf_admit_task,
+        .activate_plugin        = cedf_activate_plugin,
+};
+static int __init init_cedf(void)
+{
+        return register_sched_plugin(&cedf_plugin);
+}
+static void clean_cedf(void)
+{
+        cleanup_cedf();
+}
+module_init(init_cedf);
+module_exit(clean_cedf);
diff --git a/litmus/sched_gsn_edf.c b/litmus/sched_gsn_edf.c
new file mode 100644
index 000000000000..b9310dd6f75c
--- /dev/null
+++ b/litmus/sched_gsn_edf.c
@@ -0,0 +1,828 @@
+/*
+ * litmus/sched_gsn_edf.c
+ *
+ * Implementation of the GSN-EDF scheduling algorithm.
+ *
+ * This version uses the simple approach and serializes all scheduling
+ * decisions by the use of a queue lock. This is probably not the
+ * best way to do it, but it should suffice for now.
+ */
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <litmus/litmus.h>
+#include <litmus/jobs.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/edf_common.h>
+#include <litmus/sched_trace.h>
+#include <litmus/bheap.h>
+#include <linux/module.h>
+/* Overview of GSN-EDF operations.
+ *
+ * For a detailed explanation of GSN-EDF have a look at the FMLP paper. This
+ * description only covers how the individual operations are implemented in
+ * LITMUS.
+ *
+ * link_task_to_cpu(T, cpu)     - Low-level operation to update the linkage
+ *                                structure (NOT the actually scheduled
+ *                                task). If there is another linked task To
+ *                                already it will set To->linked_on = NO_CPU
+ *                                (thereby removing its association with this
+ *                                CPU). However, it will not requeue the
+ *                                previously linked task (if any). It will set
+ *                                T's state to RT_F_RUNNING and check whether
+ *                                it is already running somewhere else. If T
+ *                                is scheduled somewhere else it will link
+ *                                it to that CPU instead (and pull the linked
+ *                                task to cpu). T may be NULL.
+ *
+ * unlink(T)                    - Unlink removes T from all scheduler data
+ *                                structures. If it is linked to some CPU it
+ *                                will link NULL to that CPU. If it is
+ *                                currently queued in the gsnedf queue it will
+ *                                be removed from the rt_domain. It is safe to
+ *                                call unlink(T) if T is not linked. T may not
+ *                                be NULL.
+ *
+ * requeue(T)                   - Requeue will insert T into the appropriate
+ *                                queue. If the system is in real-time mode and
+ *                                the T is released already, it will go into the
+ *                                ready queue. If the system is not in
+ *                                real-time mode is T, then T will go into the
+ *                                release queue. If T's release time is in the
+ *                                future, it will go into the release
+ *                                queue. That means that T's release time/job
+ *                                no/etc. has to be updated before requeu(T) is
+ *                                called. It is not safe to call requeue(T)
+ *                                when T is already queued. T may not be NULL.
+ *
+ * gsnedf_job_arrival(T)        - This is the catch all function when T enters
+ *                                the system after either a suspension or at a
+ *                                job release. It will queue T (which means it
+ *                                is not safe to call gsnedf_job_arrival(T) if
+ *                                T is already queued) and then check whether a
+ *                                preemption is necessary. If a preemption is
+ *                                necessary it will update the linkage
+ *                                accordingly and cause scheduled to be called
+ *                                (either with an IPI or need_resched). It is
+ *                                safe to call gsnedf_job_arrival(T) if T's
+ *                                next job has not been actually released yet
+ *                                (releast time in the future). T will be put
+ *                                on the release queue in that case.
+ *
+ * job_completion(T)            - Take care of everything that needs to be done
+ *                                to prepare T for its next release and place
+ *                                it in the right queue with
+ *                                gsnedf_job_arrival().
+ *
+ *
+ * When we now that T is linked to CPU then link_task_to_cpu(NULL, CPU) is
+ * equivalent to unlink(T). Note that if you unlink a task from a CPU none of
+ * the functions will automatically propagate pending task from the ready queue
+ * to a linked task. This is the job of the calling function ( by means of
+ * __take_ready).
+ */
+/* cpu_entry_t - maintain the linked and scheduled state
+ */
+typedef struct  {
+        int                     cpu;
+        struct task_struct*     linked;         /* only RT tasks */
+        struct task_struct*     scheduled;      /* only RT tasks */
+        atomic_t                will_schedule;  /* prevent unneeded IPIs */
+        struct bheap_node*      hn;
+} cpu_entry_t;
+DEFINE_PER_CPU(cpu_entry_t, gsnedf_cpu_entries);
+cpu_entry_t* gsnedf_cpus[NR_CPUS];
+#define set_will_schedule() \
+        (atomic_set(&__get_cpu_var(gsnedf_cpu_entries).will_schedule, 1))
+#define clear_will_schedule() \
+        (atomic_set(&__get_cpu_var(gsnedf_cpu_entries).will_schedule, 0))
+#define test_will_schedule(cpu) \
+        (atomic_read(&per_cpu(gsnedf_cpu_entries, cpu).will_schedule))
+/* the cpus queue themselves according to priority in here */
+static struct bheap_node gsnedf_heap_node[NR_CPUS];
+static struct bheap      gsnedf_cpu_heap;
+static rt_domain_t gsnedf;
+#define gsnedf_lock (gsnedf.ready_lock)
+/* Uncomment this if you want to see all scheduling decisions in the
+ * TRACE() log.
+#define WANT_ALL_SCHED_EVENTS
+ */
+static int cpu_lower_prio(struct bheap_node *_a, struct bheap_node *_b)
+{
+        cpu_entry_t *a, *b;
+        a = _a->value;
+        b = _b->value;
+        /* Note that a and b are inverted: we want the lowest-priority CPU at
+         * the top of the heap.
+         */
+        return edf_higher_prio(b->linked, a->linked);
+}
+/* update_cpu_position - Move the cpu entry to the correct place to maintain
+ *                       order in the cpu queue. Caller must hold gsnedf lock.
+ */
+static void update_cpu_position(cpu_entry_t *entry)
+{
+        if (likely(bheap_node_in_heap(entry->hn)))
+                bheap_delete(cpu_lower_prio, &gsnedf_cpu_heap, entry->hn);
+        bheap_insert(cpu_lower_prio, &gsnedf_cpu_heap, entry->hn);
+}
+/* caller must hold gsnedf lock */
+static cpu_entry_t* lowest_prio_cpu(void)
+{
+        struct bheap_node* hn;
+        hn = bheap_peek(cpu_lower_prio, &gsnedf_cpu_heap);
+        return hn->value;
+}
+/* link_task_to_cpu - Update the link of a CPU.
+ *                    Handles the case where the to-be-linked task is already
+ *                    scheduled on a different CPU.
+ */
+static noinline void link_task_to_cpu(struct task_struct* linked,
+                                      cpu_entry_t *entry)
+{
+        cpu_entry_t *sched;
+        struct task_struct* tmp;
+        int on_cpu;
+        BUG_ON(linked && !is_realtime(linked));
+        /* Currently linked task is set to be unlinked. */
+        if (entry->linked) {
+                entry->linked->rt_param.linked_on = NO_CPU;
+        }
+        /* Link new task to CPU. */
+        if (linked) {
+                set_rt_flags(linked, RT_F_RUNNING);
+                /* handle task is already scheduled somewhere! */
+                on_cpu = linked->rt_param.scheduled_on;
+                if (on_cpu != NO_CPU) {
+                        sched = &per_cpu(gsnedf_cpu_entries, on_cpu);
+                        /* this should only happen if not linked already */
+                        BUG_ON(sched->linked == linked);
+                        /* If we are already scheduled on the CPU to which we
+                         * wanted to link, we don't need to do the swap --
+                         * we just link ourselves to the CPU and depend on
+                         * the caller to get things right.
+                         */
+                        if (entry != sched) {
+                                TRACE_TASK(linked,
+                                           "already scheduled on %d, updating link.\n",
+                                           sched->cpu);
+                                tmp = sched->linked;
+                                linked->rt_param.linked_on = sched->cpu;
+                                sched->linked = linked;
+                                update_cpu_position(sched);
+                                linked = tmp;
+                        }
+                }
+                if (linked) /* might be NULL due to swap */
+                        linked->rt_param.linked_on = entry->cpu;
+        }
+        entry->linked = linked;
+#ifdef WANT_ALL_SCHED_EVENTS
+        if (linked)
+                TRACE_TASK(linked, "linked to %d.\n", entry->cpu);
+        else
+                TRACE("NULL linked to %d.\n", entry->cpu);
+#endif
+        update_cpu_position(entry);
+}
+/* unlink - Make sure a task is not linked any longer to an entry
+ *          where it was linked before. Must hold gsnedf_lock.
+ */
+static noinline void unlink(struct task_struct* t)
+{
+        cpu_entry_t *entry;
+        if (unlikely(!t)) {
+                TRACE_BUG_ON(!t);
+                return;
+        }
+        if (t->rt_param.linked_on != NO_CPU) {
+                /* unlink */
+                entry = &per_cpu(gsnedf_cpu_entries, t->rt_param.linked_on);
+                t->rt_param.linked_on = NO_CPU;
+                link_task_to_cpu(NULL, entry);
+        } else if (is_queued(t)) {
+                /* This is an interesting situation: t is scheduled,
+                 * but was just recently unlinked.  It cannot be
+                 * linked anywhere else (because then it would have
+                 * been relinked to this CPU), thus it must be in some
+                 * queue. We must remove it from the list in this
+                 * case.
+                 */
+                remove(&gsnedf, t);
+        }
+}
+/* preempt - force a CPU to reschedule
+ */
+static void preempt(cpu_entry_t *entry)
+{
+        preempt_if_preemptable(entry->scheduled, entry->cpu);
+}
+/* requeue - Put an unlinked task into gsn-edf domain.
+ *           Caller must hold gsnedf_lock.
+ */
+static noinline void requeue(struct task_struct* task)
+{
+        BUG_ON(!task);
+        /* sanity check before insertion */
+        BUG_ON(is_queued(task));
+        if (is_released(task, litmus_clock()))
+                __add_ready(&gsnedf, task);
+        else {
+                /* it has got to wait */
+                add_release(&gsnedf, task);
+        }
+}
+/* check for any necessary preemptions */
+static void check_for_preemptions(void)
+{
+        struct task_struct *task;
+        cpu_entry_t* last;
+        for(last = lowest_prio_cpu();
+            edf_preemption_needed(&gsnedf, last->linked);
+            last = lowest_prio_cpu()) {
+                /* preemption necessary */
+                task = __take_ready(&gsnedf);
+                TRACE("check_for_preemptions: attempting to link task %d to %d\n",
+                      task->pid, last->cpu);
+                if (last->linked)
+                        requeue(last->linked);
+                link_task_to_cpu(task, last);
+                preempt(last);
+        }
+}
+/* gsnedf_job_arrival: task is either resumed or released */
+static noinline void gsnedf_job_arrival(struct task_struct* task)
+{
+        BUG_ON(!task);
+        requeue(task);
+        check_for_preemptions();
+}
+static void gsnedf_release_jobs(rt_domain_t* rt, struct bheap* tasks)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&gsnedf_lock, flags);
+        __merge_ready(rt, tasks);
+        check_for_preemptions();
+        spin_unlock_irqrestore(&gsnedf_lock, flags);
+}
+/* caller holds gsnedf_lock */
+static noinline void job_completion(struct task_struct *t, int forced)
+{
+        BUG_ON(!t);
+        sched_trace_task_completion(t, forced);
+        TRACE_TASK(t, "job_completion().\n");
+        /* set flags */
+        set_rt_flags(t, RT_F_SLEEP);
+        /* prepare for next period */
+        prepare_for_next_period(t);
+        if (is_released(t, litmus_clock()))
+                sched_trace_task_release(t);
+        /* unlink */
+        unlink(t);
+        /* requeue
+         * But don't requeue a blocking task. */
+        if (is_running(t))
+                gsnedf_job_arrival(t);
+}
+/* gsnedf_tick - this function is called for every local timer
+ *                         interrupt.
+ *
+ *                   checks whether the current task has expired and checks
+ *                   whether we need to preempt it if it has not expired
+ */
+static void gsnedf_tick(struct task_struct* t)
+{
+        if (is_realtime(t) && budget_exhausted(t)) {
+                if (!is_np(t)) {
+                        /* np tasks will be preempted when they become
+                         * preemptable again
+                         */
+                        set_tsk_need_resched(t);
+                        set_will_schedule();
+                        TRACE("gsnedf_scheduler_tick: "
+                              "%d is preemptable "
+                              " => FORCE_RESCHED\n", t->pid);
+                } else if (is_user_np(t)) {
+                        TRACE("gsnedf_scheduler_tick: "
+                              "%d is non-preemptable, "
+                              "preemption delayed.\n", t->pid);
+                        request_exit_np(t);
+                }
+        }
+}
+/* Getting schedule() right is a bit tricky. schedule() may not make any
+ * assumptions on the state of the current task since it may be called for a
+ * number of reasons. The reasons include a scheduler_tick() determined that it
+ * was necessary, because sys_exit_np() was called, because some Linux
+ * subsystem determined so, or even (in the worst case) because there is a bug
+ * hidden somewhere. Thus, we must take extreme care to determine what the
+ * current state is.
+ *
+ * The CPU could currently be scheduling a task (or not), be linked (or not).
+ *
+ * The following assertions for the scheduled task could hold:
+ *
+ *      - !is_running(scheduled)        // the job blocks
+ *      - scheduled->timeslice == 0     // the job completed (forcefully)
+ *      - get_rt_flag() == RT_F_SLEEP   // the job completed (by syscall)
+ *      - linked != scheduled           // we need to reschedule (for any reason)
+ *      - is_np(scheduled)              // rescheduling must be delayed,
+ *                                         sys_exit_np must be requested
+ *
+ * Any of these can occur together.
+ */
+static struct task_struct* gsnedf_schedule(struct task_struct * prev)
+{
+        cpu_entry_t* entry = &__get_cpu_var(gsnedf_cpu_entries);
+        int out_of_time, sleep, preempt, np, exists, blocks;
+        struct task_struct* next = NULL;
+        /* Bail out early if we are the release master.
+         * The release master never schedules any real-time tasks.
+         */
+        if (gsnedf.release_master == entry->cpu)
+                return NULL;
+        spin_lock(&gsnedf_lock);
+        clear_will_schedule();
+        /* sanity checking */
+        BUG_ON(entry->scheduled && entry->scheduled != prev);
+        BUG_ON(entry->scheduled && !is_realtime(prev));
+        BUG_ON(is_realtime(prev) && !entry->scheduled);
+        /* (0) Determine state */
+        exists      = entry->scheduled != NULL;
+        blocks      = exists && !is_running(entry->scheduled);
+        out_of_time = exists && budget_exhausted(entry->scheduled);
+        np          = exists && is_np(entry->scheduled);
+        sleep       = exists && get_rt_flags(entry->scheduled) == RT_F_SLEEP;
+        preempt     = entry->scheduled != entry->linked;
+#ifdef WANT_ALL_SCHED_EVENTS
+        TRACE_TASK(prev, "invoked gsnedf_schedule.\n");
+#endif
+        if (exists)
+                TRACE_TASK(prev,
+                           "blocks:%d out_of_time:%d np:%d sleep:%d preempt:%d "
+                           "state:%d sig:%d\n",
+                           blocks, out_of_time, np, sleep, preempt,
+                           prev->state, signal_pending(prev));
+        if (entry->linked && preempt)
+                TRACE_TASK(prev, "will be preempted by %s/%d\n",
+                           entry->linked->comm, entry->linked->pid);
+        /* If a task blocks we have no choice but to reschedule.
+         */
+        if (blocks)
+                unlink(entry->scheduled);
+        /* Request a sys_exit_np() call if we would like to preempt but cannot.
+         * We need to make sure to update the link structure anyway in case
+         * that we are still linked. Multiple calls to request_exit_np() don't
+         * hurt.
+         */
+        if (np && (out_of_time || preempt || sleep)) {
+                unlink(entry->scheduled);
+                request_exit_np(entry->scheduled);
+        }
+        /* Any task that is preemptable and either exhausts its execution
+         * budget or wants to sleep completes. We may have to reschedule after
+         * this. Don't do a job completion if we block (can't have timers running
+         * for blocked jobs). Preemption go first for the same reason.
+         */
+        if (!np && (out_of_time || sleep) && !blocks && !preempt)
+                job_completion(entry->scheduled, !sleep);
+        /* Link pending task if we became unlinked.
+         */
+        if (!entry->linked)
+                link_task_to_cpu(__take_ready(&gsnedf), entry);
+        /* The final scheduling decision. Do we need to switch for some reason?
+         * If linked is different from scheduled, then select linked as next.
+         */
+        if ((!np || blocks) &&
+            entry->linked != entry->scheduled) {
+                /* Schedule a linked job? */
+                if (entry->linked) {
+                        entry->linked->rt_param.scheduled_on = entry->cpu;
+                        next = entry->linked;
+                }
+                if (entry->scheduled) {
+                        /* not gonna be scheduled soon */
+                        entry->scheduled->rt_param.scheduled_on = NO_CPU;
+                        TRACE_TASK(entry->scheduled, "scheduled_on = NO_CPU\n");
+                }
+        } else
+                /* Only override Linux scheduler if we have a real-time task
+                 * scheduled that needs to continue.
+                 */
+                if (exists)
+                        next = prev;
+        spin_unlock(&gsnedf_lock);
+#ifdef WANT_ALL_SCHED_EVENTS
+        TRACE("gsnedf_lock released, next=0x%p\n", next);
+        if (next)
+                TRACE_TASK(next, "scheduled at %llu\n", litmus_clock());
+        else if (exists && !next)
+                TRACE("becomes idle at %llu.\n", litmus_clock());
+#endif
+        return next;
+}
+/* _finish_switch - we just finished the switch away from prev
+ */
+static void gsnedf_finish_switch(struct task_struct *prev)
+{
+        cpu_entry_t*    entry = &__get_cpu_var(gsnedf_cpu_entries);
+        entry->scheduled = is_realtime(current) ? current : NULL;
+#ifdef WANT_ALL_SCHED_EVENTS
+        TRACE_TASK(prev, "switched away from\n");
+#endif
+}
+/*      Prepare a task for running in RT mode
+ */
+static void gsnedf_task_new(struct task_struct * t, int on_rq, int running)
+{
+        unsigned long           flags;
+        cpu_entry_t*            entry;
+        TRACE("gsn edf: task new %d\n", t->pid);
+        spin_lock_irqsave(&gsnedf_lock, flags);
+        /* setup job params */
+        release_at(t, litmus_clock());
+        if (running) {
+                entry = &per_cpu(gsnedf_cpu_entries, task_cpu(t));
+                BUG_ON(entry->scheduled);
+                if (entry->cpu != gsnedf.release_master) {
+                        entry->scheduled = t;
+                        tsk_rt(t)->scheduled_on = task_cpu(t);
+                } else {
+                        /* do not schedule on release master */
+                        preempt(entry); /* force resched */
+                        tsk_rt(t)->scheduled_on = NO_CPU;
+                }
+        } else {
+                t->rt_param.scheduled_on = NO_CPU;
+        }
+        t->rt_param.linked_on          = NO_CPU;
+        gsnedf_job_arrival(t);
+        spin_unlock_irqrestore(&gsnedf_lock, flags);
+}
+static void gsnedf_task_wake_up(struct task_struct *task)
+{
+        unsigned long flags;
+        lt_t now;
+        TRACE_TASK(task, "wake_up at %llu\n", litmus_clock());
+        spin_lock_irqsave(&gsnedf_lock, flags);
+        /* We need to take suspensions because of semaphores into
+         * account! If a job resumes after being suspended due to acquiring
+         * a semaphore, it should never be treated as a new job release.
+         */
+        if (get_rt_flags(task) == RT_F_EXIT_SEM) {
+                set_rt_flags(task, RT_F_RUNNING);
+        } else {
+                now = litmus_clock();
+                if (is_tardy(task, now)) {
+                        /* new sporadic release */
+                        release_at(task, now);
+                        sched_trace_task_release(task);
+                }
+                else {
+                        if (task->rt.time_slice) {
+                                /* came back in time before deadline
+                                */
+                                set_rt_flags(task, RT_F_RUNNING);
+                        }
+                }
+        }
+        gsnedf_job_arrival(task);
+        spin_unlock_irqrestore(&gsnedf_lock, flags);
+}
+static void gsnedf_task_block(struct task_struct *t)
+{
+        unsigned long flags;
+        TRACE_TASK(t, "block at %llu\n", litmus_clock());
+        /* unlink if necessary */
+        spin_lock_irqsave(&gsnedf_lock, flags);
+        unlink(t);
+        spin_unlock_irqrestore(&gsnedf_lock, flags);
+        BUG_ON(!is_realtime(t));
+}
+static void gsnedf_task_exit(struct task_struct * t)
+{
+        unsigned long flags;
+        /* unlink if necessary */
+        spin_lock_irqsave(&gsnedf_lock, flags);
+        unlink(t);
+        if (tsk_rt(t)->scheduled_on != NO_CPU) {
+                gsnedf_cpus[tsk_rt(t)->scheduled_on]->scheduled = NULL;
+                tsk_rt(t)->scheduled_on = NO_CPU;
+        }
+        spin_unlock_irqrestore(&gsnedf_lock, flags);
+        BUG_ON(!is_realtime(t));
+        TRACE_TASK(t, "RIP\n");
+}
+#ifdef CONFIG_FMLP
+/* Update the queue position of a task that got it's priority boosted via
+ * priority inheritance. */
+static void update_queue_position(struct task_struct *holder)
+{
+        /* We don't know whether holder is in the ready queue. It should, but
+         * on a budget overrun it may already be in a release queue.  Hence,
+         * calling unlink() is not possible since it assumes that the task is
+         * not in a release queue.  However, we can safely check whether
+         * sem->holder is currently in a queue or scheduled after locking both
+         * the release and the ready queue lock. */
+        /* Assumption: caller holds gsnedf_lock */
+        int check_preempt = 0;
+        if (tsk_rt(holder)->linked_on != NO_CPU) {
+                TRACE_TASK(holder, "%s: linked  on %d\n",
+                           __FUNCTION__, tsk_rt(holder)->linked_on);
+                /* Holder is scheduled; need to re-order CPUs.
+                 * We can't use heap_decrease() here since
+                 * the cpu_heap is ordered in reverse direction, so
+                 * it is actually an increase. */
+                bheap_delete(cpu_lower_prio, &gsnedf_cpu_heap,
+                            gsnedf_cpus[tsk_rt(holder)->linked_on]->hn);
+                bheap_insert(cpu_lower_prio, &gsnedf_cpu_heap,
+                            gsnedf_cpus[tsk_rt(holder)->linked_on]->hn);
+        } else {
+                /* holder may be queued: first stop queue changes */
+                spin_lock(&gsnedf.release_lock);
+                if (is_queued(holder)) {
+                        TRACE_TASK(holder, "%s: is queued\n",
+                                   __FUNCTION__);
+                        /* We need to update the position
+                         * of holder in some heap. Note that this
+                         * may be a release heap. */
+                        check_preempt =
+                                !bheap_decrease(edf_ready_order,
+                                               tsk_rt(holder)->heap_node);
+                } else {
+                        /* Nothing to do: if it is not queued and not linked
+                         * then it is currently being moved by other code
+                         * (e.g., a timer interrupt handler) that will use the
+                         * correct priority when enqueuing the task. */
+                        TRACE_TASK(holder, "%s: is NOT queued => Done.\n",
+                                   __FUNCTION__);
+                }
+                spin_unlock(&gsnedf.release_lock);
+                /* If holder was enqueued in a release heap, then the following
+                 * preemption check is pointless, but we can't easily detect
+                 * that case. If you want to fix this, then consider that
+                 * simply adding a state flag requires O(n) time to update when
+                 * releasing n tasks, which conflicts with the goal to have
+                 * O(log n) merges. */
+                if (check_preempt) {
+                        /* heap_decrease() hit the top level of the heap: make
+                         * sure preemption checks get the right task, not the
+                         * potentially stale cache. */
+                        bheap_uncache_min(edf_ready_order,
+                                         &gsnedf.ready_queue);
+                        check_for_preemptions();
+                }
+        }
+}
+static long gsnedf_pi_block(struct pi_semaphore *sem,
+                            struct task_struct *new_waiter)
+{
+        /* This callback has to handle the situation where a new waiter is
+         * added to the wait queue of the semaphore.
+         *
+         * We must check if has a higher priority than the currently
+         * highest-priority task, and then potentially reschedule.
+         */
+        BUG_ON(!new_waiter);
+        if (edf_higher_prio(new_waiter, sem->hp.task)) {
+                TRACE_TASK(new_waiter, " boosts priority via %p\n", sem);
+                /* called with IRQs disabled */
+                spin_lock(&gsnedf_lock);
+                /* store new highest-priority task */
+                sem->hp.task = new_waiter;
+                if (sem->holder) {
+                        TRACE_TASK(sem->holder,
+                                   " holds %p and will inherit from %s/%d\n",
+                                   sem,
+                                   new_waiter->comm, new_waiter->pid);
+                        /* let holder inherit */
+                        sem->holder->rt_param.inh_task = new_waiter;
+                        update_queue_position(sem->holder);
+                }
+                spin_unlock(&gsnedf_lock);
+        }
+        return 0;
+}
+static long gsnedf_inherit_priority(struct pi_semaphore *sem,
+                                    struct task_struct *new_owner)
+{
+        /* We don't need to acquire the gsnedf_lock since at the time of this
+         * call new_owner isn't actually scheduled yet (it's still sleeping)
+         * and since the calling function already holds sem->wait.lock, which
+         * prevents concurrent sem->hp.task changes.
+         */
+        if (sem->hp.task && sem->hp.task != new_owner) {
+                new_owner->rt_param.inh_task = sem->hp.task;
+                TRACE_TASK(new_owner, "inherited priority from %s/%d\n",
+                           sem->hp.task->comm, sem->hp.task->pid);
+        } else
+                TRACE_TASK(new_owner,
+                           "cannot inherit priority, "
+                           "no higher priority job waits.\n");
+        return 0;
+}
+/* This function is called on a semaphore release, and assumes that
+ * the current task is also the semaphore holder.
+ */
+static long gsnedf_return_priority(struct pi_semaphore *sem)
+{
+        struct task_struct* t = current;
+        int ret = 0;
+        /* Find new highest-priority semaphore task
+         * if holder task is the current hp.task.
+         *
+         * Calling function holds sem->wait.lock.
+         */
+        if (t == sem->hp.task)
+                edf_set_hp_task(sem);
+        TRACE_CUR("gsnedf_return_priority for lock %p\n", sem);
+        if (t->rt_param.inh_task) {
+                /* interrupts already disabled by PI code */
+                spin_lock(&gsnedf_lock);
+                /* Reset inh_task to NULL. */
+                t->rt_param.inh_task = NULL;
+                /* Check if rescheduling is necessary */
+                unlink(t);
+                gsnedf_job_arrival(t);
+                spin_unlock(&gsnedf_lock);
+        }
+        return ret;
+}
+#endif
+static long gsnedf_admit_task(struct task_struct* tsk)
+{
+        return 0;
+}
+static long gsnedf_activate_plugin(void)
+{
+        int cpu;
+        cpu_entry_t *entry;
+        bheap_init(&gsnedf_cpu_heap);
+        gsnedf.release_master = atomic_read(&release_master_cpu);
+        for_each_online_cpu(cpu) {
+                entry = &per_cpu(gsnedf_cpu_entries, cpu);
+                bheap_node_init(&entry->hn, entry);
+                atomic_set(&entry->will_schedule, 0);
+                entry->linked    = NULL;
+                entry->scheduled = NULL;
+                if (cpu != gsnedf.release_master) {
+                        TRACE("GSN-EDF: Initializing CPU #%d.\n", cpu);
+                        update_cpu_position(entry);
+                } else {
+                        TRACE("GSN-EDF: CPU %d is release master.\n", cpu);
+                }
+        }
+        return 0;
+}
+/*      Plugin object   */
+static struct sched_plugin gsn_edf_plugin __cacheline_aligned_in_smp = {
+        .plugin_name            = "GSN-EDF",
+        .finish_switch          = gsnedf_finish_switch,
+        .tick                   = gsnedf_tick,
+        .task_new               = gsnedf_task_new,
+        .complete_job           = complete_job,
+        .task_exit              = gsnedf_task_exit,
+        .schedule               = gsnedf_schedule,
+        .task_wake_up           = gsnedf_task_wake_up,
+        .task_block             = gsnedf_task_block,
+#ifdef CONFIG_FMLP
+        .fmlp_active            = 1,
+        .pi_block               = gsnedf_pi_block,
+        .inherit_priority       = gsnedf_inherit_priority,
+        .return_priority        = gsnedf_return_priority,
+#endif
+        .admit_task             = gsnedf_admit_task,
+        .activate_plugin        = gsnedf_activate_plugin,
+};
+static int __init init_gsn_edf(void)
+{
+        int cpu;
+        cpu_entry_t *entry;
+        bheap_init(&gsnedf_cpu_heap);
+        /* initialize CPU state */
+        for (cpu = 0; cpu < NR_CPUS; cpu++)  {
+                entry = &per_cpu(gsnedf_cpu_entries, cpu);
+                gsnedf_cpus[cpu] = entry;
+                atomic_set(&entry->will_schedule, 0);
+                entry->cpu       = cpu;
+                entry->hn        = &gsnedf_heap_node[cpu];
+                bheap_node_init(&entry->hn, entry);
+        }
+        edf_domain_init(&gsnedf, NULL, gsnedf_release_jobs);
+        return register_sched_plugin(&gsn_edf_plugin);
+}
+module_init(init_gsn_edf);
diff --git a/litmus/sched_litmus.c b/litmus/sched_litmus.c
new file mode 100644
index 000000000000..c1fc7748e590
--- /dev/null
+++ b/litmus/sched_litmus.c
@@ -0,0 +1,318 @@
+/* This file is included from kernel/sched.c */
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+static void update_time_litmus(struct rq *rq, struct task_struct *p)
+{
+        u64 delta = rq->clock - p->se.exec_start;
+        if (unlikely((s64)delta < 0))
+                delta = 0;
+        /* per job counter */
+        p->rt_param.job_params.exec_time += delta;
+        /* task counter */
+        p->se.sum_exec_runtime += delta;
+        /* sched_clock() */
+        p->se.exec_start = rq->clock;
+        cpuacct_charge(p, delta);
+}
+static void double_rq_lock(struct rq *rq1, struct rq *rq2);
+static void double_rq_unlock(struct rq *rq1, struct rq *rq2);
+/*
+ * litmus_tick gets called by scheduler_tick() with HZ freq
+ * Interrupts are disabled
+ */
+static void litmus_tick(struct rq *rq, struct task_struct *p)
+{
+        TS_PLUGIN_TICK_START;
+        if (is_realtime(p))
+                update_time_litmus(rq, p);
+        /* plugin tick */
+        litmus->tick(p);
+        return;
+}
+static struct task_struct *
+litmus_schedule(struct rq *rq, struct task_struct *prev)
+{
+        struct rq* other_rq;
+        struct task_struct *next;
+        long was_running;
+        lt_t _maybe_deadlock = 0;
+        /* let the plugin schedule */
+        next = litmus->schedule(prev);
+        /* check if a global plugin pulled a task from a different RQ */
+        if (next && task_rq(next) != rq) {
+                /* we need to migrate the task */
+                other_rq = task_rq(next);
+                TRACE_TASK(next, "migrate from %d\n", other_rq->cpu);
+                /* while we drop the lock, the prev task could change its
+                 * state
+                 */
+                was_running = is_running(prev);
+                mb();
+                spin_unlock(&rq->lock);
+                /* Don't race with a concurrent switch.  This could deadlock in
+                 * the case of cross or circular migrations.  It's the job of
+                 * the plugin to make sure that doesn't happen.
+                 */
+                TRACE_TASK(next, "stack_in_use=%d\n",
+                           next->rt_param.stack_in_use);
+                if (next->rt_param.stack_in_use != NO_CPU) {
+                        TRACE_TASK(next, "waiting to deschedule\n");
+                        _maybe_deadlock = litmus_clock();
+                }
+                while (next->rt_param.stack_in_use != NO_CPU) {
+                        cpu_relax();
+                        mb();
+                        if (next->rt_param.stack_in_use == NO_CPU)
+                                TRACE_TASK(next,"descheduled. Proceeding.\n");
+                        if (lt_before(_maybe_deadlock + 10000000,
+                                      litmus_clock())) {
+                                /* We've been spinning for 10ms.
+                                 * Something can't be right!
+                                 * Let's abandon the task and bail out; at least
+                                 * we will have debug info instead of a hard
+                                 * deadlock.
+                                 */
+                                TRACE_TASK(next,"stack too long in use. "
+                                           "Deadlock?\n");
+                                next = NULL;
+                                /* bail out */
+                                spin_lock(&rq->lock);
+                                return next;
+                        }
+                }
+#ifdef  __ARCH_WANT_UNLOCKED_CTXSW
+                if (next->oncpu)
+                        TRACE_TASK(next, "waiting for !oncpu");
+                while (next->oncpu) {
+                        cpu_relax();
+                        mb();
+                }
+#endif
+                double_rq_lock(rq, other_rq);
+                mb();
+                if (is_realtime(prev) && is_running(prev) != was_running) {
+                        TRACE_TASK(prev,
+                                   "state changed while we dropped"
+                                   " the lock: is_running=%d, was_running=%d\n",
+                                   is_running(prev), was_running);
+                        if (is_running(prev) && !was_running) {
+                                /* prev task became unblocked
+                                 * we need to simulate normal sequence of events
+                                 * to scheduler plugins.
+                                 */
+                                litmus->task_block(prev);
+                                litmus->task_wake_up(prev);
+                        }
+                }
+                set_task_cpu(next, smp_processor_id());
+                /* DEBUG: now that we have the lock we need to make sure a
+                 *  couple of things still hold:
+                 *  - it is still a real-time task
+                 *  - it is still runnable (could have been stopped)
+                 * If either is violated, then the active plugin is
+                 * doing something wrong.
+                 */
+                if (!is_realtime(next) || !is_running(next)) {
+                        /* BAD BAD BAD */
+                        TRACE_TASK(next,"BAD: migration invariant FAILED: "
+                                   "rt=%d running=%d\n",
+                                   is_realtime(next),
+                                   is_running(next));
+                        /* drop the task */
+                        next = NULL;
+                }
+                /* release the other CPU's runqueue, but keep ours */
+                spin_unlock(&other_rq->lock);
+        }
+        if (next) {
+                next->rt_param.stack_in_use = rq->cpu;
+                next->se.exec_start = rq->clock;
+        }
+        return next;
+}
+static void enqueue_task_litmus(struct rq *rq, struct task_struct *p,
+                                int wakeup)
+{
+        if (wakeup) {
+                sched_trace_task_resume(p);
+                tsk_rt(p)->present = 1;
+                litmus->task_wake_up(p);
+                rq->litmus.nr_running++;
+        } else
+                TRACE_TASK(p, "ignoring an enqueue, not a wake up.\n");
+}
+static void dequeue_task_litmus(struct rq *rq, struct task_struct *p, int sleep)
+{
+        if (sleep) {
+                litmus->task_block(p);
+                tsk_rt(p)->present = 0;
+                sched_trace_task_block(p);
+                rq->litmus.nr_running--;
+        } else
+                TRACE_TASK(p, "ignoring a dequeue, not going to sleep.\n");
+}
+static void yield_task_litmus(struct rq *rq)
+{
+        BUG_ON(rq->curr != current);
+        /* sched_yield() is called to trigger delayed preemptions.
+         * Thus, mark the current task as needing to be rescheduled.
+         * This will cause the scheduler plugin to be invoked, which can
+         * then determine if a preemption is still required.
+         */
+        clear_exit_np(current);
+        set_tsk_need_resched(current);
+}
+/* Plugins are responsible for this.
+ */
+static void check_preempt_curr_litmus(struct rq *rq, struct task_struct *p, int flags)
+{
+}
+static void put_prev_task_litmus(struct rq *rq, struct task_struct *p)
+{
+}
+static void pre_schedule_litmus(struct rq *rq, struct task_struct *prev)
+{
+        update_time_litmus(rq, prev);
+        if (!is_running(prev))
+                tsk_rt(prev)->present = 0;
+}
+/* pick_next_task_litmus() - litmus_schedule() function
+ *
+ * return the next task to be scheduled
+ */
+static struct task_struct *pick_next_task_litmus(struct rq *rq)
+{
+        /* get the to-be-switched-out task (prev) */
+        struct task_struct *prev = rq->litmus.prev;
+        struct task_struct *next;
+        /* if not called from schedule() but from somewhere
+         * else (e.g., migration), return now!
+         */
+        if(!rq->litmus.prev)
+                return NULL;
+        rq->litmus.prev = NULL;
+        TS_PLUGIN_SCHED_START;
+        next = litmus_schedule(rq, prev);
+        TS_PLUGIN_SCHED_END;
+        return next;
+}
+static void task_tick_litmus(struct rq *rq, struct task_struct *p, int queued)
+{
+        /* nothing to do; tick related tasks are done by litmus_tick() */
+        return;
+}
+static void switched_to_litmus(struct rq *rq, struct task_struct *p, int running)
+{
+}
+static void prio_changed_litmus(struct rq *rq, struct task_struct *p,
+                                int oldprio, int running)
+{
+}
+unsigned int get_rr_interval_litmus(struct task_struct *p)
+{
+        /* return infinity */
+        return 0;
+}
+/* This is called when a task became a real-time task, either due to a SCHED_*
+ * class transition or due to PI mutex inheritance. We don't handle Linux PI
+ * mutex inheritance yet (and probably never will). Use LITMUS provided
+ * synchronization primitives instead.
+ */
+static void set_curr_task_litmus(struct rq *rq)
+{
+        rq->curr->se.exec_start = rq->clock;
+}
+#ifdef CONFIG_SMP
+/* execve tries to rebalance task in this scheduling domain */
+static int select_task_rq_litmus(struct task_struct *p, int sd_flag, int flags)
+{
+        /* preemption is already disabled.
+         * We don't want to change cpu here
+         */
+        return smp_processor_id();
+}
+/* we don't repartition at runtime */
+static unsigned long
+load_balance_litmus(struct rq *this_rq, int this_cpu, struct rq *busiest,
+                unsigned long max_load_move,
+                struct sched_domain *sd, enum cpu_idle_type idle,
+                int *all_pinned, int *this_best_prio)
+{
+        return 0;
+}
+static int
+move_one_task_litmus(struct rq *this_rq, int this_cpu, struct rq *busiest,
+                 struct sched_domain *sd, enum cpu_idle_type idle)
+{
+        return 0;
+}
+#endif
+const struct sched_class litmus_sched_class = {
+        .next                   = &rt_sched_class,
+        .enqueue_task           = enqueue_task_litmus,
+        .dequeue_task           = dequeue_task_litmus,
+        .yield_task             = yield_task_litmus,
+        .check_preempt_curr     = check_preempt_curr_litmus,
+        .pick_next_task         = pick_next_task_litmus,
+        .put_prev_task          = put_prev_task_litmus,
+#ifdef CONFIG_SMP
+        .select_task_rq         = select_task_rq_litmus,
+        .load_balance           = load_balance_litmus,
+        .move_one_task          = move_one_task_litmus,
+        .pre_schedule           = pre_schedule_litmus,
+#endif
+        .set_curr_task          = set_curr_task_litmus,
+        .task_tick              = task_tick_litmus,
+        .get_rr_interval        = get_rr_interval_litmus,
+        .prio_changed           = prio_changed_litmus,
+        .switched_to            = switched_to_litmus,
+};
diff --git a/litmus/sched_pfair.c b/litmus/sched_pfair.c
new file mode 100644
index 000000000000..2ea39223e7f0
--- /dev/null
+++ b/litmus/sched_pfair.c
@@ -0,0 +1,896 @@
+/*
+ * kernel/sched_pfair.c
+ *
+ * Implementation of the (global) Pfair scheduling algorithm.
+ *
+ */
+#include <asm/div64.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <litmus/litmus.h>
+#include <litmus/jobs.h>
+#include <litmus/rt_domain.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/sched_trace.h>
+#include <litmus/bheap.h>
+struct subtask {
+        /* measured in quanta relative to job release */
+        quanta_t release;
+        quanta_t deadline;
+        quanta_t overlap; /* called "b bit" by PD^2 */
+        quanta_t group_deadline;
+};
+struct pfair_param   {
+        quanta_t        quanta;       /* number of subtasks */
+        quanta_t        cur;          /* index of current subtask */
+        quanta_t        release;      /* in quanta */
+        quanta_t        period;       /* in quanta */
+        quanta_t        last_quantum; /* when scheduled last */
+        int             last_cpu;     /* where scheduled last */
+        unsigned int    sporadic_release; /* On wakeup, new sporadic release? */
+        struct subtask subtasks[0];   /* allocate together with pfair_param */
+};
+#define tsk_pfair(tsk) ((tsk)->rt_param.pfair)
+struct pfair_state {
+        int cpu;
+        volatile quanta_t cur_tick;    /* updated by the CPU that is advancing
+                                        * the time */
+        volatile quanta_t local_tick;  /* What tick is the local CPU currently
+                                        * executing? Updated only by the local
+                                        * CPU. In QEMU, this may lag behind the
+                                        * current tick. In a real system, with
+                                        * proper timers and aligned quanta,
+                                        * that should only be the
+                                        * case for a very short time after the
+                                        * time advanced. With staggered quanta,
+                                        * it will lag for the duration of the
+                                        * offset.
+                                        */
+        struct task_struct* linked;    /* the task that should be executing */
+        struct task_struct* local;     /* the local copy of linked          */
+        struct task_struct* scheduled; /* what is actually scheduled        */
+        unsigned long missed_quanta;
+        lt_t offset;                    /* stagger offset */
+};
+/* Currently, we limit the maximum period of any task to 2000 quanta.
+ * The reason is that it makes the implementation easier since we do not
+ * need to reallocate the release wheel on task arrivals.
+ * In the future
+ */
+#define PFAIR_MAX_PERIOD 2000
+/* This is the release queue wheel. It is indexed by pfair_time %
+ * PFAIR_MAX_PERIOD.  Each heap is ordered by PFAIR priority, so that it can be
+ * merged with the ready queue.
+ */
+static struct bheap release_queue[PFAIR_MAX_PERIOD];
+DEFINE_PER_CPU(struct pfair_state, pfair_state);
+struct pfair_state* *pstate; /* short cut */
+static quanta_t pfair_time = 0; /* the "official" PFAIR clock */
+static quanta_t merge_time = 0; /* Updated after the release queue has been
+                                 * merged. Used by drop_all_references().
+                                 */
+static rt_domain_t pfair;
+/* The pfair_lock is used to serialize all scheduling events.
+ */
+#define pfair_lock pfair.ready_lock
+/* Enable for lots of trace info.
+ * #define PFAIR_DEBUG
+ */
+#ifdef PFAIR_DEBUG
+#define PTRACE_TASK(t, f, args...)  TRACE_TASK(t, f, ## args)
+#define PTRACE(f, args...) TRACE(f, ## args)
+#else
+#define PTRACE_TASK(t, f, args...)
+#define PTRACE(f, args...)
+#endif
+/* gcc will inline all of these accessor functions... */
+static struct subtask* cur_subtask(struct task_struct* t)
+{
+        return tsk_pfair(t)->subtasks + tsk_pfair(t)->cur;
+}
+static quanta_t cur_deadline(struct task_struct* t)
+{
+        return cur_subtask(t)->deadline +  tsk_pfair(t)->release;
+}
+static quanta_t cur_sub_release(struct task_struct* t)
+{
+        return cur_subtask(t)->release +  tsk_pfair(t)->release;
+}
+static quanta_t cur_release(struct task_struct* t)
+{
+#ifdef EARLY_RELEASE
+        /* only the release of the first subtask counts when we early
+         * release */
+        return tsk_pfair(t)->release;
+#else
+        return cur_sub_release(t);
+#endif
+}
+static quanta_t cur_overlap(struct task_struct* t)
+{
+        return cur_subtask(t)->overlap;
+}
+static quanta_t cur_group_deadline(struct task_struct* t)
+{
+        quanta_t gdl = cur_subtask(t)->group_deadline;
+        if (gdl)
+                return gdl + tsk_pfair(t)->release;
+        else
+                return gdl;
+}
+static int pfair_higher_prio(struct task_struct* first,
+                             struct task_struct* second)
+{
+        return  /* first task must exist */
+                first && (
+                /* Does the second task exist and is it a real-time task?  If
+                 * not, the first task (which is a RT task) has higher
+                 * priority.
+                 */
+                !second || !is_realtime(second)  ||
+                /* Is the (subtask) deadline of the first task earlier?
+                 * Then it has higher priority.
+                 */
+                time_before(cur_deadline(first), cur_deadline(second)) ||
+                /* Do we have a deadline tie?
+                 * Then break by B-bit.
+                 */
+                (cur_deadline(first) == cur_deadline(second) &&
+                 (cur_overlap(first) > cur_overlap(second) ||
+                /* Do we have a B-bit tie?
+                 * Then break by group deadline.
+                 */
+                (cur_overlap(first) == cur_overlap(second) &&
+                 (time_after(cur_group_deadline(first),
+                             cur_group_deadline(second)) ||
+                /* Do we have a group deadline tie?
+                 * Then break by PID, which are unique.
+                 */
+                (cur_group_deadline(first) ==
+                 cur_group_deadline(second) &&
+                 first->pid < second->pid))))));
+}
+int pfair_ready_order(struct bheap_node* a, struct bheap_node* b)
+{
+        return pfair_higher_prio(bheap2task(a), bheap2task(b));
+}
+/* return the proper release queue for time t */
+static struct bheap* relq(quanta_t t)
+{
+        struct bheap* rq = &release_queue[t % PFAIR_MAX_PERIOD];
+        return rq;
+}
+static void prepare_release(struct task_struct* t, quanta_t at)
+{
+        tsk_pfair(t)->release    = at;
+        tsk_pfair(t)->cur        = 0;
+}
+static void __pfair_add_release(struct task_struct* t, struct bheap* queue)
+{
+        bheap_insert(pfair_ready_order, queue,
+                    tsk_rt(t)->heap_node);
+}
+static void pfair_add_release(struct task_struct* t)
+{
+        BUG_ON(bheap_node_in_heap(tsk_rt(t)->heap_node));
+        __pfair_add_release(t, relq(cur_release(t)));
+}
+/* pull released tasks from the release queue */
+static void poll_releases(quanta_t time)
+{
+        __merge_ready(&pfair, relq(time));
+        merge_time = time;
+}
+static void check_preempt(struct task_struct* t)
+{
+        int cpu = NO_CPU;
+        if (tsk_rt(t)->linked_on != tsk_rt(t)->scheduled_on &&
+            tsk_rt(t)->present) {
+                /* the task can be scheduled and
+                 * is not scheduled where it ought to be scheduled
+                 */
+                cpu = tsk_rt(t)->linked_on != NO_CPU ?
+                        tsk_rt(t)->linked_on         :
+                        tsk_rt(t)->scheduled_on;
+                PTRACE_TASK(t, "linked_on:%d, scheduled_on:%d\n",
+                           tsk_rt(t)->linked_on, tsk_rt(t)->scheduled_on);
+                /* preempt */
+                if (cpu == smp_processor_id())
+                        set_tsk_need_resched(current);
+                else {
+                        smp_send_reschedule(cpu);
+                }
+        }
+}
+/* caller must hold pfair_lock */
+static void drop_all_references(struct task_struct *t)
+{
+        int cpu;
+        struct pfair_state* s;
+        struct bheap* q;
+        if (bheap_node_in_heap(tsk_rt(t)->heap_node)) {
+                /* figure out what queue the node is in */
+                if (time_before_eq(cur_release(t), merge_time))
+                        q = &pfair.ready_queue;
+                else
+                        q = relq(cur_release(t));
+                bheap_delete(pfair_ready_order, q,
+                            tsk_rt(t)->heap_node);
+        }
+        for (cpu = 0; cpu < num_online_cpus(); cpu++) {
+                s = &per_cpu(pfair_state, cpu);
+                if (s->linked == t)
+                        s->linked = NULL;
+                if (s->local  == t)
+                        s->local  = NULL;
+                if (s->scheduled  == t)
+                        s->scheduled = NULL;
+        }
+}
+/* returns 1 if the task needs to go the release queue */
+static int advance_subtask(quanta_t time, struct task_struct* t, int cpu)
+{
+        struct pfair_param* p = tsk_pfair(t);
+        int to_relq;
+        p->cur = (p->cur + 1) % p->quanta;
+        if (!p->cur) {
+                sched_trace_task_completion(t, 1);
+                if (tsk_rt(t)->present) {
+                        /* we start a new job */
+                        prepare_for_next_period(t);
+                        sched_trace_task_release(t);
+                        get_rt_flags(t) = RT_F_RUNNING;
+                        p->release += p->period;
+                } else {
+                        /* remove task from system until it wakes */
+                        drop_all_references(t);
+                        tsk_pfair(t)->sporadic_release = 1;
+                        TRACE_TASK(t, "on %d advanced to subtask %lu (not present)\n",
+                                   cpu, p->cur);
+                        return 0;
+                }
+        }
+        to_relq = time_after(cur_release(t), time);
+        TRACE_TASK(t, "on %d advanced to subtask %lu -> to_relq=%d\n",
+                   cpu, p->cur, to_relq);
+        return to_relq;
+}
+static void advance_subtasks(quanta_t time)
+{
+        int cpu, missed;
+        struct task_struct* l;
+        struct pfair_param* p;
+        for_each_online_cpu(cpu) {
+                l = pstate[cpu]->linked;
+                missed = pstate[cpu]->linked != pstate[cpu]->local;
+                if (l) {
+                        p = tsk_pfair(l);
+                        p->last_quantum = time;
+                        p->last_cpu     =  cpu;
+                        if (advance_subtask(time, l, cpu)) {
+                                pstate[cpu]->linked = NULL;
+                                pfair_add_release(l);
+                        }
+                }
+        }
+}
+static int target_cpu(quanta_t time, struct task_struct* t, int default_cpu)
+{
+        int cpu;
+        if (tsk_rt(t)->scheduled_on != NO_CPU) {
+                /* always observe scheduled_on linkage */
+                default_cpu = tsk_rt(t)->scheduled_on;
+        } else if (tsk_pfair(t)->last_quantum == time - 1) {
+                /* back2back quanta */
+                /* Only observe last_quantum if no scheduled_on is in the way.
+                 * This should only kick in if a CPU missed quanta, and that
+                 * *should* only happen in QEMU.
+                 */
+                cpu = tsk_pfair(t)->last_cpu;
+                if (!pstate[cpu]->linked ||
+                    tsk_rt(pstate[cpu]->linked)->scheduled_on != cpu) {
+                        default_cpu = cpu;
+                }
+        }
+        return default_cpu;
+}
+/* returns one if linking was redirected */
+static int pfair_link(quanta_t time, int cpu,
+                      struct task_struct* t)
+{
+        int target = target_cpu(time, t, cpu);
+        struct task_struct* prev  = pstate[cpu]->linked;
+        struct task_struct* other;
+        if (target != cpu) {
+                other = pstate[target]->linked;
+                pstate[target]->linked = t;
+                tsk_rt(t)->linked_on   = target;
+                if (!other)
+                        /* linked ok, but reschedule this CPU */
+                        return 1;
+                if (target < cpu) {
+                        /* link other to cpu instead */
+                        tsk_rt(other)->linked_on = cpu;
+                        pstate[cpu]->linked      = other;
+                        if (prev) {
+                                /* prev got pushed back into the ready queue */
+                                tsk_rt(prev)->linked_on = NO_CPU;
+                                __add_ready(&pfair, prev);
+                        }
+                        /* we are done with this cpu */
+                        return 0;
+                } else {
+                        /* re-add other, it's original CPU was not considered yet */
+                        tsk_rt(other)->linked_on = NO_CPU;
+                        __add_ready(&pfair, other);
+                        /* reschedule this CPU */
+                        return 1;
+                }
+        } else {
+                pstate[cpu]->linked  = t;
+                tsk_rt(t)->linked_on = cpu;
+                if (prev) {
+                        /* prev got pushed back into the ready queue */
+                        tsk_rt(prev)->linked_on = NO_CPU;
+                        __add_ready(&pfair, prev);
+                }
+                /* we are done with this CPU */
+                return 0;
+        }
+}
+static void schedule_subtasks(quanta_t time)
+{
+        int cpu, retry;
+        for_each_online_cpu(cpu) {
+                retry = 1;
+                while (retry) {
+                        if (pfair_higher_prio(__peek_ready(&pfair),
+                                              pstate[cpu]->linked))
+                                retry = pfair_link(time, cpu,
+                                                   __take_ready(&pfair));
+                        else
+                                retry = 0;
+                }
+        }
+}
+static void schedule_next_quantum(quanta_t time)
+{
+        int cpu;
+        /* called with interrupts disabled */
+        PTRACE("--- Q %lu at %llu PRE-SPIN\n",
+               time, litmus_clock());
+        spin_lock(&pfair_lock);
+        PTRACE("<<< Q %lu at %llu\n",
+               time, litmus_clock());
+        sched_trace_quantum_boundary();
+        advance_subtasks(time);
+        poll_releases(time);
+        schedule_subtasks(time);
+        for (cpu = 0; cpu < num_online_cpus(); cpu++)
+                if (pstate[cpu]->linked)
+                        PTRACE_TASK(pstate[cpu]->linked,
+                                    " linked on %d.\n", cpu);
+                else
+                        PTRACE("(null) linked on %d.\n", cpu);
+        /* We are done. Advance time. */
+        mb();
+        for (cpu = 0; cpu < num_online_cpus(); cpu++) {
+                if (pstate[cpu]->local_tick != pstate[cpu]->cur_tick) {
+                        TRACE("BAD Quantum not acked on %d "
+                              "(l:%lu c:%lu p:%lu)\n",
+                              cpu,
+                              pstate[cpu]->local_tick,
+                              pstate[cpu]->cur_tick,
+                              pfair_time);
+                        pstate[cpu]->missed_quanta++;
+                }
+                pstate[cpu]->cur_tick = time;
+        }
+        PTRACE(">>> Q %lu at %llu\n",
+               time, litmus_clock());
+        spin_unlock(&pfair_lock);
+}
+static noinline void wait_for_quantum(quanta_t q, struct pfair_state* state)
+{
+        quanta_t loc;
+        goto first; /* skip mb() on first iteration */
+        do {
+                cpu_relax();
+                mb();
+        first:  loc = state->cur_tick;
+                /* FIXME: what if loc > cur? */
+        } while (time_before(loc, q));
+        PTRACE("observed cur_tick:%lu >= q:%lu\n",
+               loc, q);
+}
+static quanta_t current_quantum(struct pfair_state* state)
+{
+        lt_t t = litmus_clock() - state->offset;
+        return time2quanta(t, FLOOR);
+}
+static void catchup_quanta(quanta_t from, quanta_t target,
+                           struct pfair_state* state)
+{
+        quanta_t cur = from, time;
+        TRACE("+++< BAD catching up quanta from %lu to %lu\n",
+              from, target);
+        while (time_before(cur, target)) {
+                wait_for_quantum(cur, state);
+                cur++;
+                time = cmpxchg(&pfair_time,
+                               cur - 1,   /* expected */
+                               cur        /* next     */
+                        );
+                if (time == cur - 1)
+                        schedule_next_quantum(cur);
+        }
+        TRACE("+++> catching up done\n");
+}
+/* pfair_tick - this function is called for every local timer
+ *                         interrupt.
+ */
+static void pfair_tick(struct task_struct* t)
+{
+        struct pfair_state* state = &__get_cpu_var(pfair_state);
+        quanta_t time, cur;
+        int retry = 10;
+        do {
+                cur  = current_quantum(state);
+                PTRACE("q %lu at %llu\n", cur, litmus_clock());
+                /* Attempt to advance time. First CPU to get here
+                 * will prepare the next quantum.
+                 */
+                time = cmpxchg(&pfair_time,
+                               cur - 1,   /* expected */
+                               cur        /* next     */
+                        );
+                if (time == cur - 1) {
+                        /* exchange succeeded */
+                        wait_for_quantum(cur - 1, state);
+                        schedule_next_quantum(cur);
+                        retry = 0;
+                } else if (time_before(time, cur - 1)) {
+                        /* the whole system missed a tick !? */
+                        catchup_quanta(time, cur, state);
+                        retry--;
+                } else if (time_after(time, cur)) {
+                        /* our timer lagging behind!? */
+                        TRACE("BAD pfair_time:%lu > cur:%lu\n", time, cur);
+                        retry--;
+                } else {
+                        /* Some other CPU already started scheduling
+                         * this quantum. Let it do its job and then update.
+                         */
+                        retry = 0;
+                }
+        } while (retry);
+        /* Spin locally until time advances. */
+        wait_for_quantum(cur, state);
+        /* copy assignment */
+        /* FIXME: what if we race with a future update? Corrupted state? */
+        state->local      = state->linked;
+        /* signal that we are done */
+        mb();
+        state->local_tick = state->cur_tick;
+        if (state->local != current
+            && (is_realtime(current) || is_present(state->local)))
+                set_tsk_need_resched(current);
+}
+static int safe_to_schedule(struct task_struct* t, int cpu)
+{
+        int where = tsk_rt(t)->scheduled_on;
+        if (where != NO_CPU && where != cpu) {
+                TRACE_TASK(t, "BAD: can't be scheduled on %d, "
+                           "scheduled already on %d.\n", cpu, where);
+                return 0;
+        } else
+                return tsk_rt(t)->present && get_rt_flags(t) == RT_F_RUNNING;
+}
+static struct task_struct* pfair_schedule(struct task_struct * prev)
+{
+        struct pfair_state* state = &__get_cpu_var(pfair_state);
+        int blocks;
+        struct task_struct* next = NULL;
+        spin_lock(&pfair_lock);
+        blocks  = is_realtime(prev) && !is_running(prev);
+        if (state->local && safe_to_schedule(state->local, state->cpu))
+                next = state->local;
+        if (prev != next) {
+                tsk_rt(prev)->scheduled_on = NO_CPU;
+                if (next)
+                        tsk_rt(next)->scheduled_on = state->cpu;
+        }
+        spin_unlock(&pfair_lock);
+        if (next)
+                TRACE_TASK(next, "scheduled rel=%lu at %lu (%llu)\n",
+                           tsk_pfair(next)->release, pfair_time, litmus_clock());
+        else if (is_realtime(prev))
+                TRACE("Becomes idle at %lu (%llu)\n", pfair_time, litmus_clock());
+        return next;
+}
+static void pfair_task_new(struct task_struct * t, int on_rq, int running)
+{
+        unsigned long           flags;
+        TRACE("pfair: task new %d state:%d\n", t->pid, t->state);
+        spin_lock_irqsave(&pfair_lock, flags);
+        if (running)
+                t->rt_param.scheduled_on = task_cpu(t);
+        else
+                t->rt_param.scheduled_on = NO_CPU;
+        prepare_release(t, pfair_time + 1);
+        tsk_pfair(t)->sporadic_release = 0;
+        pfair_add_release(t);
+        check_preempt(t);
+        spin_unlock_irqrestore(&pfair_lock, flags);
+}
+static void pfair_task_wake_up(struct task_struct *t)
+{
+        unsigned long flags;
+        lt_t now;
+        TRACE_TASK(t, "wakes at %llu, release=%lu, pfair_time:%lu\n",
+                   litmus_clock(), cur_release(t), pfair_time);
+        spin_lock_irqsave(&pfair_lock, flags);
+        /* It is a little unclear how to deal with Pfair
+         * tasks that block for a while and then wake. For now,
+         * if a task blocks and wakes before its next job release,
+         * then it may resume if it is currently linked somewhere
+         * (as if it never blocked at all). Otherwise, we have a
+         * new sporadic job release.
+         */
+        if (tsk_pfair(t)->sporadic_release) {
+                now = litmus_clock();
+                release_at(t, now);
+                prepare_release(t, time2quanta(now, CEIL));
+                sched_trace_task_release(t);
+                /* FIXME: race with pfair_time advancing */
+                pfair_add_release(t);
+                tsk_pfair(t)->sporadic_release = 0;
+        }
+        check_preempt(t);
+        spin_unlock_irqrestore(&pfair_lock, flags);
+        TRACE_TASK(t, "wake up done at %llu\n", litmus_clock());
+}
+static void pfair_task_block(struct task_struct *t)
+{
+        BUG_ON(!is_realtime(t));
+        TRACE_TASK(t, "blocks at %llu, state:%d\n",
+                   litmus_clock(), t->state);
+}
+static void pfair_task_exit(struct task_struct * t)
+{
+        unsigned long flags;
+        BUG_ON(!is_realtime(t));
+        /* Remote task from release or ready queue, and ensure
+         * that it is not the scheduled task for ANY CPU. We
+         * do this blanket check because occassionally when
+         * tasks exit while blocked, the task_cpu of the task
+         * might not be the same as the CPU that the PFAIR scheduler
+         * has chosen for it.
+         */
+        spin_lock_irqsave(&pfair_lock, flags);
+        TRACE_TASK(t, "RIP, state:%d\n", t->state);
+        drop_all_references(t);
+        spin_unlock_irqrestore(&pfair_lock, flags);
+        kfree(t->rt_param.pfair);
+        t->rt_param.pfair = NULL;
+}
+static void pfair_release_at(struct task_struct* task, lt_t start)
+{
+        unsigned long flags;
+        quanta_t release;
+        BUG_ON(!is_realtime(task));
+        spin_lock_irqsave(&pfair_lock, flags);
+        release_at(task, start);
+        release = time2quanta(start, CEIL);
+        if (release - pfair_time >= PFAIR_MAX_PERIOD)
+                release = pfair_time + PFAIR_MAX_PERIOD;
+        TRACE_TASK(task, "sys release at %lu\n", release);
+        drop_all_references(task);
+        prepare_release(task, release);
+        pfair_add_release(task);
+        /* Clear sporadic release flag, since this release subsumes any
+         * sporadic release on wake.
+         */
+        tsk_pfair(task)->sporadic_release = 0;
+        spin_unlock_irqrestore(&pfair_lock, flags);
+}
+static void init_subtask(struct subtask* sub, unsigned long i,
+                         lt_t quanta, lt_t period)
+{
+        /* since i is zero-based, the formulas are shifted by one */
+        lt_t tmp;
+        /* release */
+        tmp = period * i;
+        do_div(tmp, quanta); /* floor */
+        sub->release = (quanta_t) tmp;
+        /* deadline */
+        tmp = period * (i + 1);
+        if (do_div(tmp, quanta)) /* ceil */
+                tmp++;
+        sub->deadline = (quanta_t) tmp;
+        /* next release */
+        tmp = period * (i + 1);
+        do_div(tmp, quanta); /* floor */
+        sub->overlap =  sub->deadline - (quanta_t) tmp;
+        /* Group deadline.
+         * Based on the formula given in Uma's thesis.
+         */
+        if (2 * quanta >= period) {
+                /* heavy */
+                tmp = (sub->deadline - (i + 1)) * period;
+                if (period > quanta &&
+                    do_div(tmp, (period - quanta))) /* ceil */
+                        tmp++;
+                sub->group_deadline = (quanta_t) tmp;
+        } else
+                sub->group_deadline = 0;
+}
+static void dump_subtasks(struct task_struct* t)
+{
+        unsigned long i;
+        for (i = 0; i < t->rt_param.pfair->quanta; i++)
+                TRACE_TASK(t, "SUBTASK %lu: rel=%lu dl=%lu bbit:%lu gdl:%lu\n",
+                           i + 1,
+                           t->rt_param.pfair->subtasks[i].release,
+                           t->rt_param.pfair->subtasks[i].deadline,
+                           t->rt_param.pfair->subtasks[i].overlap,
+                           t->rt_param.pfair->subtasks[i].group_deadline);
+}
+static long pfair_admit_task(struct task_struct* t)
+{
+        lt_t quanta;
+        lt_t period;
+        s64  quantum_length = ktime_to_ns(tick_period);
+        struct pfair_param* param;
+        unsigned long i;
+        /* Pfair is a tick-based method, so the time
+         * of interest is jiffies. Calculate tick-based
+         * times for everything.
+         * (Ceiling of exec cost, floor of period.)
+         */
+        quanta = get_exec_cost(t);
+        period = get_rt_period(t);
+        quanta = time2quanta(get_exec_cost(t), CEIL);
+        if (do_div(period, quantum_length))
+                printk(KERN_WARNING
+                       "The period of %s/%d is not a multiple of %llu.\n",
+                       t->comm, t->pid, (unsigned long long) quantum_length);
+        if (period >= PFAIR_MAX_PERIOD) {
+                printk(KERN_WARNING
+                       "PFAIR: Rejecting task %s/%d; its period is too long.\n",
+                       t->comm, t->pid);
+                return -EINVAL;
+        }
+        if (quanta == period) {
+                /* special case: task has weight 1.0 */
+                printk(KERN_INFO
+                       "Admitting weight 1.0 task. (%s/%d, %llu, %llu).\n",
+                       t->comm, t->pid, quanta, period);
+                quanta = 1;
+                period = 1;
+        }
+        param = kmalloc(sizeof(*param) +
+                        quanta * sizeof(struct subtask), GFP_ATOMIC);
+        if (!param)
+                return -ENOMEM;
+        param->quanta  = quanta;
+        param->cur     = 0;
+        param->release = 0;
+        param->period  = period;
+        for (i = 0; i < quanta; i++)
+                init_subtask(param->subtasks + i, i, quanta, period);
+        if (t->rt_param.pfair)
+                /* get rid of stale allocation */
+                kfree(t->rt_param.pfair);
+        t->rt_param.pfair = param;
+        /* spew out some debug info */
+        dump_subtasks(t);
+        return 0;
+}
+static long pfair_activate_plugin(void)
+{
+        int cpu;
+        struct pfair_state* state;
+        state = &__get_cpu_var(pfair_state);
+        pfair_time = current_quantum(state);
+        TRACE("Activating PFAIR at q=%lu\n", pfair_time);
+        for (cpu = 0; cpu < num_online_cpus(); cpu++)  {
+                state = &per_cpu(pfair_state, cpu);
+                state->cur_tick   = pfair_time;
+                state->local_tick = pfair_time;
+                state->missed_quanta = 0;
+                state->offset     = cpu_stagger_offset(cpu);
+        }
+        return 0;
+}
+/*      Plugin object   */
+static struct sched_plugin pfair_plugin __cacheline_aligned_in_smp = {
+        .plugin_name            = "PFAIR",
+        .tick                   = pfair_tick,
+        .task_new               = pfair_task_new,
+        .task_exit              = pfair_task_exit,
+        .schedule               = pfair_schedule,
+        .task_wake_up           = pfair_task_wake_up,
+        .task_block             = pfair_task_block,
+        .admit_task             = pfair_admit_task,
+        .release_at             = pfair_release_at,
+        .complete_job           = complete_job,
+        .activate_plugin        = pfair_activate_plugin,
+};
+static int __init init_pfair(void)
+{
+        int cpu, i;
+        struct pfair_state *state;
+        /*
+         * initialize short_cut for per-cpu pfair state;
+         * there may be a problem here if someone removes a cpu
+         * while we are doing this initialization... and if cpus
+         * are added / removed later... is it a _real_ problem?
+         */
+        pstate = kmalloc(sizeof(struct pfair_state*) * num_online_cpus(), GFP_KERNEL);
+        /* initialize release queue */
+        for (i = 0; i < PFAIR_MAX_PERIOD; i++)
+                bheap_init(&release_queue[i]);
+        /* initialize CPU state */
+        for (cpu = 0; cpu < num_online_cpus(); cpu++)  {
+                state = &per_cpu(pfair_state, cpu);
+                state->cpu        = cpu;
+                state->cur_tick   = 0;
+                state->local_tick = 0;
+                state->linked     = NULL;
+                state->local      = NULL;
+                state->scheduled  = NULL;
+                state->missed_quanta = 0;
+                state->offset     = cpu_stagger_offset(cpu);
+                pstate[cpu] = state;
+        }
+        rt_domain_init(&pfair, pfair_ready_order, NULL, NULL);
+        return register_sched_plugin(&pfair_plugin);
+}
+static void __exit clean_pfair(void)
+{
+        kfree(pstate);
+}
+module_init(init_pfair);
+module_exit(clean_pfair);
diff --git a/litmus/sched_plugin.c b/litmus/sched_plugin.c
new file mode 100644
index 000000000000..3767b30e610a
--- /dev/null
+++ b/litmus/sched_plugin.c
@@ -0,0 +1,265 @@
+/* sched_plugin.c -- core infrastructure for the scheduler plugin system
+ *
+ * This file includes the initialization of the plugin system, the no-op Linux
+ * scheduler plugin, some dummy functions, and some helper functions.
+ */
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/jobs.h>
+/*
+ * Generic function to trigger preemption on either local or remote cpu
+ * from scheduler plugins. The key feature is that this function is
+ * non-preemptive section aware and does not invoke the scheduler / send
+ * IPIs if the to-be-preempted task is actually non-preemptive.
+ */
+void preempt_if_preemptable(struct task_struct* t, int on_cpu)
+{
+        /* t is the real-time task executing on CPU on_cpu If t is NULL, then
+         * on_cpu is currently scheduling background work.
+         */
+        int send_ipi;
+        if (smp_processor_id() == on_cpu) {
+                /* local CPU case */
+                if (t) {
+                        /* check if we need to poke userspace */
+                        if (is_user_np(t))
+                                /* yes, poke it */
+                                request_exit_np(t);
+                        else
+                                /* no, see if we are allowed to preempt the
+                                 * currently-executing task */
+                                if (!is_kernel_np(t))
+                                        set_tsk_need_resched(t);
+                } else
+                        /* move non-real-time task out of the way */
+                        set_tsk_need_resched(current);
+        } else {
+                /* remote CPU case */
+                if (!t)
+                        /* currently schedules non-real-time work */
+                        send_ipi = 1;
+                else {
+                        /* currently schedules real-time work */
+                        if (is_user_np(t)) {
+                                /* need to notify user space of delayed
+                                 * preemption */
+                                /* to avoid a race, set the flag, then test
+                                 * again */
+                                request_exit_np(t);
+                                /* make sure it got written */
+                                mb();
+                        }
+                        /* Only send an ipi if remote task might have raced our
+                         * request, i.e., send an IPI to make sure if it exited
+                         * its critical section.
+                         */
+                        send_ipi = !is_np(t) && !is_kernel_np(t);
+                }
+                if (likely(send_ipi))
+                        smp_send_reschedule(on_cpu);
+        }
+}
+/*************************************************************
+ *                   Dummy plugin functions                  *
+ *************************************************************/
+static void litmus_dummy_finish_switch(struct task_struct * prev)
+{
+}
+static struct task_struct* litmus_dummy_schedule(struct task_struct * prev)
+{
+        return NULL;
+}
+static void litmus_dummy_tick(struct task_struct* tsk)
+{
+}
+static long litmus_dummy_admit_task(struct task_struct* tsk)
+{
+        printk(KERN_CRIT "LITMUS^RT: Linux plugin rejects %s/%d.\n",
+                tsk->comm, tsk->pid);
+        return -EINVAL;
+}
+static void litmus_dummy_task_new(struct task_struct *t, int on_rq, int running)
+{
+}
+static void litmus_dummy_task_wake_up(struct task_struct *task)
+{
+}
+static void litmus_dummy_task_block(struct task_struct *task)
+{
+}
+static void litmus_dummy_task_exit(struct task_struct *task)
+{
+}
+static long litmus_dummy_complete_job(void)
+{
+        return -ENOSYS;
+}
+static long litmus_dummy_activate_plugin(void)
+{
+        return 0;
+}
+static long litmus_dummy_deactivate_plugin(void)
+{
+        return 0;
+}
+#ifdef CONFIG_FMLP
+static long litmus_dummy_inherit_priority(struct pi_semaphore *sem,
+                                          struct task_struct *new_owner)
+{
+        return -ENOSYS;
+}
+static long litmus_dummy_return_priority(struct pi_semaphore *sem)
+{
+        return -ENOSYS;
+}
+static long litmus_dummy_pi_block(struct pi_semaphore *sem,
+                                  struct task_struct *new_waiter)
+{
+        return -ENOSYS;
+}
+#endif
+/* The default scheduler plugin. It doesn't do anything and lets Linux do its
+ * job.
+ */
+struct sched_plugin linux_sched_plugin = {
+        .plugin_name = "Linux",
+        .tick = litmus_dummy_tick,
+        .task_new   = litmus_dummy_task_new,
+        .task_exit = litmus_dummy_task_exit,
+        .task_wake_up = litmus_dummy_task_wake_up,
+        .task_block = litmus_dummy_task_block,
+        .complete_job = litmus_dummy_complete_job,
+        .schedule = litmus_dummy_schedule,
+        .finish_switch = litmus_dummy_finish_switch,
+        .activate_plugin = litmus_dummy_activate_plugin,
+        .deactivate_plugin = litmus_dummy_deactivate_plugin,
+#ifdef CONFIG_FMLP
+        .inherit_priority = litmus_dummy_inherit_priority,
+        .return_priority = litmus_dummy_return_priority,
+        .pi_block = litmus_dummy_pi_block,
+#endif
+        .admit_task = litmus_dummy_admit_task
+};
+/*
+ * The cluster size is needed in C-EDF: it makes sense only to cluster
+ * around L2 or L3, so if cluster_cache_index = 2 (default) we cluster
+ * all the CPUs that shares a L2 cache, while cluster_cache_index = 3
+ * we cluster all CPs that shares a L3 cache
+ */
+int cluster_cache_index = 2;
+/*
+ *      The reference to current plugin that is used to schedule tasks within
+ *      the system. It stores references to actual function implementations
+ *      Should be initialized by calling "init_***_plugin()"
+ */
+struct sched_plugin *litmus = &linux_sched_plugin;
+/* the list of registered scheduling plugins */
+static LIST_HEAD(sched_plugins);
+static DEFINE_SPINLOCK(sched_plugins_lock);
+#define CHECK(func) {\
+        if (!plugin->func) \
+                plugin->func = litmus_dummy_ ## func;}
+/* FIXME: get reference to module  */
+int register_sched_plugin(struct sched_plugin* plugin)
+{
+        printk(KERN_INFO "Registering LITMUS^RT plugin %s.\n",
+               plugin->plugin_name);
+        /* make sure we don't trip over null pointers later */
+        CHECK(finish_switch);
+        CHECK(schedule);
+        CHECK(tick);
+        CHECK(task_wake_up);
+        CHECK(task_exit);
+        CHECK(task_block);
+        CHECK(task_new);
+        CHECK(complete_job);
+        CHECK(activate_plugin);
+        CHECK(deactivate_plugin);
+#ifdef CONFIG_FMLP
+        CHECK(inherit_priority);
+        CHECK(return_priority);
+        CHECK(pi_block);
+#endif
+        CHECK(admit_task);
+        if (!plugin->release_at)
+                plugin->release_at = release_at;
+        spin_lock(&sched_plugins_lock);
+        list_add(&plugin->list, &sched_plugins);
+        spin_unlock(&sched_plugins_lock);
+        return 0;
+}
+/* FIXME: reference counting, etc. */
+struct sched_plugin* find_sched_plugin(const char* name)
+{
+        struct list_head *pos;
+        struct sched_plugin *plugin;
+        spin_lock(&sched_plugins_lock);
+        list_for_each(pos, &sched_plugins) {
+                plugin = list_entry(pos, struct sched_plugin, list);
+                if (!strcmp(plugin->plugin_name, name))
+                    goto out_unlock;
+        }
+        plugin = NULL;
+out_unlock:
+        spin_unlock(&sched_plugins_lock);
+        return plugin;
+}
+int print_sched_plugins(char* buf, int max)
+{
+        int count = 0;
+        struct list_head *pos;
+        struct sched_plugin *plugin;
+        spin_lock(&sched_plugins_lock);
+        list_for_each(pos, &sched_plugins) {
+                plugin = list_entry(pos, struct sched_plugin, list);
+                count += snprintf(buf + count, max - count, "%s\n", plugin->plugin_name);
+                if (max - count <= 0)
+                        break;
+        }
+        spin_unlock(&sched_plugins_lock);
+        return  count;
+}
diff --git a/litmus/sched_psn_edf.c b/litmus/sched_psn_edf.c
new file mode 100644
index 000000000000..7f71ecfaaaae
--- /dev/null
+++ b/litmus/sched_psn_edf.c
@@ -0,0 +1,478 @@
+/*
+ * kernel/sched_psn_edf.c
+ *
+ * Implementation of the PSN-EDF scheduler plugin.
+ * Based on kern/sched_part_edf.c and kern/sched_gsn_edf.c.
+ *
+ * Suspensions and non-preemptable sections are supported.
+ * Priority inheritance is not supported.
+ */
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <litmus/litmus.h>
+#include <litmus/jobs.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/edf_common.h>
+typedef struct {
+        rt_domain_t             domain;
+        int                     cpu;
+        struct task_struct*     scheduled; /* only RT tasks */
+/*
+ * scheduling lock slock
+ * protects the domain and serializes scheduling decisions
+ */
+#define slock domain.ready_lock
+} psnedf_domain_t;
+DEFINE_PER_CPU(psnedf_domain_t, psnedf_domains);
+#define local_edf               (&__get_cpu_var(psnedf_domains).domain)
+#define local_pedf              (&__get_cpu_var(psnedf_domains))
+#define remote_edf(cpu)         (&per_cpu(psnedf_domains, cpu).domain)
+#define remote_pedf(cpu)        (&per_cpu(psnedf_domains, cpu))
+#define task_edf(task)          remote_edf(get_partition(task))
+#define task_pedf(task)         remote_pedf(get_partition(task))
+static void psnedf_domain_init(psnedf_domain_t* pedf,
+                               check_resched_needed_t check,
+                               release_jobs_t release,
+                               int cpu)
+{
+        edf_domain_init(&pedf->domain, check, release);
+        pedf->cpu               = cpu;
+        pedf->scheduled         = NULL;
+}
+static void requeue(struct task_struct* t, rt_domain_t *edf)
+{
+        if (t->state != TASK_RUNNING)
+                TRACE_TASK(t, "requeue: !TASK_RUNNING\n");
+        set_rt_flags(t, RT_F_RUNNING);
+        if (is_released(t, litmus_clock()))
+                __add_ready(edf, t);
+        else
+                add_release(edf, t); /* it has got to wait */
+}
+/* we assume the lock is being held */
+static void preempt(psnedf_domain_t *pedf)
+{
+        preempt_if_preemptable(pedf->scheduled, pedf->cpu);
+}
+/* This check is trivial in partioned systems as we only have to consider
+ * the CPU of the partition.
+ */
+static int psnedf_check_resched(rt_domain_t *edf)
+{
+        psnedf_domain_t *pedf = container_of(edf, psnedf_domain_t, domain);
+        /* because this is a callback from rt_domain_t we already hold
+         * the necessary lock for the ready queue
+         */
+        if (edf_preemption_needed(edf, pedf->scheduled)) {
+                preempt(pedf);
+                return 1;
+        } else
+                return 0;
+}
+static void job_completion(struct task_struct* t, int forced)
+{
+        sched_trace_task_completion(t,forced);
+        TRACE_TASK(t, "job_completion().\n");
+        set_rt_flags(t, RT_F_SLEEP);
+        prepare_for_next_period(t);
+}
+static void psnedf_tick(struct task_struct *t)
+{
+        psnedf_domain_t *pedf = local_pedf;
+        /* Check for inconsistency. We don't need the lock for this since
+         * ->scheduled is only changed in schedule, which obviously is not
+         *  executing in parallel on this CPU
+         */
+        BUG_ON(is_realtime(t) && t != pedf->scheduled);
+        if (is_realtime(t) && budget_exhausted(t)) {
+                if (!is_np(t)) {
+                        set_tsk_need_resched(t);
+                        TRACE("psnedf_scheduler_tick: "
+                              "%d is preemptable "
+                              " => FORCE_RESCHED\n", t->pid);
+                } else if (is_user_np(t)) {
+                        TRACE("psnedf_scheduler_tick: "
+                              "%d is non-preemptable, "
+                              "preemption delayed.\n", t->pid);
+                        request_exit_np(t);
+                }
+        }
+}
+static struct task_struct* psnedf_schedule(struct task_struct * prev)
+{
+        psnedf_domain_t*        pedf = local_pedf;
+        rt_domain_t*            edf  = &pedf->domain;
+        struct task_struct*     next;
+        int                     out_of_time, sleep, preempt,
+                                np, exists, blocks, resched;
+        spin_lock(&pedf->slock);
+        /* sanity checking
+         * differently from gedf, when a task exits (dead)
+         * pedf->schedule may be null and prev _is_ realtime
+         */
+        BUG_ON(pedf->scheduled && pedf->scheduled != prev);
+        BUG_ON(pedf->scheduled && !is_realtime(prev));
+        /* (0) Determine state */
+        exists      = pedf->scheduled != NULL;
+        blocks      = exists && !is_running(pedf->scheduled);
+        out_of_time = exists && budget_exhausted(pedf->scheduled);
+        np          = exists && is_np(pedf->scheduled);
+        sleep       = exists && get_rt_flags(pedf->scheduled) == RT_F_SLEEP;
+        preempt     = edf_preemption_needed(edf, prev);
+        /* If we need to preempt do so.
+         * The following checks set resched to 1 in case of special
+         * circumstances.
+         */
+        resched = preempt;
+        /* If a task blocks we have no choice but to reschedule.
+         */
+        if (blocks)
+                resched = 1;
+        /* Request a sys_exit_np() call if we would like to preempt but cannot.
+         * Multiple calls to request_exit_np() don't hurt.
+         */
+        if (np && (out_of_time || preempt || sleep))
+                request_exit_np(pedf->scheduled);
+        /* Any task that is preemptable and either exhausts its execution
+         * budget or wants to sleep completes. We may have to reschedule after
+         * this.
+         */
+        if (!np && (out_of_time || sleep) && !blocks) {
+                job_completion(pedf->scheduled, !sleep);
+                resched = 1;
+        }
+        /* The final scheduling decision. Do we need to switch for some reason?
+         * Switch if we are in RT mode and have no task or if we need to
+         * resched.
+         */
+        next = NULL;
+        if ((!np || blocks) && (resched || !exists)) {
+                /* Take care of a previously scheduled
+                 * job by taking it out of the Linux runqueue.
+                 */
+                if (pedf->scheduled && !blocks)
+                        requeue(pedf->scheduled, edf);
+                next = __take_ready(edf);
+        } else
+                /* Only override Linux scheduler if we have a real-time task
+                 * scheduled that needs to continue.
+                 */
+                if (exists)
+                        next = prev;
+        if (next) {
+                TRACE_TASK(next, "scheduled at %llu\n", litmus_clock());
+                set_rt_flags(next, RT_F_RUNNING);
+        } else {
+                TRACE("becoming idle at %llu\n", litmus_clock());
+        }
+        pedf->scheduled = next;
+        spin_unlock(&pedf->slock);
+        return next;
+}
+/*      Prepare a task for running in RT mode
+ */
+static void psnedf_task_new(struct task_struct * t, int on_rq, int running)
+{
+        rt_domain_t*            edf  = task_edf(t);
+        psnedf_domain_t*        pedf = task_pedf(t);
+        unsigned long           flags;
+        TRACE_TASK(t, "psn edf: task new, cpu = %d\n",
+                   t->rt_param.task_params.cpu);
+        /* setup job parameters */
+        release_at(t, litmus_clock());
+        /* The task should be running in the queue, otherwise signal
+         * code will try to wake it up with fatal consequences.
+         */
+        spin_lock_irqsave(&pedf->slock, flags);
+        if (running) {
+                /* there shouldn't be anything else running at the time */
+                BUG_ON(pedf->scheduled);
+                pedf->scheduled = t;
+        } else {
+                requeue(t, edf);
+                /* maybe we have to reschedule */
+                preempt(pedf);
+        }
+        spin_unlock_irqrestore(&pedf->slock, flags);
+}
+static void psnedf_task_wake_up(struct task_struct *task)
+{
+        unsigned long           flags;
+        psnedf_domain_t*        pedf = task_pedf(task);
+        rt_domain_t*            edf  = task_edf(task);
+        lt_t                    now;
+        TRACE_TASK(task, "wake_up at %llu\n", litmus_clock());
+        spin_lock_irqsave(&pedf->slock, flags);
+        BUG_ON(is_queued(task));
+        /* We need to take suspensions because of semaphores into
+         * account! If a job resumes after being suspended due to acquiring
+         * a semaphore, it should never be treated as a new job release.
+         *
+         * FIXME: This should be done in some more predictable and userspace-controlled way.
+         */
+        now = litmus_clock();
+        if (is_tardy(task, now) &&
+            get_rt_flags(task) != RT_F_EXIT_SEM) {
+                /* new sporadic release */
+                release_at(task, now);
+                sched_trace_task_release(task);
+        }
+        /* Only add to ready queue if it is not the currently-scheduled
+         * task. This could be the case if a task was woken up concurrently
+         * on a remote CPU before the executing CPU got around to actually
+         * de-scheduling the task, i.e., wake_up() raced with schedule()
+         * and won.
+         */
+        if (pedf->scheduled != task)
+                requeue(task, edf);
+        spin_unlock_irqrestore(&pedf->slock, flags);
+        TRACE_TASK(task, "wake up done\n");
+}
+static void psnedf_task_block(struct task_struct *t)
+{
+        /* only running tasks can block, thus t is in no queue */
+        TRACE_TASK(t, "block at %llu, state=%d\n", litmus_clock(), t->state);
+        BUG_ON(!is_realtime(t));
+        BUG_ON(is_queued(t));
+}
+static void psnedf_task_exit(struct task_struct * t)
+{
+        unsigned long flags;
+        psnedf_domain_t*        pedf = task_pedf(t);
+        rt_domain_t*            edf;
+        spin_lock_irqsave(&pedf->slock, flags);
+        if (is_queued(t)) {
+                /* dequeue */
+                edf  = task_edf(t);
+                remove(edf, t);
+        }
+        if (pedf->scheduled == t)
+                pedf->scheduled = NULL;
+        TRACE_TASK(t, "RIP, now reschedule\n");
+        preempt(pedf);
+        spin_unlock_irqrestore(&pedf->slock, flags);
+}
+#ifdef CONFIG_FMLP
+static long psnedf_pi_block(struct pi_semaphore *sem,
+                            struct task_struct *new_waiter)
+{
+        psnedf_domain_t*        pedf;
+        rt_domain_t*            edf;
+        struct task_struct*     t;
+        int cpu  = get_partition(new_waiter);
+        BUG_ON(!new_waiter);
+        if (edf_higher_prio(new_waiter, sem->hp.cpu_task[cpu])) {
+                TRACE_TASK(new_waiter, " boosts priority\n");
+                pedf = task_pedf(new_waiter);
+                edf  = task_edf(new_waiter);
+                /* interrupts already disabled */
+                spin_lock(&pedf->slock);
+                /* store new highest-priority task */
+                sem->hp.cpu_task[cpu] = new_waiter;
+                if (sem->holder &&
+                    get_partition(sem->holder) == get_partition(new_waiter)) {
+                        /* let holder inherit */
+                        sem->holder->rt_param.inh_task = new_waiter;
+                        t = sem->holder;
+                        if (is_queued(t)) {
+                                /* queued in domain*/
+                                remove(edf, t);
+                                /* readd to make priority change take place */
+                                /* FIXME: this looks outdated */
+                                if (is_released(t, litmus_clock()))
+                                        __add_ready(edf, t);
+                                else
+                                        add_release(edf, t);
+                        }
+                }
+                /* check if we need to reschedule */
+                if (edf_preemption_needed(edf, current))
+                        preempt(pedf);
+                spin_unlock(&pedf->slock);
+        }
+        return 0;
+}
+static long psnedf_inherit_priority(struct pi_semaphore *sem,
+                                    struct task_struct *new_owner)
+{
+        int cpu  = get_partition(new_owner);
+        new_owner->rt_param.inh_task = sem->hp.cpu_task[cpu];
+        if (sem->hp.cpu_task[cpu] && new_owner != sem->hp.cpu_task[cpu]) {
+                TRACE_TASK(new_owner,
+                           "inherited priority from %s/%d\n",
+                           sem->hp.cpu_task[cpu]->comm,
+                           sem->hp.cpu_task[cpu]->pid);
+        } else
+                TRACE_TASK(new_owner,
+                           "cannot inherit priority: "
+                           "no higher priority job waits on this CPU!\n");
+        /* make new owner non-preemptable as required by FMLP under
+         * PSN-EDF.
+         */
+        make_np(new_owner);
+        return 0;
+}
+/* This function is called on a semaphore release, and assumes that
+ * the current task is also the semaphore holder.
+ */
+static long psnedf_return_priority(struct pi_semaphore *sem)
+{
+        struct task_struct*     t    = current;
+        psnedf_domain_t*        pedf = task_pedf(t);
+        rt_domain_t*            edf  = task_edf(t);
+        int                     ret  = 0;
+        int                     cpu  = get_partition(current);
+        int still_np;
+        /* Find new highest-priority semaphore task
+         * if holder task is the current hp.cpu_task[cpu].
+         *
+         * Calling function holds sem->wait.lock.
+         */
+        if (t == sem->hp.cpu_task[cpu])
+                edf_set_hp_cpu_task(sem, cpu);
+        still_np = take_np(current);
+        /* Since we don't nest resources, this
+         * should always be zero */
+        BUG_ON(still_np);
+        if (current->rt_param.inh_task) {
+                TRACE_CUR("return priority of %s/%d\n",
+                          current->rt_param.inh_task->comm,
+                          current->rt_param.inh_task->pid);
+        } else
+                TRACE_CUR(" no priority to return %p\n", sem);
+        /* Always check for delayed preemptions that might have become
+         * necessary due to non-preemptive execution.
+         */
+        spin_lock(&pedf->slock);
+        /* Reset inh_task to NULL. */
+        current->rt_param.inh_task = NULL;
+        /* check if we need to reschedule */
+        if (edf_preemption_needed(edf, current))
+                preempt(pedf);
+        spin_unlock(&pedf->slock);
+        return ret;
+}
+#endif
+static long psnedf_admit_task(struct task_struct* tsk)
+{
+        return task_cpu(tsk) == tsk->rt_param.task_params.cpu ? 0 : -EINVAL;
+}
+/*      Plugin object   */
+static struct sched_plugin psn_edf_plugin __cacheline_aligned_in_smp = {
+        .plugin_name            = "PSN-EDF",
+#ifdef CONFIG_SRP
+        .srp_active             = 1,
+#endif
+        .tick                   = psnedf_tick,
+        .task_new               = psnedf_task_new,
+        .complete_job           = complete_job,
+        .task_exit              = psnedf_task_exit,
+        .schedule               = psnedf_schedule,
+        .task_wake_up           = psnedf_task_wake_up,
+        .task_block             = psnedf_task_block,
+#ifdef CONFIG_FMLP
+        .fmlp_active            = 1,
+        .pi_block               = psnedf_pi_block,
+        .inherit_priority       = psnedf_inherit_priority,
+        .return_priority        = psnedf_return_priority,
+#endif
+        .admit_task             = psnedf_admit_task
+};
+static int __init init_psn_edf(void)
+{
+        int i;
+        /* We do not really want to support cpu hotplug, do we? ;)
+         * However, if we are so crazy to do so,
+         * we cannot use num_online_cpu()
+         */
+        for (i = 0; i < num_online_cpus(); i++) {
+                psnedf_domain_init(remote_pedf(i),
+                                   psnedf_check_resched,
+                                   NULL, i);
+        }
+        return register_sched_plugin(&psn_edf_plugin);
+}
+module_init(init_psn_edf);
diff --git a/litmus/sched_task_trace.c b/litmus/sched_task_trace.c
new file mode 100644
index 000000000000..39a543e22d41
--- /dev/null
+++ b/litmus/sched_task_trace.c
@@ -0,0 +1,204 @@
+/*
+ * sched_task_trace.c -- record scheduling events to a byte stream
+ */
+#define NO_TASK_TRACE_DECLS
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/percpu.h>
+#include <litmus/ftdev.h>
+#include <litmus/litmus.h>
+#include <litmus/sched_trace.h>
+#include <litmus/feather_trace.h>
+#include <litmus/ftdev.h>
+/* set MAJOR to 0 to have it dynamically assigned */
+#define FT_TASK_TRACE_MAJOR     253
+#define NO_EVENTS               4096 /* this is a buffer of 12 4k pages per CPU */
+#define now() litmus_clock()
+struct local_buffer {
+        struct st_event_record record[NO_EVENTS];
+        char   flag[NO_EVENTS];
+        struct ft_buffer ftbuf;
+};
+DEFINE_PER_CPU(struct local_buffer, st_event_buffer);
+static struct ftdev st_dev;
+static int st_dev_can_open(struct ftdev *dev, unsigned int cpu)
+{
+        return cpu_online(cpu) ? 0 : -ENODEV;
+}
+static int __init init_sched_task_trace(void)
+{
+        struct local_buffer* buf;
+        int i, ok = 0;
+        ftdev_init(&st_dev, THIS_MODULE);
+        for (i = 0; i < NR_CPUS; i++) {
+                buf = &per_cpu(st_event_buffer, i);
+                ok += init_ft_buffer(&buf->ftbuf, NO_EVENTS,
+                                     sizeof(struct st_event_record),
+                                     buf->flag,
+                                     buf->record);
+                st_dev.minor[i].buf = &buf->ftbuf;
+        }
+        if (ok == NR_CPUS) {
+                st_dev.minor_cnt = NR_CPUS;
+                st_dev.can_open = st_dev_can_open;
+                return register_ftdev(&st_dev, "sched_trace", FT_TASK_TRACE_MAJOR);
+        } else {
+                return -EINVAL;
+        }
+}
+module_init(init_sched_task_trace);
+static inline struct st_event_record* get_record(u8 type, struct task_struct* t)
+{
+        struct st_event_record* rec = NULL;
+        struct local_buffer* buf;
+        buf = &get_cpu_var(st_event_buffer);
+        if (ft_buffer_start_write(&buf->ftbuf, (void**) &rec)) {
+                rec->hdr.type = type;
+                rec->hdr.cpu  = smp_processor_id();
+                rec->hdr.pid  = t ? t->pid : 0;
+                rec->hdr.job  = t ? t->rt_param.job_params.job_no : 0;
+        } else {
+                put_cpu_var(st_event_buffer);
+        }
+        /* rec will be NULL if it failed */
+        return rec;
+}
+static inline void put_record(struct st_event_record* rec)
+{
+        struct local_buffer* buf;
+        buf = &__get_cpu_var(st_event_buffer);
+        ft_buffer_finish_write(&buf->ftbuf, rec);
+        put_cpu_var(st_event_buffer);
+}
+feather_callback void do_sched_trace_task_name(unsigned long id, unsigned long _task)
+{
+        struct task_struct *t = (struct task_struct*) _task;
+        struct st_event_record* rec = get_record(ST_NAME, t);
+        int i;
+        if (rec) {
+                for (i = 0; i < min(TASK_COMM_LEN, ST_NAME_LEN); i++)
+                        rec->data.name.cmd[i] = t->comm[i];
+                put_record(rec);
+        }
+}
+feather_callback void do_sched_trace_task_param(unsigned long id, unsigned long _task)
+{
+        struct task_struct *t = (struct task_struct*) _task;
+        struct st_event_record* rec = get_record(ST_PARAM, t);
+        if (rec) {
+                rec->data.param.wcet      = get_exec_cost(t);
+                rec->data.param.period    = get_rt_period(t);
+                rec->data.param.phase     = get_rt_phase(t);
+                rec->data.param.partition = get_partition(t);
+                put_record(rec);
+        }
+}
+feather_callback void do_sched_trace_task_release(unsigned long id, unsigned long _task)
+{
+        struct task_struct *t = (struct task_struct*) _task;
+        struct st_event_record* rec = get_record(ST_RELEASE, t);
+        if (rec) {
+                rec->data.release.release  = get_release(t);
+                rec->data.release.deadline = get_deadline(t);
+                put_record(rec);
+        }
+}
+/* skipped: st_assigned_data, we don't use it atm */
+feather_callback void do_sched_trace_task_switch_to(unsigned long id,
+                                                    unsigned long _task)
+{
+        struct task_struct *t = (struct task_struct*) _task;
+        struct st_event_record* rec;
+        if (is_realtime(t)) {
+                rec = get_record(ST_SWITCH_TO, t);
+                if (rec) {
+                        rec->data.switch_to.when      = now();
+                        rec->data.switch_to.exec_time = get_exec_time(t);
+                        put_record(rec);
+                }
+        }
+}
+feather_callback void do_sched_trace_task_switch_away(unsigned long id,
+                                                      unsigned long _task)
+{
+        struct task_struct *t = (struct task_struct*) _task;
+        struct st_event_record* rec;
+        if (is_realtime(t)) {
+                rec = get_record(ST_SWITCH_AWAY, t);
+                if (rec) {
+                        rec->data.switch_away.when      = now();
+                        rec->data.switch_away.exec_time = get_exec_time(t);
+                        put_record(rec);
+                }
+        }
+}
+feather_callback void do_sched_trace_task_completion(unsigned long id,
+                                                     unsigned long _task,
+                                                     unsigned long forced)
+{
+        struct task_struct *t = (struct task_struct*) _task;
+        struct st_event_record* rec = get_record(ST_COMPLETION, t);
+        if (rec) {
+                rec->data.completion.when   = now();
+                rec->data.completion.forced = forced;
+                put_record(rec);
+        }
+}
+feather_callback void do_sched_trace_task_block(unsigned long id,
+                                                unsigned long _task)
+{
+        struct task_struct *t = (struct task_struct*) _task;
+        struct st_event_record* rec = get_record(ST_BLOCK, t);
+        if (rec) {
+                rec->data.block.when      = now();
+                put_record(rec);
+        }
+}
+feather_callback void do_sched_trace_task_resume(unsigned long id,
+                                                 unsigned long _task)
+{
+        struct task_struct *t = (struct task_struct*) _task;
+        struct st_event_record* rec = get_record(ST_RESUME, t);
+        if (rec) {
+                rec->data.resume.when      = now();
+                put_record(rec);
+        }
+}
+feather_callback void do_sched_trace_sys_release(unsigned long id,
+                                                 unsigned long _start)
+{
+        lt_t *start = (lt_t*) _start;
+        struct st_event_record* rec = get_record(ST_SYS_RELEASE, NULL);
+        if (rec) {
+                rec->data.sys_release.when    = now();
+                rec->data.sys_release.release = *start;
+                put_record(rec);
+        }
+}
diff --git a/litmus/sched_trace.c b/litmus/sched_trace.c
new file mode 100644
index 000000000000..ad0b138d4b01
--- /dev/null
+++ b/litmus/sched_trace.c
@@ -0,0 +1,378 @@
+/*
+ * sched_trace.c -- record scheduling events to a byte stream.
+ */
+#include <linux/spinlock.h>
+#include <linux/semaphore.h>
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <asm/uaccess.h>
+#include <linux/module.h>
+#include <linux/sysrq.h>
+#include <linux/kfifo.h>
+#include <litmus/sched_trace.h>
+#include <litmus/litmus.h>
+#define SCHED_TRACE_NAME "litmus/log"
+/* Allocate a buffer of about 32k per CPU */
+#define LITMUS_TRACE_BUF_PAGES 8
+#define LITMUS_TRACE_BUF_SIZE (PAGE_SIZE * LITMUS_TRACE_BUF_PAGES * NR_CPUS)
+/* Max length of one read from the buffer */
+#define MAX_READ_LEN (64 * 1024)
+/* Max length for one write --- from kernel --- to the buffer */
+#define MSG_SIZE 255
+/* Inner ring buffer structure */
+typedef struct {
+        rwlock_t        del_lock;
+        /* the buffer */
+        struct kfifo    *kfifo;
+} ring_buffer_t;
+/* Main buffer structure */
+typedef struct {
+        ring_buffer_t           buf;
+        atomic_t                reader_cnt;
+        struct semaphore        reader_mutex;
+} trace_buffer_t;
+/*
+ * Inner buffer management functions
+ */
+void rb_init(ring_buffer_t* buf)
+{
+        rwlock_init(&buf->del_lock);
+        buf->kfifo = NULL;
+}
+int rb_alloc_buf(ring_buffer_t* buf, unsigned int size)
+{
+        unsigned long flags;
+        write_lock_irqsave(&buf->del_lock, flags);
+        buf->kfifo = kfifo_alloc(size, GFP_ATOMIC, NULL);
+        write_unlock_irqrestore(&buf->del_lock, flags);
+        if(IS_ERR(buf->kfifo)) {
+                printk(KERN_ERR "kfifo_alloc failed\n");
+                return PTR_ERR(buf->kfifo);
+        }
+        return 0;
+}
+int rb_free_buf(ring_buffer_t* buf)
+{
+        unsigned long flags;
+        write_lock_irqsave(&buf->del_lock, flags);
+        BUG_ON(!buf->kfifo);
+        kfifo_free(buf->kfifo);
+        buf->kfifo = NULL;
+        write_unlock_irqrestore(&buf->del_lock, flags);
+        return 0;
+}
+/*
+ * Assumption: concurrent writes are serialized externally
+ *
+ * Will only succeed if there is enough space for all len bytes.
+ */
+int rb_put(ring_buffer_t* buf, char* mem, size_t len)
+{
+        unsigned long flags;
+        int error = 0;
+        read_lock_irqsave(&buf->del_lock, flags);
+        if (!buf->kfifo) {
+                error = -ENODEV;
+                goto out;
+        }
+        if((__kfifo_put(buf->kfifo, mem, len)) < len) {
+                error = -ENOMEM;
+                goto out;
+        }
+ out:
+        read_unlock_irqrestore(&buf->del_lock, flags);
+        return error;
+}
+/* Assumption: concurrent reads are serialized externally */
+int rb_get(ring_buffer_t* buf, char* mem, size_t len)
+{
+        unsigned long flags;
+        int error = 0;
+        read_lock_irqsave(&buf->del_lock, flags);
+        if (!buf->kfifo) {
+                error = -ENODEV;
+                goto out;
+        }
+        error = __kfifo_get(buf->kfifo, (unsigned char*)mem, len);
+ out:
+        read_unlock_irqrestore(&buf->del_lock, flags);
+        return error;
+}
+/*
+ * Device Driver management
+ */
+static spinlock_t log_buffer_lock = SPIN_LOCK_UNLOCKED;
+static trace_buffer_t log_buffer;
+static void init_log_buffer(void)
+{
+        rb_init(&log_buffer.buf);
+        atomic_set(&log_buffer.reader_cnt,0);
+        init_MUTEX(&log_buffer.reader_mutex);
+}
+static DEFINE_PER_CPU(char[MSG_SIZE], fmt_buffer);
+/*
+ * sched_trace_log_message - Write to the trace buffer (log_buffer)
+ *
+ * This is the only function accessing the log_buffer from inside the
+ * kernel for writing.
+ * Concurrent access to sched_trace_log_message must be serialized using
+ * log_buffer_lock
+ * The maximum length of a formatted message is 255
+ */
+void sched_trace_log_message(const char* fmt, ...)
+{
+        unsigned long   flags;
+        va_list         args;
+        size_t          len;
+        char*           buf;
+        va_start(args, fmt);
+        local_irq_save(flags);
+        /* format message */
+        buf = __get_cpu_var(fmt_buffer);
+        len = vscnprintf(buf, MSG_SIZE, fmt, args);
+        spin_lock(&log_buffer_lock);
+        /* Don't copy the trailing null byte, we don't want null bytes
+         * in a text file.
+         */
+        rb_put(&log_buffer.buf, buf, len);
+        spin_unlock(&log_buffer_lock);
+        local_irq_restore(flags);
+        va_end(args);
+}
+/*
+ * log_read - Read the trace buffer
+ *
+ * This function is called as a file operation from userspace.
+ * Readers can sleep. Access is serialized through reader_mutex
+ */
+static ssize_t log_read(struct file *filp, char __user *to, size_t len,
+                      loff_t *f_pos)
+{
+        /* we ignore f_pos, this is strictly sequential */
+        ssize_t error = -EINVAL;
+        char*   mem;
+        trace_buffer_t *tbuf = filp->private_data;
+        if (down_interruptible(&tbuf->reader_mutex)) {
+                error = -ERESTARTSYS;
+                goto out;
+        }
+        if (len > MAX_READ_LEN)
+                len = MAX_READ_LEN;
+        mem = kmalloc(len, GFP_KERNEL);
+        if (!mem) {
+                error = -ENOMEM;
+                goto out_unlock;
+        }
+        error = rb_get(&tbuf->buf, mem, len);
+        while (!error) {
+                set_current_state(TASK_INTERRUPTIBLE);
+                schedule_timeout(110);
+                if (signal_pending(current))
+                        error = -ERESTARTSYS;
+                else
+                        error = rb_get(&tbuf->buf, mem, len);
+        }
+        if (error > 0 && copy_to_user(to, mem, error))
+                error = -EFAULT;
+        kfree(mem);
+ out_unlock:
+        up(&tbuf->reader_mutex);
+ out:
+        return error;
+}
+/*
+ * Enable redirection of printk() messages to the trace buffer.
+ * Defined in kernel/printk.c
+ */
+extern int trace_override;
+extern int trace_recurse;
+/*
+ * log_open - open the global log message ring buffer.
+ */
+static int log_open(struct inode *in, struct file *filp)
+{
+        int error = -EINVAL;
+        trace_buffer_t* tbuf;
+        tbuf = &log_buffer;
+        if (down_interruptible(&tbuf->reader_mutex)) {
+                error = -ERESTARTSYS;
+                goto out;
+        }
+        /* first open must allocate buffers */
+        if (atomic_inc_return(&tbuf->reader_cnt) == 1) {
+                if ((error = rb_alloc_buf(&tbuf->buf, LITMUS_TRACE_BUF_SIZE)))
+                {
+                        atomic_dec(&tbuf->reader_cnt);
+                        goto out_unlock;
+                }
+        }
+        error = 0;
+        filp->private_data = tbuf;
+        printk(KERN_DEBUG
+               "sched_trace kfifo at 0x%p with buffer starting at: 0x%p\n",
+               tbuf->buf.kfifo, &((tbuf->buf.kfifo)->buffer));
+        /* override printk() */
+        trace_override++;
+ out_unlock:
+        up(&tbuf->reader_mutex);
+ out:
+        return error;
+}
+static int log_release(struct inode *in, struct file *filp)
+{
+        int error = -EINVAL;
+        trace_buffer_t* tbuf = filp->private_data;
+        BUG_ON(!filp->private_data);
+        if (down_interruptible(&tbuf->reader_mutex)) {
+                error = -ERESTARTSYS;
+                goto out;
+        }
+        /* last release must deallocate buffers */
+        if (atomic_dec_return(&tbuf->reader_cnt) == 0) {
+                error = rb_free_buf(&tbuf->buf);
+        }
+        /* release printk() overriding */
+        trace_override--;
+        printk(KERN_DEBUG "sched_trace kfifo released\n");
+        up(&tbuf->reader_mutex);
+ out:
+        return error;
+}
+/*
+ * log_fops  - The file operations for accessing the global LITMUS log message
+ *             buffer.
+ *
+ * Except for opening the device file it uses the same operations as trace_fops.
+ */
+static struct file_operations log_fops = {
+        .owner   = THIS_MODULE,
+        .open    = log_open,
+        .release = log_release,
+        .read    = log_read,
+};
+static struct miscdevice litmus_log_dev = {
+        .name    = SCHED_TRACE_NAME,
+        .minor   = MISC_DYNAMIC_MINOR,
+        .fops    = &log_fops,
+};
+#ifdef CONFIG_MAGIC_SYSRQ
+void dump_trace_buffer(int max)
+{
+        char line[80];
+        int len;
+        int count = 0;
+        /* potential, but very unlikely, race... */
+        trace_recurse = 1;
+        while ((max == 0 || count++ < max) &&
+               (len = rb_get(&log_buffer.buf, line, sizeof(line) - 1)) > 0) {
+                line[len] = '\0';
+                printk("%s", line);
+        }
+        trace_recurse = 0;
+}
+static void sysrq_dump_trace_buffer(int key, struct tty_struct *tty)
+{
+        dump_trace_buffer(100);
+}
+static struct sysrq_key_op sysrq_dump_trace_buffer_op = {
+        .handler        = sysrq_dump_trace_buffer,
+        .help_msg       = "dump-trace-buffer(Y)",
+        .action_msg     = "writing content of TRACE() buffer",
+};
+#endif
+static int __init init_sched_trace(void)
+{
+        printk("Initializing TRACE() device\n");
+        init_log_buffer();
+#ifdef CONFIG_MAGIC_SYSRQ
+        /* offer some debugging help */
+        if (!register_sysrq_key('y', &sysrq_dump_trace_buffer_op))
+                printk("Registered dump-trace-buffer(Y) magic sysrq.\n");
+        else
+                printk("Could not register dump-trace-buffer(Y) magic sysrq.\n");
+#endif
+        return misc_register(&litmus_log_dev);
+}
+static void __exit exit_sched_trace(void)
+{
+        misc_deregister(&litmus_log_dev);
+}
+module_init(init_sched_trace);
+module_exit(exit_sched_trace);
diff --git a/litmus/srp.c b/litmus/srp.c
new file mode 100644
index 000000000000..71639b991630
--- /dev/null
+++ b/litmus/srp.c
@@ -0,0 +1,318 @@
+/* ************************************************************************** */
+/*                          STACK RESOURCE POLICY                             */
+/* ************************************************************************** */
+#include <asm/atomic.h>
+#include <linux/wait.h>
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/fdso.h>
+#include <litmus/trace.h>
+#ifdef CONFIG_SRP
+struct srp_priority {
+        struct list_head        list;
+        unsigned int            period;
+        pid_t                   pid;
+};
+#define list2prio(l) list_entry(l, struct srp_priority, list)
+/* SRP task priority comparison function. Smaller periods have highest
+ * priority, tie-break is PID. Special case: period == 0 <=> no priority
+ */
+static int srp_higher_prio(struct srp_priority* first,
+                           struct srp_priority* second)
+{
+        if (!first->period)
+                return 0;
+        else
+                return  !second->period ||
+                        first->period < second->period || (
+                        first->period == second->period &&
+                        first->pid < second->pid);
+}
+struct srp {
+        struct list_head        ceiling;
+        wait_queue_head_t       ceiling_blocked;
+};
+atomic_t srp_objects_in_use = ATOMIC_INIT(0);
+DEFINE_PER_CPU(struct srp, srp);
+/* Initialize SRP semaphores at boot time. */
+static int __init srp_init(void)
+{
+        int i;
+        printk("Initializing SRP per-CPU ceilings...");
+        for (i = 0; i < NR_CPUS; i++) {
+                init_waitqueue_head(&per_cpu(srp, i).ceiling_blocked);
+                INIT_LIST_HEAD(&per_cpu(srp, i).ceiling);
+        }
+        printk(" done!\n");
+        return 0;
+}
+module_init(srp_init);
+#define system_ceiling(srp) list2prio(srp->ceiling.next)
+#define UNDEF_SEM -2
+/* struct for uniprocessor SRP "semaphore" */
+struct srp_semaphore {
+        struct srp_priority ceiling;
+        struct task_struct* owner;
+        int cpu; /* cpu associated with this "semaphore" and resource */
+};
+#define ceiling2sem(c) container_of(c, struct srp_semaphore, ceiling)
+static int srp_exceeds_ceiling(struct task_struct* first,
+                               struct srp* srp)
+{
+        return list_empty(&srp->ceiling) ||
+               get_rt_period(first) < system_ceiling(srp)->period ||
+               (get_rt_period(first) == system_ceiling(srp)->period &&
+                first->pid < system_ceiling(srp)->pid) ||
+                ceiling2sem(system_ceiling(srp))->owner == first;
+}
+static void srp_add_prio(struct srp* srp, struct srp_priority* prio)
+{
+        struct list_head *pos;
+        if (in_list(&prio->list)) {
+                printk(KERN_CRIT "WARNING: SRP violation detected, prio is already in "
+                       "ceiling list! cpu=%d, srp=%p\n", smp_processor_id(), ceiling2sem(prio));
+                return;
+        }
+        list_for_each(pos, &srp->ceiling)
+                if (unlikely(srp_higher_prio(prio, list2prio(pos)))) {
+                        __list_add(&prio->list, pos->prev, pos);
+                        return;
+                }
+        list_add_tail(&prio->list, &srp->ceiling);
+}
+static void* create_srp_semaphore(void)
+{
+        struct srp_semaphore* sem;
+        sem = kmalloc(sizeof(*sem), GFP_KERNEL);
+        if (!sem)
+                return NULL;
+        INIT_LIST_HEAD(&sem->ceiling.list);
+        sem->ceiling.period = 0;
+        sem->cpu     = UNDEF_SEM;
+        sem->owner   = NULL;
+        atomic_inc(&srp_objects_in_use);
+        return sem;
+}
+static noinline int open_srp_semaphore(struct od_table_entry* entry, void* __user arg)
+{
+        struct srp_semaphore* sem = (struct srp_semaphore*) entry->obj->obj;
+        int ret = 0;
+        struct task_struct* t = current;
+        struct srp_priority t_prio;
+        TRACE("opening SRP semaphore %p, cpu=%d\n", sem, sem->cpu);
+        if (!srp_active())
+                return -EBUSY;
+        if (sem->cpu == UNDEF_SEM)
+                sem->cpu = get_partition(t);
+        else if (sem->cpu != get_partition(t))
+                ret = -EPERM;
+        if (ret == 0) {
+                t_prio.period = get_rt_period(t);
+                t_prio.pid    = t->pid;
+                if (srp_higher_prio(&t_prio, &sem->ceiling)) {
+                        sem->ceiling.period = t_prio.period;
+                        sem->ceiling.pid    = t_prio.pid;
+                }
+        }
+        return ret;
+}
+static void destroy_srp_semaphore(void* sem)
+{
+        /* XXX invariants */
+        atomic_dec(&srp_objects_in_use);
+        kfree(sem);
+}
+struct fdso_ops srp_sem_ops = {
+        .create  = create_srp_semaphore,
+        .open    = open_srp_semaphore,
+        .destroy = destroy_srp_semaphore
+};
+static void do_srp_down(struct srp_semaphore* sem)
+{
+        /* Update ceiling. */
+        srp_add_prio(&__get_cpu_var(srp), &sem->ceiling);
+        WARN_ON(sem->owner != NULL);
+        sem->owner = current;
+        TRACE_CUR("acquired srp 0x%p\n", sem);
+}
+static void do_srp_up(struct srp_semaphore* sem)
+{
+        /* Determine new system priority ceiling for this CPU. */
+        WARN_ON(!in_list(&sem->ceiling.list));
+        if (in_list(&sem->ceiling.list))
+                list_del(&sem->ceiling.list);
+        sem->owner = NULL;
+        /* Wake tasks on this CPU, if they exceed current ceiling. */
+        TRACE_CUR("released srp 0x%p\n", sem);
+        wake_up_all(&__get_cpu_var(srp).ceiling_blocked);
+}
+/* Adjust the system-wide priority ceiling if resource is claimed. */
+asmlinkage long sys_srp_down(int sem_od)
+{
+        int cpu;
+        int ret = -EINVAL;
+        struct srp_semaphore* sem;
+        /* disabling preemptions is sufficient protection since
+         * SRP is strictly per CPU and we don't interfere with any
+         * interrupt handlers
+         */
+        preempt_disable();
+        TS_SRP_DOWN_START;
+        cpu = smp_processor_id();
+        sem = lookup_srp_sem(sem_od);
+        if (sem && sem->cpu == cpu) {
+                do_srp_down(sem);
+                ret = 0;
+        }
+        TS_SRP_DOWN_END;
+        preempt_enable();
+        return ret;
+}
+/* Adjust the system-wide priority ceiling if resource is freed. */
+asmlinkage long sys_srp_up(int sem_od)
+{
+        int cpu;
+        int ret = -EINVAL;
+        struct srp_semaphore* sem;
+        preempt_disable();
+        TS_SRP_UP_START;
+        cpu = smp_processor_id();
+        sem = lookup_srp_sem(sem_od);
+        if (sem && sem->cpu == cpu) {
+                do_srp_up(sem);
+                ret = 0;
+        }
+        TS_SRP_UP_END;
+        preempt_enable();
+        return ret;
+}
+static int srp_wake_up(wait_queue_t *wait, unsigned mode, int sync,
+                       void *key)
+{
+        int cpu = smp_processor_id();
+        struct task_struct *tsk = wait->private;
+        if (cpu != get_partition(tsk))
+                TRACE_TASK(tsk, "srp_wake_up on wrong cpu, partition is %d\b",
+                           get_partition(tsk));
+        else if (srp_exceeds_ceiling(tsk, &__get_cpu_var(srp)))
+                return default_wake_function(wait, mode, sync, key);
+        return 0;
+}
+static void do_ceiling_block(struct task_struct *tsk)
+{
+        wait_queue_t wait = {
+                .private   = tsk,
+                .func      = srp_wake_up,
+                .task_list = {NULL, NULL}
+        };
+        tsk->state = TASK_UNINTERRUPTIBLE;
+        add_wait_queue(&__get_cpu_var(srp).ceiling_blocked, &wait);
+        tsk->rt_param.srp_non_recurse = 1;
+        preempt_enable_no_resched();
+        schedule();
+        preempt_disable();
+        tsk->rt_param.srp_non_recurse = 0;
+        remove_wait_queue(&__get_cpu_var(srp).ceiling_blocked, &wait);
+}
+/* Wait for current task priority to exceed system-wide priority ceiling.
+ */
+void srp_ceiling_block(void)
+{
+        struct task_struct *tsk = current;
+        /* Only applies to real-time tasks, but optimize for RT tasks. */
+        if (unlikely(!is_realtime(tsk)))
+                return;
+        /* Avoid recursive ceiling blocking. */
+        if (unlikely(tsk->rt_param.srp_non_recurse))
+                return;
+        /* Bail out early if there aren't any SRP resources around. */
+        if (likely(!atomic_read(&srp_objects_in_use)))
+                return;
+        preempt_disable();
+        if (!srp_exceeds_ceiling(tsk, &__get_cpu_var(srp))) {
+                TRACE_CUR("is priority ceiling blocked.\n");
+                while (!srp_exceeds_ceiling(tsk, &__get_cpu_var(srp)))
+                        do_ceiling_block(tsk);
+                TRACE_CUR("finally exceeds system ceiling.\n");
+        } else
+                TRACE_CUR("is not priority ceiling blocked\n");
+        preempt_enable();
+}
+#else
+asmlinkage long sys_srp_down(int sem_od)
+{
+        return -ENOSYS;
+}
+asmlinkage long sys_srp_up(int sem_od)
+{
+        return -ENOSYS;
+}
+struct fdso_ops srp_sem_ops = {};
+#endif
diff --git a/litmus/sync.c b/litmus/sync.c
new file mode 100644
index 000000000000..bf75fde5450b
--- /dev/null
+++ b/litmus/sync.c
@@ -0,0 +1,104 @@
+/* litmus/sync.c - Support for synchronous and asynchronous task system releases.
+ *
+ *
+ */
+#include <asm/atomic.h>
+#include <asm/uaccess.h>
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <linux/sched.h>
+#include <linux/completion.h>
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/jobs.h>
+#include <litmus/sched_trace.h>
+static DECLARE_COMPLETION(ts_release);
+static long do_wait_for_ts_release(void)
+{
+        long ret = 0;
+        /* If the interruption races with a release, the completion object
+         * may have a non-zero counter. To avoid this problem, this should
+         * be replaced by wait_for_completion().
+         *
+         * For debugging purposes, this is interruptible for now.
+         */
+        ret = wait_for_completion_interruptible(&ts_release);
+        return ret;
+}
+int count_tasks_waiting_for_release(void)
+{
+        unsigned long flags;
+        int task_count = 0;
+        struct list_head *pos;
+        spin_lock_irqsave(&ts_release.wait.lock, flags);
+        list_for_each(pos, &ts_release.wait.task_list) {
+                task_count++;
+        }
+        spin_unlock_irqrestore(&ts_release.wait.lock, flags);
+        return task_count;
+}
+static long do_release_ts(lt_t start)
+{
+        int  task_count = 0;
+        unsigned long flags;
+        struct list_head        *pos;
+        struct task_struct      *t;
+        spin_lock_irqsave(&ts_release.wait.lock, flags);
+        TRACE("<<<<<< synchronous task system release >>>>>>\n");
+        sched_trace_sys_release(&start);
+        list_for_each(pos, &ts_release.wait.task_list) {
+                t = (struct task_struct*) list_entry(pos,
+                                                     struct __wait_queue,
+                                                     task_list)->private;
+                task_count++;
+                litmus->release_at(t, start + t->rt_param.task_params.phase);
+                sched_trace_task_release(t);
+        }
+        spin_unlock_irqrestore(&ts_release.wait.lock, flags);
+        complete_n(&ts_release, task_count);
+        return task_count;
+}
+asmlinkage long sys_wait_for_ts_release(void)
+{
+        long ret = -EPERM;
+        struct task_struct *t = current;
+        if (is_realtime(t))
+                ret = do_wait_for_ts_release();
+        return ret;
+}
+asmlinkage long sys_release_ts(lt_t __user *__delay)
+{
+        long ret;
+        lt_t delay;
+        /* FIXME: check capabilities... */
+        ret = copy_from_user(&delay, __delay, sizeof(delay));
+        if (ret == 0)
+                ret = do_release_ts(litmus_clock() + delay);
+        return ret;
+}
diff --git a/litmus/trace.c b/litmus/trace.c
new file mode 100644
index 000000000000..440376998dc9
--- /dev/null
+++ b/litmus/trace.c
@@ -0,0 +1,103 @@
+#include <linux/module.h>
+#include <litmus/ftdev.h>
+#include <litmus/litmus.h>
+#include <litmus/trace.h>
+/******************************************************************************/
+/*                          Allocation                                        */
+/******************************************************************************/
+static struct ftdev overhead_dev;
+#define trace_ts_buf overhead_dev.minor[0].buf
+static unsigned int ts_seq_no = 0;
+static inline void __save_timestamp_cpu(unsigned long event,
+                                        uint8_t type, uint8_t cpu)
+{
+        unsigned int seq_no;
+        struct timestamp *ts;
+        seq_no = fetch_and_inc((int *) &ts_seq_no);
+        if (ft_buffer_start_write(trace_ts_buf, (void**)  &ts)) {
+                ts->event     = event;
+                ts->timestamp = ft_timestamp();
+                ts->seq_no    = seq_no;
+                ts->cpu       = cpu;
+                ts->task_type = type;
+                ft_buffer_finish_write(trace_ts_buf, ts);
+        }
+}
+static inline void __save_timestamp(unsigned long event,
+                                   uint8_t type)
+{
+        __save_timestamp_cpu(event, type, raw_smp_processor_id());
+}
+feather_callback void save_timestamp(unsigned long event)
+{
+        __save_timestamp(event, TSK_UNKNOWN);
+}
+feather_callback void save_timestamp_def(unsigned long event,
+                                         unsigned long type)
+{
+        __save_timestamp(event, (uint8_t) type);
+}
+feather_callback void save_timestamp_task(unsigned long event,
+                                          unsigned long t_ptr)
+{
+        int rt = is_realtime((struct task_struct *) t_ptr);
+        __save_timestamp(event, rt ? TSK_RT : TSK_BE);
+}
+feather_callback void save_timestamp_cpu(unsigned long event,
+                                         unsigned long cpu)
+{
+        __save_timestamp_cpu(event, TSK_UNKNOWN, cpu);
+}
+/******************************************************************************/
+/*                        DEVICE FILE DRIVER                                  */
+/******************************************************************************/
+/*
+ * should be 8M; it is the max we can ask to buddy system allocator (MAX_ORDER)
+ * and we might not get as much
+ */
+#define NO_TIMESTAMPS (2 << 11)
+/* set MAJOR to 0 to have it dynamically assigned */
+#define FT_TRACE_MAJOR  252
+static int alloc_timestamp_buffer(struct ftdev* ftdev, unsigned int idx)
+{
+        unsigned int count = NO_TIMESTAMPS;
+        while (count && !trace_ts_buf) {
+                printk("time stamp buffer: trying to allocate %u time stamps.\n", count);
+                ftdev->minor[idx].buf = alloc_ft_buffer(count, sizeof(struct timestamp));
+                count /= 2;
+        }
+        return ftdev->minor[idx].buf ? 0 : -ENOMEM;
+}
+static void free_timestamp_buffer(struct ftdev* ftdev, unsigned int idx)
+{
+        free_ft_buffer(ftdev->minor[idx].buf);
+        ftdev->minor[idx].buf = NULL;
+}
+static int __init init_ft_overhead_trace(void)
+{
+        printk("Initializing Feather-Trace overhead tracing device.\n");
+        ftdev_init(&overhead_dev, THIS_MODULE);
+        overhead_dev.minor_cnt = 1; /* only one buffer */
+        overhead_dev.alloc = alloc_timestamp_buffer;
+        overhead_dev.free  = free_timestamp_buffer;
+        return register_ftdev(&overhead_dev, "ft_trace", FT_TRACE_MAJOR);
+}
+module_init(init_ft_overhead_trace);
author	Andrea Bastoni <bastoni@cs.unc.edu>	2010-05-29 23:35:01 -0400
committer	Andrea Bastoni <bastoni@cs.unc.edu>	2010-05-29 23:35:01 -0400
commit	6ffc1fee98c4b995eb3a0285f4f8fb467cb0306e (patch)
tree	69a05892a41e7f7400fa598ee0bdf8027c8f0fd6 /litmus
parent	e40152ee1e1c7a63f4777791863215e3faa37a86 (diff)
parent	7c1ff4c544dd650cceff3cd69a04bcba60856678 (diff)