LITMUS 2008: Initial Port

This introduces the core changes ported from LITMUS 2007. The kernel seems to work under QEMU, but many bugs probably remain.
author: Bjoern B. Brandenburg <bbb@cs.unc.edu> 2008-02-13 14:13:15 -0500
committer: Bjoern B. Brandenburg <bbb@cs.unc.edu> 2008-02-13 14:13:15 -0500
commit: 8ce9b0cb97d9266b3b64b2b57835e17f6e03f585 (patch)
tree: a6ef1acaf9c9dc116ccc9f24f5233fa7d25cd426
parent: 49914084e797530d9baaf51df9eda77babc98fa8 (diff)
38 files changed, 5434 insertions, 7 deletions
diff --git a/Makefile b/Makefile
index 189d8ef416..d9e4495038 100644
--- a/Makefile
+++ b/Makefile
@@ -597,7 +597,7 @@ export mod_strip_cmd
 ifeq ($(KBUILD_EXTMOD),)
-core-y          += kernel/ mm/ fs/ ipc/ security/ crypto/ block/
+core-y          += kernel/ mm/ fs/ ipc/ security/ crypto/ block/ litmus/
 vmlinux-dirs    := $(patsubst %/,%,$(filter %/, $(init-y) $(init-m) \
                     $(core-y) $(core-m) $(drivers-y) $(drivers-m) \
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 80b7ba4056..f99330fed0 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1620,3 +1620,5 @@ source "security/Kconfig"
 source "crypto/Kconfig"
 source "lib/Kconfig"
+source "litmus/Kconfig"
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index 8344c70adf..9c9ffbe8b6 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -324,3 +324,18 @@ ENTRY(sys_call_table)
        .long sys_timerfd
        .long sys_eventfd
        .long sys_fallocate
+        /* LITMUS */
+        .long sys_set_rt_task_param     /* 325 */
+        .long sys_get_rt_task_param
+        .long sys_complete_job
+        .long sys_register_np_flag
+        .long sys_exit_np
+        .long sys_od_open               /* 330 */
+        .long sys_od_close
+        .long sys_pi_down
+        .long sys_pi_up
+        .long sys_srp_down
+        .long sys_srp_up                /* 335 */
+        .long sys_reg_task_srp_sem
+        .long sys_query_job_no
+        .long sys_wait_for_job_release  /* 338 */
diff --git a/fs/exec.c b/fs/exec.c
index 282240afe9..6f47786702 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -56,6 +56,8 @@
 #include <asm/mmu_context.h>
 #include <asm/tlb.h>
+#include <litmus/litmus.h>
 #ifdef CONFIG_KMOD
 #include <linux/kmod.h>
 #endif
@@ -1309,6 +1311,7 @@ int do_execve(char * filename,
                goto out_kfree;
        sched_exec();
+        litmus_exec();
        bprm->file = file;
        bprm->filename = filename;
diff --git a/fs/inode.c b/fs/inode.c
index ed35383d0b..ef71ea06c6 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -220,6 +220,8 @@ void inode_init_once(struct inode *inode)
        INIT_LIST_HEAD(&inode->inotify_watches);
        mutex_init(&inode->inotify_mutex);
 #endif
+        INIT_LIST_HEAD(&inode->i_obj_list);
+        mutex_init(&inode->i_obj_mutex);
 }
 EXPORT_SYMBOL(inode_init_once);
diff --git a/include/asm-x86/unistd_32.h b/include/asm-x86/unistd_32.h
index 9b15545eb9..063c5856f2 100644
--- a/include/asm-x86/unistd_32.h
+++ b/include/asm-x86/unistd_32.h
@@ -330,10 +330,24 @@
 #define __NR_timerfd            322
 #define __NR_eventfd            323
 #define __NR_fallocate          324
+#define __NR_set_rt_task_param  325
+#define __NR_get_rt_task_param  326
+#define __NR_sleep_next_period  327
+#define __NR_register_np_flag   328
+#define __NR_exit_np            329
+#define __NR_od_open            330
+#define __NR_od_close           331
+#define __NR_pi_down            332
+#define __NR_pi_up              333
+#define __NR_srp_down           334
+#define __NR_srp_up             335
+#define __NR_reg_task_srp_sem   336
+#define __NR_query_job_no       337
+#define __NR_wait_for_job_release 338
 #ifdef __KERNEL__
-#define NR_syscalls 325
+#define NR_syscalls 339
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b3ec4a496d..22f856c14e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -588,6 +588,8 @@ static inline int mapping_writably_mapped(struct address_space *mapping)
 #define i_size_ordered_init(inode) do { } while (0)
 #endif
+struct inode_obj_id_table;
 struct inode {
        struct hlist_node       i_hash;
        struct list_head        i_list;
@@ -653,6 +655,9 @@ struct inode {
        void                    *i_security;
 #endif
        void                    *i_private; /* fs or device private pointer */
+        struct list_head        i_obj_list;
+        struct mutex            i_obj_mutex;
 };
 /*
diff --git a/include/linux/sched.h b/include/linux/sched.h
index cc14656f86..9541cc8fe8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -37,6 +37,7 @@
 #define SCHED_BATCH             3
 /* SCHED_ISO: reserved but not implemented yet */
 #define SCHED_IDLE              5
+#define SCHED_LITMUS            6
 #ifdef __KERNEL__
@@ -91,6 +92,8 @@ struct sched_param {
 #include <asm/processor.h>
+#include <litmus/rt_param.h>
 struct exec_domain;
 struct futex_pi_state;
 struct bio;
@@ -914,6 +917,8 @@ struct sched_entity {
 #endif
 };
+struct od_table_entry;
 struct task_struct {
        volatile long state;    /* -1 unrunnable, 0 runnable, >0 stopped */
        void *stack;
@@ -1178,6 +1183,17 @@ struct task_struct {
        int make_it_fail;
 #endif
        struct prop_local_single dirties;
+        /* litmus parameters and state */
+        struct rt_param rt_param;
+        /* allow scheduler plugins to queue in release lists, etc.
+         * Cleanup: Move this into the rt_param struct.
+         */
+        struct list_head rt_list;
+        /* references to PI semaphores, etc. */
+        struct od_table_entry* od_table;
 };
 /*
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 975c963e57..6ae0ff9494 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -84,4 +84,20 @@ static inline unsigned long __copy_from_user_nocache(void *to,
                ret;                                    \
        })
+/* This is a naive attempt at a write version of the above native Linux macro.
+ */
+#define poke_kernel_address(val, addr)                  \
+        ({                                              \
+                long ret;                               \
+                mm_segment_t old_fs = get_fs();         \
+                                                        \
+                set_fs(KERNEL_DS);                      \
+                pagefault_disable();                    \
+                ret = __put_user(val, (__force typeof(val) __user *)(addr)); \
+                pagefault_enable();                     \
+                set_fs(old_fs);                         \
+                ret;                                    \
+        })
 #endif          /* __LINUX_UACCESS_H__ */
diff --git a/include/litmus/edf_common.h b/include/litmus/edf_common.h
new file mode 100644
index 0000000000..f3c930b137
--- /dev/null
+++ b/include/litmus/edf_common.h
@@ -0,0 +1,35 @@
+/* EDF common data structures and utility functions shared by all EDF
+ * based scheduler plugins
+ */
+/* CLEANUP: Add comments and make it less messy.
+ *
+ */
+#ifndef __UNC_EDF_COMMON_H__
+#define __UNC_EDF_COMMON_H__
+#include <litmus/rt_domain.h>
+void edf_domain_init(rt_domain_t* rt, check_resched_needed_t resched);
+int edf_higher_prio(struct task_struct* first,
+                    struct task_struct* second);
+int edf_ready_order(struct list_head* a, struct list_head* b);
+void edf_release_at(struct task_struct *t, lt_t start);
+int  edf_preemption_needed(rt_domain_t* rt, struct task_struct *t);
+long edf_complete_job(void);
+void edf_prepare_for_next_period(struct task_struct *t);
+#define job_completed(t) (!is_be(t) && \
+        (t)->rt_param.times.exec_time == (t)->rt_param.basic_params.exec_cost)
+int edf_set_hp_task(struct pi_semaphore *sem);
+int edf_set_hp_cpu_task(struct pi_semaphore *sem, int cpu);
+#endif
diff --git a/include/litmus/fdso.h b/include/litmus/fdso.h
new file mode 100644
index 0000000000..5a783555e7
--- /dev/null
+++ b/include/litmus/fdso.h
@@ -0,0 +1,69 @@
+/* fdso.h - file descriptor attached shared objects
+ *
+ * (c) 2007 B. Brandenburg, LITMUS^RT project
+ */
+#ifndef _LINUX_FDSO_H_
+#define _LINUX_FDSO_H_
+#include <linux/list.h>
+#include <asm/atomic.h>
+#include <linux/fs.h>
+#define MAX_OBJECT_DESCRIPTORS 32
+typedef enum  {
+        MIN_OBJ_TYPE    = 0,
+        PI_SEM          = 0,
+        SRP_SEM         = 1,
+        MAX_OBJ_TYPE    = 1
+} obj_type_t;
+struct inode_obj_id {
+        struct list_head        list;
+        atomic_t                count;
+        struct inode*           inode;
+        obj_type_t              type;
+        void*                   obj;
+        unsigned int            id;
+};
+struct od_table_entry {
+        unsigned int            used;
+        struct inode_obj_id*    obj;
+        void*                   extra;
+};
+struct fdso_ops {
+        void* (*create) (void);
+        void  (*destroy)(void*);
+        int   (*open)   (struct od_table_entry*, void* __user);
+        int   (*close)  (struct od_table_entry*);
+};
+/* translate a userspace supplied od into the raw table entry
+ * returns NULL if od is invalid
+ */
+struct od_table_entry* __od_lookup(int od);
+/* translate a userspace supplied od into the associated object
+ * returns NULL if od is invalid
+ */
+static inline void* od_lookup(int od, obj_type_t type)
+{
+        struct od_table_entry* e = __od_lookup(od);
+        return e && e->obj->type == type ? e->obj->obj : NULL;
+}
+#define lookup_pi_sem(od)  ((struct pi_semaphore*)  od_lookup(od, PI_SEM))
+#define lookup_srp_sem(od) ((struct srp_semaphore*) od_lookup(od, SRP_SEM))
+#define lookup_ics(od)     ((struct ics*)           od_lookup(od, ICS_ID))
+#endif
diff --git a/include/litmus/feather_buffer.h b/include/litmus/feather_buffer.h
new file mode 100644
index 0000000000..c788227905
--- /dev/null
+++ b/include/litmus/feather_buffer.h
@@ -0,0 +1,108 @@
+#ifndef _FEATHER_BUFFER_H_
+#define _FEATHER_BUFFER_H_
+/* requires UINT_MAX and memcpy */
+static inline int  fetch_and_inc(int *val)
+{
+        int ret = 1;
+        __asm__ __volatile__("lock; xaddl %0, %1" : "+r" (ret), "+m" (*val) : : "memory" );
+        return ret;
+}
+static inline int  fetch_and_dec(int *val)
+{
+        int ret = -1;
+        __asm__ __volatile__("lock; xaddl %0, %1" : "+r" (ret), "+m" (*val) : : "memory" );
+        return ret;
+}
+#define SLOT_FREE       0
+#define SLOT_BUSY       1
+#define SLOT_READY      2
+struct ft_buffer {
+        unsigned int    slot_count;
+        unsigned int    slot_size;
+        int             free_count;
+        unsigned int    write_idx;
+        unsigned int    read_idx;
+        char*           slots;
+        void*           buffer_mem;
+        unsigned int    failed_writes;
+};
+static inline int init_ft_buffer(struct ft_buffer*      buf,
+                                 unsigned int           slot_count,
+                                 unsigned int           slot_size,
+                                 char*                  slots,
+                                 void*                  buffer_mem)
+{
+        int i = 0;
+        if (!slot_count || UINT_MAX % slot_count != slot_count - 1) {
+                /* The slot count must divide UNIT_MAX + 1 so that when it
+                 * wraps around the index correctly points to 0.
+                 */
+                return 0;
+        } else {
+                buf->slot_count    = slot_count;
+                buf->slot_size     = slot_size;
+                buf->slots         = slots;
+                buf->buffer_mem    = buffer_mem;
+                buf->free_count    = slot_count;
+                buf->write_idx     = 0;
+                buf->read_idx      = 0;
+                buf->failed_writes = 0;
+                for (i = 0; i < slot_count; i++)
+                        buf->slots[i] = SLOT_FREE;
+                return 1;
+        }
+}
+static inline int ft_buffer_start_write(struct ft_buffer* buf, void **ptr)
+{
+        int free = fetch_and_dec(&buf->free_count);
+        unsigned int idx;
+        if (free <= 0) {
+                fetch_and_inc(&buf->free_count);
+                *ptr = 0;
+                fetch_and_inc(&buf->failed_writes);
+                return 0;
+        } else {
+                idx  = fetch_and_inc((int*) &buf->write_idx) % buf->slot_count;
+                buf->slots[idx] = SLOT_BUSY;
+                *ptr = ((char*) buf->buffer_mem) + idx * buf->slot_size;
+                return 1;
+        }
+}
+static inline void ft_buffer_finish_write(struct ft_buffer* buf, void *ptr)
+{
+        unsigned int idx = ((char*) ptr - (char*) buf->buffer_mem) / buf->slot_size;
+        buf->slots[idx]  = SLOT_READY;
+}
+/* exclusive reader access is assumed */
+static inline int ft_buffer_read(struct ft_buffer* buf, void* dest)
+{
+        unsigned int idx;
+        if (buf->free_count == buf->slot_count)
+                /* nothing available */
+                return 0;
+        idx = buf->read_idx % buf->slot_count;
+        if (buf->slots[idx] == SLOT_READY) {
+                memcpy(dest, ((char*) buf->buffer_mem) + idx * buf->slot_size,
+                       buf->slot_size);
+                buf->slots[idx] = SLOT_FREE;
+                buf->read_idx++;
+                fetch_and_inc(&buf->free_count);
+                return 1;
+        } else
+                return 0;
+}
+#endif
diff --git a/include/litmus/feather_trace.h b/include/litmus/feather_trace.h
new file mode 100644
index 0000000000..5c37ea71ea
--- /dev/null
+++ b/include/litmus/feather_trace.h
@@ -0,0 +1,93 @@
+#ifndef _FEATHER_TRACE_H_
+#define _FEATHER_TRACE_H_
+#define feather_callback __attribute__((regparm(0)))
+/* make the compiler reload any register that is not saved in
+ * a cdecl function call
+ */
+#define CLOBBER_LIST "memory", "cc", "eax", "ecx", "edx"
+#define ft_event(id, callback)                                  \
+        __asm__ __volatile__(                                   \
+            "1: jmp 2f                                    \n\t" \
+            " call " #callback "                          \n\t" \
+            ".section __event_table, \"aw\"               \n\t" \
+            ".long " #id  ", 0, 1b, 2f                    \n\t" \
+            ".previous                                    \n\t" \
+            "2:                                           \n\t" \
+        : : : CLOBBER_LIST)
+#define ft_event0(id, callback)                                 \
+        __asm__ __volatile__(                                   \
+            "1: jmp 2f                                    \n\t" \
+            " subl $4, %%esp                              \n\t" \
+            " movl $" #id  ", (%%esp)                     \n\t" \
+            " call " #callback "                          \n\t" \
+            " addl $4, %%esp                              \n\t" \
+            ".section __event_table, \"aw\"               \n\t" \
+            ".long " #id  ", 0, 1b, 2f                    \n\t" \
+            ".previous                                    \n\t" \
+            "2:                                           \n\t" \
+        : :  : CLOBBER_LIST)
+#define ft_event1(id, callback, param)                          \
+        __asm__ __volatile__(                                   \
+            "1: jmp 2f                                    \n\t" \
+            " subl $8, %%esp                              \n\t" \
+            " movl %0, 4(%%esp)                           \n\t" \
+            " movl $" #id  ", (%%esp)                     \n\t" \
+            " call " #callback "                          \n\t" \
+            " addl $8, %%esp                              \n\t" \
+            ".section __event_table, \"aw\"               \n\t" \
+            ".long " #id  ", 0, 1b, 2f                    \n\t" \
+            ".previous                                    \n\t" \
+            "2:                                           \n\t" \
+        : : "r" (param)  : CLOBBER_LIST)
+#define ft_event2(id, callback, param, param2)                  \
+        __asm__ __volatile__(                                   \
+            "1: jmp 2f                                    \n\t" \
+            " subl $12, %%esp                             \n\t" \
+            " movl %1, 8(%%esp)                           \n\t" \
+            " movl %0, 4(%%esp)                           \n\t" \
+            " movl $" #id  ", (%%esp)                     \n\t" \
+            " call " #callback "                          \n\t" \
+            " addl $12, %%esp                             \n\t" \
+            ".section __event_table, \"aw\"               \n\t" \
+            ".long " #id  ", 0, 1b, 2f                    \n\t" \
+            ".previous                                    \n\t" \
+            "2:                                           \n\t" \
+        : : "r" (param), "r" (param2)  : CLOBBER_LIST)
+#define ft_event3(id, callback, p, p2, p3)                      \
+        __asm__ __volatile__(                                   \
+            "1: jmp 2f                                    \n\t" \
+            " subl $16, %%esp                             \n\t" \
+            " movl %1, 12(%%esp)                          \n\t" \
+            " movl %1, 8(%%esp)                           \n\t" \
+            " movl %0, 4(%%esp)                           \n\t" \
+            " movl $" #id  ", (%%esp)                     \n\t" \
+            " call " #callback "                          \n\t" \
+            " addl $16, %%esp                             \n\t" \
+            ".section __event_table, \"aw\"               \n\t" \
+            ".long " #id  ", 0, 1b, 2f                    \n\t" \
+            ".previous                                    \n\t" \
+            "2:                                           \n\t" \
+        : : "r" (p), "r" (p2), "r" (p3)  : CLOBBER_LIST)
+static inline unsigned long long ft_read_tsc(void)
+{
+        unsigned long long ret;
+        __asm__ __volatile__("rdtsc" : "=A" (ret));
+        return ret;
+}
+int ft_enable_event(unsigned long id);
+int ft_disable_event(unsigned long id);
+int ft_is_event_enabled(unsigned long id);
+int ft_disable_all_events(void);
+#endif
diff --git a/include/litmus/litmus.h b/include/litmus/litmus.h
new file mode 100644
index 0000000000..6e99e651d7
--- /dev/null
+++ b/include/litmus/litmus.h
@@ -0,0 +1,192 @@
+/*
+ * Constant definitions related to
+ * scheduling policy.
+ */
+#ifndef _LINUX_LITMUS_H_
+#define _LINUX_LITMUS_H_
+#include <linux/jiffies.h>
+#include <litmus/sched_trace.h>
+typedef enum {
+        SCHED_LINUX             =  0,
+        SCHED_GSN_EDF           = 10,
+        SCHED_PSN_EDF           = 11,
+        /*      Add your scheduling policy here */
+        SCHED_DEFAULT           =  0,
+        SCHED_INVALID           = -1,
+} spolicy;
+typedef enum {
+        LITMUS_RESERVED_RANGE = 1024,
+} sched_setup_cmd_t;
+/*      per-task modes */
+enum rt_task_mode_t {
+        BACKGROUND_TASK = 0,
+        LITMUS_RT_TASK  = 1
+};
+/*      Plugin boot options, for convenience */
+#define PLUGIN_LINUX            "linux"
+#define PLUGIN_GSN_EDF          "gsn_edf"
+#define PLUGIN_PSN_EDF          "psn_edf"
+extern spolicy sched_policy;
+/*      RT mode start time      */
+extern volatile unsigned long rt_start_time;
+#define TRACE(fmt, args...) \
+        sched_trace_log_message("%d: " fmt, raw_smp_processor_id(), ## args)
+#define TRACE_TASK(t, fmt, args...) \
+        TRACE("(%s/%d) " fmt, (t)->comm, (t)->pid, ##args)
+#define TRACE_CUR(fmt, args...) \
+        TRACE_TASK(current, fmt, ## args)
+#define TRACE_BUG_ON(cond) \
+        do { if (cond) TRACE("BUG_ON(%s) at %s:%d " \
+                             "called from %p current=%s/%d state=%d " \
+                             "flags=%x partition=%d cpu=%d rtflags=%d"\
+                             " job=%u knp=%d timeslice=%u\n",           \
+        #cond, __FILE__, __LINE__, __builtin_return_address(0), current->comm, \
+        current->pid, current->state, current->flags,  \
+        get_partition(current), smp_processor_id(), get_rt_flags(current), \
+        current->rt_param.job_params.job_no, current->rt_param.kernel_np, \
+        current->time_slice\
+        ); } while(0);
+/* in_list - is a given list_head queued on some list?
+ */
+static inline int in_list(struct list_head* list)
+{
+        return !(  /* case 1: deleted */
+                   (list->next == LIST_POISON1 &&
+                    list->prev == LIST_POISON2)
+                 ||
+                   /* case 2: initialized */
+                   (list->next == list &&
+                    list->prev == list)
+                );
+}
+typedef int (*list_cmp_t)(struct list_head*, struct list_head*);
+static inline unsigned int list_insert(struct list_head* new,
+                                       struct list_head* head,
+                                       list_cmp_t order_before)
+{
+        struct list_head *pos;
+        unsigned int passed = 0;
+        BUG_ON(!new);
+        /* find a spot where the new entry is less than the next */
+        list_for_each(pos, head) {
+                if (unlikely(order_before(new, pos))) {
+                        /* pos is not less than new, thus insert here */
+                        __list_add(new, pos->prev, pos);
+                        goto out;
+                }
+                passed++;
+        }
+        /* if we get to this point either the list is empty or every entry
+         * queued element is less than new.
+         * Let's add new to the end. */
+        list_add_tail(new, head);
+ out:
+        return passed;
+}
+void list_qsort(struct list_head* list, list_cmp_t less_than);
+#define RT_PREEMPTIVE           0x2050 /* = NP */
+#define RT_NON_PREEMPTIVE       0x4e50 /* =  P */
+#define RT_EXIT_NP_REQUESTED    0x5251 /* = RQ */
+/* returns 1 if task t has registered np flag and set it to RT_NON_PREEMPTIVE
+ */
+int is_np(struct task_struct *t);
+/* request that the task should call sys_exit_np()
+ */
+void request_exit_np(struct task_struct *t);
+/* kill naughty tasks
+ */
+void scheduler_signal(struct task_struct *t, unsigned int signal);
+void send_scheduler_signals(void);
+void np_mem_kill(struct task_struct *t);
+void litmus_fork(struct task_struct *tsk);
+void litmus_exec(void);
+/* clean up real-time state of a task */
+void exit_litmus(struct task_struct *dead_tsk);
+long litmus_admit_task(struct task_struct *tsk);
+void litmus_exit_task(struct task_struct *tsk);
+#define is_realtime(t)          ((t)->policy == SCHED_LITMUS)
+#define rt_transition_pending(t) \
+        ((t)->rt_param.transition_pending)
+/*      Realtime utility macros */
+#define get_rt_flags(t)         ((t)->rt_param.flags)
+#define set_rt_flags(t,f)       (t)->rt_param.flags=(f)
+#define get_exec_cost(t)        ((t)->rt_param.task_params.exec_cost)
+#define get_exec_time(t)        ((t)->rt_param.job_params.exec_time)
+#define get_rt_period(t)        ((t)->rt_param.task_params.period)
+#define get_partition(t)        (t)->rt_param.task_params.cpu
+#define get_deadline(t)         ((t)->rt_param.job_params.deadline)
+#define get_class(t)            ((t)->rt_param.task_params.cls)
+inline static int budget_exhausted(struct task_struct* t)
+{
+        return get_exec_time(t) >= get_exec_cost(t);
+}
+#define is_subject_to_srp(t)    ((t)->rt_param.subject_to_srp)
+#define is_hrt(t)               \
+        ((t)->rt_param.task_params.class == RT_CLASS_HARD)
+#define is_srt(t)               \
+        ((t)->rt_param.task_params.class == RT_CLASS_SOFT)
+#define is_be(t)                \
+        ((t)->rt_param.task_params.class == RT_CLASS_BEST_EFFORT)
+#define get_release(t) ((t)->rt_param.job_params.release)
+/* Honor the flag in the preempt_count variable that is set
+ * when scheduling is in progress.
+ */
+#define is_running(t)                   \
+        ((t)->state == TASK_RUNNING ||  \
+         task_thread_info(t)->preempt_count & PREEMPT_ACTIVE)
+#define is_blocked(t)       \
+        (!is_running(t))
+#define is_released(t, now)     \
+        (lt_before_eq(get_release(t), now))
+#define is_tardy(t, now)    \
+        (lt_before_eq((t)->rt_param.job_params.deadline, now))
+/* real-time comparison macros */
+#define earlier_deadline(a, b) (lt_before(\
+        (a)->rt_param.job_params.deadline,\
+        (b)->rt_param.job_params.deadline))
+#define earlier_release(a, b)  (lt_before(\
+        (a)->rt_param.job_params.release,\
+        (b)->rt_param.job_params.release))
+#define make_np(t) do {t->rt_param.kernel_np++;} while(0);
+#define take_np(t) do {t->rt_param.kernel_np--;} while(0);
+#endif
diff --git a/include/litmus/rt_domain.h b/include/litmus/rt_domain.h
new file mode 100644
index 0000000000..79b6034f22
--- /dev/null
+++ b/include/litmus/rt_domain.h
@@ -0,0 +1,94 @@
+/* CLEANUP: Add comments and make it less messy.
+ *
+ */
+#ifndef __UNC_RT_DOMAIN_H__
+#define __UNC_RT_DOMAIN_H__
+struct _rt_domain;
+typedef int (*check_resched_needed_t)(struct _rt_domain *rt);
+typedef void (*release_at_t)(struct task_struct *t, lt_t start);
+typedef struct _rt_domain {
+        /* runnable rt tasks are in here */
+        rwlock_t                        ready_lock;
+        struct list_head                ready_queue;
+        /* real-time tasks waiting for release are in here */
+        spinlock_t                      release_lock;
+        struct list_head                release_queue;
+        /* how do we check if we need to kick another CPU? */
+        check_resched_needed_t          check_resched;
+        /* how are tasks ordered in the ready queue? */
+        list_cmp_t                      order;
+} rt_domain_t;
+#define next_ready(rt) \
+        (list_entry((rt)->ready_queue.next, struct task_struct, rt_list))
+#define ready_jobs_pending(rt) \
+        (!list_empty(&(rt)->ready_queue))
+void rt_domain_init(rt_domain_t *rt, check_resched_needed_t f,
+                    list_cmp_t order);
+void __add_ready(rt_domain_t* rt, struct task_struct *new);
+void __add_release(rt_domain_t* rt, struct task_struct *task);
+struct task_struct* __take_ready(rt_domain_t* rt);
+struct task_struct* __peek_ready(rt_domain_t* rt);
+void try_release_pending(rt_domain_t* rt);
+void __release_pending(rt_domain_t* rt);
+static inline void add_ready(rt_domain_t* rt, struct task_struct *new)
+{
+        unsigned long flags;
+        /* first we need the write lock for rt_ready_queue */
+        write_lock_irqsave(&rt->ready_lock, flags);
+        __add_ready(rt, new);
+        write_unlock_irqrestore(&rt->ready_lock, flags);
+}
+static inline struct task_struct* take_ready(rt_domain_t* rt)
+{
+        unsigned long flags;
+        struct task_struct* ret;
+        /* first we need the write lock for rt_ready_queue */
+        write_lock_irqsave(&rt->ready_lock, flags);
+        ret = __take_ready(rt);
+        write_unlock_irqrestore(&rt->ready_lock, flags);
+        return ret;
+}
+static inline void add_release(rt_domain_t* rt, struct task_struct *task)
+{
+        unsigned long flags;
+        /* first we need the write lock for rt_ready_queue */
+        spin_lock_irqsave(&rt->release_lock, flags);
+        __add_release(rt, task);
+        spin_unlock_irqrestore(&rt->release_lock, flags);
+}
+static inline int __jobs_pending(rt_domain_t* rt)
+{
+        return !list_empty(&rt->ready_queue);
+}
+static inline int jobs_pending(rt_domain_t* rt)
+{
+        unsigned long flags;
+        int ret;
+        /* first we need the write lock for rt_ready_queue */
+        read_lock_irqsave(&rt->ready_lock, flags);
+        ret = __jobs_pending(rt);
+        read_unlock_irqrestore(&rt->ready_lock, flags);
+        return ret;
+}
+#endif
diff --git a/include/litmus/rt_param.h b/include/litmus/rt_param.h
new file mode 100644
index 0000000000..9fb5b19b78
--- /dev/null
+++ b/include/litmus/rt_param.h
@@ -0,0 +1,135 @@
+/*
+ * Definition of the scheduler plugin interface.
+ *
+ */
+#ifndef _LINUX_RT_PARAM_H_
+#define _LINUX_RT_PARAM_H_
+/* Litmus time type. */
+typedef unsigned long long lt_t;
+static inline int lt_after(lt_t a, lt_t b)
+{
+        return ((long long) b) - ((long long) a) < 0;
+}
+#define lt_before(a, b) lt_after(b, a)
+static inline int lt_after_eq(lt_t a, lt_t b)
+{
+        return ((long long) a) - ((long long) b) >= 0;
+}
+#define lt_before_eq(a, b) lt_after_eq(b, a)
+/* different types of clients */
+typedef enum {
+        RT_CLASS_HARD,
+        RT_CLASS_SOFT,
+        RT_CLASS_BEST_EFFORT
+} task_class_t;
+struct rt_task {
+        lt_t            exec_cost;
+        lt_t            period;
+        unsigned int    cpu;
+        task_class_t    cls;
+};
+/* don't export internal data structures to user space (liblitmus) */
+#ifdef __KERNEL__
+struct rt_job {
+        /* Time instant the the job was or will be released.  */
+        lt_t    release;
+        /* What is the current deadline? */
+        lt_t    deadline;
+        /* How much service has this job received so far?
+         */
+        lt_t    exec_time;
+        /* Which job is this. This is used to let user space
+         * specify which job to wait for, which is important if jobs
+         * overrun. If we just call sys_sleep_next_period() then we
+         * will unintentionally miss jobs after an overrun.
+         *
+         * Increase this sequence number when a job is released.
+         */
+        unsigned int    job_no;
+        /* when did this job start executing? */
+        lt_t    exec_start;
+};
+/*      RT task parameters for scheduling extensions
+ *      These parameters are inherited during clone and therefore must
+ *      be explicitly set up before the task set is launched.
+ */
+struct rt_param {
+        /* is the task sleeping? */
+        unsigned int            flags:8;
+        /* Did this task register any SRP controlled resource accesses?
+         * This, of course, should only ever be true under partitioning.
+         * However, this limitation is not currently enforced.
+         */
+        unsigned int            subject_to_srp:1;
+        /* user controlled parameters */
+        struct rt_task          task_params;
+        /* timing parameters */
+        struct rt_job           job_params;
+        /* task representing the current "inherited" task
+         * priority, assigned by inherit_priority and
+         * return priority in the scheduler plugins.
+         * could point to self if PI does not result in
+         * an increased task priority.
+         */
+         struct task_struct*    inh_task;
+        /* Don't just dereference this pointer in kernel space!
+         * It might very well point to junk or nothing at all.
+         * NULL indicates that the task has not requested any non-preemptable
+         * section support.
+         * Not inherited upon fork.
+         */
+        short*                  np_flag;
+        /* For the FMLP under PSN-EDF, it is required to make the task
+         * non-preemptive from kernel space. In order not to interfere with
+         * user space, this counter indicates the kernel space np setting.
+         * kernel_np > 0 => task is non-preemptive
+         */
+        unsigned int            kernel_np;
+        /* This field can be used by plugins to store where the task
+         * is currently scheduled. It is the responsibility of the
+         * plugin to avoid race conditions.
+         *
+         * Used by GSN-EDF.
+         */
+        volatile int            scheduled_on;
+        /* This field can be used by plugins to store where the task
+         * is currently linked. It is the responsibility of the plugin
+         * to avoid race conditions.
+         *
+         * Used by GSN-EDF.
+         */
+        volatile int            linked_on;
+        /* Fields saved before BE->RT transition.
+         */
+        int old_policy;
+        int old_prio;
+};
+/*      Possible RT flags       */
+#define RT_F_RUNNING            0x00000000
+#define RT_F_SLEEP              0x00000001
+#define RT_F_EXIT_SEM           0x00000008
+#endif
+#endif
diff --git a/include/litmus/sched_plugin.h b/include/litmus/sched_plugin.h
new file mode 100644
index 0000000000..421c54f517
--- /dev/null
+++ b/include/litmus/sched_plugin.h
@@ -0,0 +1,118 @@
+/*
+ * Definition of the scheduler plugin interface.
+ *
+ */
+#ifndef _LINUX_SCHED_PLUGIN_H_
+#define _LINUX_SCHED_PLUGIN_H_
+#include <linux/sched.h>
+/* struct for semaphore with priority inheritance */
+struct pi_semaphore {
+        atomic_t count;
+        int sleepers;
+        wait_queue_head_t wait;
+        union {
+                /* highest-prio holder/waiter */
+                struct task_struct *task;
+                struct task_struct* cpu_task[NR_CPUS];
+        } hp;
+        /* current lock holder */
+        struct task_struct *holder;
+};
+/********************* scheduler invocation ******************/
+/*  Plugin-specific realtime tick handler */
+typedef void (*scheduler_tick_t) (struct task_struct *cur);
+/* Novell make sched decision function */
+typedef struct task_struct* (*schedule_t)(struct task_struct * prev);
+/* Clean up after the task switch has occured.
+ * This function is called after every (even non-rt) task switch.
+ */
+typedef void (*finish_switch_t)(struct task_struct *prev);
+/********************* task state changes ********************/
+/* Called to setup a new real-time task.
+ * Release the first job, enqueue, etc.
+ * Task may already be running.
+ */
+typedef void (*task_new_t) (struct task_struct *task,
+                            int on_rq,
+                            int running);
+/* Called to re-introduce a task after blocking.
+ * Can potentially be called multiple times.
+ */
+typedef void (*task_wake_up_t) (struct task_struct *task);
+/* called to notify the plugin of a blocking real-time task
+ * it will only be called for real-time tasks and before schedule is called */
+typedef void (*task_block_t)  (struct task_struct *task);
+/* Called when a real-time task exits or changes to a different scheduling
+ * class.
+ * Free any allocated resources
+ */
+typedef void (*task_exit_t)    (struct task_struct *);
+/* Called when the new_owner is released from the wait queue
+ * it should now inherit the priority from sem, _before_ it gets readded
+ * to any queue
+ */
+typedef long (*inherit_priority_t) (struct pi_semaphore *sem,
+                                    struct task_struct *new_owner);
+/* Called when the current task releases a semahpore where it might have
+ * inherited a piority from
+ */
+typedef long (*return_priority_t) (struct pi_semaphore *sem);
+/* Called when a task tries to acquire a semaphore and fails. Check if its
+ * priority is higher than that of the current holder.
+ */
+typedef long (*pi_block_t) (struct pi_semaphore *sem, struct task_struct *t);
+/********************* sys call backends  ********************/
+/* This function causes the caller to sleep until the next release */
+typedef long (*complete_job_t) (void);
+typedef long (*admit_task_t)(struct task_struct* tsk);
+struct sched_plugin {
+        struct list_head        list;
+        /*      basic info              */
+        char                    *plugin_name;
+        /*      scheduler invocation    */
+        scheduler_tick_t        tick;
+        schedule_t              schedule;
+        finish_switch_t         finish_switch;
+        /*      syscall backend         */
+        complete_job_t          complete_job;
+        /*      task state changes      */
+        admit_task_t            admit_task;
+        task_new_t              task_new;
+        task_wake_up_t          task_wake_up;
+        task_block_t            task_block;
+        task_exit_t             task_exit;
+        /*     priority inheritance     */
+        inherit_priority_t      inherit_priority;
+        return_priority_t       return_priority;
+        pi_block_t              pi_block;
+} __attribute__ ((__aligned__(SMP_CACHE_BYTES)));
+extern struct sched_plugin *litmus;
+int register_sched_plugin(struct sched_plugin* plugin);
+struct sched_plugin* find_sched_plugin(const char* name);
+int print_sched_plugins(char* buf, int max);
+#endif
diff --git a/include/litmus/sched_trace.h b/include/litmus/sched_trace.h
new file mode 100644
index 0000000000..60dcbfb0ae
--- /dev/null
+++ b/include/litmus/sched_trace.h
@@ -0,0 +1,31 @@
+/* sched_trace.h -- record scheduler events to a byte stream for offline analysis.
+ */
+#ifndef _LINUX_SCHED_TRACE_H_
+#define _LINUX_SCHED_TRACE_H_
+#include <linux/sched.h>
+/* dummies, need to be re-implemented */
+/* used in sched.c */
+#define  sched_trace_task_arrival(t)
+#define sched_trace_task_departure(t)
+#define sched_trace_task_preemption(t, by)
+#define sched_trace_task_scheduled(t)
+/* used in scheduler plugins */
+#define sched_trace_job_release(t)
+#define sched_trace_job_completion(t)
+#ifdef CONFIG_SCHED_DEBUG_TRACE
+void sched_trace_log_message(const char* fmt, ...);
+#else
+#define sched_trace_log_message(fmt, ...)
+#endif
+#endif
diff --git a/include/litmus/trace.h b/include/litmus/trace.h
new file mode 100644
index 0000000000..04510237ec
--- /dev/null
+++ b/include/litmus/trace.h
@@ -0,0 +1,74 @@
+#ifndef _SYS_TRACE_H_
+#define _SYS_TRACE_H_
+#include <litmus/feather_trace.h>
+#include <litmus/feather_buffer.h>
+/*********************** TIMESTAMPS ************************/
+struct timestamp {
+        unsigned long           event;
+        unsigned long long      timestamp;
+        unsigned int            seq_no;
+        int                     cpu;
+};
+/* buffer holding time stamps - will be provided by driver */
+extern struct ft_buffer* trace_ts_buf;
+/* save_timestamp:  stores current time as struct timestamp
+ * in trace_ts_buf
+ */
+asmlinkage void save_timestamp(unsigned long event);
+#define TIMESTAMP(id) ft_event0(id, save_timestamp)
+/* Convention for timestamps
+ * =========================
+ *
+ * In order to process the trace files with a common tool, we use the following
+ * convention to measure execution times: The end time id of a code segment is
+ * always the next number after the start time event id.
+ */
+#define TS_SCHED_START                  TIMESTAMP(100)
+#define TS_SCHED_END                    TIMESTAMP(101)
+#define TS_CXS_START                    TIMESTAMP(102)
+#define TS_CXS_END                      TIMESTAMP(103)
+#define TS_TICK_START                   TIMESTAMP(110)
+#define TS_TICK_END                     TIMESTAMP(111)
+#define TS_PLUGIN_SCHED_START           TIMESTAMP(120)
+#define TS_PLUGIN_SCHED_END             TIMESTAMP(121)
+#define TS_PLUGIN_TICK_START            TIMESTAMP(130)
+#define TS_PLUGIN_TICK_END              TIMESTAMP(131)
+#define TS_ENTER_NP_START               TIMESTAMP(140)
+#define TS_ENTER_NP_END                 TIMESTAMP(141)
+#define TS_EXIT_NP_START                TIMESTAMP(150)
+#define TS_EXIT_NP_END                  TIMESTAMP(151)
+#define TS_SRP_UP_START                 TIMESTAMP(160)
+#define TS_SRP_UP_END                   TIMESTAMP(161)
+#define TS_SRP_DOWN_START               TIMESTAMP(162)
+#define TS_SRP_DOWN_END                 TIMESTAMP(163)
+#define TS_PI_UP_START                  TIMESTAMP(170)
+#define TS_PI_UP_END                    TIMESTAMP(171)
+#define TS_PI_DOWN_START                TIMESTAMP(172)
+#define TS_PI_DOWN_END                  TIMESTAMP(173)
+#define TS_FIFO_UP_START                TIMESTAMP(180)
+#define TS_FIFO_UP_END                  TIMESTAMP(181)
+#define TS_FIFO_DOWN_START              TIMESTAMP(182)
+#define TS_FIFO_DOWN_END                TIMESTAMP(183)
+#endif /* !_SYS_TRACE_H_ */
diff --git a/kernel/exit.c b/kernel/exit.c
index 549c0558ba..bc313b74a1 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -52,6 +52,8 @@
 extern void sem_exit (void);
+extern void exit_od_table(struct task_struct* t);
 static void exit_mm(struct task_struct * tsk);
 static void __unhash_process(struct task_struct *p)
@@ -987,6 +989,8 @@ fastcall NORET_TYPE void do_exit(long code)
        if (unlikely(tsk->audit_context))
                audit_free(tsk);
+        exit_od_table(tsk);
        tsk->exit_code = code;
        taskstats_exit(tsk, group_dead);
diff --git a/kernel/fork.c b/kernel/fork.c
index 8dd8ff2810..9e42d3a207 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -59,6 +59,9 @@
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
 /*
 * Protected counters by write_lock_irq(&tasklist_lock)
 */
@@ -121,6 +124,8 @@ void __put_task_struct(struct task_struct *tsk)
        WARN_ON(atomic_read(&tsk->usage));
        WARN_ON(tsk == current);
+        exit_litmus(tsk);
        security_task_free(tsk);
        free_uid(tsk->user);
        put_group_info(tsk->group_info);
diff --git a/kernel/sched.c b/kernel/sched.c
index e76b11ca6d..4890a12786 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -324,6 +324,8 @@ struct rq {
        atomic_t nr_iowait;
+        struct task_struct* litmus_next;
 #ifdef CONFIG_SMP
        struct sched_domain *sd;
@@ -875,11 +877,12 @@ static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
 #include "sched_idletask.c"
 #include "sched_fair.c"
 #include "sched_rt.c"
+#include "../litmus/sched_litmus.c"
 #ifdef CONFIG_SCHED_DEBUG
 # include "sched_debug.c"
 #endif
-#define sched_class_highest (&rt_sched_class)
+#define sched_class_highest (&litmus_sched_class)
 /*
 * Update delta_exec, delta_fair fields for rq.
@@ -1529,7 +1532,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
        this_cpu = smp_processor_id();
 #ifdef CONFIG_SMP
-        if (unlikely(task_running(rq, p)))
+        if (unlikely(task_running(rq, p) || is_realtime(p)))
                goto out_activate;
        new_cpu = cpu;
@@ -1890,6 +1893,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
         */
        prev_state = prev->state;
        finish_arch_switch(prev);
+        litmus->finish_switch(prev);
        finish_lock_switch(rq, prev);
        fire_sched_in_preempt_notifiers(current);
        if (mm)
@@ -3491,6 +3495,7 @@ void scheduler_tick(void)
        update_cpu_load(rq);
        if (curr != rq->idle) /* FIXME: needed? */
                curr->sched_class->task_tick(rq, curr);
+        litmus_tick(rq, curr);
        spin_unlock(&rq->lock);
 #ifdef CONFIG_SMP
@@ -3641,6 +3646,10 @@ need_resched_nonpreemptible:
         */
        local_irq_disable();
        __update_rq_clock(rq);
+        /* do litmus scheduling outside of rq lock, so that we
+         * can do proper migrations for global schedulers
+         */
+        litmus_schedule(rq, prev);
        spin_lock(&rq->lock);
        clear_tsk_need_resched(prev);
@@ -4236,6 +4245,9 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
        case SCHED_RR:
                p->sched_class = &rt_sched_class;
                break;
+        case SCHED_LITMUS:
+                p->sched_class = &litmus_sched_class;
+                break;
        }
        p->rt_priority = prio;
@@ -4268,7 +4280,7 @@ recheck:
                policy = oldpolicy = p->policy;
        else if (policy != SCHED_FIFO && policy != SCHED_RR &&
                        policy != SCHED_NORMAL && policy != SCHED_BATCH &&
-                        policy != SCHED_IDLE)
+                        policy != SCHED_IDLE && policy != SCHED_LITMUS)
                return -EINVAL;
        /*
         * Valid priorities for SCHED_FIFO and SCHED_RR are
@@ -4282,6 +4294,9 @@ recheck:
        if (rt_policy(policy) != (param->sched_priority != 0))
                return -EINVAL;
+        if (policy == SCHED_LITMUS && policy == p->policy)
+                return -EINVAL;
        /*
         * Allow unprivileged RT tasks to decrease priority:
         */
@@ -4316,6 +4331,12 @@ recheck:
                        return -EPERM;
        }
+        if (policy == SCHED_LITMUS) {
+                retval = litmus_admit_task(p);
+                if (retval)
+                        return retval;
+        }
        retval = security_task_setscheduler(p, policy, param);
        if (retval)
                return retval;
@@ -4345,9 +4366,15 @@ recheck:
                        p->sched_class->put_prev_task(rq, p);
        }
+        if (p->policy == SCHED_LITMUS)
+                litmus_exit_task(p);
        oldprio = p->prio;
        __setscheduler(rq, p, policy, param->sched_priority);
+        if (policy == SCHED_LITMUS)
+                litmus->task_new(p, on_rq, running);
        if (on_rq) {
                if (running)
                        p->sched_class->set_curr_task(rq);
@@ -4364,6 +4391,7 @@ recheck:
                        check_preempt_curr(rq, p);
                }
        }
        __task_rq_unlock(rq);
        spin_unlock_irqrestore(&p->pi_lock, flags);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index da7c061e72..de30496263 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -845,7 +845,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
        struct sched_entity *se = &curr->se, *pse = &p->se;
        unsigned long gran;
-        if (unlikely(rt_prio(p->prio))) {
+        if (unlikely(rt_prio(p->prio) || p->policy == SCHED_LITMUS)) {
                update_rq_clock(rq);
                update_curr(cfs_rq);
                resched_task(curr);
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 9ba3daa034..c7c938cee2 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -70,7 +70,7 @@ yield_task_rt(struct rq *rq)
 */
 static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
 {
-        if (p->prio < rq->curr->prio)
+        if (p->prio < rq->curr->prio || p->policy == SCHED_LITMUS)
                resched_task(rq->curr);
 }
diff --git a/litmus/Kconfig b/litmus/Kconfig
new file mode 100644
index 0000000000..e6c5469d70
--- /dev/null
+++ b/litmus/Kconfig
@@ -0,0 +1,24 @@
+menu "LITMUS^RT"
+config SCHED_TASK_TRACE
+        bool "Trace real-time tasks"
+        default y
+        help
+          Include support for the sched_trace_XXX() tracing functions. This
+          allows the collection of real-time task events such as job
+          completions, job releases, early completions, etc. This results in  a
+          small overhead in the scheduling code. Disable if the overhead is not
+          acceptable (e.g., benchmarking).
+config SCHED_DEBUG_TRACE
+        bool "TRACE() debugging"
+        default y
+        help
+          Include support for sched_trace_log_messageg(), which is used to
+          implement TRACE(). If disabled, no TRACE() messages will be included
+          in the kernel, and no overheads due to debugging statements will be
+          incurred by the scheduler. Disable if the overhead is not acceptable
+          (e.g. benchmarking).
+endmenu
diff --git a/litmus/Makefile b/litmus/Makefile
new file mode 100644
index 0000000000..4ad854f117
--- /dev/null
+++ b/litmus/Makefile
@@ -0,0 +1,8 @@
+#
+# Makefile for LITMUS^RT
+#
+obj-y     = sched_plugin.o litmus.o sched_trace.o \
+            edf_common.o \
+            sched_gsn_edf.o sched_psn_edf.o litmus_sem.o \
+            trace.o ft_event.o rt_domain.o fdso.o
diff --git a/litmus/edf_common.c b/litmus/edf_common.c
new file mode 100644
index 0000000000..3d9dca852d
--- /dev/null
+++ b/litmus/edf_common.c
@@ -0,0 +1,132 @@
+/*
+ * kernel/edf_common.c
+ *
+ * Common functions for EDF based scheduler.
+ */
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/sched_trace.h>
+#include <litmus/edf_common.h>
+/* edf_higher_prio -  returns true if first has a higher EDF priority
+ *                    than second. Deadline ties are broken by PID.
+ *
+ * first first must not be NULL and a real-time task.
+ * second may be NULL or a non-rt task.
+ */
+int edf_higher_prio(struct task_struct* first,
+                    struct task_struct* second)
+{
+        struct task_struct *first_task = first;
+        struct task_struct *second_task = second;
+        /* Check for inherited priorities. Change task
+         * used for comparison in such a case.
+         */
+        if (first && first->rt_param.inh_task)
+                first_task = first->rt_param.inh_task;
+        if (second && second->rt_param.inh_task)
+                second_task = second->rt_param.inh_task;
+        return
+                /* does the second task exist and is it a real-time task?  If
+                 * not, the first task (which is a RT task) has higher
+                 * priority.
+                 */
+                !second_task || !is_realtime(second_task)  ||
+                /* is the deadline of the first task earlier?
+                 * Then it has higher priority.
+                 */
+                earlier_deadline(first_task, second_task) ||
+                /* Do we have a deadline tie?
+                 * Then break by PID.
+                 */
+                (get_deadline(first_task) == get_deadline(second_task) &&
+                (first_task->pid < second_task->pid ||
+                /* If the PIDs are the same then the task with the inherited
+                 * priority wins.
+                 */
+                (first_task->pid == second_task->pid &&
+                 !second->rt_param.inh_task)));
+}
+int edf_ready_order(struct list_head* a, struct list_head* b)
+{
+        return edf_higher_prio(
+                list_entry(a, struct task_struct, rt_list),
+                list_entry(b, struct task_struct, rt_list));
+}
+void edf_release_at(struct task_struct *t, lt_t start)
+{
+        t->rt_param.job_params.deadline = start;
+        edf_prepare_for_next_period(t);
+        set_rt_flags(t, RT_F_RUNNING);
+}
+void edf_domain_init(rt_domain_t* rt, check_resched_needed_t resched)
+{
+        rt_domain_init(rt, resched, edf_ready_order);
+}
+void edf_prepare_for_next_period(struct task_struct *t)
+{
+        BUG_ON(!t);
+        /* prepare next release */
+        t->rt_param.job_params.release   = t->rt_param.job_params.deadline;
+        t->rt_param.job_params.deadline += get_rt_period(t);
+        t->rt_param.job_params.exec_time = 0;
+        /* update job sequence number */
+        t->rt_param.job_params.job_no++;
+        /* don't confuse Linux */
+        t->time_slice = 1;
+}
+/* need_to_preempt - check whether the task t needs to be preempted
+ *                   call only with irqs disabled and with  ready_lock acquired
+ *                   THIS DOES NOT TAKE NON-PREEMPTIVE SECTIONS INTO ACCOUNT!
+ */
+int edf_preemption_needed(rt_domain_t* rt, struct task_struct *t)
+{
+        /* we need the read lock for edf_ready_queue */
+        /* no need to preempt if there is nothing pending */
+        if (!ready_jobs_pending(rt))
+                return 0;
+        /* we need to reschedule if t doesn't exist */
+        if (!t)
+                return 1;
+        /* NOTE: We cannot check for non-preemptibility since we
+         *       don't know what address space we're currently in.
+         */
+        /* make sure to get non-rt stuff out of the way */
+        return !is_realtime(t) || edf_higher_prio(next_ready(rt), t);
+}
+/*
+ *      Deactivate current task until the beginning of the next period.
+ */
+long edf_complete_job(void)
+{
+        /* Mark that we do not excute anymore */
+        set_rt_flags(current, RT_F_SLEEP);
+        /* call schedule, this will return when a new job arrives
+         * it also takes care of preparing for the next release
+         */
+        schedule();
+        return 0;
+}
diff --git a/litmus/fdso.c b/litmus/fdso.c
new file mode 100644
index 0000000000..ca9557d877
--- /dev/null
+++ b/litmus/fdso.c
@@ -0,0 +1,279 @@
+/* fdso.c - file descriptor attached shared objects
+ *
+ * (c) 2007 B. Brandenburg, LITMUS^RT project
+ *
+ * Notes:
+ *   - objects descriptor (OD) tables are not cloned during a fork.
+ *   - objects are created on-demand, and freed after the last reference
+ *     is dropped.
+ *   - for now, object types are hard coded.
+ *   - As long as we have live objects, we keep a reference to the inode.
+ */
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/mutex.h>
+#include <linux/file.h>
+#include <asm/uaccess.h>
+#include <litmus/fdso.h>
+extern struct fdso_ops pi_sem_ops;
+extern struct fdso_ops srp_sem_ops;
+static const struct fdso_ops* fdso_ops[] = {
+        &pi_sem_ops,
+        &srp_sem_ops,
+};
+static void* fdso_create(obj_type_t type)
+{
+        return fdso_ops[type]->create();
+}
+static void fdso_destroy(obj_type_t type, void* obj)
+{
+        fdso_ops[type]->destroy(obj);
+}
+static int fdso_open(struct od_table_entry* entry, void* __user config)
+{
+        if (fdso_ops[entry->obj->type]->open)
+                return fdso_ops[entry->obj->type]->open(entry, config);
+        else
+                return 0;
+}
+static int fdso_close(struct od_table_entry* entry)
+{
+        if (fdso_ops[entry->obj->type]->close)
+                return fdso_ops[entry->obj->type]->close(entry);
+        else
+                return 0;
+}
+/* inode must be locked already */
+static struct inode_obj_id* alloc_inode_obj(struct inode* inode,
+                                            obj_type_t type,
+                                            unsigned int id)
+{
+        struct inode_obj_id* obj;
+        void* raw_obj;
+        raw_obj = fdso_create(type);
+        if (!raw_obj)
+                return NULL;
+        obj = kmalloc(sizeof(struct inode_obj_id), GFP_KERNEL);
+        if (!obj)
+                return NULL;
+        INIT_LIST_HEAD(&obj->list);
+        atomic_set(&obj->count, 1);
+        obj->type  = type;
+        obj->id    = id;
+        obj->obj   = raw_obj;
+        obj->inode = inode;
+        list_add(&obj->list, &inode->i_obj_list);
+        atomic_inc(&inode->i_count);
+        printk(KERN_DEBUG "alloc_inode_obj(%p, %d, %d): object created\n", inode, type, id);
+        return obj;
+}
+/* inode must be locked already */
+static struct inode_obj_id* get_inode_obj(struct inode* inode,
+                                          obj_type_t type,
+                                          unsigned int id)
+{
+        struct list_head* pos;
+        struct inode_obj_id* obj = NULL;
+        list_for_each(pos, &inode->i_obj_list) {
+                obj = list_entry(pos, struct inode_obj_id, list);
+                if (obj->id == id && obj->type == type) {
+                        atomic_inc(&obj->count);
+                        return obj;
+                }
+        }
+        printk(KERN_DEBUG "get_inode_obj(%p, %d, %d): couldn't find object\n", inode, type, id);
+        return NULL;
+}
+static void put_inode_obj(struct inode_obj_id* obj)
+{
+        struct inode* inode;
+        int let_go = 0;
+        inode = obj->inode;
+        if (atomic_dec_and_test(&obj->count)) {
+                mutex_lock(&inode->i_obj_mutex);
+                /* no new references can be obtained */
+                if (!atomic_read(&obj->count)) {
+                        list_del(&obj->list);
+                        fdso_destroy(obj->type, obj->obj);
+                        kfree(obj);
+                        let_go = 1;
+                }
+                mutex_unlock(&inode->i_obj_mutex);
+                if (let_go)
+                        iput(inode);
+        }
+}
+static struct od_table_entry*  get_od_entry(struct task_struct* t)
+{
+        struct od_table_entry* table;
+        int i;
+        table = t->od_table;
+        if (!table) {
+                table = (struct od_table_entry*)
+                        kzalloc(sizeof(struct  od_table_entry) *
+                                MAX_OBJECT_DESCRIPTORS, GFP_KERNEL);
+                t->od_table = table;
+        }
+        for (i = 0; table &&  i < MAX_OBJECT_DESCRIPTORS; i++)
+                if (!table[i].used) {
+                        table[i].used = 1;
+                        return table + i;
+                }
+        return NULL;
+}
+static int put_od_entry(struct od_table_entry* od)
+{
+        put_inode_obj(od->obj);
+        od->used = 0;
+        return 0;
+}
+void exit_od_table(struct task_struct* t)
+{
+        int i;
+        if (t->od_table) {
+                for (i = 0; i < MAX_OBJECT_DESCRIPTORS; i++)
+                        if (t->od_table[i].used)
+                                put_od_entry(t->od_table + i);
+                kfree(t->od_table);
+                t->od_table = NULL;
+        }
+}
+static int do_sys_od_open(struct file* file, obj_type_t type, int id,
+                          void* __user config)
+{
+        int idx = 0, err;
+        struct inode* inode;
+        struct inode_obj_id* obj = NULL;
+        struct od_table_entry* entry;
+        inode = file->f_dentry->d_inode;
+        entry = get_od_entry(current);
+        if (!entry)
+                return -ENOMEM;
+        mutex_lock(&inode->i_obj_mutex);
+        obj = get_inode_obj(inode, type, id);
+        if (!obj)
+                obj = alloc_inode_obj(inode, type, id);
+        if (!obj) {
+                idx = -ENOMEM;
+                entry->used = 0;
+        } else {
+                entry->obj   = obj;
+                entry->extra = NULL;
+                idx = entry - current->od_table;
+        }
+        mutex_unlock(&inode->i_obj_mutex);
+        err = fdso_open(entry, config);
+        if (err < 0) {
+                /* The class rejected the open call.
+                 * We need to clean up and tell user space.
+                 */
+                put_od_entry(entry);
+                idx = err;
+        }
+        return idx;
+}
+struct od_table_entry* __od_lookup(int od)
+{
+        struct task_struct *t = current;
+        if (!t->od_table)
+                return NULL;
+        if (od < 0 || od >= MAX_OBJECT_DESCRIPTORS)
+                return NULL;
+        if (!t->od_table[od].used)
+                return NULL;
+        return t->od_table + od;
+}
+asmlinkage int sys_od_open(int fd, int type, int obj_id, void* __user config)
+{
+        int ret = 0;
+        struct file*  file;
+        /*
+           1) get file from fd, get inode from file
+           2) lock inode
+           3) try to lookup object
+           4) if not present create and enqueue object, inc inode refcnt
+           5) increment refcnt of object
+           6) alloc od_table_entry, setup ptrs
+           7) unlock inode
+           8) return offset in od_table as OD
+         */
+        if (type < MIN_OBJ_TYPE || type > MAX_OBJ_TYPE) {
+                ret = -EINVAL;
+                goto out;
+        }
+        file = fget(fd);
+        if (!file) {
+                ret = -EBADF;
+                goto out;
+        }
+        ret = do_sys_od_open(file, type, obj_id, config);
+        fput(file);
+out:
+        return ret;
+}
+asmlinkage int sys_od_close(int od)
+{
+        int ret = -EINVAL;
+        struct task_struct *t = current;
+        if (od < 0 || od >= MAX_OBJECT_DESCRIPTORS)
+                return ret;
+        if (!t->od_table || !t->od_table[od].used)
+                return ret;
+        /* give the class a chance to reject the close
+         */
+        ret = fdso_close(t->od_table + od);
+        if (ret == 0)
+                ret = put_od_entry(t->od_table + od);
+        return ret;
+}
diff --git a/litmus/ft_event.c b/litmus/ft_event.c
new file mode 100644
index 0000000000..b1d80c52d7
--- /dev/null
+++ b/litmus/ft_event.c
@@ -0,0 +1,104 @@
+#include <linux/types.h>
+#include <litmus/feather_trace.h>
+/* the feather trace management functions assume
+ * exclusive access to the event table
+ */
+#define BYTE_JUMP      0xeb
+#define BYTE_JUMP_LEN  0x02
+/* for each event, there is an entry in the event table */
+struct trace_event {
+        long    id;
+        long    count;
+        long    start_addr;
+        long    end_addr;
+};
+extern struct trace_event  __start___event_table[];
+extern struct trace_event  __stop___event_table[];
+int ft_enable_event(unsigned long id)
+{
+        struct trace_event* te = __start___event_table;
+        int count = 0;
+        char* delta;
+        unsigned char* instr;
+        while (te < __stop___event_table) {
+                if (te->id == id && ++te->count == 1) {
+                        instr  = (unsigned char*) te->start_addr;
+                        /* make sure we don't clobber something wrong */
+                        if (*instr == BYTE_JUMP) {
+                                delta  = (((unsigned char*) te->start_addr) + 1);
+                                *delta = 0;
+                        }
+                }
+                if (te->id == id)
+                        count++;
+                te++;
+        }
+        return count;
+}
+int ft_disable_event(unsigned long id)
+{
+        struct trace_event* te = __start___event_table;
+        int count = 0;
+        char* delta;
+        unsigned char* instr;
+        while (te < __stop___event_table) {
+                if (te->id == id && --te->count == 0) {
+                        instr  = (unsigned char*) te->start_addr;
+                        if (*instr == BYTE_JUMP) {
+                                delta  = (((unsigned char*) te->start_addr) + 1);
+                                *delta = te->end_addr - te->start_addr -
+                                        BYTE_JUMP_LEN;
+                        }
+                }
+                if (te->id == id)
+                        count++;
+                te++;
+        }
+        return count;
+}
+int ft_disable_all_events(void)
+{
+        struct trace_event* te = __start___event_table;
+        int count = 0;
+        char* delta;
+        unsigned char* instr;
+        while (te < __stop___event_table) {
+                if (te->count) {
+                        instr  = (unsigned char*) te->start_addr;
+                        if (*instr == BYTE_JUMP) {
+                                delta  = (((unsigned char*) te->start_addr)
+                                          + 1);
+                                *delta = te->end_addr - te->start_addr -
+                                        BYTE_JUMP_LEN;
+                                te->count = 0;
+                                count++;
+                        }
+                }
+                te++;
+        }
+        return count;
+}
+int ft_is_event_enabled(unsigned long id)
+{
+        struct trace_event* te = __start___event_table;
+        while (te < __stop___event_table) {
+                if (te->id == id)
+                        return te->count;
+                te++;
+        }
+        return 0;
+}
diff --git a/litmus/litmus.c b/litmus/litmus.c
new file mode 100644
index 0000000000..8ab96452e6
--- /dev/null
+++ b/litmus/litmus.c
@@ -0,0 +1,799 @@
+/* litmus.c -- Implementation of the LITMUS syscalls, the LITMUS intialization code,
+ *             and the procfs interface..
+ */
+#include <asm/uaccess.h>
+#include <linux/uaccess.h>
+#include <linux/sysrq.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <litmus/litmus.h>
+#include <linux/sched.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/trace.h>
+/* Number of RT tasks that exist in the system */
+atomic_t rt_task_count          = ATOMIC_INIT(0);
+static DEFINE_SPINLOCK(task_transition_lock);
+/* To send signals from the scheduler
+ * Must drop locks first.
+ */
+static LIST_HEAD(sched_sig_list);
+static DEFINE_SPINLOCK(sched_sig_list_lock);
+/*
+ * sys_set_task_rt_param
+ * @pid: Pid of the task which scheduling parameters must be changed
+ * @param: New real-time extension parameters such as the execution cost and
+ *         period
+ * Syscall for manipulating with task rt extension params
+ * Returns EFAULT  if param is NULL.
+ *         ESRCH   if pid is not corrsponding
+ *                 to a valid task.
+ *         EINVAL  if either period or execution cost is <=0
+ *         EPERM   if pid is a real-time task
+ *         0       if success
+ *
+ * Only non-real-time tasks may be configured with this system call
+ * to avoid races with the scheduler. In practice, this means that a
+ * task's parameters must be set _before_ calling sys_prepare_rt_task()
+ */
+asmlinkage long sys_set_rt_task_param(pid_t pid, struct rt_task __user * param)
+{
+        struct rt_task tp;
+        struct task_struct *target;
+        int retval = -EINVAL;
+        printk("Setting up rt task parameters for process %d.\n", pid);
+        if (pid < 0 || param == 0) {
+                goto out;
+        }
+        if (copy_from_user(&tp, param, sizeof(tp))) {
+                retval = -EFAULT;
+                goto out;
+        }
+        /*      Task search and manipulation must be protected */
+        read_lock_irq(&tasklist_lock);
+        if (!(target = find_task_by_pid(pid))) {
+                retval = -ESRCH;
+                goto out_unlock;
+        }
+        if (is_realtime(target)) {
+                /* The task is already a real-time task.
+                 * We cannot not allow parameter changes at this point.
+                 */
+                retval = -EBUSY;
+                goto out_unlock;
+        }
+        if (tp.exec_cost <= 0)
+                goto out_unlock;
+        if (tp.period <= 0)
+                goto out_unlock;
+        if (!cpu_online(tp.cpu))
+                goto out_unlock;
+        if (tp.period < tp.exec_cost)
+        {
+                printk(KERN_INFO "litmus: real-time task %d rejected "
+                       "because wcet > period\n", pid);
+                goto out_unlock;
+        }
+        target->rt_param.task_params = tp;
+        retval = 0;
+      out_unlock:
+        read_unlock_irq(&tasklist_lock);
+      out:
+        return retval;
+}
+/*      Getter of task's RT params
+ *      returns EINVAL if param or pid is NULL
+ *      returns ESRCH  if pid does not correspond to a valid task
+ *      returns EFAULT if copying of parameters has failed.
+ */
+asmlinkage long sys_get_rt_task_param(pid_t pid, struct rt_task __user * param)
+{
+        int retval = -EINVAL;
+        struct task_struct *source;
+        struct rt_task lp;
+        if (param == 0 || pid < 0)
+                goto out;
+        read_lock(&tasklist_lock);
+        if (!(source = find_task_by_pid(pid))) {
+                retval = -ESRCH;
+                goto out_unlock;
+        }
+        lp = source->rt_param.task_params;
+        read_unlock(&tasklist_lock);
+        /* Do copying outside the lock */
+        retval =
+            copy_to_user(param, &lp, sizeof(lp)) ? -EFAULT : 0;
+        return retval;
+      out_unlock:
+        read_unlock(&tasklist_lock);
+      out:
+        return retval;
+}
+/* implemented in kernel/litmus_sem.c */
+void srp_ceiling_block(void);
+/*
+ *      This is the crucial function for periodic task implementation,
+ *      It checks if a task is periodic, checks if such kind of sleep
+ *      is permitted and calls plugin-specific sleep, which puts the
+ *      task into a wait array.
+ *      returns 0 on successful wakeup
+ *      returns EPERM if current conditions do not permit such sleep
+ *      returns EINVAL if current task is not able to go to sleep
+ */
+asmlinkage long sys_complete_job(void)
+{
+        int retval = -EPERM;
+        if (!is_realtime(current)) {
+                retval = -EINVAL;
+                goto out;
+        }
+        /* Task with negative or zero period cannot sleep */
+        if (get_rt_period(current) <= 0) {
+                retval = -EINVAL;
+                goto out;
+        }
+        /* The plugin has to put the task into an
+         * appropriate queue and call schedule
+         */
+        retval = litmus->complete_job();
+        if (!retval && is_subject_to_srp(current))
+                srp_ceiling_block();
+      out:
+        return retval;
+}
+/*      This is an "improved" version of sys_complete_job that
+ *      addresses the problem of unintentionally missing a job after
+ *      an overrun.
+ *
+ *      returns 0 on successful wakeup
+ *      returns EPERM if current conditions do not permit such sleep
+ *      returns EINVAL if current task is not able to go to sleep
+ */
+asmlinkage long sys_wait_for_job_release(unsigned int job)
+{
+        int retval = -EPERM;
+        if (!is_realtime(current)) {
+                retval = -EINVAL;
+                goto out;
+        }
+        /* Task with negative or zero period cannot sleep */
+        if (get_rt_period(current) <= 0) {
+                retval = -EINVAL;
+                goto out;
+        }
+        retval = 0;
+        /* first wait until we have "reached" the desired job
+         *
+         * This implementation has at least two problems:
+         *
+         * 1) It doesn't gracefully handle the wrap around of
+         *    job_no. Since LITMUS is a prototype, this is not much
+         *    of a problem right now.
+         *
+         * 2) It is theoretically racy if a job release occurs
+         *    between checking job_no and calling sleep_next_period().
+         *    A proper solution would requiring adding another callback
+         *    in the plugin structure and testing the condition with
+         *    interrupts disabled.
+         *
+         * FIXME: At least problem 2 should be taken care of eventually.
+         */
+        while (!retval && job > current->rt_param.job_params.job_no)
+          /* If the last job overran then job <= job_no and we
+           * don't send the task to sleep.
+           */
+          retval = litmus->complete_job();
+        /* We still have to honor the SRP after the actual release.
+         */
+        if (!retval && is_subject_to_srp(current))
+                srp_ceiling_block();
+      out:
+        return retval;
+}
+/*      This is a helper syscall to query the current job sequence number.
+ *
+ *      returns 0 on successful query
+ *      returns EPERM if task is not a real-time task.
+ *      returns EFAULT if &job is not a valid pointer.
+ */
+asmlinkage long sys_query_job_no(unsigned int __user *job)
+{
+        int retval = -EPERM;
+        if (is_realtime(current))
+                retval = put_user(current->rt_param.job_params.job_no, job);
+        return retval;
+}
+struct sched_sig {
+        struct list_head        list;
+        struct task_struct*     task;
+        unsigned int            signal:31;
+        int                     force:1;
+};
+static void __scheduler_signal(struct task_struct *t, unsigned int signo,
+                               int force)
+{
+        struct sched_sig* sig;
+        sig = kmalloc(GFP_ATOMIC, sizeof(struct sched_sig));
+        if (!sig) {
+                TRACE_TASK(t, "dropping signal: %u\n", t);
+                return;
+        }
+        spin_lock(&sched_sig_list_lock);
+        sig->signal = signo;
+        sig->force  = force;
+        sig->task   = t;
+        get_task_struct(t);
+        list_add(&sig->list, &sched_sig_list);
+        spin_unlock(&sched_sig_list_lock);
+}
+void scheduler_signal(struct task_struct *t, unsigned int signo)
+{
+        __scheduler_signal(t, signo, 0);
+}
+void force_scheduler_signal(struct task_struct *t, unsigned int signo)
+{
+        __scheduler_signal(t, signo, 1);
+}
+/* FIXME: get rid of the locking and do this on a per-processor basis */
+void send_scheduler_signals(void)
+{
+        unsigned long flags;
+        struct list_head *p, *extra;
+        struct siginfo info;
+        struct sched_sig* sig;
+        struct task_struct* t;
+        struct list_head claimed;
+        if (spin_trylock_irqsave(&sched_sig_list_lock, flags)) {
+                if (list_empty(&sched_sig_list))
+                        p = NULL;
+                else {
+                        p = sched_sig_list.next;
+                        list_del(&sched_sig_list);
+                        INIT_LIST_HEAD(&sched_sig_list);
+                }
+                spin_unlock_irqrestore(&sched_sig_list_lock, flags);
+                /* abort if there are no signals */
+                if (!p)
+                        return;
+                /* take signal list we just obtained */
+                list_add(&claimed, p);
+                list_for_each_safe(p, extra, &claimed) {
+                        list_del(p);
+                        sig = list_entry(p, struct sched_sig, list);
+                        t = sig->task;
+                        info.si_signo = sig->signal;
+                        info.si_errno = 0;
+                        info.si_code  = SI_KERNEL;
+                        info.si_pid   = 1;
+                        info.si_uid   = 0;
+                        TRACE("sending signal %d to %d\n", info.si_signo,
+                              t->pid);
+                        if (sig->force)
+                                force_sig_info(sig->signal, &info, t);
+                        else
+                                send_sig_info(sig->signal, &info, t);
+                        put_task_struct(t);
+                        kfree(sig);
+                }
+        }
+}
+static inline void np_mem_error(struct task_struct* t, const char* reason)
+{
+        if (t->state != TASK_DEAD && !(t->flags & PF_EXITING)) {
+                TRACE("np section: %s => %s/%d killed\n",
+                      reason, t->comm, t->pid);
+                force_scheduler_signal(t, SIGKILL);
+        }
+}
+/*      sys_register_np_flag() allows real-time tasks to register an
+ *      np section indicator.
+ *      returns 0      if the flag was successfully registered
+ *      returns EINVAL if current task is not a real-time task
+ *      returns EFAULT if *flag couldn't be written
+ */
+asmlinkage long sys_register_np_flag(short __user *flag)
+{
+        int retval = -EINVAL;
+        short test_val = RT_PREEMPTIVE;
+        /* avoid races with the scheduler */
+        preempt_disable();
+        TRACE("reg_np_flag(%p) for %s/%d\n", flag,
+              current->comm, current->pid);
+        /* Let's first try to write to the address.
+         * That way it is initialized and any bugs
+         * involving dangling pointers will caught
+         * early.
+         * NULL indicates disabling np section support
+         * and should not be tested.
+         */
+        if (flag)
+          retval = poke_kernel_address(test_val, flag);
+        else
+          retval = 0;
+        TRACE("reg_np_flag: retval=%d\n", retval);
+        if (unlikely(0 != retval))
+                np_mem_error(current, "np flag: not writable");
+        else
+          /* the pointer is ok */
+          current->rt_param.np_flag = flag;
+        preempt_enable();
+        return retval;
+}
+void request_exit_np(struct task_struct *t)
+{
+        int ret;
+        short flag;
+        /* We can only do this if t is actually currently scheduled on this CPU
+         * because otherwise we are in the wrong address space. Thus make sure
+         * to check.
+         */
+        BUG_ON(t != current);
+        if (unlikely(!is_realtime(t) || !t->rt_param.np_flag)) {
+                TRACE_TASK(t, "request_exit_np(): BAD TASK!\n");
+                return;
+        }
+        flag = RT_EXIT_NP_REQUESTED;
+        ret  = poke_kernel_address(flag, t->rt_param.np_flag + 1);
+        TRACE("request_exit_np(%s/%d)\n", t->comm, t->pid);
+        if (unlikely(0 != ret))
+                np_mem_error(current, "request_exit_np(): flag not writable");
+}
+int is_np(struct task_struct* t)
+{
+        int ret;
+        unsigned short flag = 0x5858; /* = XX, looks nicer in debug*/
+        BUG_ON(t != current);
+        if (unlikely(t->rt_param.kernel_np))
+                return 1;
+        else if (unlikely(t->rt_param.np_flag == NULL) ||
+                 t->flags & PF_EXITING ||
+                 t->state == TASK_DEAD)
+                return 0;
+        else {
+                /* This is the tricky part. The process has registered a
+                 * non-preemptive section marker. We now need to check whether
+                 * it is set to to NON_PREEMPTIVE. Along the way we could
+                 * discover that the pointer points to an unmapped region (=>
+                 * kill the task) or that the location contains some garbage
+                 * value (=> also kill the task). Killing the task in any case
+                 * forces userspace to play nicely. Any bugs will be discovered
+                 * immediately.
+                 */
+                ret = probe_kernel_address(t->rt_param.np_flag, flag);
+                if (0 == ret && (flag == RT_NON_PREEMPTIVE ||
+                                 flag == RT_PREEMPTIVE))
+                return flag != RT_PREEMPTIVE;
+                else {
+                        /* either we could not read from the address or
+                         * it contained garbage => kill the process
+                         * FIXME: Should we cause a SEGFAULT instead?
+                         */
+                        TRACE("is_np: ret=%d flag=%c%c (%x)\n", ret,
+                              flag & 0xff, (flag >> 8) & 0xff, flag);
+                        np_mem_error(t, "is_np() could not read");
+                        return 0;
+                }
+        }
+}
+/*
+ *      sys_exit_np() allows real-time tasks to signal that it left a
+ *      non-preemptable section. It will be called after the kernel requested a
+ *      callback in the preemption indicator flag.
+ *      returns 0      if the signal was valid and processed.
+ *      returns EINVAL if current task is not a real-time task
+ */
+asmlinkage long sys_exit_np(void)
+{
+        int retval = -EINVAL;
+        TS_EXIT_NP_START;
+        if (!is_realtime(current))
+                goto out;
+        TRACE("sys_exit_np(%s/%d)\n", current->comm, current->pid);
+        /* force rescheduling so that we can be preempted */
+        set_tsk_need_resched(current);
+        retval = 0;
+      out:
+        TS_EXIT_NP_END;
+        return retval;
+}
+/* p is a real-time task. Re-init its state as a best-effort task. */
+static void reinit_litmus_state(struct task_struct* p, int restore)
+{
+        struct rt_task  user_config = {};
+        __user short   *np_flag     = NULL;
+        if (restore) {
+                /* Safe user-space provided configuration data.
+                 * FIXME: This is missing service levels for adaptive tasks.
+                 */
+                user_config = p->rt_param.task_params;
+                np_flag     = p->rt_param.np_flag;
+        }
+        /* We probably should not be inheriting any task's priority
+         * at this point in time.
+         */
+        WARN_ON(p->rt_param.inh_task);
+        /* We need to restore the priority of the task. */
+//      __setscheduler(p, p->rt_param.old_policy, p->rt_param.old_prio);
+        /* Cleanup everything else. */
+        memset(&p->rt_param, 0, sizeof(struct rt_task));
+        /* Restore preserved fields. */
+        if (restore) {
+                p->rt_param.task_params = user_config;
+                p->rt_param.np_flag      = np_flag;
+        }
+}
+long litmus_admit_task(struct task_struct* tsk)
+{
+        long retval;
+        long flags;
+        BUG_ON(is_realtime(tsk));
+        if (get_rt_period(tsk) == 0 ||
+            get_exec_cost(tsk) > get_rt_period(tsk)) {
+                TRACE_TASK(tsk, "litmus admit: invalid task parameters "
+                           "(%lu, %lu)\n",
+                       get_exec_cost(tsk), get_rt_period(tsk));
+                return -EINVAL;
+        }
+        if (!cpu_online(get_partition(tsk)))
+        {
+                TRACE_TASK(tsk, "litmus admit: cpu %d is not online\n",
+                           get_partition(tsk));
+                return -EINVAL;
+        }
+        INIT_LIST_HEAD(&tsk->rt_list);
+        /* avoid scheduler plugin changing underneath us */
+        spin_lock_irqsave(&task_transition_lock, flags);
+        retval = litmus->admit_task(tsk);
+        if (!retval)
+                atomic_inc(&rt_task_count);
+        spin_unlock_irqrestore(&task_transition_lock, flags);
+        return retval;
+}
+void litmus_exit_task(struct task_struct* tsk)
+{
+        if (is_realtime(tsk)) {
+                litmus->task_exit(tsk);
+                atomic_dec(&rt_task_count);
+                reinit_litmus_state(tsk, 1);
+        }
+}
+/* Switching a plugin in use is tricky.
+ * We must watch out that no real-time tasks exists
+ * (and that none is created in parallel) and that the plugin is not
+ * currently in use on any processor (in theory).
+ *
+ * For now, we don't enforce the second part since it is unlikely to cause
+ * any trouble by itself as long as we don't unload modules.
+ */
+int switch_sched_plugin(struct sched_plugin* plugin)
+{
+        long flags;
+        int ret = 0;
+        BUG_ON(!plugin);
+        /* stop task transitions */
+        spin_lock_irqsave(&task_transition_lock, flags);
+        /* don't switch if there are active real-time tasks */
+        if (atomic_read(&rt_task_count) == 0) {
+                printk(KERN_INFO "Switching to LITMUS^RT plugin %s.\n", plugin->plugin_name);
+                litmus = plugin;
+        } else
+                ret = -EBUSY;
+        spin_unlock_irqrestore(&task_transition_lock, flags);
+        return ret;
+}
+/* Called upon fork.
+ * p is the newly forked task.
+ */
+void litmus_fork(struct task_struct* p)
+{
+        if (is_realtime(p))
+                /* clean out any litmus related state, don't preserve anything*/
+                reinit_litmus_state(p, 0);
+}
+/* Called upon execve().
+ * current is doing the exec.
+ * Don't let address space specific stuff leak.
+ */
+void litmus_exec(void)
+{
+        struct task_struct* p = current;
+        if (is_realtime(p)) {
+                WARN_ON(p->rt_param.inh_task);
+                p->rt_param.np_flag = NULL;
+        }
+}
+void exit_litmus(struct task_struct *dead_tsk)
+{
+        if (is_realtime(dead_tsk))
+                litmus_exit_task(dead_tsk);
+}
+void list_qsort(struct list_head* list, list_cmp_t less_than)
+{
+        struct list_head lt;
+        struct list_head geq;
+        struct list_head *pos, *extra, *pivot;
+        int n_lt = 0, n_geq = 0;
+        BUG_ON(!list);
+        if (list->next == list)
+                return;
+        INIT_LIST_HEAD(&lt);
+        INIT_LIST_HEAD(&geq);
+        pivot = list->next;
+        list_del(pivot);
+        list_for_each_safe(pos, extra, list) {
+                list_del(pos);
+                if (less_than(pos, pivot)) {
+                        list_add(pos, &lt);
+                        n_lt++;
+                } else {
+                        list_add(pos, &geq);
+                        n_geq++;
+                }
+        }
+        if (n_lt < n_geq) {
+                list_qsort(&lt, less_than);
+                list_qsort(&geq, less_than);
+        } else {
+                list_qsort(&geq, less_than);
+                list_qsort(&lt, less_than);
+        }
+        list_splice(&geq, list);
+        list_add(pivot, list);
+        list_splice(&lt, list);
+}
+#ifdef CONFIG_MAGIC_SYSRQ
+int sys_kill(int pid, int sig);
+static void sysrq_handle_kill_rt_tasks(int key, struct tty_struct *tty)
+{
+        struct task_struct *t;
+        read_lock(&tasklist_lock);
+        for_each_process(t) {
+                if (is_realtime(t)) {
+                        sys_kill(t->pid, SIGKILL);
+                }
+        }
+        read_unlock(&tasklist_lock);
+}
+static struct sysrq_key_op sysrq_kill_rt_tasks_op = {
+        .handler        = sysrq_handle_kill_rt_tasks,
+        .help_msg       = "Quit-rt-tasks",
+        .action_msg     = "sent SIGKILL to all real-time tasks",
+};
+#endif
+static int proc_read_stats(char *page, char **start,
+                           off_t off, int count,
+                           int *eof, void *data)
+{
+        int len;
+        len = snprintf(page, PAGE_SIZE,
+                       "real-time task count = %d\n",
+                       atomic_read(&rt_task_count));
+        return len;
+}
+static int proc_read_plugins(char *page, char **start,
+                           off_t off, int count,
+                           int *eof, void *data)
+{
+        int len;
+        len = print_sched_plugins(page, PAGE_SIZE);
+        return len;
+}
+static int proc_read_curr(char *page, char **start,
+                          off_t off, int count,
+                          int *eof, void *data)
+{
+        int len;
+        len = snprintf(page, PAGE_SIZE, "%s\n", litmus->plugin_name);
+        return len;
+}
+static int proc_write_curr(struct file *file,
+                           const char *buffer,
+                           unsigned long count,
+                           void *data)
+{
+        int len, ret;
+        char name[65];
+        struct sched_plugin* found;
+        if(count > 64)
+                len = 64;
+        else
+                len = count;
+        if(copy_from_user(name, buffer, len))
+                return -EFAULT;
+        name[len] = '\0';
+        /* chomp name */
+        if (len > 1 && name[len - 1] == '\n')
+                name[len - 1] = '\0';
+        found = find_sched_plugin(name);
+        if (found) {
+                ret = switch_sched_plugin(found);
+                if (ret != 0)
+                        printk(KERN_INFO "Could not switch plugin: %d\n", ret);
+        } else
+                printk(KERN_INFO "Plugin '%s' is unknown.\n", name);
+        return len;
+}
+static struct proc_dir_entry *litmus_dir = NULL,
+        *curr_file = NULL,
+        *stat_file = NULL,
+        *plugs_file = NULL;
+static int __init init_litmus_proc(void)
+{
+        litmus_dir = proc_mkdir("litmus", NULL);
+        if (!litmus_dir) {
+                printk(KERN_ERR "Could not allocate LITMUS^RT procfs entry.\n");
+                return -ENOMEM;
+        }
+        litmus_dir->owner = THIS_MODULE;
+        curr_file = create_proc_entry("active_plugin",
+                                      0644, litmus_dir);
+        if (!curr_file) {
+                printk(KERN_ERR "Could not allocate active_plugin "
+                       "procfs entry.\n");
+                return -ENOMEM;
+        }
+        curr_file->owner = THIS_MODULE;
+        curr_file->read_proc  = proc_read_curr;
+        curr_file->write_proc = proc_write_curr;
+        stat_file = create_proc_read_entry("stats", 0444, litmus_dir,
+                                           proc_read_stats, NULL);
+        plugs_file = create_proc_read_entry("plugins", 0444, litmus_dir,
+                                           proc_read_plugins, NULL);
+        return 0;
+}
+static void exit_litmus_proc(void)
+{
+        if (plugs_file)
+                remove_proc_entry("plugins", litmus_dir);
+        if (stat_file)
+                remove_proc_entry("stats", litmus_dir);
+        if (curr_file)
+                remove_proc_entry("active_plugin", litmus_dir);
+        if (litmus_dir)
+                remove_proc_entry("litmus", NULL);
+}
+extern struct sched_plugin linux_sched_plugin;
+static int __init _init_litmus(void)
+{
+        /*      Common initializers,
+         *      mode change lock is used to enforce single mode change
+         *      operation.
+         */
+        printk("Starting LITMUS^RT kernel\n");
+        register_sched_plugin(&linux_sched_plugin);
+#ifdef CONFIG_MAGIC_SYSRQ
+        /* offer some debugging help */
+        if (!register_sysrq_key('q', &sysrq_kill_rt_tasks_op))
+                printk("Registered kill rt tasks magic sysrq.\n");
+        else
+                printk("Could not register kill rt tasks magic sysrq.\n");
+#endif
+        init_litmus_proc();
+        return 0;
+}
+static void _exit_litmus(void)
+{
+        exit_litmus_proc();
+}
+module_init(_init_litmus);
+module_exit(_exit_litmus);
diff --git a/litmus/litmus_sem.c b/litmus/litmus_sem.c
new file mode 100644
index 0000000000..f52941c5ca
--- /dev/null
+++ b/litmus/litmus_sem.c
@@ -0,0 +1,566 @@
+/*
+ * PI semaphores and SRP implementations.
+ * Much of the code here is borrowed from include/asm-i386/semaphore.h.
+ *
+ * NOTE: This implementation is very much a prototype and horribly insecure. It
+ *       is intended to be a proof of concept, not a feature-complete solution.
+ */
+#include <asm/atomic.h>
+#include <asm/semaphore.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/spinlock.h>
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/edf_common.h>
+#include <litmus/fdso.h>
+#include <litmus/trace.h>
+/* ************************************************************************** */
+/*                          PRIORITY INHERITANCE                              */
+/* ************************************************************************** */
+static  void* create_pi_semaphore(void)
+{
+        struct pi_semaphore* sem;
+        int i;
+        sem = kmalloc(sizeof(struct pi_semaphore), GFP_KERNEL);
+        if (!sem)
+                return NULL;
+        atomic_set(&sem->count, 1);
+        sem->sleepers = 0;
+        init_waitqueue_head(&sem->wait);
+        sem->hp.task = NULL;
+        sem->holder = NULL;
+        for (i = 0; i < NR_CPUS; i++)
+                sem->hp.cpu_task[i] = NULL;
+        return sem;
+}
+static void destroy_pi_semaphore(void* sem)
+{
+        /* XXX assert invariants */
+        kfree(sem);
+}
+struct fdso_ops pi_sem_ops = {
+        .create  = create_pi_semaphore,
+        .destroy = destroy_pi_semaphore
+};
+struct wq_pair {
+        struct task_struct*  tsk;
+        struct pi_semaphore* sem;
+};
+static int rt_pi_wake_up(wait_queue_t *wait, unsigned mode, int sync,
+                           void *key)
+{
+        struct wq_pair* wqp   = (struct wq_pair*) wait->private;
+        set_rt_flags(wqp->tsk, RT_F_EXIT_SEM);
+        litmus->inherit_priority(wqp->sem, wqp->tsk);
+        TRACE_TASK(wqp->tsk,
+                   "woken up by rt_pi_wake_up() (RT_F_SEM_EXIT, PI)\n");
+        /* point to task for default_wake_function() */
+        wait->private = wqp->tsk;
+        default_wake_function(wait, mode, sync, key);
+        /* Always return true since we know that if we encountered a task
+         * that was already running the wake_up raced with the schedule in
+         * rt_pi_down(). In that case the task in rt_pi_down() will be scheduled
+         * immediately and own the lock. We must not wake up another task in
+         * any case.
+         */
+        return 1;
+}
+/* caller is responsible for locking */
+int edf_set_hp_task(struct pi_semaphore *sem)
+{
+        struct list_head        *tmp, *next;
+        struct task_struct      *queued;
+        int ret = 0;
+        sem->hp.task = NULL;
+        list_for_each_safe(tmp, next, &sem->wait.task_list) {
+                queued  = ((struct wq_pair*)
+                        list_entry(tmp, wait_queue_t,
+                                   task_list)->private)->tsk;
+                /* Compare task prios, find high prio task. */
+                if (edf_higher_prio(queued, sem->hp.task)) {
+                        sem->hp.task = queued;
+                        ret = 1;
+                }
+        }
+        return ret;
+}
+/* caller is responsible for locking */
+int edf_set_hp_cpu_task(struct pi_semaphore *sem, int cpu)
+{
+        struct list_head        *tmp, *next;
+        struct task_struct      *queued;
+        int ret = 0;
+        sem->hp.cpu_task[cpu] = NULL;
+        list_for_each_safe(tmp, next, &sem->wait.task_list) {
+                queued  = ((struct wq_pair*)
+                        list_entry(tmp, wait_queue_t,
+                                   task_list)->private)->tsk;
+                /* Compare task prios, find high prio task. */
+                if (get_partition(queued) == cpu &&
+                    edf_higher_prio(queued, sem->hp.cpu_task[cpu])) {
+                        sem->hp.cpu_task[cpu] = queued;
+                        ret = 1;
+                }
+        }
+        return ret;
+}
+int do_pi_down(struct pi_semaphore* sem)
+{
+        unsigned long flags;
+        struct task_struct *tsk = current;
+        struct wq_pair pair;
+        int suspended = 1;
+        wait_queue_t wait = {
+                .private = &pair,
+                .func    = rt_pi_wake_up,
+                .task_list = {NULL, NULL}
+        };
+        pair.tsk = tsk;
+        pair.sem = sem;
+        spin_lock_irqsave(&sem->wait.lock, flags);
+        if (atomic_dec_return(&sem->count) < 0 ||
+            waitqueue_active(&sem->wait)) {
+                /* we need to suspend */
+                tsk->state = TASK_UNINTERRUPTIBLE;
+                add_wait_queue_exclusive_locked(&sem->wait, &wait);
+                TRACE_CUR("suspends on PI lock %p\n", sem);
+                litmus->pi_block(sem, tsk);
+                /* release lock before sleeping */
+                spin_unlock_irqrestore(&sem->wait.lock, flags);
+                TS_PI_DOWN_END;
+                preempt_enable_no_resched();
+                /* we depend on the FIFO order
+                 * Thus, we don't need to recheck when we wake up, we
+                 * are guaranteed to have the lock since there is only one
+                 * wake up per release
+                 */
+                schedule();
+                TRACE_CUR("woke up, now owns PI lock %p\n", sem);
+                /* try_to_wake_up() set our state to TASK_RUNNING,
+                 * all we need to do is to remove our wait queue entry
+                 */
+                remove_wait_queue(&sem->wait, &wait);
+        } else {
+                /* no priority inheritance necessary, since there are no queued
+                 * tasks.
+                 */
+                suspended = 0;
+                TRACE_CUR("acquired PI lock %p, no contention\n", sem);
+                sem->holder  = tsk;
+                sem->hp.task = tsk;
+                litmus->inherit_priority(sem, tsk);
+                spin_unlock_irqrestore(&sem->wait.lock, flags);
+        }
+        return suspended;
+}
+void do_pi_up(struct pi_semaphore* sem)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&sem->wait.lock, flags);
+        TRACE_CUR("releases PI lock %p\n", sem);
+        litmus->return_priority(sem);
+        sem->holder = NULL;
+        if (atomic_inc_return(&sem->count) < 1)
+                /* there is a task queued */
+                wake_up_locked(&sem->wait);
+        spin_unlock_irqrestore(&sem->wait.lock, flags);
+}
+asmlinkage long sys_pi_down(int sem_od)
+{
+        long ret = 0;
+        struct pi_semaphore * sem;
+        int suspended = 0;
+        preempt_disable();
+        TS_PI_DOWN_START;
+        sem = lookup_pi_sem(sem_od);
+        if (sem)
+                suspended = do_pi_down(sem);
+        else
+                ret = -EINVAL;
+        if (!suspended) {
+                TS_PI_DOWN_END;
+                preempt_enable();
+        }
+        return ret;
+}
+asmlinkage long sys_pi_up(int sem_od)
+{
+        long ret = 0;
+        struct pi_semaphore * sem;
+        preempt_disable();
+        TS_PI_UP_START;
+        sem = lookup_pi_sem(sem_od);
+        if (sem)
+                do_pi_up(sem);
+        else
+                ret = -EINVAL;
+        TS_PI_UP_END;
+        preempt_enable();
+        return ret;
+}
+/* Clear wait queue and wakeup waiting tasks, and free semaphore. */
+/*
+asmlinkage long sys_pi_sema_free(int sem_id)
+{
+        struct list_head *tmp, *next;
+        unsigned long flags;
+        if (sem_id < 0 || sem_id >= MAX_PI_SEMAPHORES)
+                return -EINVAL;
+        if (!pi_sems[sem_id].used)
+                return -EINVAL;
+        spin_lock_irqsave(&pi_sems[sem_id].wait.lock, flags);
+        if (waitqueue_active(&pi_sems[sem_id].wait)) {
+                list_for_each_safe(tmp, next,
+                                   &pi_sems[sem_id].wait.task_list) {
+                        wait_queue_t *curr = list_entry(tmp, wait_queue_t,
+                                                        task_list);
+                        list_del(tmp);
+                        set_rt_flags((struct task_struct*)curr->private,
+                                     RT_F_EXIT_SEM);
+                        curr->func(curr,
+                                   TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
+                                   0, NULL);
+                }
+        }
+        spin_unlock_irqrestore(&pi_sems[sem_id].wait.lock, flags);
+        pi_sems[sem_id].used = 0;
+        return 0;
+}
+*/
+/* ************************************************************************** */
+/*                          STACK RESOURCE POLICY                             */
+/* ************************************************************************** */
+struct srp_priority {
+        struct list_head        list;
+        unsigned int            period;
+        pid_t                   pid;
+};
+#define list2prio(l) list_entry(l, struct srp_priority, list)
+/* SRP task priority comparison function. Smaller periods have highest
+ * priority, tie-break is PID. Special case: period == 0 <=> no priority
+ */
+static int srp_higher_prio(struct srp_priority* first,
+                           struct srp_priority* second)
+{
+        if (!first->period)
+                return 0;
+        else
+                return  !second->period ||
+                        first->period < second->period || (
+                        first->period == second->period &&
+                        first->pid < second->pid);
+}
+struct srp {
+        struct list_head        ceiling;
+        wait_queue_head_t       ceiling_blocked;
+};
+DEFINE_PER_CPU(struct srp, srp);
+#define system_ceiling(srp) list2prio(srp->ceiling.next)
+static int srp_exceeds_ceiling(struct task_struct* first,
+                               struct srp* srp)
+{
+        return list_empty(&srp->ceiling) ||
+               get_rt_period(first) < system_ceiling(srp)->period ||
+               (get_rt_period(first) == system_ceiling(srp)->period &&
+                first->pid < system_ceiling(srp)->pid);
+}
+static void srp_add_prio(struct srp* srp, struct srp_priority* prio)
+{
+        struct list_head *pos;
+        if (in_list(&prio->list)) {
+                TRACE_CUR("WARNING: SRP violation detected, prio is already in "
+                          "ceiling list!\n");
+                return;
+        }
+        list_for_each(pos, &srp->ceiling)
+                if (unlikely(srp_higher_prio(prio, list2prio(pos)))) {
+                        __list_add(&prio->list, pos->prev, pos);
+                        return;
+                }
+        list_add_tail(&prio->list, &srp->ceiling);
+}
+/* struct for uniprocessor SRP "semaphore" */
+struct srp_semaphore {
+        struct srp_priority ceiling;
+        int cpu; /* cpu associated with this "semaphore" and resource */
+        int claimed; /* is the resource claimed (ceiling should be used)? */
+};
+static void* create_srp_semaphore(void)
+{
+        struct srp_semaphore* sem;
+        if (!is_realtime(current))
+                /* XXX log error */
+                return NULL;
+        sem = kmalloc(sizeof(*sem), GFP_KERNEL);
+        if (!sem)
+                return NULL;
+        INIT_LIST_HEAD(&sem->ceiling.list);
+        sem->ceiling.period = 0;
+        sem->claimed = 0;
+        sem->cpu     = get_partition(current);
+        return sem;
+}
+static void destroy_srp_semaphore(void* sem)
+{
+        /* XXX invariants */
+        kfree(sem);
+}
+struct fdso_ops srp_sem_ops = {
+        .create  = create_srp_semaphore,
+        .destroy = destroy_srp_semaphore
+};
+/* Initialize SRP semaphores at boot time. */
+static int __init srp_sema_boot_init(void)
+{
+        int i;
+        printk("Initializing SRP per-CPU ceilings...");
+        for (i = 0; i < NR_CPUS; i++) {
+                init_waitqueue_head(&per_cpu(srp, i).ceiling_blocked);
+                INIT_LIST_HEAD(&per_cpu(srp, i).ceiling);
+        }
+        printk(" done!\n");
+        return 0;
+}
+__initcall(srp_sema_boot_init);
+void do_srp_down(struct srp_semaphore* sem)
+{
+        /* claim... */
+        sem->claimed = 1;
+        /* ...and update ceiling */
+        srp_add_prio(&__get_cpu_var(srp), &sem->ceiling);
+}
+void do_srp_up(struct srp_semaphore* sem)
+{
+        sem->claimed = 0;
+        /* Determine new system priority ceiling for this CPU. */
+        if (in_list(&sem->ceiling.list))
+                list_del(&sem->ceiling.list);
+        else
+                TRACE_CUR("WARNING: SRP violation detected, prio not in ceiling"
+                          " list!\n");
+        /* Wake tasks on this CPU, if they exceed current ceiling. */
+        wake_up_all(&__get_cpu_var(srp).ceiling_blocked);
+}
+/* Adjust the system-wide priority ceiling if resource is claimed. */
+asmlinkage long sys_srp_down(int sem_od)
+{
+        int cpu;
+        int ret = -EINVAL;
+        struct srp_semaphore* sem;
+        /* disabling preemptions is sufficient protection since
+         * SRP is strictly per CPU and we don't interfere with any
+         * interrupt handlers
+         */
+        preempt_disable();
+        TS_SRP_DOWN_START;
+        cpu = smp_processor_id();
+        sem = lookup_srp_sem(sem_od);
+        if (sem && sem->cpu == cpu) {
+                do_srp_down(sem);
+                ret = 0;
+        }
+        TS_SRP_DOWN_END;
+        preempt_enable();
+        return ret;
+}
+/* Adjust the system-wide priority ceiling if resource is freed. */
+asmlinkage long sys_srp_up(int sem_od)
+{
+        int cpu;
+        int ret = -EINVAL;
+        struct srp_semaphore* sem;
+        preempt_disable();
+        TS_SRP_UP_START;
+        cpu = smp_processor_id();
+        sem = lookup_srp_sem(sem_od);
+        if (sem && sem->cpu == cpu) {
+                do_srp_up(sem);
+                ret = 0;
+        }
+        TS_SRP_UP_END;
+        preempt_enable();
+        return ret;
+}
+/* Indicate that task will use a resource associated with a given
+ * semaphore. Should be done *a priori* before RT task system is
+ * executed, so this does *not* update the system priority
+ * ceiling! (The ceiling would be meaningless anyway, as the SRP
+ * breaks without this a priori knowledge.)
+ */
+asmlinkage long sys_reg_task_srp_sem(int sem_od)
+{
+        /*
+         * FIXME: This whole concept is rather brittle!
+         *        There must be a better solution. Maybe register on
+         *        first reference?
+         */
+        struct task_struct *t = current;
+        struct srp_priority t_prio;
+        struct srp_semaphore* sem;
+        sem = lookup_srp_sem(sem_od);
+        if (!sem)
+                return -EINVAL;
+        if (!is_realtime(t))
+                return -EPERM;
+        if (sem->cpu != get_partition(t))
+                return -EINVAL;
+        preempt_disable();
+        t->rt_param.subject_to_srp = 1;
+        t_prio.period = get_rt_period(t);
+        t_prio.pid    = t->pid;
+        if (srp_higher_prio(&t_prio, &sem->ceiling)) {
+                sem->ceiling.period = t_prio.period;
+                sem->ceiling.pid    = t_prio.pid;
+        }
+        preempt_enable();
+        return 0;
+}
+static int srp_wake_up(wait_queue_t *wait, unsigned mode, int sync,
+                       void *key)
+{
+        int cpu = smp_processor_id();
+        struct task_struct *tsk = wait->private;
+        if (cpu != get_partition(tsk))
+                TRACE_TASK(tsk, "srp_wake_up on wrong cpu, partition is %d\b",
+                           get_partition(tsk));
+        else if (srp_exceeds_ceiling(tsk, &__get_cpu_var(srp)))
+                return default_wake_function(wait, mode, sync, key);
+        return 0;
+}
+/* Wait for current task priority to exceed system-wide priority ceiling.
+ * Can be used to determine when it is safe to run a job after its release.
+ */
+void srp_ceiling_block(void)
+{
+        struct task_struct *tsk = current;
+        wait_queue_t wait = {
+                .private   = tsk,
+                .func      = srp_wake_up,
+                .task_list = {NULL, NULL}
+        };
+        preempt_disable();
+        if (!srp_exceeds_ceiling(tsk, &__get_cpu_var(srp))) {
+                tsk->state = TASK_UNINTERRUPTIBLE;
+                add_wait_queue(&__get_cpu_var(srp).ceiling_blocked, &wait);
+                TRACE_CUR("is priority ceiling blocked.\n");
+                preempt_enable_no_resched();
+                schedule();
+                /* Access to CPU var must occur with preemptions disabled,
+                 * otherwise Linux debug code complains loudly, even if it is
+                 * ok here.
+                 */
+                preempt_disable();
+                TRACE_CUR("finally exceeds system ceiling.\n");
+                remove_wait_queue(&__get_cpu_var(srp).ceiling_blocked, &wait);
+                preempt_enable();
+        } else {
+                TRACE_CUR("is not priority ceiling blocked\n");
+                preempt_enable();
+        }
+}
+/* ************************************************************************** */
diff --git a/litmus/rt_domain.c b/litmus/rt_domain.c
new file mode 100644
index 0000000000..fe7bd29b19
--- /dev/null
+++ b/litmus/rt_domain.c
@@ -0,0 +1,130 @@
+/*
+ * kernel/rt_domain.c
+ *
+ * LITMUS real-time infrastructure. This file contains the
+ * functions that manipulate RT domains. RT domains are an abstraction
+ * of a ready queue and a release queue.
+ */
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/sched_trace.h>
+#include <litmus/rt_domain.h>
+static int dummy_resched(rt_domain_t *rt)
+{
+        return 0;
+}
+static int dummy_order(struct list_head* a, struct list_head* b)
+{
+        return 0;
+}
+int release_order(struct list_head* a, struct list_head* b)
+{
+        return earlier_release(
+                list_entry(a, struct task_struct, rt_list),
+                list_entry(b, struct task_struct, rt_list));
+}
+void rt_domain_init(rt_domain_t *rt,
+                    check_resched_needed_t f,
+                    list_cmp_t order)
+{
+        BUG_ON(!rt);
+        if (!f)
+                f = dummy_resched;
+        if (!order)
+                order = dummy_order;
+        INIT_LIST_HEAD(&rt->ready_queue);
+        INIT_LIST_HEAD(&rt->release_queue);
+        rt->ready_lock          = RW_LOCK_UNLOCKED;
+        rt->release_lock        = SPIN_LOCK_UNLOCKED;
+        rt->check_resched       = f;
+        rt->order               = order;
+}
+/* add_ready - add a real-time task to the rt ready queue. It must be runnable.
+ * @new:      the newly released task
+ */
+void __add_ready(rt_domain_t* rt, struct task_struct *new)
+{
+        TRACE("rt: adding %s/%d (%llu, %llu) to ready queue at %llu\n",
+              new->comm, new->pid, get_exec_cost(new), get_rt_period(new),
+              sched_clock());
+        if (!list_insert(&new->rt_list, &rt->ready_queue, rt->order))
+                rt->check_resched(rt);
+}
+struct task_struct* __take_ready(rt_domain_t* rt)
+{
+        struct task_struct *t = __peek_ready(rt);
+        /* kick it out of the ready list */
+        if (t)
+                list_del(&t->rt_list);
+        return t;
+}
+struct task_struct* __peek_ready(rt_domain_t* rt)
+{
+        if (!list_empty(&rt->ready_queue))
+                return next_ready(rt);
+        else
+                return NULL;
+}
+/* add_release - add a real-time task to the rt release queue.
+ * @task:        the sleeping task
+ */
+void __add_release(rt_domain_t* rt, struct task_struct *task)
+{
+        TRACE("rt: adding %s/%d (%llu, %llu) rel=%llu to release queue\n",
+              task->comm, task->pid, get_exec_cost(task), get_rt_period(task),
+              get_release(task));
+        list_insert(&task->rt_list, &rt->release_queue, release_order);
+}
+void __release_pending(rt_domain_t* rt)
+{
+        struct list_head *pos, *save;
+        struct task_struct   *queued;
+        lt_t now = sched_clock();
+        list_for_each_safe(pos, save, &rt->release_queue) {
+                queued = list_entry(pos, struct task_struct, rt_list);
+                if (likely(is_released(queued, now))) {
+                        /* this one is ready to go*/
+                        list_del(pos);
+                        set_rt_flags(queued, RT_F_RUNNING);
+                        sched_trace_job_release(queued);
+                        /* now it can be picked up */
+                        barrier();
+                        add_ready(rt, queued);
+                }
+                else
+                        /* the release queue is ordered */
+                        break;
+        }
+}
+void try_release_pending(rt_domain_t* rt)
+{
+        unsigned long flags;
+        if (spin_trylock_irqsave(&rt->release_lock, flags)) {
+                __release_pending(rt);
+                spin_unlock_irqrestore(&rt->release_lock, flags);
+        }
+}
diff --git a/litmus/sched_gsn_edf.c b/litmus/sched_gsn_edf.c
new file mode 100644
index 0000000000..e879b02888
--- /dev/null
+++ b/litmus/sched_gsn_edf.c
@@ -0,0 +1,719 @@
+/*
+ * kernel/sched_gsn_edf.c
+ *
+ * Implementation of the GSN-EDF scheduling algorithm.
+ *
+ * This version uses the simple approach and serializes all scheduling
+ * decisions by the use of a queue lock. This is probably not the
+ * best way to do it, but it should suffice for now.
+ */
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/edf_common.h>
+#include <litmus/sched_trace.h>
+#include <linux/module.h>
+/* Overview of GSN-EDF operations.
+ *
+ * For a detailed explanation of GSN-EDF have a look at the FMLP paper. This
+ * description only covers how the individual operations are implemented in
+ * LITMUS.
+ *
+ * link_task_to_cpu(T, cpu)     - Low-level operation to update the linkage
+ *                                structure (NOT the actually scheduled
+ *                                task). If there is another linked task To
+ *                                already it will set To->linked_on = NO_CPU
+ *                                (thereby removing its association with this
+ *                                CPU). However, it will not requeue the
+ *                                previously linked task (if any). It will set
+ *                                T's state to RT_F_RUNNING and check whether
+ *                                it is already running somewhere else. If T
+ *                                is scheduled somewhere else it will link
+ *                                it to that CPU instead (and pull the linked
+ *                                task to cpu). T may be NULL.
+ *
+ * unlink(T)                    - Unlink removes T from all scheduler data
+ *                                structures. If it is linked to some CPU it
+ *                                will link NULL to that CPU. If it is
+ *                                currently queued in the gsnedf queue it will
+ *                                be removed from the T->rt_list. It is safe to
+ *                                call unlink(T) if T is not linked. T may not
+ *                                be NULL.
+ *
+ * requeue(T)                   - Requeue will insert T into the appropriate
+ *                                queue. If the system is in real-time mode and
+ *                                the T is released already, it will go into the
+ *                                ready queue. If the system is not in
+ *                                real-time mode is T, then T will go into the
+ *                                release queue. If T's release time is in the
+ *                                future, it will go into the release
+ *                                queue. That means that T's release time/job
+ *                                no/etc. has to be updated before requeu(T) is
+ *                                called. It is not safe to call requeue(T)
+ *                                when T is already queued. T may not be NULL.
+ *
+ * gsnedf_job_arrival(T)        - This is the catch all function when T enters
+ *                                the system after either a suspension or at a
+ *                                job release. It will queue T (which means it
+ *                                is not safe to call gsnedf_job_arrival(T) if
+ *                                T is already queued) and then check whether a
+ *                                preemption is necessary. If a preemption is
+ *                                necessary it will update the linkage
+ *                                accordingly and cause scheduled to be called
+ *                                (either with an IPI or need_resched). It is
+ *                                safe to call gsnedf_job_arrival(T) if T's
+ *                                next job has not been actually released yet
+ *                                (releast time in the future). T will be put
+ *                                on the release queue in that case.
+ *
+ * job_completion(T)            - Take care of everything that needs to be done
+ *                                to prepare T for its next release and place
+ *                                it in the right queue with
+ *                                gsnedf_job_arrival().
+ *
+ *
+ * When we now that T is linked to CPU then link_task_to_cpu(NULL, CPU) is
+ * equivalent to unlink(T). Note that if you unlink a task from a CPU none of
+ * the functions will automatically propagate pending task from the ready queue
+ * to a linked task. This is the job of the calling function ( by means of
+ * __take_ready).
+ */
+/* cpu_entry_t - maintain the linked and scheduled state
+ */
+typedef struct  {
+        int                     cpu;
+        struct task_struct*     linked;         /* only RT tasks */
+        struct task_struct*     scheduled;      /* only RT tasks */
+        struct list_head        list;
+        atomic_t                will_schedule;  /* prevent unneeded IPIs */
+} cpu_entry_t;
+DEFINE_PER_CPU(cpu_entry_t, gsnedf_cpu_entries);
+#define set_will_schedule() \
+        (atomic_set(&__get_cpu_var(gsnedf_cpu_entries).will_schedule, 1))
+#define clear_will_schedule() \
+        (atomic_set(&__get_cpu_var(gsnedf_cpu_entries).will_schedule, 0))
+#define test_will_schedule(cpu) \
+        (atomic_read(&per_cpu(gsnedf_cpu_entries, cpu).will_schedule))
+#define NO_CPU 0xffffffff
+/* The gsnedf_lock is used to serialize all scheduling events.
+ * It protects
+ */
+static DEFINE_SPINLOCK(gsnedf_lock);
+/* the cpus queue themselves according to priority in here */
+static LIST_HEAD(gsnedf_cpu_queue);
+static rt_domain_t gsnedf;
+/* update_cpu_position - Move the cpu entry to the correct place to maintain
+ *                       order in the cpu queue. Caller must hold gsnedf lock.
+ *
+ *                                              This really should be a heap.
+ */
+static void update_cpu_position(cpu_entry_t *entry)
+{
+        cpu_entry_t *other;
+        struct list_head *pos;
+        if (likely(in_list(&entry->list)))
+                list_del(&entry->list);
+        /* if we do not execute real-time jobs we just move
+         * to the end of the queue
+         */
+        if (entry->linked) {
+                list_for_each(pos, &gsnedf_cpu_queue) {
+                        other = list_entry(pos, cpu_entry_t, list);
+                        if (edf_higher_prio(entry->linked, other->linked)) {
+                                __list_add(&entry->list, pos->prev, pos);
+                                return;
+                        }
+                }
+        }
+        /* if we get this far we have the lowest priority job */
+        list_add_tail(&entry->list, &gsnedf_cpu_queue);
+}
+/* link_task_to_cpu - Update the link of a CPU.
+ *                    Handles the case where the to-be-linked task is already
+ *                    scheduled on a different CPU.
+ */
+static noinline void link_task_to_cpu(struct task_struct* linked,
+                                      cpu_entry_t *entry)
+{
+        cpu_entry_t *sched;
+        struct task_struct* tmp;
+        int on_cpu;
+        BUG_ON(linked && !is_realtime(linked));
+        /* Currently linked task is set to be unlinked. */
+        if (entry->linked) {
+                entry->linked->rt_param.linked_on = NO_CPU;
+        }
+        /* Link new task to CPU. */
+        if (linked) {
+                set_rt_flags(linked, RT_F_RUNNING);
+                /* handle task is already scheduled somewhere! */
+                on_cpu = linked->rt_param.scheduled_on;
+                if (on_cpu != NO_CPU) {
+                        sched = &per_cpu(gsnedf_cpu_entries, on_cpu);
+                        /* this should only happen if not linked already */
+                        BUG_ON(sched->linked == linked);
+                        /* If we are already scheduled on the CPU to which we
+                         * wanted to link, we don't need to do the swap --
+                         * we just link ourselves to the CPU and depend on
+                         * the caller to get things right.
+                         */
+                        if (entry != sched) {
+                                tmp = sched->linked;
+                                linked->rt_param.linked_on = sched->cpu;
+                                sched->linked = linked;
+                                update_cpu_position(sched);
+                                linked = tmp;
+                        }
+                }
+                if (linked) /* might be NULL due to swap */
+                        linked->rt_param.linked_on = entry->cpu;
+        }
+        entry->linked = linked;
+        update_cpu_position(entry);
+}
+/* unlink - Make sure a task is not linked any longer to an entry
+ *          where it was linked before. Must hold gsnedf_lock.
+ */
+static noinline void unlink(struct task_struct* t)
+{
+        cpu_entry_t *entry;
+        if (unlikely(!t)) {
+                TRACE_BUG_ON(!t);
+                return;
+        }
+        if (t->rt_param.linked_on != NO_CPU) {
+                /* unlink */
+                entry = &per_cpu(gsnedf_cpu_entries, t->rt_param.linked_on);
+                t->rt_param.linked_on = NO_CPU;
+                link_task_to_cpu(NULL, entry);
+        } else if (in_list(&t->rt_list)) {
+                /* This is an interesting situation: t is scheduled,
+                 * but was just recently unlinked.  It cannot be
+                 * linked anywhere else (because then it would have
+                 * been relinked to this CPU), thus it must be in some
+                 * queue. We must remove it from the list in this
+                 * case.
+                 */
+                list_del(&t->rt_list);
+        }
+}
+/* preempt - force a CPU to reschedule
+ */
+static noinline void preempt(cpu_entry_t *entry)
+{
+        /* We cannot make the is_np() decision here if it is a remote CPU
+         * because requesting exit_np() requires that we currently use the
+         * address space of the task. Thus, in the remote case we just send
+         * the IPI and let schedule() handle the problem.
+         */
+        if (smp_processor_id() == entry->cpu) {
+                if (entry->scheduled && is_np(entry->scheduled))
+                        request_exit_np(entry->scheduled);
+                else
+                        set_tsk_need_resched(current);
+        } else
+                /* in case that it is a remote CPU we have to defer the
+                 * the decision to the remote CPU
+                 * FIXME: We could save a few IPI's here if we leave the flag
+                 * set when we are waiting for a np_exit().
+                 */
+                if (!test_will_schedule(entry->cpu))
+                        smp_send_reschedule(entry->cpu);
+}
+/* requeue - Put an unlinked task into gsn-edf domain.
+ *           Caller must hold gsnedf_lock.
+ */
+static noinline void requeue(struct task_struct* task)
+{
+        BUG_ON(!task);
+        /* sanity check rt_list before insertion */
+        BUG_ON(in_list(&task->rt_list));
+        if (get_rt_flags(task) == RT_F_SLEEP) {
+                /* this task has expired
+                 * _schedule has already taken care of updating
+                 * the release and
+                 * deadline. We just must check if it has been released.
+                 */
+                if (is_released(task, sched_clock()))
+                        __add_ready(&gsnedf, task);
+                else {
+                        /* it has got to wait */
+                        __add_release(&gsnedf, task);
+                }
+        } else
+                /* this is a forced preemption
+                 * thus the task stays in the ready_queue
+                 * we only must make it available to others
+                 */
+                __add_ready(&gsnedf, task);
+}
+/* gsnedf_job_arrival: task is either resumed or released */
+static noinline void gsnedf_job_arrival(struct task_struct* task)
+{
+        cpu_entry_t* last;
+        BUG_ON(list_empty(&gsnedf_cpu_queue));
+        BUG_ON(!task);
+        /* first queue arriving job */
+        requeue(task);
+        /* then check for any necessary preemptions */
+        last = list_entry(gsnedf_cpu_queue.prev, cpu_entry_t, list);
+        if (edf_preemption_needed(&gsnedf, last->linked)) {
+                /* preemption necessary */
+                task = __take_ready(&gsnedf);
+                TRACE("job_arrival: task %d linked to %d\n",
+                      task->pid, last->cpu);
+                if (last->linked)
+                        requeue(last->linked);
+                link_task_to_cpu(task, last);
+                preempt(last);
+        }
+}
+/* check for current job releases */
+static noinline  void gsnedf_release_jobs(void)
+{
+        struct list_head *pos, *save;
+        struct task_struct   *queued;
+        lt_t now = sched_clock();
+        list_for_each_safe(pos, save, &gsnedf.release_queue) {
+                queued = list_entry(pos, struct task_struct, rt_list);
+                if (likely(is_released(queued, now))) {
+                        /* this one is ready to go*/
+                        list_del(pos);
+                        set_rt_flags(queued, RT_F_RUNNING);
+                        sched_trace_job_release(queued);
+                        gsnedf_job_arrival(queued);
+                }
+                else
+                        /* the release queue is ordered */
+                        break;
+        }
+}
+/* gsnedf_tick - this function is called for every local timer
+ *                         interrupt.
+ *
+ *                   checks whether the current task has expired and checks
+ *                   whether we need to preempt it if it has not expired
+ */
+static void gsnedf_tick(struct task_struct* t)
+{
+        unsigned long           flags;
+        if (is_realtime(t) && budget_exhausted(t)) {
+                if (!is_np(t)) {
+                        /* np tasks will be preempted when they become
+                         * preemptable again
+                         */
+                        set_tsk_need_resched(t);
+                        set_will_schedule();
+                        TRACE("gsnedf_scheduler_tick: "
+                              "%d is preemptable "
+                              " => FORCE_RESCHED\n", t->pid);
+                } else {
+                        TRACE("gsnedf_scheduler_tick: "
+                              "%d is non-preemptable, "
+                              "preemption delayed.\n", t->pid);
+                        request_exit_np(t);
+                }
+        }
+        /* only the first CPU needs to release jobs */
+        /* FIXME: drive this from a hrtimer */
+        if (smp_processor_id() == 0) {
+                spin_lock_irqsave(&gsnedf_lock, flags);
+                /* Try to release pending jobs */
+                gsnedf_release_jobs();
+                /* We don't need to check linked != scheduled since
+                 * set_tsk_need_resched has been set by preempt() if necessary.
+                 */
+                spin_unlock_irqrestore(&gsnedf_lock, flags);
+        }
+}
+/* caller holds gsnedf_lock */
+static noinline void job_completion(struct task_struct *t)
+{
+        BUG_ON(!t);
+        sched_trace_job_completion(t);
+        TRACE_TASK(t, "job_completion().\n");
+        /* set flags */
+        set_rt_flags(t, RT_F_SLEEP);
+        /* prepare for next period */
+        edf_prepare_for_next_period(t);
+        /* unlink */
+        unlink(t);
+        /* requeue
+         * But don't requeue a blocking task. */
+        if (is_running(t))
+                gsnedf_job_arrival(t);
+}
+/* Getting schedule() right is a bit tricky. schedule() may not make any
+ * assumptions on the state of the current task since it may be called for a
+ * number of reasons. The reasons include a scheduler_tick() determined that it
+ * was necessary, because sys_exit_np() was called, because some Linux
+ * subsystem determined so, or even (in the worst case) because there is a bug
+ * hidden somewhere. Thus, we must take extreme care to determine what the
+ * current state is.
+ *
+ * The CPU could currently be scheduling a task (or not), be linked (or not).
+ *
+ * The following assertions for the scheduled task could hold:
+ *
+ *      - !is_running(scheduled)        // the job blocks
+ *      - scheduled->timeslice == 0     // the job completed (forcefully)
+ *      - get_rt_flag() == RT_F_SLEEP   // the job completed (by syscall)
+ *      - linked != scheduled           // we need to reschedule (for any reason)
+ *      - is_np(scheduled)              // rescheduling must be delayed,
+ *                                         sys_exit_np must be requested
+ *
+ * Any of these can occur together.
+ */
+static struct task_struct* gsnedf_schedule(struct task_struct * prev)
+{
+        cpu_entry_t*    entry = &__get_cpu_var(gsnedf_cpu_entries);
+        int                     out_of_time, sleep, preempt, np, exists, blocks;
+        struct task_struct* next = NULL;
+        /* Will be released in finish_switch. */
+        spin_lock(&gsnedf_lock);
+        clear_will_schedule();
+        /* sanity checking */
+        BUG_ON(entry->scheduled && entry->scheduled != prev);
+        BUG_ON(entry->scheduled && !is_realtime(prev));
+        BUG_ON(is_realtime(prev) && !entry->scheduled);
+        /* (0) Determine state */
+        exists      = entry->scheduled != NULL;
+        blocks      = exists && !is_running(entry->scheduled);
+        out_of_time = exists && budget_exhausted(entry->scheduled);
+        np          = exists && is_np(entry->scheduled);
+        sleep       = exists && get_rt_flags(entry->scheduled) == RT_F_SLEEP;
+        preempt     = entry->scheduled != entry->linked;
+        /* If a task blocks we have no choice but to reschedule.
+         */
+        if (blocks)
+                unlink(entry->scheduled);
+        /* Request a sys_exit_np() call if we would like to preempt but cannot.
+         * We need to make sure to update the link structure anyway in case
+         * that we are still linked. Multiple calls to request_exit_np() don't
+         * hurt.
+         */
+        if (np && (out_of_time || preempt || sleep)) {
+                unlink(entry->scheduled);
+                request_exit_np(entry->scheduled);
+        }
+        /* Any task that is preemptable and either exhausts its execution
+         * budget or wants to sleep completes. We may have to reschedule after
+         * this.
+         */
+        if (!np && (out_of_time || sleep))
+                job_completion(entry->scheduled);
+        /* Link pending task if we became unlinked.
+         */
+        if (!entry->linked)
+                link_task_to_cpu(__take_ready(&gsnedf), entry);
+        /* The final scheduling decision. Do we need to switch for some reason?
+         * If linked different from scheduled select linked as next.
+         */
+        if ((!np || blocks) &&
+            entry->linked != entry->scheduled) {
+                /* Schedule a linked job? */
+                if (entry->linked)
+                        next = entry->linked;
+        } else
+                /* Only override Linux scheduler if we have real-time task
+                 * scheduled that needs to continue.
+                 */
+                if (exists)
+                        next = prev;
+        spin_unlock(&gsnedf_lock);
+        /* don't race with a concurrent switch */
+        if (next && prev != next)
+                while (next->rt_param.scheduled_on != NO_CPU)
+                        cpu_relax();
+        return next;
+}
+/* _finish_switch - we just finished the switch away from prev
+ */
+static void gsnedf_finish_switch(struct task_struct *prev)
+{
+        cpu_entry_t*    entry = &__get_cpu_var(gsnedf_cpu_entries);
+        entry->scheduled = is_realtime(current) ? current : NULL;
+        prev->rt_param.scheduled_on    = NO_CPU;
+        current->rt_param.scheduled_on = smp_processor_id();
+}
+/*      Prepare a task for running in RT mode
+ */
+static void gsnedf_task_new(struct task_struct * t, int on_rq, int running)
+{
+        unsigned long           flags;
+        cpu_entry_t*            entry;
+        TRACE("gsn edf: task new %d\n", t->pid);
+        spin_lock_irqsave(&gsnedf_lock, flags);
+        if (running) {
+                entry = &per_cpu(gsnedf_cpu_entries, task_cpu(t));
+                BUG_ON(entry->scheduled);
+                entry->scheduled = t;
+                t->rt_param.scheduled_on = task_cpu(t);
+        } else
+                t->rt_param.scheduled_on = NO_CPU;
+        t->rt_param.linked_on          = NO_CPU;
+        /* setup job params */
+        edf_release_at(t, sched_clock());
+        gsnedf_job_arrival(t);
+        spin_unlock_irqrestore(&gsnedf_lock, flags);
+}
+static void gsnedf_task_wake_up(struct task_struct *task)
+{
+        unsigned long flags;
+        lt_t now;
+        spin_lock_irqsave(&gsnedf_lock, flags);
+        /* We need to take suspensions because of semaphores into
+         * account! If a job resumes after being suspended due to acquiring
+         * a semaphore, it should never be treated as a new job release.
+         */
+        if (get_rt_flags(task) == RT_F_EXIT_SEM) {
+                set_rt_flags(task, RT_F_RUNNING);
+        } else {
+                now = sched_clock();
+                if (is_tardy(task, now)) {
+                        /* new sporadic release */
+                        edf_release_at(task, now);
+                        sched_trace_job_release(task);
+                }
+                else if (task->time_slice)
+                        /* came back in time before deadline
+                         */
+                        set_rt_flags(task, RT_F_RUNNING);
+        }
+        gsnedf_job_arrival(task);
+        spin_unlock_irqrestore(&gsnedf_lock, flags);
+}
+static void gsnedf_task_block(struct task_struct *t)
+{
+        unsigned long flags;
+        /* unlink if necessary */
+        spin_lock_irqsave(&gsnedf_lock, flags);
+        unlink(t);
+        spin_unlock_irqrestore(&gsnedf_lock, flags);
+        BUG_ON(!is_realtime(t));
+        BUG_ON(t->rt_list.next != LIST_POISON1);
+        BUG_ON(t->rt_list.prev != LIST_POISON2);
+}
+static void gsnedf_task_exit(struct task_struct * t)
+{
+        unsigned long flags;
+        /* unlink if necessary */
+        spin_lock_irqsave(&gsnedf_lock, flags);
+        unlink(t);
+        spin_unlock_irqrestore(&gsnedf_lock, flags);
+        BUG_ON(!is_realtime(t));
+        TRACE_TASK(t, "RIP\n");
+        BUG_ON(t->rt_list.next != LIST_POISON1);
+        BUG_ON(t->rt_list.prev != LIST_POISON2);
+}
+static long gsnedf_pi_block(struct pi_semaphore *sem,
+                            struct task_struct *new_waiter)
+{
+        /* This callback has to handle the situation where a new waiter is
+         * added to the wait queue of the semaphore.
+         *
+         * We must check if has a higher priority than the currently
+         * highest-priority task, and then potentially reschedule.
+         */
+        BUG_ON(!new_waiter);
+        if (edf_higher_prio(new_waiter, sem->hp.task)) {
+                TRACE_TASK(new_waiter, " boosts priority\n");
+                /* called with IRQs disabled */
+                spin_lock(&gsnedf_lock);
+                /* store new highest-priority task */
+                sem->hp.task = new_waiter;
+                if (sem->holder) {
+                        /* let holder inherit */
+                        sem->holder->rt_param.inh_task = new_waiter;
+                        unlink(sem->holder);
+                        gsnedf_job_arrival(sem->holder);
+                }
+                spin_unlock(&gsnedf_lock);
+        }
+        return 0;
+}
+static long gsnedf_inherit_priority(struct pi_semaphore *sem,
+                                    struct task_struct *new_owner)
+{
+        /* We don't need to acquire the gsnedf_lock since at the time of this
+         * call new_owner isn't actually scheduled yet (it's still sleeping)
+         * and since the calling function already holds sem->wait.lock, which
+         * prevents concurrent sem->hp.task changes.
+         */
+        if (sem->hp.task && sem->hp.task != new_owner) {
+                new_owner->rt_param.inh_task = sem->hp.task;
+                TRACE_TASK(new_owner, "inherited priority from %s/%d\n",
+                           sem->hp.task->comm, sem->hp.task->pid);
+        } else
+                TRACE_TASK(new_owner,
+                           "cannot inherit priority, "
+                           "no higher priority job waits.\n");
+        return 0;
+}
+/* This function is called on a semaphore release, and assumes that
+ * the current task is also the semaphore holder.
+ */
+static long gsnedf_return_priority(struct pi_semaphore *sem)
+{
+        struct task_struct* t = current;
+        int ret = 0;
+        /* Find new highest-priority semaphore task
+         * if holder task is the current hp.task.
+         *
+         * Calling function holds sem->wait.lock.
+         */
+        if (t == sem->hp.task)
+                edf_set_hp_task(sem);
+        TRACE_CUR("gsnedf_return_priority for lock %p\n", sem);
+        if (t->rt_param.inh_task) {
+                /* interrupts already disabled by PI code */
+                spin_lock(&gsnedf_lock);
+                /* Reset inh_task to NULL. */
+                t->rt_param.inh_task = NULL;
+                /* Check if rescheduling is necessary */
+                unlink(t);
+                gsnedf_job_arrival(t);
+                spin_unlock(&gsnedf_lock);
+        }
+        return ret;
+}
+static long gsnedf_admit_task(struct task_struct* tsk)
+{
+        return 0;
+}
+/*      Plugin object   */
+static struct sched_plugin gsn_edf_plugin __cacheline_aligned_in_smp = {
+        .plugin_name            = "GSN-EDF",
+        .finish_switch          = gsnedf_finish_switch,
+        .tick                   = gsnedf_tick,
+        .task_new               = gsnedf_task_new,
+        .complete_job           = edf_complete_job,
+        .task_exit              = gsnedf_task_exit,
+        .schedule               = gsnedf_schedule,
+        .task_wake_up           = gsnedf_task_wake_up,
+        .task_block             = gsnedf_task_block,
+        .pi_block               = gsnedf_pi_block,
+        .inherit_priority       = gsnedf_inherit_priority,
+        .return_priority        = gsnedf_return_priority,
+        .admit_task             = gsnedf_admit_task
+};
+static int __init init_gsn_edf(void)
+{
+        int cpu;
+        cpu_entry_t *entry;
+        /* initialize CPU state */
+        for (cpu = 0; cpu < NR_CPUS; cpu++)  {
+                entry = &per_cpu(gsnedf_cpu_entries, cpu);
+                atomic_set(&entry->will_schedule, 0);
+                entry->linked    = NULL;
+                entry->scheduled = NULL;
+                entry->cpu       = cpu;
+                INIT_LIST_HEAD(&entry->list);
+        }
+        edf_domain_init(&gsnedf, NULL);
+        return register_sched_plugin(&gsn_edf_plugin);
+}
+module_init(init_gsn_edf);
diff --git a/litmus/sched_litmus.c b/litmus/sched_litmus.c
new file mode 100644
index 0000000000..89ae3941db
--- /dev/null
+++ b/litmus/sched_litmus.c
@@ -0,0 +1,149 @@
+/* This file is included from kernel/sched.c */
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+static void update_time_litmus(struct rq *rq, struct task_struct *p)
+{
+        lt_t now = sched_clock();
+        p->rt_param.job_params.exec_time +=
+                now - p->rt_param.job_params.exec_start;
+        p->rt_param.job_params.exec_start = now;
+}
+static void double_rq_lock(struct rq *rq1, struct rq *rq2);
+static void double_rq_unlock(struct rq *rq1, struct rq *rq2);
+static void litmus_tick(struct rq *rq, struct task_struct *p)
+{
+        if (is_realtime(p))
+                update_time_litmus(rq, p);
+        litmus->tick(p);
+}
+static void litmus_schedule(struct rq *rq, struct task_struct *prev)
+{
+        struct rq* other_rq;
+        int success  = 0;
+        /* WARNING: rq is _not_ locked! */
+        if (is_realtime(prev))
+                update_time_litmus(rq, prev);
+        while (!success) {
+                /* let the plugin schedule */
+                rq->litmus_next = litmus->schedule(prev);
+                /* check if a global plugin pulled a task from a different RQ */
+                if (rq->litmus_next && task_rq(rq->litmus_next) != rq) {
+                        /* we need to migrate the task */
+                        other_rq = task_rq(rq->litmus_next);
+                        double_rq_lock(rq, other_rq);
+                        /* now that we have the lock we need to make sure a
+                         *  couple of things still hold:
+                         *  - it is still a real-time task
+                         *  - it is still runnable (could have been stopped)
+                         */
+                        if (is_realtime(rq->litmus_next) &&
+                            is_running(rq->litmus_next)) {
+                                set_task_cpu(rq->litmus_next, smp_processor_id());
+                                success = 1;
+                        } /* else something raced, retry */
+                        double_rq_unlock(rq, other_rq);
+                } else
+                        success = 1;
+        }
+}
+static void enqueue_task_litmus(struct rq *rq, struct task_struct *p, int wakeup)
+{
+        if (wakeup)
+                litmus->task_wake_up(p);
+}
+static void dequeue_task_litmus(struct rq *rq, struct task_struct *p, int sleep)
+{
+        if (sleep)
+                litmus->task_block(p);
+}
+static void yield_task_litmus(struct rq *rq)
+{
+        BUG_ON(rq->curr != current);
+        litmus->complete_job();
+}
+/* Plugins are responsible for this.
+ */
+static void check_preempt_curr_litmus(struct rq *rq, struct task_struct *p)
+{
+}
+/* has already been taken care of */
+static void put_prev_task_litmus(struct rq *rq, struct task_struct *p)
+{
+}
+static struct task_struct *pick_next_task_litmus(struct rq *rq)
+{
+        struct task_struct* picked = rq->litmus_next;
+        rq->litmus_next = NULL;
+        if (picked)
+                picked->rt_param.job_params.exec_start = sched_clock();
+        return picked;
+}
+static void task_tick_litmus(struct rq *rq, struct task_struct *p)
+{
+}
+/* This is called when a task became a real-time task, either due
+ * to a SCHED_* class transition or due to PI mutex inheritance.\
+ * We don't handle Linux PI mutex inheritance yet. Use LITMUS provided
+ * synchronization primitives instead.
+ */
+static void set_curr_task_litmus(struct rq *rq)
+{
+        rq->curr->rt_param.job_params.exec_start = sched_clock();
+}
+#ifdef CONFIG_SMP
+/* we don't repartition at runtime */
+static unsigned long
+load_balance_litmus(struct rq *this_rq, int this_cpu, struct rq *busiest,
+                unsigned long max_load_move,
+                struct sched_domain *sd, enum cpu_idle_type idle,
+                int *all_pinned, int *this_best_prio)
+{
+        return 0;
+}
+static int
+move_one_task_litmus(struct rq *this_rq, int this_cpu, struct rq *busiest,
+                 struct sched_domain *sd, enum cpu_idle_type idle)
+{
+        return 0;
+}
+#endif
+const struct sched_class litmus_sched_class = {
+        .next                   = &rt_sched_class,
+        .enqueue_task           = enqueue_task_litmus,
+        .dequeue_task           = dequeue_task_litmus,
+        .yield_task             = yield_task_litmus,
+        .check_preempt_curr     = check_preempt_curr_litmus,
+        .pick_next_task         = pick_next_task_litmus,
+        .put_prev_task          = put_prev_task_litmus,
+#ifdef CONFIG_SMP
+        .load_balance           = load_balance_litmus,
+        .move_one_task          = move_one_task_litmus,
+#endif
+        .set_curr_task          = set_curr_task_litmus,
+        .task_tick              = task_tick_litmus,
+};
diff --git a/litmus/sched_plugin.c b/litmus/sched_plugin.c
new file mode 100644
index 0000000000..f7eb116ee4
--- /dev/null
+++ b/litmus/sched_plugin.c
@@ -0,0 +1,174 @@
+/* sched_plugin.c -- core infrastructure for the scheduler plugin system
+ *
+ * This file includes the initialization of the plugin system, the no-op Linux
+ * scheduler plugin and some dummy functions.
+ */
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+/*************************************************************
+ *                   Dummy plugin functions                  *
+ *************************************************************/
+static void litmus_dummy_finish_switch(struct task_struct * prev)
+{
+}
+static struct task_struct* litmus_dummy_schedule(struct task_struct * prev)
+{
+        return NULL;
+}
+static void litmus_dummy_tick(struct task_struct* tsk)
+{
+}
+static long litmus_dummy_admit_task(struct task_struct* tsk)
+{
+        printk(KERN_CRIT "LITMUS^RT: Linux plugin rejects %s/%d.\n",
+                tsk->comm, tsk->pid);
+        return -EINVAL;
+}
+static void litmus_dummy_task_new(struct task_struct *t, int on_rq, int running)
+{
+}
+static void litmus_dummy_task_wake_up(struct task_struct *task)
+{
+}
+static void litmus_dummy_task_block(struct task_struct *task)
+{
+}
+static void litmus_dummy_task_exit(struct task_struct *task)
+{
+}
+static long litmus_dummy_complete_job(void)
+{
+        return -ENOSYS;
+}
+static long litmus_dummy_inherit_priority(struct pi_semaphore *sem,
+                                          struct task_struct *new_owner)
+{
+        return -ENOSYS;
+}
+static long litmus_dummy_return_priority(struct pi_semaphore *sem)
+{
+        return -ENOSYS;
+}
+static long litmus_dummy_pi_block(struct pi_semaphore *sem,
+                                  struct task_struct *new_waiter)
+{
+        return -ENOSYS;
+}
+/* The default scheduler plugin. It doesn't do anything and lets Linux do its
+ * job.
+ */
+struct sched_plugin linux_sched_plugin = {
+        .plugin_name = "Linux",
+        .tick = litmus_dummy_tick,
+        .task_new   = litmus_dummy_task_new,
+        .task_exit = litmus_dummy_task_exit,
+        .task_wake_up = litmus_dummy_task_wake_up,
+        .task_block = litmus_dummy_task_block,
+        .complete_job = litmus_dummy_complete_job,
+        .schedule = litmus_dummy_schedule,
+        .finish_switch = litmus_dummy_finish_switch,
+        .inherit_priority = litmus_dummy_inherit_priority,
+        .return_priority = litmus_dummy_return_priority,
+        .pi_block = litmus_dummy_pi_block,
+        .admit_task = litmus_dummy_admit_task
+};
+/*
+ *      The reference to current plugin that is used to schedule tasks within
+ *      the system. It stores references to actual function implementations
+ *      Should be initialized by calling "init_***_plugin()"
+ */
+struct sched_plugin *litmus = &linux_sched_plugin;
+/* the list of registered scheduling plugins */
+static LIST_HEAD(sched_plugins);
+static DEFINE_SPINLOCK(sched_plugins_lock);
+#define CHECK(func) {\
+        if (!plugin->func) \
+                plugin->func = litmus_dummy_ ## func;}
+/* FIXME: get reference to module  */
+int register_sched_plugin(struct sched_plugin* plugin)
+{
+        printk(KERN_INFO "Registering LITMUS^RT plugin %s.\n",
+               plugin->plugin_name);
+        /* make sure we don't trip over null pointers later */
+        CHECK(finish_switch);
+        CHECK(schedule);
+        CHECK(tick);
+        CHECK(task_wake_up);
+        CHECK(task_exit);
+        CHECK(task_block);
+        CHECK(task_new);
+        CHECK(complete_job);
+        CHECK(inherit_priority);
+        CHECK(return_priority);
+        CHECK(pi_block);
+        CHECK(admit_task);
+        spin_lock(&sched_plugins_lock);
+        list_add(&plugin->list, &sched_plugins);
+        spin_unlock(&sched_plugins_lock);
+        return 0;
+}
+/* FIXME: reference counting, etc. */
+struct sched_plugin* find_sched_plugin(const char* name)
+{
+        struct list_head *pos;
+        struct sched_plugin *plugin;
+        spin_lock(&sched_plugins_lock);
+        list_for_each(pos, &sched_plugins) {
+                plugin = list_entry(pos, struct sched_plugin, list);
+                if (!strcmp(plugin->plugin_name, name))
+                    goto out_unlock;
+        }
+        plugin = NULL;
+out_unlock:
+        spin_unlock(&sched_plugins_lock);
+        return plugin;
+}
+int print_sched_plugins(char* buf, int max)
+{
+        int count = 0;
+        struct list_head *pos;
+        struct sched_plugin *plugin;
+        spin_lock(&sched_plugins_lock);
+        list_for_each(pos, &sched_plugins) {
+                plugin = list_entry(pos, struct sched_plugin, list);
+                count += snprintf(buf + count, max - count, "%s\n", plugin->plugin_name);
+                if (max - count <= 0)
+                        break;
+        }
+        spin_unlock(&sched_plugins_lock);
+        return  count;
+}
diff --git a/litmus/sched_psn_edf.c b/litmus/sched_psn_edf.c
new file mode 100644
index 0000000000..961680d0a6
--- /dev/null
+++ b/litmus/sched_psn_edf.c
@@ -0,0 +1,440 @@
+/*
+ * kernel/sched_psn_edf.c
+ *
+ * Implementation of the PSN-EDF scheduler plugin.
+ * Based on kern/sched_part_edf.c and kern/sched_gsn_edf.c.
+ *
+ * Suspensions and non-preemptable sections are supported.
+ * Priority inheritance is not supported.
+ */
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/edf_common.h>
+typedef struct {
+        rt_domain_t             domain;
+        int                     cpu;
+        struct task_struct*     scheduled; /* only RT tasks */
+        spinlock_t              lock;      /* protects the domain and
+                                            * serializes scheduling decisions
+                                            */
+} psnedf_domain_t;
+DEFINE_PER_CPU(psnedf_domain_t, psnedf_domains);
+#define local_edf               (&__get_cpu_var(psnedf_domains).domain)
+#define local_pedf              (&__get_cpu_var(psnedf_domains))
+#define remote_edf(cpu)         (&per_cpu(psnedf_domains, cpu).domain)
+#define remote_pedf(cpu)        (&per_cpu(psnedf_domains, cpu))
+#define task_edf(task)          remote_edf(get_partition(task))
+#define task_pedf(task)         remote_pedf(get_partition(task))
+static void psnedf_domain_init(psnedf_domain_t* pedf,
+                                 check_resched_needed_t check,
+                                 int cpu)
+{
+        edf_domain_init(&pedf->domain, check);
+        pedf->cpu               = cpu;
+        pedf->lock              = SPIN_LOCK_UNLOCKED;
+        pedf->scheduled         = NULL;
+}
+static void requeue(struct task_struct* t, rt_domain_t *edf)
+{
+        /* only requeue if t is actually running */
+        BUG_ON(!is_running(t));
+        if (t->state != TASK_RUNNING)
+                TRACE_TASK(t, "requeue: !TASK_RUNNING");
+        set_rt_flags(t, RT_F_RUNNING);
+        if (is_released(t, sched_clock()))
+                __add_ready(edf, t);
+        else
+                __add_release(edf, t); /* it has got to wait */
+}
+/* we assume the lock is being held */
+static void preempt(psnedf_domain_t *pedf)
+{
+        if (smp_processor_id() == pedf->cpu) {
+                if (pedf->scheduled && is_np(pedf->scheduled))
+                        request_exit_np(pedf->scheduled);
+                else
+                        set_tsk_need_resched(current);
+        } else
+                /* in case that it is a remote CPU we have to defer the
+                 * the decision to the remote CPU
+                 */
+                smp_send_reschedule(pedf->cpu);
+}
+/* This check is trivial in partioned systems as we only have to consider
+ * the CPU of the partition.
+ */
+static int psnedf_check_resched(rt_domain_t *edf)
+{
+        psnedf_domain_t *pedf = container_of(edf, psnedf_domain_t, domain);
+        int ret = 0;
+        /* because this is a callback from rt_domain_t we already hold
+         * the necessary lock for the ready queue
+         */
+        if (edf_preemption_needed(edf, pedf->scheduled)) {
+                preempt(pedf);
+                ret = 1;
+        }
+        return ret;
+}
+static void psnedf_tick(struct task_struct *t)
+{
+        unsigned long       flags;
+        rt_domain_t        *edf          = local_edf;
+        psnedf_domain_t    *pedf         = local_pedf;
+        /* Check for inconsistency. We don't need the lock for this since
+         * ->scheduled is only changed in schedule, which obviously is not
+         *  executing in parallel on this CPU
+         */
+        BUG_ON(is_realtime(t) && t != pedf->scheduled);
+        if (is_realtime(t) && budget_exhausted(t)) {
+                if (!is_np(t))
+                        set_tsk_need_resched(t);
+                else {
+                        TRACE("psnedf_scheduler_tick: "
+                              "%d is non-preemptable, "
+                              "preemption delayed.\n", t->pid);
+                        request_exit_np(t);
+                }
+        }
+        spin_lock_irqsave(&pedf->lock, flags);
+        /* FIXME: release via hrtimer */
+        __release_pending(edf);
+        spin_unlock_irqrestore(&pedf->lock, flags);
+}
+static void job_completion(struct task_struct* t)
+{
+        TRACE_TASK(t, "job_completion().\n");
+        set_rt_flags(t, RT_F_SLEEP);
+        edf_prepare_for_next_period(t);
+}
+static struct task_struct* psnedf_schedule(struct task_struct * prev)
+{
+        psnedf_domain_t*        pedf = local_pedf;
+        rt_domain_t*            edf  = &pedf->domain;
+        struct task_struct*     next;
+        int                     out_of_time, sleep, preempt,
+                                np, exists, blocks, resched;
+        spin_lock(&pedf->lock);
+        /* sanity checking */
+        BUG_ON(pedf->scheduled && pedf->scheduled != prev);
+        BUG_ON(pedf->scheduled && !is_realtime(prev));
+        /* (0) Determine state */
+        exists      = pedf->scheduled != NULL;
+        blocks      = exists && !is_running(pedf->scheduled);
+        out_of_time = exists && budget_exhausted(pedf->scheduled);
+        np          = exists && is_np(pedf->scheduled);
+        sleep       = exists && get_rt_flags(pedf->scheduled) == RT_F_SLEEP;
+        preempt     = edf_preemption_needed(edf, prev);
+        /* If we need to preempt do so.
+         * The following checks set resched to 1 in case of special
+         * circumstances.
+         */
+        resched = preempt;
+        /* If a task blocks we have no choice but to reschedule.
+         */
+        if (blocks)
+                resched = 1;
+        /* Request a sys_exit_np() call if we would like to preempt but cannot.
+         * Multiple calls to request_exit_np() don't hurt.
+         */
+        if (np && (out_of_time || preempt || sleep))
+                request_exit_np(pedf->scheduled);
+        /* Any task that is preemptable and either exhausts its execution
+         * budget or wants to sleep completes. We may have to reschedule after
+         * this.
+         */
+        if (!np && (out_of_time || sleep)) {
+                job_completion(pedf->scheduled);
+                resched = 1;
+        }
+        /* The final scheduling decision. Do we need to switch for some reason?
+         * Switch if we are in RT mode and have no task or if we need to
+         * resched.
+         */
+        next = NULL;
+        if ((!np || blocks) && (resched || !exists)) {
+                /* Take care of a previously scheduled
+                 * job by taking it out of the Linux runqueue.
+                 */
+                if (pedf->scheduled && !blocks)
+                        requeue(pedf->scheduled, edf);
+                next = __take_ready(edf);
+        } else
+                /* Only override Linux scheduler if we have a real-time task
+                 * scheduled that needs to continue.
+                 */
+                if (exists)
+                        next = prev;
+        if (next)
+                set_rt_flags(next, RT_F_RUNNING);
+        pedf->scheduled = next;
+        spin_unlock(&pedf->lock);
+        return next;
+}
+/*      Prepare a task for running in RT mode
+ *      Enqueues the task into master queue data structure
+ */
+static void psnedf_task_new(struct task_struct * t, int on_rq, int running)
+{
+        rt_domain_t*            edf  = task_edf(t);
+        psnedf_domain_t*        pedf = task_pedf(t);
+        unsigned long           flags;
+        TRACE("[%d] psn edf: prepare new %d on CPU %d\n",
+                smp_processor_id(), t->pid, get_partition(t));
+        /* setup job parameters */
+        edf_release_at(t, sched_clock());
+        /* The task should be running in the queue, otherwise signal
+         * code will try to wake it up with fatal consequences.
+         */
+        spin_lock_irqsave(&pedf->lock, flags);
+        if (running) {
+                /* there shouldn't be anything else running at the time */
+                BUG_ON(pedf->scheduled);
+                pedf->scheduled = t;
+        } else {
+                requeue(t, edf);
+                /* maybe we have to reschedule */
+                preempt(pedf);
+        }
+        spin_unlock_irqrestore(&pedf->lock, flags);
+}
+static void psnedf_task_wake_up(struct task_struct *task)
+{
+        unsigned long           flags;
+        psnedf_domain_t*        pedf = task_pedf(task);
+        rt_domain_t*            edf  = task_edf(task);
+        lt_t                    now;
+        spin_lock_irqsave(&pedf->lock, flags);
+        BUG_ON(in_list(&task->rt_list));
+        /* We need to take suspensions because of semaphores into
+         * account! If a job resumes after being suspended due to acquiring
+         * a semaphore, it should never be treated as a new job release.
+         *
+         * FIXME: This should be done in some more predictable and userspace-controlled way.
+         */
+        now = sched_clock();
+        if (is_tardy(task, now) &&
+            get_rt_flags(task) != RT_F_EXIT_SEM) {
+                /* new sporadic release */
+                edf_release_at(task, now);
+                sched_trace_job_release(task);
+        }
+        requeue(task, edf);
+        spin_unlock_irqrestore(&pedf->lock, flags);
+}
+static void psnedf_task_block(struct task_struct *t)
+{
+        /* only running tasks can block, thus t is in no queue */
+        BUG_ON(!is_realtime(t));
+        BUG_ON(in_list(&t->rt_list));
+}
+static void psnedf_task_exit(struct task_struct * t)
+{
+        unsigned long flags;
+        psnedf_domain_t*        pedf = task_pedf(t);
+        spin_lock_irqsave(&pedf->lock, flags);
+        if (in_list(&t->rt_list))
+                /* dequeue */
+                list_del(&t->rt_list);
+        preempt(pedf);
+        spin_unlock_irqrestore(&pedf->lock, flags);
+}
+static long psnedf_pi_block(struct pi_semaphore *sem,
+                            struct task_struct *new_waiter)
+{
+        psnedf_domain_t*        pedf;
+        rt_domain_t*            edf;
+        struct task_struct*     t;
+        int cpu  = get_partition(new_waiter);
+        BUG_ON(!new_waiter);
+        if (edf_higher_prio(new_waiter, sem->hp.cpu_task[cpu])) {
+                TRACE_TASK(new_waiter, " boosts priority\n");
+                pedf = task_pedf(new_waiter);
+                edf  = task_edf(new_waiter);
+                /* interrupts already disabled */
+                spin_lock(&pedf->lock);
+                /* store new highest-priority task */
+                sem->hp.cpu_task[cpu] = new_waiter;
+                if (sem->holder &&
+                    get_partition(sem->holder) == get_partition(new_waiter)) {
+                        /* let holder inherit */
+                        sem->holder->rt_param.inh_task = new_waiter;
+                        t = sem->holder;
+                        if (in_list(&t->rt_list)) {
+                                /* queued in domain*/
+                                list_del(&t->rt_list);
+                                /* readd to make priority change take place */
+                                if (is_released(t, sched_clock()))
+                                        __add_ready(edf, t);
+                                else
+                                        __add_release(edf, t);
+                        }
+                }
+                /* check if we need to reschedule */
+                if (edf_preemption_needed(edf, current))
+                        preempt(pedf);
+                spin_unlock(&pedf->lock);
+        }
+        return 0;
+}
+static long psnedf_inherit_priority(struct pi_semaphore *sem,
+                                    struct task_struct *new_owner)
+{
+        int cpu  = get_partition(new_owner);
+        new_owner->rt_param.inh_task = sem->hp.cpu_task[cpu];
+        if (sem->hp.cpu_task[cpu] && new_owner != sem->hp.cpu_task[cpu]) {
+                TRACE_TASK(new_owner,
+                           "inherited priority from %s/%d\n",
+                           sem->hp.cpu_task[cpu]->comm,
+                           sem->hp.cpu_task[cpu]->pid);
+        } else
+                TRACE_TASK(new_owner,
+                           "cannot inherit priority: "
+                           "no higher priority job waits on this CPU!\n");
+        /* make new owner non-preemptable as required by FMLP under
+         * PSN-EDF.
+         */
+        make_np(new_owner);
+        return 0;
+}
+/* This function is called on a semaphore release, and assumes that
+ * the current task is also the semaphore holder.
+ */
+static long psnedf_return_priority(struct pi_semaphore *sem)
+{
+        struct task_struct*     t    = current;
+        psnedf_domain_t*        pedf = task_pedf(t);
+        rt_domain_t*            edf  = task_edf(t);
+        int                     ret  = 0;
+        int                     cpu  = get_partition(current);
+        /* Find new highest-priority semaphore task
+         * if holder task is the current hp.cpu_task[cpu].
+         *
+         * Calling function holds sem->wait.lock.
+         */
+        if (t == sem->hp.cpu_task[cpu])
+                edf_set_hp_cpu_task(sem, cpu);
+        take_np(t);
+        if (current->rt_param.inh_task) {
+                TRACE_CUR("return priority of %s/%d\n",
+                          current->rt_param.inh_task->comm,
+                          current->rt_param.inh_task->pid);
+                spin_lock(&pedf->lock);
+                /* Reset inh_task to NULL. */
+                current->rt_param.inh_task = NULL;
+                /* check if we need to reschedule */
+                if (edf_preemption_needed(edf, current))
+                        preempt(pedf);
+                spin_unlock(&pedf->lock);
+        } else
+                TRACE_CUR(" no priority to return %p\n", sem);
+        return ret;
+}
+static long psnedf_admit_task(struct task_struct* tsk)
+{
+        return task_cpu(tsk) == tsk->rt_param.task_params.cpu ? 0 : -EINVAL;
+}
+/*      Plugin object   */
+static struct sched_plugin psn_edf_plugin __cacheline_aligned_in_smp = {
+        .plugin_name            = "PSN-EDF",
+        .tick                   = psnedf_tick,
+        .task_new               = psnedf_task_new,
+        .complete_job           = edf_complete_job,
+        .task_exit              = psnedf_task_exit,
+        .schedule               = psnedf_schedule,
+        .task_wake_up           = psnedf_task_wake_up,
+        .task_block             = psnedf_task_block,
+        .pi_block               = psnedf_pi_block,
+        .inherit_priority       = psnedf_inherit_priority,
+        .return_priority        = psnedf_return_priority,
+        .admit_task             = psnedf_admit_task
+};
+static int __init init_psn_edf(void)
+{
+        int i;
+        for (i = 0; i < NR_CPUS; i++)
+        {
+                psnedf_domain_init(remote_pedf(i),
+                                   psnedf_check_resched, i);
+        }
+        return register_sched_plugin(&psn_edf_plugin);
+}
+module_init(init_psn_edf);
diff --git a/litmus/sched_trace.c b/litmus/sched_trace.c
new file mode 100644
index 0000000000..0976e830ad
--- /dev/null
+++ b/litmus/sched_trace.c
@@ -0,0 +1,541 @@
+/* sched_trace.c -- record scheduling events to a byte stream.
+ *
+ * TODO: Move ring buffer to a lockfree implementation.
+ */
+#include <linux/spinlock.h>
+#include <linux/fs.h>
+#include <linux/cdev.h>
+#include <asm/semaphore.h>
+#include <asm/uaccess.h>
+#include <linux/module.h>
+#include <litmus/sched_trace.h>
+#include <litmus/litmus.h>
+typedef struct {
+        /*      guard read and write pointers                   */
+        spinlock_t      lock;
+        /*      guard against concurrent freeing of buffer      */
+        rwlock_t        del_lock;
+        /*      memory allocated for ring buffer                */
+        unsigned long   order;
+        char*           buf;
+        char*           end;
+        /*      Read/write pointer. May not cross.
+         *      They point to the position of next write and
+         *      last read.
+         */
+        char*           writep;
+        char*           readp;
+} ring_buffer_t;
+#define EMPTY_RING_BUFFER {     \
+        .lock     = SPIN_LOCK_UNLOCKED,         \
+        .del_lock = RW_LOCK_UNLOCKED,           \
+        .buf      = NULL,                       \
+        .end      = NULL,                       \
+        .writep   = NULL,                       \
+        .readp    = NULL                        \
+}
+void rb_init(ring_buffer_t* buf)
+{
+        *buf = (ring_buffer_t) EMPTY_RING_BUFFER;
+}
+int rb_alloc_buf(ring_buffer_t* buf, unsigned long order)
+{
+        unsigned long flags;
+        int error = 0;
+        char *mem;
+        /* do memory allocation while not atomic */
+        mem = (char *) __get_free_pages(GFP_KERNEL, order);
+        if (!mem)
+                return -ENOMEM;
+        write_lock_irqsave(&buf->del_lock, flags);
+        BUG_ON(buf->buf);
+        buf->buf = mem;
+        buf->end = buf->buf + PAGE_SIZE * (1 << order) - 1;
+        memset(buf->buf, 0xff, buf->end - buf->buf);
+        buf->order = order;
+        buf->writep = buf->buf + 1;
+        buf->readp  = buf->buf;
+        write_unlock_irqrestore(&buf->del_lock, flags);
+        return error;
+}
+int rb_free_buf(ring_buffer_t* buf)
+{
+        unsigned long flags;
+        int error = 0;
+        write_lock_irqsave(&buf->del_lock, flags);
+        BUG_ON(!buf->buf);
+        free_pages((unsigned long) buf->buf, buf->order);
+        buf->buf    = NULL;
+        buf->end    = NULL;
+        buf->writep = NULL;
+        buf->readp  = NULL;
+        write_unlock_irqrestore(&buf->del_lock, flags);
+        return error;
+}
+/* Assumption: concurrent writes are serialized externally
+ *
+ * Will only succeed if there is enough space for all len bytes.
+ */
+int rb_put(ring_buffer_t* buf, char* mem, size_t len)
+{
+        unsigned long flags;
+        char* r , *w;
+        int error = 0;
+        read_lock_irqsave(&buf->del_lock, flags);
+        if (!buf->buf) {
+                error = -ENODEV;
+                goto out;
+        }
+        spin_lock(&buf->lock);
+        r = buf->readp;
+        w = buf->writep;
+        spin_unlock(&buf->lock);
+        if (r < w && buf->end - w >= len - 1) {
+                /* easy case: there is enough space in the buffer
+                 * to write it in one continous chunk*/
+                memcpy(w, mem, len);
+                w += len;
+                if (w > buf->end)
+                        /* special case: fit exactly into buffer
+                         * w is now buf->end + 1
+                         */
+                        w = buf->buf;
+        } else if (w < r && r - w >= len) { /* >= len because  may not cross */
+                /* we are constrained by the read pointer but we there
+                 * is enough space
+                 */
+                memcpy(w, mem, len);
+                w += len;
+        } else if (r <= w && buf->end - w < len - 1) {
+                /* the wrap around case: there may or may not be space */
+                if ((buf->end - w) + (r - buf->buf) >= len - 1) {
+                        /* copy chunk that fits at the end */
+                        memcpy(w, mem, buf->end - w + 1);
+                        mem += buf->end - w + 1;
+                        len -= (buf->end - w + 1);
+                        w = buf->buf;
+                        /* copy the rest */
+                        memcpy(w, mem, len);
+                        w += len;
+                }
+                else
+                        error = -ENOMEM;
+        } else {
+                error = -ENOMEM;
+        }
+        if (!error) {
+                spin_lock(&buf->lock);
+                buf->writep = w;
+                spin_unlock(&buf->lock);
+        }
+ out:
+        read_unlock_irqrestore(&buf->del_lock, flags);
+        return error;
+}
+/* Assumption: concurrent reads are serialized externally */
+int rb_get(ring_buffer_t* buf, char* mem, size_t len)
+{
+        unsigned long flags;
+        char* r , *w;
+        int error = 0;
+        read_lock_irqsave(&buf->del_lock, flags);
+        if (!buf->buf) {
+                error = -ENODEV;
+                goto out;
+        }
+        spin_lock(&buf->lock);
+        r = buf->readp;
+        w = buf->writep;
+        spin_unlock(&buf->lock);
+        if (w <= r && buf->end - r >= len) {
+                /* easy case: there is enough data in the buffer
+                 * to get it in one  chunk*/
+                memcpy(mem, r + 1, len);
+                r += len;
+                error = len;
+        } else if (r + 1 < w && w - r - 1 >= len) {
+                /* we are constrained by the write pointer but
+                 * there is enough data
+                 */
+                memcpy(mem, r + 1, len);
+                r += len;
+                error = len;
+        } else if (r + 1 < w && w - r - 1 < len) {
+                /* we are constrained by the write pointer and there
+                 * there is not enough data
+                 */
+                memcpy(mem, r + 1, w - r - 1);
+                error = w - r - 1;
+                r    += w - r - 1;
+        } else if (w <= r && buf->end - r < len) {
+                /* the wrap around case: there may or may not be enough data
+                 * first let's get what is available
+                 */
+                memcpy(mem, r + 1, buf->end - r);
+                error += (buf->end - r);
+                mem   += (buf->end - r);
+                len   -= (buf->end - r);
+                r     += (buf->end - r);
+                if (w > buf->buf) {
+                        /* there is more to get */
+                        r = buf->buf - 1;
+                        if (w - r >= len) {
+                                /* plenty */
+                                memcpy(mem, r + 1, len);
+                                error += len;
+                                r     += len;
+                        } else {
+                                memcpy(mem, r + 1, w - r - 1);
+                                error += w - r - 1;
+                                r     += w - r - 1;
+                        }
+                }
+        } /* nothing available */
+        if (error > 0) {
+                spin_lock(&buf->lock);
+                buf->readp = r;
+                spin_unlock(&buf->lock);
+        }
+ out:
+        read_unlock_irqrestore(&buf->del_lock, flags);
+        return error;
+}
+/******************************************************************************/
+/*                        DEVICE FILE DRIVER                                  */
+/******************************************************************************/
+/* Allocate a buffer of about 1 MB per CPU.
+ *
+ */
+#define BUFFER_ORDER 8
+typedef struct {
+        ring_buffer_t           buf;
+        atomic_t                reader_cnt;
+        struct semaphore        reader_mutex;
+} trace_buffer_t;
+/* This does not initialize the semaphore!! */
+#define EMPTY_TRACE_BUFFER \
+        { .buf = EMPTY_RING_BUFFER, .reader_cnt = ATOMIC_INIT(0)}
+static DEFINE_PER_CPU(trace_buffer_t, trace_buffer);
+#ifdef CONFIG_SCHED_DEBUG_TRACE
+static spinlock_t               log_buffer_lock = SPIN_LOCK_UNLOCKED;
+#endif
+static trace_buffer_t           log_buffer = EMPTY_TRACE_BUFFER;
+static void init_buffers(void)
+{
+        int i;
+        for (i = 0; i < NR_CPUS; i++) {
+                rb_init(&per_cpu(trace_buffer, i).buf);
+                init_MUTEX(&per_cpu(trace_buffer, i).reader_mutex);
+                atomic_set(&per_cpu(trace_buffer, i).reader_cnt, 0);
+        }
+        /* only initialize the mutex, the rest was initialized as part
+         * of the static initialization macro
+         */
+        init_MUTEX(&log_buffer.reader_mutex);
+}
+static int trace_release(struct inode *in, struct file *filp)
+{
+        int error               = -EINVAL;
+        trace_buffer_t* buf     = filp->private_data;
+        BUG_ON(!filp->private_data);
+        if (down_interruptible(&buf->reader_mutex)) {
+                error = -ERESTARTSYS;
+                goto out;
+        }
+        /*      last release must deallocate buffers    */
+        if (atomic_dec_return(&buf->reader_cnt) == 0) {
+                error = rb_free_buf(&buf->buf);
+        }
+        up(&buf->reader_mutex);
+ out:
+        return error;
+}
+static ssize_t trace_read(struct file *filp, char __user *to, size_t len,
+                      loff_t *f_pos)
+{
+        /*      we ignore f_pos, this is strictly sequential */
+        ssize_t error = -EINVAL;
+        char*   mem;
+        trace_buffer_t *buf = filp->private_data;
+        if (down_interruptible(&buf->reader_mutex)) {
+                error = -ERESTARTSYS;
+                goto out;
+        }
+        if (len > 64 * 1024)
+                len = 64 * 1024;
+        mem = kmalloc(len, GFP_KERNEL);
+        if (!mem) {
+                error = -ENOMEM;
+                goto out_unlock;
+        }
+        error = rb_get(&buf->buf, mem, len);
+        while (!error) {
+                set_current_state(TASK_INTERRUPTIBLE);
+                schedule_timeout(110);
+                if (signal_pending(current))
+                        error = -ERESTARTSYS;
+                else
+                        error = rb_get(&buf->buf, mem, len);
+        }
+        if (error > 0 && copy_to_user(to, mem, error))
+                error = -EFAULT;
+        kfree(mem);
+ out_unlock:
+        up(&buf->reader_mutex);
+ out:
+        return error;
+}
+/* trace_open - Open one of the per-CPU sched_trace buffers.
+ */
+static int trace_open(struct inode *in, struct file *filp)
+{
+        int error               = -EINVAL;
+        int cpu                 = MINOR(in->i_rdev);
+        trace_buffer_t* buf;
+        if (!cpu_online(cpu)) {
+                printk(KERN_WARNING "sched trace: "
+                        "CPU #%d is not online. (open failed)\n", cpu);
+                error = -ENODEV;
+                goto out;
+        }
+        buf = &per_cpu(trace_buffer, cpu);
+        if (down_interruptible(&buf->reader_mutex)) {
+                error = -ERESTARTSYS;
+                goto out;
+        }
+        /*      first open must allocate buffers        */
+        if (atomic_inc_return(&buf->reader_cnt) == 1) {
+                if ((error = rb_alloc_buf(&buf->buf, BUFFER_ORDER)))
+                {
+                        atomic_dec(&buf->reader_cnt);
+                        goto out_unlock;
+                }
+        }
+        error = 0;
+        filp->private_data = buf;
+ out_unlock:
+        up(&buf->reader_mutex);
+ out:
+        return error;
+}
+/* log_open - open the global log message ring buffer.
+ */
+static int log_open(struct inode *in, struct file *filp)
+{
+        int error               = -EINVAL;
+        trace_buffer_t* buf;
+        buf = &log_buffer;
+        if (down_interruptible(&buf->reader_mutex)) {
+                error = -ERESTARTSYS;
+                goto out;
+        }
+        /*      first open must allocate buffers        */
+        if (atomic_inc_return(&buf->reader_cnt) == 1) {
+                if ((error = rb_alloc_buf(&buf->buf, BUFFER_ORDER)))
+                {
+                        atomic_dec(&buf->reader_cnt);
+                        goto out_unlock;
+                }
+        }
+        error = 0;
+        filp->private_data = buf;
+ out_unlock:
+        up(&buf->reader_mutex);
+ out:
+        return error;
+}
+/******************************************************************************/
+/*                          Device Registration                               */
+/******************************************************************************/
+/* the major numbes are from the unassigned/local use block
+ *
+ * This should be converted to dynamic allocation at some point...
+ */
+#define TRACE_MAJOR     250
+#define LOG_MAJOR       251
+/* trace_fops - The file operations for accessing the per-CPU scheduling event
+ *              trace buffers.
+ */
+struct file_operations trace_fops = {
+        .owner   = THIS_MODULE,
+        .open    = trace_open,
+        .release = trace_release,
+        .read    = trace_read,
+};
+/* log_fops  - The file operations for accessing the global LITMUS log message
+ *             buffer.
+ *
+ * Except for opening the device file it uses the same operations as trace_fops.
+ */
+struct file_operations log_fops = {
+        .owner   = THIS_MODULE,
+        .open    = log_open,
+        .release = trace_release,
+        .read    = trace_read,
+};
+static int __init register_buffer_dev(const char* name,
+                                      struct file_operations* fops,
+                                      int major, int count)
+{
+        dev_t  trace_dev;
+        struct cdev *cdev;
+        int error = 0;
+        trace_dev = MKDEV(major, 0);
+        error     = register_chrdev_region(trace_dev, count, name);
+        if (error)
+        {
+                printk(KERN_WARNING "sched trace: "
+                       "Could not register major/minor number %d\n", major);
+                return error;
+        }
+        cdev = cdev_alloc();
+        if (!cdev) {
+                printk(KERN_WARNING "sched trace: "
+                        "Could not get a cdev for %s.\n", name);
+                return -ENOMEM;
+        }
+        cdev->owner = THIS_MODULE;
+        cdev->ops   = fops;
+        error = cdev_add(cdev, trace_dev, count);
+        if (error) {
+                printk(KERN_WARNING "sched trace: "
+                        "add_cdev failed for %s.\n", name);
+                return -ENOMEM;
+        }
+        return error;
+}
+static int __init init_sched_trace(void)
+{
+        int error1 = 0, error2 = 0;
+        printk("Initializing scheduler trace device\n");
+        init_buffers();
+        error1 = register_buffer_dev("schedtrace", &trace_fops,
+                                    TRACE_MAJOR, NR_CPUS);
+        error2 = register_buffer_dev("litmus_log", &log_fops,
+                                     LOG_MAJOR, 1);
+        if (error1 || error2)
+                return min(error1, error2);
+        else
+                return 0;
+}
+module_init(init_sched_trace);
+/******************************************************************************/
+/*                                KERNEL API                                  */
+/******************************************************************************/
+/* The per-CPU LITMUS log buffer. Don't put it on the stack, it is too big for
+ * that and the kernel gets very picky with nested interrupts and small stacks.
+ */
+#ifdef CONFIG_SCHED_DEBUG_TRACE
+#define MSG_SIZE 255
+static DEFINE_PER_CPU(char[MSG_SIZE], fmt_buffer);
+/* sched_trace_log_message - This is the only function that accesses the the
+ *                           log buffer inside the kernel for writing.
+ *                           Concurrent access to it is serialized via the
+ *                           log_buffer_lock.
+ *
+ *                           The maximum length of a formatted message is 255.
+ */
+void sched_trace_log_message(const char* fmt, ...)
+{
+        unsigned long   flags;
+        va_list         args;
+        size_t          len;
+        char*           buf;
+        va_start(args, fmt);
+        local_irq_save(flags);
+        /* format message */
+        buf = __get_cpu_var(fmt_buffer);
+        len = vscnprintf(buf, MSG_SIZE, fmt, args);
+        spin_lock(&log_buffer_lock);
+        /* Don't copy the trailing null byte, we don't want null bytes
+         * in a text file.
+         */
+        rb_put(&log_buffer.buf, buf, len);
+        spin_unlock(&log_buffer_lock);
+        local_irq_restore(flags);
+        va_end(args);
+}
+#endif
diff --git a/litmus/trace.c b/litmus/trace.c
new file mode 100644
index 0000000000..90ef443bd9
--- /dev/null
+++ b/litmus/trace.c
@@ -0,0 +1,303 @@
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/cdev.h>
+#include <asm/semaphore.h>
+#include <asm/uaccess.h>
+#include <linux/module.h>
+#include <litmus/trace.h>
+/******************************************************************************/
+/*                          Allocation                                        */
+/******************************************************************************/
+struct ft_buffer* trace_ts_buf = NULL;
+static unsigned int ts_seq_no = 0;
+feather_callback void save_timestamp(unsigned long event)
+{
+        unsigned int seq_no = fetch_and_inc((int *) &ts_seq_no);
+        struct timestamp *ts;
+        if (ft_buffer_start_write(trace_ts_buf, (void**)  &ts)) {
+                ts->event     = event;
+                ts->timestamp = ft_read_tsc();
+                ts->seq_no    = seq_no;
+                ts->cpu       = raw_smp_processor_id();
+                ft_buffer_finish_write(trace_ts_buf, ts);
+        }
+}
+static struct ft_buffer* alloc_ft_buffer(unsigned int count, size_t size)
+{
+        struct ft_buffer* buf;
+        size_t total = (size + 1) * count;
+        char* mem;
+        int order = 0, pages = 1;
+        buf = kmalloc(sizeof(struct ft_buffer), GFP_KERNEL);
+        if (!buf)
+                return NULL;
+        total = (total / PAGE_SIZE) + (total % PAGE_SIZE != 0);
+        while (pages < total) {
+                order++;
+                pages *= 2;
+        }
+        mem = (char*) __get_free_pages(GFP_KERNEL, order);
+        if (!mem) {
+                kfree(buf);
+                return NULL;
+        }
+        if (!init_ft_buffer(buf, count, size,
+                            mem + (count * size),  /* markers at the end */
+                            mem)) {                /* buffer objects     */
+                free_pages((unsigned long) mem, order);
+                kfree(buf);
+                return NULL;
+        }
+        return buf;
+}
+static void free_ft_buffer(struct ft_buffer* buf)
+{
+        int order = 0, pages = 1;
+        size_t total;
+        if (buf) {
+                total = (buf->slot_size + 1) * buf->slot_count;
+                total = (total / PAGE_SIZE) + (total % PAGE_SIZE != 0);
+                while (pages < total) {
+                        order++;
+                        pages *= 2;
+                }
+                free_pages((unsigned long) buf->buffer_mem, order);
+                kfree(buf);
+        }
+}
+/******************************************************************************/
+/*                        DEVICE FILE DRIVER                                  */
+/******************************************************************************/
+#define NO_TIMESTAMPS 262144
+static DECLARE_MUTEX(feather_lock);
+static int use_count = 0;
+static int trace_release(struct inode *in, struct file *filp)
+{
+        int err                 = -EINVAL;
+        if (down_interruptible(&feather_lock)) {
+                err = -ERESTARTSYS;
+                goto out;
+        }
+        printk(KERN_ALERT "%s/%d disconnects from feather trace device. "
+               "use_count=%d\n",
+               current->comm, current->pid, use_count);
+        if (use_count == 1) {
+                /* disable events */
+                ft_disable_all_events();
+                /* wait for any pending events to complete */
+                set_current_state(TASK_UNINTERRUPTIBLE);
+                schedule_timeout(HZ);
+                printk(KERN_ALERT "Failed trace writes: %u\n",
+                       trace_ts_buf->failed_writes);
+                free_ft_buffer(trace_ts_buf);
+                trace_ts_buf = NULL;
+        }
+        use_count--;
+        up(&feather_lock);
+out:
+        return err;
+}
+static ssize_t trace_read(struct file *filp, char __user *to, size_t len,
+                      loff_t *f_pos)
+{
+        /*      we ignore f_pos, this is strictly sequential */
+        ssize_t error = 0;
+        struct timestamp ts;
+        if (down_interruptible(&feather_lock)) {
+                error = -ERESTARTSYS;
+                goto out;
+        }
+        while (len >= sizeof(struct timestamp)) {
+                if (ft_buffer_read(trace_ts_buf, &ts)) {
+                        if (copy_to_user(to, &ts, sizeof(struct timestamp))) {
+                                error = -EFAULT;
+                                break;
+                        } else {
+                                len    -= sizeof(struct timestamp);
+                                to     += sizeof(struct timestamp);
+                                error  += sizeof(struct timestamp);
+                        }
+                } else {
+                        set_current_state(TASK_INTERRUPTIBLE);
+                        schedule_timeout(50);
+                        if (signal_pending(current)) {
+                                error = -ERESTARTSYS;
+                                break;
+                        }
+                }
+        }
+        up(&feather_lock);
+out:
+        return error;
+}
+#define ENABLE_CMD      0
+#define DISABLE_CMD     1
+static ssize_t trace_write(struct file *filp, const char __user *from,
+                           size_t len, loff_t *f_pos)
+{
+        ssize_t error = -EINVAL;
+        unsigned long cmd;
+        unsigned long id;
+        if (len % sizeof(long) || len < 2 * sizeof(long))
+                goto out;
+        if (copy_from_user(&cmd, from, sizeof(long))) {
+                error = -EFAULT;
+                goto out;
+        }
+        len  -= sizeof(long);
+        from += sizeof(long);
+        if (cmd != ENABLE_CMD && cmd != DISABLE_CMD)
+                goto out;
+        if (down_interruptible(&feather_lock)) {
+                error = -ERESTARTSYS;
+                goto out;
+        }
+        error = sizeof(long);
+        while (len) {
+                if (copy_from_user(&id, from, sizeof(long))) {
+                        error = -EFAULT;
+                        goto out;
+                }
+                len  -= sizeof(long);
+                from += sizeof(long);
+                if (cmd) {
+                        printk(KERN_INFO
+                               "Disabling feather-trace event %lu.\n", id);
+                        ft_disable_event(id);
+                } else {
+                        printk(KERN_INFO
+                               "Enabling feather-trace event %lu.\n", id);
+                        ft_enable_event(id);
+                }
+                error += sizeof(long);
+        }
+        up(&feather_lock);
+ out:
+        return error;
+}
+static int trace_open(struct inode *in, struct file *filp)
+{
+        int err = 0;
+        unsigned int count = NO_TIMESTAMPS;
+        if (down_interruptible(&feather_lock)) {
+                err = -ERESTARTSYS;
+                goto out;
+        }
+        while (count && !trace_ts_buf) {
+                printk("trace: trying to allocate %u time stamps.\n", count);
+                trace_ts_buf = alloc_ft_buffer(count, sizeof(struct timestamp));
+                count /= 2;
+        }
+        if (!trace_ts_buf)
+                err = -ENOMEM;
+        else
+                use_count++;
+        up(&feather_lock);
+out:
+        return err;
+}
+/******************************************************************************/
+/*                          Device Registration                               */
+/******************************************************************************/
+#define FT_TRACE_MAJOR  252
+struct file_operations ft_trace_fops = {
+        .owner   = THIS_MODULE,
+        .open    = trace_open,
+        .release = trace_release,
+        .write   = trace_write,
+        .read    = trace_read,
+};
+static int __init register_buffer_dev(const char* name,
+                                      struct file_operations* fops,
+                                      int major, int count)
+{
+        dev_t   trace_dev;
+        struct cdev *cdev;
+        int error = 0;
+        trace_dev = MKDEV(major, 0);
+        error     = register_chrdev_region(trace_dev, count, name);
+        if (error)
+        {
+                printk(KERN_WARNING "trace: "
+                       "Could not register major/minor number %d\n", major);
+                return error;
+        }
+        cdev = cdev_alloc();
+        if (!cdev) {
+                printk(KERN_WARNING "trace: "
+                        "Could not get a cdev for %s.\n", name);
+                return -ENOMEM;
+        }
+        cdev->owner = THIS_MODULE;
+        cdev->ops   = fops;
+        error = cdev_add(cdev, trace_dev, count);
+        if (error) {
+                printk(KERN_WARNING "trace: "
+                        "add_cdev failed for %s.\n", name);
+                return -ENOMEM;
+        }
+        return error;
+}
+static int __init init_sched_trace(void)
+{
+        int error = 0;
+        printk("Initializing Feather-Trace device\n");
+        /* dummy entry to make linker happy */
+        ft_event0(666, save_timestamp);
+        error = register_buffer_dev("ft_trace", &ft_trace_fops,
+                                    FT_TRACE_MAJOR, 1);
+        return error;
+}
+module_init(init_sched_trace);
author	Bjoern B. Brandenburg <bbb@cs.unc.edu>	2008-02-13 14:13:15 -0500
committer	Bjoern B. Brandenburg <bbb@cs.unc.edu>	2008-02-13 14:13:15 -0500
commit	8ce9b0cb97d9266b3b64b2b57835e17f6e03f585 (patch)
tree	a6ef1acaf9c9dc116ccc9f24f5233fa7d25cd426
parent	49914084e797530d9baaf51df9eda77babc98fa8 (diff)