release LITMUS 2007.3

author: Bjoern Brandenburg <bbb@bbb1-cs.cs.unc.edu> 2008-01-28 14:13:24 -0500
committer: Bjoern Brandenburg <bbb@bbb1-cs.cs.unc.edu> 2008-01-28 14:13:24 -0500
commit: d3605639a4e641ae7591734f9e8f836605e58f1c (patch)
tree: d45d12db3f6dc22412e137e835670c9a8779b215
parent: 24a3d78b334a52123f168a451fa4a5db4bb157e0 (diff)
5 files changed, 12935 insertions, 34 deletions
diff --git a/download/2007.3/SHA256SUMS b/download/2007.3/SHA256SUMS
new file mode 100644
index 0000000..3adb07b
--- /dev/null
+++ b/download/2007.3/SHA256SUMS
@@ -0,0 +1,3 @@
+8f1f5335de7a1aab158adf90aa7010eea066c5dd153b6f98b2d4bb1785682e3b  liblitmus-2007.3.tgz
+5989f228bdadfd52633344e55e1b6db49f44b72e699c05fdde96832e08bca47c  libso-2007.3.tgz
+4d51589a5cb92b9c7df11de4b550700493d067c2fa2b0228c464ff4c18436941  litmus-rt-2007.3.patch
diff --git a/download/2007.3/liblitmus-2007.3.tgz b/download/2007.3/liblitmus-2007.3.tgz
new file mode 100644
index 0000000..17ff20a
--- /dev/null
+++ b/download/2007.3/liblitmus-2007.3.tgz
Binary files differ
diff --git a/download/2007.3/libso-2007.3.tgz b/download/2007.3/libso-2007.3.tgz
new file mode 100644
index 0000000..97c6437
--- /dev/null
+++ b/download/2007.3/libso-2007.3.tgz
Binary files differ
diff --git a/download/2007.3/litmus-rt-2007.3.patch b/download/2007.3/litmus-rt-2007.3.patch
new file mode 100644
index 0000000..a81602a
--- /dev/null
+++ b/download/2007.3/litmus-rt-2007.3.patch
@@ -0,0 +1,12859 @@
+ arch/i386/Kconfig                |   28 +
+ arch/i386/kernel/apic.c          |   92 ++
+ arch/i386/kernel/i386_ksyms.c    |    1 +
+ arch/i386/kernel/signal.c        |   13 +
+ arch/i386/kernel/syscall_table.S |   27 +
+ fs/Makefile                      |    2 +-
+ fs/exec.c                        |    5 +-
+ fs/fdso.c                        |  281 +++++++
+ fs/inode.c                       |    2 +
+ include/asm-i386/thread_info.h   |    2 +
+ include/asm-i386/unistd.h        |   28 +-
+ include/linux/edf_common.h       |   36 +
+ include/linux/fdso.h             |   70 ++
+ include/linux/feather_buffer.h   |  108 +++
+ include/linux/feather_trace.h    |   93 ++
+ include/linux/fifo_common.h      |   18 +
+ include/linux/fpmath.h           |  111 +++
+ include/linux/fs.h               |    5 +
+ include/linux/ics.h              |   35 +
+ include/linux/list.h             |   30 +
+ include/linux/litmus.h           |  141 ++++
+ include/linux/pfair_common.h     |   40 +
+ include/linux/pfair_math.h       |   80 ++
+ include/linux/queuelock.h        |   98 +++
+ include/linux/rt_domain.h        |   98 +++
+ include/linux/rt_param.h         |  277 ++++++
+ include/linux/sched.h            |   14 +
+ include/linux/sched_plugin.h     |  147 ++++
+ include/linux/sched_trace.h      |  182 ++++
+ include/linux/trace.h            |   74 ++
+ include/linux/uaccess.h          |   16 +
+ include/linux/wait.h             |    2 +
+ kernel/Makefile                  |    8 +-
+ kernel/edf_common.c              |  135 +++
+ kernel/exit.c                    |    4 +
+ kernel/fifo_common.c             |   86 ++
+ kernel/fork.c                    |    5 +
+ kernel/ft_event.c                |  104 +++
+ kernel/ics.c                     |  229 +++++
+ kernel/litmus.c                  | 1034 +++++++++++++++++++++++
+ kernel/litmus_sem.c              |  567 +++++++++++++
+ kernel/pfair_common.c            |  237 ++++++
+ kernel/rt_domain.c               |  185 ++++
+ kernel/sched.c                   |  204 ++++-
+ kernel/sched_adaptive.c          | 1454 ++++++++++++++++++++++++++++++++
+ kernel/sched_edf_hsb.c           | 1724 ++++++++++++++++++++++++++++++++++++++
+ kernel/sched_global_edf.c        |  550 ++++++++++++
+ kernel/sched_gsn_edf.c           |  816 ++++++++++++++++++
+ kernel/sched_part_edf.c          |  340 ++++++++
+ kernel/sched_pfair.c             |  503 +++++++++++
+ kernel/sched_plugin.c            |  108 +++
+ kernel/sched_psn_edf.c           |  523 ++++++++++++
+ kernel/sched_trace.c             |  755 +++++++++++++++++
+ kernel/timer.c                   |   22 +
+ kernel/trace.c                   |  302 +++++++
+ lib/semaphore-sleepers.c         |    2 +-
+ 56 files changed, 12028 insertions(+), 25 deletions(-)
+diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
+index 0dfee81..da6f1e9 100644
+--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
+@@ -1210,6 +1210,7 @@ config KPROBES
+          a probepoint and specifies the callback.  Kprobes is useful
+          for kernel debugging, non-intrusive instrumentation and testing.
+          If in doubt, say "N".
+
+ endmenu
+ 
+ source "arch/i386/Kconfig.debug"
+@@ -1259,3 +1260,30 @@ config X86_TRAMPOLINE
+ config KTIME_SCALAR
+        bool
+        default y
+
+
+menu "LITMUS^RT"
+
+
+config SCHED_TASK_TRACE
+       bool "Trace real-time tasks"
+       default y       
+       help
+         Include support for the sched_trace_XXX() tracing functions. This
+          allows the collection of real-time task events such as job 
+         completions, job releases, early completions, etc. This results in  a
+         small overhead in the scheduling code. Disable if the overhead is not
+         acceptable (e.g., benchmarking).
+
+config SCHED_DEBUG_TRACE
+       bool "TRACE() debugging"
+       default y       
+       help
+         Include support for sched_trace_log_messageg(), which is used to 
+         implement TRACE(). If disabled, no TRACE() messages will be included
+         in the kernel, and no overheads due to debugging statements will be
+         incurred by the scheduler. Disable if the overhead is not acceptable
+         (e.g. benchmarking).
+
+
+endmenu
+diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c
+index 776d9be..2e8909f 100644
+--- a/arch/i386/kernel/apic.c
+++ b/arch/i386/kernel/apic.c
+@@ -26,6 +26,7 @@
+ #include <linux/sysdev.h>
+ #include <linux/cpu.h>
+ #include <linux/module.h>
+#include <linux/litmus.h>
+ 
+ #include <asm/atomic.h>
+ #include <asm/smp.h>
+@@ -43,6 +44,8 @@
+ 
+ #include "io_ports.h"
+ 
+#include <linux/trace.h>
+
+ /*
+  * cpu_mask that denotes the CPUs that needs timer interrupt coming in as
+  * IPIs in place of local APIC timers
+@@ -54,6 +57,15 @@ static cpumask_t timer_bcast_ipi;
+  */
+ static int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */
+ 
+/*
+ * Definitions and variables related to quantum synchronization.
+ */
+#define WAIT_TO_SYNC 30000 /* time after boot until sync */
+static int stagger = 0; /* are we using staggered quanta? */
+static atomic_t qsync_time = ATOMIC_INIT(INITIAL_JIFFIES);
+static atomic_t quantum_sync_barrier = ATOMIC_INIT(0);
+static atomic_t sync_done = ATOMIC_INIT(0);
+
+ static inline void lapic_disable(void)
+ {
+        enable_local_apic = -1;
+@@ -786,6 +798,23 @@ static int __init apic_set_verbosity(char *str)
+ 
+ __setup("apic=", apic_set_verbosity);
+ 
+/*
+ * Determine whether to use aligned or staggerd quanta.
+ */
+
+static int __init apic_synch_type(char *str)
+{
+       if (strcmp("aligned", str) == 0)
+               stagger = 0;
+       else if (strcmp("staggered", str) == 0)
+               stagger = 1;
+       else
+               stagger = 0; /* aligned quanta by default */
+       return 1;
+}
+
+__setup("quanta=", apic_synch_type);
+
+ static int __init detect_init_APIC (void)
+ {
+        u32 h, l, features;
+@@ -1198,6 +1227,47 @@ EXPORT_SYMBOL(switch_ipi_to_APIC_timer);
+ #undef APIC_DIVISOR
+ 
+ /*
+ * This function is called to align all quanta, and to stagger quanta if
+ * necessary. It relies on a barrier to synchronize all processors, so
+ * that they all reset their APIC timers at the same time. If quanta
+ * should be staggered, the appropriate stagger delay is then added at
+ * each processor.
+ */
+
+void synchronize_quanta(void)
+{
+       int cpu = smp_processor_id();
+       int total_cpus = num_online_cpus();
+       int stagger_interval = jiffies_to_usecs(1) / total_cpus;
+
+       /*
+        * Disable APIC timer, wait for all other processors to reach barrier,
+        * and re-enable all timers concurrently.
+        */
+       disable_APIC_timer();
+       atomic_inc(&quantum_sync_barrier);
+       while (atomic_read(&quantum_sync_barrier) < total_cpus) {
+               /* Delay, otherwise atomic_inc's cannot occur. */
+               udelay(1);
+       }
+
+       /* Add necessary stagger for this CPU, if required. */
+       if (stagger) {
+               int stagger_us = cpu * stagger_interval;
+               udelay(stagger_us);
+       }
+
+       /* Re-enable all timers. */
+       __setup_APIC_LVTT(calibration_result);
+       enable_APIC_timer();
+       
+       /* The first CPU signals that quantum sync is complete. */
+       if (cpu == 0)
+                atomic_inc(&sync_done); 
+}
+
+
+/*
+  * Local timer interrupt handler. It does both profiling and
+  * process statistics/rescheduling.
+  *
+@@ -1209,11 +1279,32 @@ EXPORT_SYMBOL(switch_ipi_to_APIC_timer);
+ 
+ inline void smp_local_timer_interrupt(void)
+ {
+/*     s64 offset; */
+
+       TS_TICK_START;
+
+        profile_tick(CPU_PROFILING);
+ #ifdef CONFIG_SMP
+        update_process_times(user_mode_vm(get_irq_regs()));
+ #endif
+ 
+       /* Print out timing data - can be commented out if necessary. */
+/*     offset = get_nsec_offset(); */
+/*     TRACE("%d\n", offset);      */
+
+       /*
+        * Synchronize quanta if we have reached qsync_time plus wait
+        * interval. The synchronization code itself is placed in its own
+        * (non-inline) function, to avoid issues with creating an inline
+        * function that is too large.
+        */
+       if (unlikely(!atomic_read(&sync_done) &&
+                    time_after(jiffies,
+                               (unsigned long)(atomic_read(&qsync_time) + 
+                               msecs_to_jiffies(WAIT_TO_SYNC))))) {
+               synchronize_quanta();
+       }
+
+        /*
+         * We take the 'long' return path, and there every subsystem
+         * grabs the apropriate locks (kernel lock/ irq lock).
+@@ -1224,6 +1315,7 @@ inline void smp_local_timer_interrupt(void)
+         * Currently this isn't too much of an issue (performance wise),
+         * we can take more than 100K local irqs per second on a 100 MHz P5.
+         */
+       TS_TICK_END;
+ }
+ 
+ /*
+diff --git a/arch/i386/kernel/i386_ksyms.c b/arch/i386/kernel/i386_ksyms.c
+index e3d4b73..9670f77 100644
+--- a/arch/i386/kernel/i386_ksyms.c
+++ b/arch/i386/kernel/i386_ksyms.c
+@@ -6,6 +6,7 @@ EXPORT_SYMBOL(__down_failed);
+ EXPORT_SYMBOL(__down_failed_interruptible);
+ EXPORT_SYMBOL(__down_failed_trylock);
+ EXPORT_SYMBOL(__up_wakeup);
+
+ /* Networking helper routines. */
+ EXPORT_SYMBOL(csum_partial_copy_generic);
+ 
+diff --git a/arch/i386/kernel/signal.c b/arch/i386/kernel/signal.c
+index 65d7620..7415518 100644
+--- a/arch/i386/kernel/signal.c
+++ b/arch/i386/kernel/signal.c
+@@ -27,6 +27,8 @@
+ #include <asm/i387.h>
+ #include "sigframe.h"
+ 
+#include <linux/ics.h>
+
+ #define DEBUG_SIG 0
+ 
+ #define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
+@@ -653,5 +655,16 @@ void do_notify_resume(struct pt_regs *regs, void *_unused,
+        if (thread_info_flags & (_TIF_SIGPENDING | _TIF_RESTORE_SIGMASK))
+                do_signal(regs);
+        
+       if (thread_info_flags & _TIF_ROLLBACK_RCS) {
+               long addr = (long) get_rollback_addr();
+               if (addr) {
+                       ICS_DBG(KERN_DEBUG "do_notify_resume(): eip 0x%lx -> "
+                               "0x%lx\n", regs->eip, addr);
+                       regs->eip = addr;
+                       
+               }
+               clear_thread_flag(TIF_ROLLBACK_RCS);            
+       }
+
+        clear_thread_flag(TIF_IRET);
+ }
+diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
+index 2697e92..32f7d54 100644
+--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
+@@ -319,3 +319,30 @@ ENTRY(sys_call_table)
+        .long sys_move_pages
+        .long sys_getcpu
+        .long sys_epoll_pwait
+       /* LITMUS syscalls */
+       .long sys_sched_setpolicy       /* 320 */
+       .long sys_sched_getpolicy
+       .long sys_set_rt_mode
+       .long sys_set_rt_task_param
+       .long sys_get_rt_task_param
+       .long sys_ni_syscall            /* 325 */
+       .long sys_sleep_next_period
+       .long sys_scheduler_setup
+        .long sys_register_np_flag
+        .long sys_exit_np   
+       .long sys_od_open               /* 330 */
+       .long sys_od_close
+       .long sys_pi_down
+       .long sys_pi_up                 
+       .long sys_srp_down              
+       .long sys_srp_up                /* 335 */
+       .long sys_reg_task_srp_sem
+       .long sys_query_job_no
+       .long sys_wait_for_job_release  
+       .long sys_set_service_levels    
+       .long sys_get_cur_service_level /* 340 */
+       .long sys_reg_ics_cb
+       .long sys_start_wcs
+       .long sys_task_mode_transition  /* 343 */
+
+
+diff --git a/fs/Makefile b/fs/Makefile
+index b9ffa63..318c0f7 100644
+--- a/fs/Makefile
+++ b/fs/Makefile
+@@ -11,7 +11,7 @@ obj-y :=      open.o read_write.o file_table.o super.o \
+                attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \
+                seq_file.o xattr.o libfs.o fs-writeback.o \
+                pnode.o drop_caches.o splice.o sync.o utimes.o \
+-               stack.o
+               stack.o fdso.o
+ 
+ ifeq ($(CONFIG_BLOCK),y)
+ obj-y +=       buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
+diff --git a/fs/exec.c b/fs/exec.c
+index 11fe93f..29498a9 100644
+--- a/fs/exec.c
+++ b/fs/exec.c
+@@ -54,6 +54,8 @@
+ #include <asm/uaccess.h>
+ #include <asm/mmu_context.h>
+ 
+#include <linux/litmus.h>
+
+ #ifdef CONFIG_KMOD
+ #include <linux/kmod.h>
+ #endif
+@@ -1140,7 +1142,8 @@ int do_execve(char * filename,
+        if (IS_ERR(file))
+                goto out_kfree;
+ 
+-       sched_exec();
+       sched_exec();   
+       litmus_exec();
+ 
+        bprm->p = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *);
+ 
+diff --git a/fs/fdso.c b/fs/fdso.c
+new file mode 100644
+index 0000000..e639020
+--- /dev/null
+++ b/fs/fdso.c
+@@ -0,0 +1,281 @@
+/* fdso.c - file descriptor attached shared objects 
+ *
+ * (c) 2007 B. Brandenburg, LITMUS^RT project
+ *
+ * Notes: 
+ *   - objects descriptor (OD) tables are not cloned during a fork.
+ *   - objects are created on-demand, and freed after the last reference
+ *     is dropped.
+ *   - for now, object types are hard coded.
+ *   - As long as we have live objects, we keep a reference to the inode.
+ */
+
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/mutex.h>
+#include <linux/file.h>
+#include <asm/uaccess.h>
+
+#include <linux/fdso.h>
+
+extern struct fdso_ops pi_sem_ops;
+extern struct fdso_ops srp_sem_ops;
+extern struct fdso_ops ics_ops;
+
+static const struct fdso_ops* fdso_ops[] = {
+       &pi_sem_ops, 
+       &srp_sem_ops,
+       &ics_ops
+};
+
+static void* fdso_create(obj_type_t type) 
+{
+       return fdso_ops[type]->create();
+}
+
+static void fdso_destroy(obj_type_t type, void* obj)
+{
+       fdso_ops[type]->destroy(obj);   
+}       
+
+static int fdso_open(struct od_table_entry* entry, void* __user config)
+{
+       if (fdso_ops[entry->obj->type]->open)
+               return fdso_ops[entry->obj->type]->open(entry, config);
+       else
+               return 0;
+}
+
+static int fdso_close(struct od_table_entry* entry)
+{
+       if (fdso_ops[entry->obj->type]->close)
+               return fdso_ops[entry->obj->type]->close(entry);
+       else
+               return 0;       
+}
+
+/* inode must be locked already */
+static struct inode_obj_id* alloc_inode_obj(struct inode* inode,
+                                           obj_type_t type, 
+                                           unsigned int id)
+{
+       struct inode_obj_id* obj;
+       void* raw_obj;
+       
+       raw_obj = fdso_create(type);
+       if (!raw_obj)
+               return NULL;
+
+       obj = kmalloc(sizeof(struct inode_obj_id), GFP_KERNEL);
+       if (!obj)
+               return NULL;
+       INIT_LIST_HEAD(&obj->list);
+       atomic_set(&obj->count, 1);
+       obj->type  = type;
+       obj->id    = id;
+       obj->obj   = raw_obj;
+       obj->inode = inode;
+
+       list_add(&obj->list, &inode->i_obj_list);
+       atomic_inc(&inode->i_count);
+       
+       printk(KERN_DEBUG "alloc_inode_obj(%p, %d, %d): object created\n", inode, type, id);
+       return obj;
+}
+
+/* inode must be locked already */
+static struct inode_obj_id* get_inode_obj(struct inode* inode, 
+                                         obj_type_t type, 
+                                         unsigned int id)
+{
+       struct list_head* pos;
+       struct inode_obj_id* obj = NULL;
+       
+       list_for_each(pos, &inode->i_obj_list) {
+               obj = list_entry(pos, struct inode_obj_id, list);
+               if (obj->id == id && obj->type == type) {
+                       atomic_inc(&obj->count);
+                       return obj;
+               }
+       }
+       printk(KERN_DEBUG "get_inode_obj(%p, %d, %d): couldn't find object\n", inode, type, id);
+       return NULL;
+}
+
+
+static void put_inode_obj(struct inode_obj_id* obj)
+{
+       struct inode* inode;
+       int let_go = 0;
+
+       inode = obj->inode;
+       if (atomic_dec_and_test(&obj->count)) {
+
+               mutex_lock(&inode->i_obj_mutex);
+               /* no new references can be obtained */
+               if (!atomic_read(&obj->count)) {
+                       list_del(&obj->list);
+                       fdso_destroy(obj->type, obj->obj);
+                       kfree(obj);
+                       let_go = 1;
+               }
+               mutex_unlock(&inode->i_obj_mutex);
+               if (let_go)
+                       iput(inode);
+       }
+}
+
+static struct od_table_entry*  get_od_entry(struct task_struct* t)
+{
+       struct od_table_entry* table;
+       int i;
+       
+
+       table = t->od_table;
+       if (!table) {
+               table = (struct od_table_entry*) 
+                       kzalloc(sizeof(struct  od_table_entry) * 
+                               MAX_OBJECT_DESCRIPTORS, GFP_KERNEL);
+               t->od_table = table;
+       }
+
+       for (i = 0; table &&  i < MAX_OBJECT_DESCRIPTORS; i++)
+               if (!table[i].used) {
+                       table[i].used = 1;
+                       return table + i;
+               }
+       return NULL;
+}
+
+static int put_od_entry(struct od_table_entry* od)
+{
+       put_inode_obj(od->obj);
+       od->used = 0;
+       return 0;
+}
+
+void exit_od_table(struct task_struct* t)
+{
+       int i;
+
+       if (t->od_table) {
+               for (i = 0; i < MAX_OBJECT_DESCRIPTORS; i++)
+                       if (t->od_table[i].used) 
+                               put_od_entry(t->od_table + i);
+               kfree(t->od_table);
+               t->od_table = NULL;
+       }
+}
+
+static int do_sys_od_open(struct file* file, obj_type_t type, int id,
+                         void* __user config)
+{
+       int idx = 0, err;
+       struct inode* inode;
+       struct inode_obj_id* obj = NULL;
+       struct od_table_entry* entry;
+
+       inode = file->f_dentry->d_inode;
+       
+       entry = get_od_entry(current);
+       if (!entry)
+               return -ENOMEM;
+
+       mutex_lock(&inode->i_obj_mutex);    
+       obj = get_inode_obj(inode, type, id);
+       if (!obj)               
+               obj = alloc_inode_obj(inode, type, id); 
+       if (!obj) {
+               idx = -ENOMEM;
+               entry->used = 0;
+       } else {
+               entry->obj   = obj;
+               entry->extra = NULL;
+               idx = entry - current->od_table;
+       }
+
+       mutex_unlock(&inode->i_obj_mutex);
+
+       err = fdso_open(entry, config);
+       if (err < 0) {
+               /* The class rejected the open call.
+                * We need to clean up and tell user space.
+                */
+               put_od_entry(entry);
+               idx = err;
+       }
+
+       return idx;     
+}
+
+
+struct od_table_entry* __od_lookup(int od)
+{
+       struct task_struct *t = current;
+       
+       if (!t->od_table)
+               return NULL;
+       if (od < 0 || od >= MAX_OBJECT_DESCRIPTORS)
+               return NULL;
+       if (!t->od_table[od].used)
+               return NULL;
+       return t->od_table + od;
+}
+
+
+asmlinkage int sys_od_open(int fd, int type, int obj_id, void* __user config)
+{
+       int ret = 0;
+       struct file*  file;
+
+       /* 
+          1) get file from fd, get inode from file
+          2) lock inode
+          3) try to lookup object
+          4) if not present create and enqueue object, inc inode refcnt
+          5) increment refcnt of object
+          6) alloc od_table_entry, setup ptrs
+          7) unlock inode
+          8) return offset in od_table as OD
+        */      
+       
+       if (type < MIN_OBJ_TYPE || type > MAX_OBJ_TYPE) {
+               ret = -EINVAL;
+               goto out;
+       }
+       
+       file = fget(fd);
+       if (!file) {
+               ret = -EBADF;
+               goto out;
+       }
+
+       ret = do_sys_od_open(file, type, obj_id, config);
+
+       fput(file);
+       
+out:           
+       return ret;
+}
+
+
+asmlinkage int sys_od_close(int od)
+{
+       int ret = -EINVAL;
+       struct task_struct *t = current;
+       
+       if (od < 0 || od >= MAX_OBJECT_DESCRIPTORS)
+               return ret;
+
+       if (!t->od_table || !t->od_table[od].used)
+               return ret;
+
+
+       /* give the class a chance to reject the close 
+        */
+       ret = fdso_close(t->od_table + od);
+       if (ret == 0)           
+               ret = put_od_entry(t->od_table + od);
+       
+       return ret;
+}
+diff --git a/fs/inode.c b/fs/inode.c
+index bf21dc6..fcf8ce3 100644
+--- a/fs/inode.c
+++ b/fs/inode.c
+@@ -205,6 +205,8 @@ void inode_init_once(struct inode *inode)
+        INIT_LIST_HEAD(&inode->inotify_watches);
+        mutex_init(&inode->inotify_mutex);
+ #endif
+       INIT_LIST_HEAD(&inode->i_obj_list);
+       mutex_init(&inode->i_obj_mutex);
+ }
+ 
+ EXPORT_SYMBOL(inode_init_once);
+diff --git a/include/asm-i386/thread_info.h b/include/asm-i386/thread_info.h
+index 4b187bb..fd9dd60 100644
+--- a/include/asm-i386/thread_info.h
+++ b/include/asm-i386/thread_info.h
+@@ -131,6 +131,7 @@ static inline struct thread_info *current_thread_info(void)
+ #define TIF_SYSCALL_AUDIT      7       /* syscall auditing active */
+ #define TIF_SECCOMP            8       /* secure computing */
+ #define TIF_RESTORE_SIGMASK    9       /* restore signal mask in do_signal() */
+#define TIF_ROLLBACK_RCS       10      /* set EIP to rollback addr */
+ #define TIF_MEMDIE             16
+ #define TIF_DEBUG              17      /* uses debug registers */
+ #define TIF_IO_BITMAP          18      /* uses I/O bitmap */
+@@ -146,6 +147,7 @@ static inline struct thread_info *current_thread_info(void)
+ #define _TIF_SYSCALL_AUDIT     (1<<TIF_SYSCALL_AUDIT)
+ #define _TIF_SECCOMP           (1<<TIF_SECCOMP)
+ #define _TIF_RESTORE_SIGMASK   (1<<TIF_RESTORE_SIGMASK)
+#define _TIF_ROLLBACK_RCS      (1<<TIF_ROLLBACK_RCS)
+ #define _TIF_DEBUG             (1<<TIF_DEBUG)
+ #define _TIF_IO_BITMAP         (1<<TIF_IO_BITMAP)
+ #define _TIF_FREEZE            (1<<TIF_FREEZE)
+diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
+index 833fa17..ecc7490 100644
+--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
+@@ -325,10 +325,36 @@
+ #define __NR_move_pages                317
+ #define __NR_getcpu            318
+ #define __NR_epoll_pwait       319
+/* LITMUS */
+#define __NR_sched_setpolicy   320
+#define __NR_sched_getpolicy   321
+/*     Syscall definitions for mode change and task creation-manipulation */
+#define __NR_set_rt_mode       322
+#define __NR_set_rt_task_param 323
+#define __NR_get_rt_task_param 324
+#define __NR_prepare_rt_task   325
+#define __NR_sleep_next_period  326
+#define __NR_scheduler_setup   327
+#define __NR_register_np_flag   328
+#define __NR_exit_np            329
+#define __NR_od_open           330
+#define __NR_od_close          331
+#define __NR_pi_down           332
+#define __NR_pi_up             333
+#define __NR_srp_down          334
+#define __NR_srp_up            335
+#define __NR_reg_task_srp_sem  336
+#define __NR_query_job_no      337
+#define __NR_wait_for_job_release 338
+#define __NR_set_service_levels 339
+#define __NR_get_cur_service_level 340
+#define __NR_reg_ics_cb                341
+#define __NR_start_wcs         342
+
+ 
+ #ifdef __KERNEL__
+ 
+-#define NR_syscalls 320
+#define NR_syscalls 343
+ 
+ #define __ARCH_WANT_IPC_PARSE_VERSION
+ #define __ARCH_WANT_OLD_READDIR
+diff --git a/include/linux/edf_common.h b/include/linux/edf_common.h
+new file mode 100644
+index 0000000..f940308
+--- /dev/null
+++ b/include/linux/edf_common.h
+@@ -0,0 +1,36 @@
+/* EDF common data structures and utility functions shared by all EDF 
+ * based scheduler plugins
+ */
+
+/* CLEANUP: Add comments and make it less messy.
+ *
+ */
+
+#ifndef __UNC_EDF_COMMON_H__
+#define __UNC_EDF_COMMON_H__
+
+#include <linux/rt_domain.h>
+
+
+void edf_domain_init(rt_domain_t* rt, check_resched_needed_t resched);
+
+int edf_higher_prio(struct task_struct* first, 
+                   struct task_struct* second);
+
+int edf_ready_order(struct list_head* a, struct list_head* b);
+
+void edf_release_at(struct task_struct *t, jiffie_t start);
+#define edf_release_now(t) edf_release_at(t, jiffies)
+
+int  edf_preemption_needed(rt_domain_t* rt, struct task_struct *t);
+long edf_sleep_next_period(void);
+
+void edf_prepare_for_next_period(struct task_struct *t);
+
+#define job_completed(t) (!is_be(t) && \
+       (t)->rt_param.times.exec_time == (t)->rt_param.basic_params.exec_cost)
+
+int edf_set_hp_task(struct pi_semaphore *sem);
+int edf_set_hp_cpu_task(struct pi_semaphore *sem, int cpu);
+
+#endif 
+diff --git a/include/linux/fdso.h b/include/linux/fdso.h
+new file mode 100644
+index 0000000..3e962fd
+--- /dev/null
+++ b/include/linux/fdso.h
+@@ -0,0 +1,70 @@
+/* fdso.h - file descriptor attached shared objects
+ *
+ * (c) 2007 B. Brandenburg, LITMUS^RT project
+ */
+
+#ifndef _LINUX_FDSO_H_
+#define _LINUX_FDSO_H_
+
+#include <linux/list.h>
+#include <asm/atomic.h>
+
+#include <linux/fs.h>
+
+#define MAX_OBJECT_DESCRIPTORS 32
+
+typedef enum  {
+       MIN_OBJ_TYPE    = 0,
+
+       PI_SEM          = 0,
+       SRP_SEM         = 1,
+       ICS_ID          = 2,
+
+       MAX_OBJ_TYPE    = 2
+} obj_type_t;
+
+struct inode_obj_id {
+       struct list_head        list;
+       atomic_t                count;
+       struct inode*           inode;
+
+       obj_type_t              type;
+       void*                   obj;
+       unsigned int            id;
+};
+
+
+struct od_table_entry {
+       unsigned int            used;
+
+       struct inode_obj_id*    obj;
+       void*                   extra;
+};
+
+struct fdso_ops {
+       void* (*create) (void);
+       void  (*destroy)(void*);
+       int   (*open)   (struct od_table_entry*, void* __user);
+       int   (*close)  (struct od_table_entry*);
+};
+
+/* translate a userspace supplied od into the raw table entry
+ * returns NULL if od is invalid
+ */
+struct od_table_entry* __od_lookup(int od);
+
+/* translate a userspace supplied od into the associated object
+ * returns NULL if od is invalid
+ */
+static inline void* od_lookup(int od, obj_type_t type)
+{
+       struct od_table_entry* e = __od_lookup(od);
+       return e && e->obj->type == type ? e->obj->obj : NULL;
+}
+
+#define lookup_pi_sem(od)  ((struct pi_semaphore*)  od_lookup(od, PI_SEM))
+#define lookup_srp_sem(od) ((struct srp_semaphore*) od_lookup(od, SRP_SEM))
+#define lookup_ics(od)     ((struct ics*)           od_lookup(od, ICS_ID))
+
+
+#endif
+diff --git a/include/linux/feather_buffer.h b/include/linux/feather_buffer.h
+new file mode 100644
+index 0000000..c788227
+--- /dev/null
+++ b/include/linux/feather_buffer.h
+@@ -0,0 +1,108 @@
+#ifndef _FEATHER_BUFFER_H_
+#define _FEATHER_BUFFER_H_
+
+/* requires UINT_MAX and memcpy */
+
+static inline int  fetch_and_inc(int *val)
+{
+       int ret = 1;
+       __asm__ __volatile__("lock; xaddl %0, %1" : "+r" (ret), "+m" (*val) : : "memory" );
+       return ret;
+}
+
+static inline int  fetch_and_dec(int *val)
+{
+       int ret = -1;
+       __asm__ __volatile__("lock; xaddl %0, %1" : "+r" (ret), "+m" (*val) : : "memory" );
+       return ret;
+}
+
+#define SLOT_FREE      0
+#define        SLOT_BUSY       1
+#define        SLOT_READY      2
+
+struct ft_buffer {
+       unsigned int    slot_count;
+       unsigned int    slot_size;
+
+       int             free_count;
+       unsigned int    write_idx;
+       unsigned int    read_idx;
+
+       char*           slots;
+       void*           buffer_mem;
+       unsigned int    failed_writes;
+};
+
+static inline int init_ft_buffer(struct ft_buffer*     buf,
+                                unsigned int           slot_count,
+                                unsigned int           slot_size,
+                                char*                  slots,
+                                void*                  buffer_mem)
+{
+       int i = 0;
+       if (!slot_count || UINT_MAX % slot_count != slot_count - 1) {
+               /* The slot count must divide UNIT_MAX + 1 so that when it
+                * wraps around the index correctly points to 0.
+                */
+               return 0;
+       } else {
+               buf->slot_count    = slot_count;
+               buf->slot_size     = slot_size;
+               buf->slots         = slots;
+               buf->buffer_mem    = buffer_mem;
+               buf->free_count    = slot_count;
+               buf->write_idx     = 0;
+               buf->read_idx      = 0;
+               buf->failed_writes = 0;
+               for (i = 0; i < slot_count; i++)
+                       buf->slots[i] = SLOT_FREE;
+               return 1;
+       }
+}
+
+static inline int ft_buffer_start_write(struct ft_buffer* buf, void **ptr)
+{
+       int free = fetch_and_dec(&buf->free_count);
+       unsigned int idx;
+       if (free <= 0) {
+               fetch_and_inc(&buf->free_count);
+               *ptr = 0;
+               fetch_and_inc(&buf->failed_writes);
+               return 0;
+       } else {
+               idx  = fetch_and_inc((int*) &buf->write_idx) % buf->slot_count;
+               buf->slots[idx] = SLOT_BUSY;
+               *ptr = ((char*) buf->buffer_mem) + idx * buf->slot_size;
+               return 1;
+       }
+}
+
+static inline void ft_buffer_finish_write(struct ft_buffer* buf, void *ptr)
+{
+       unsigned int idx = ((char*) ptr - (char*) buf->buffer_mem) / buf->slot_size;
+       buf->slots[idx]  = SLOT_READY;
+}
+
+
+/* exclusive reader access is assumed */
+static inline int ft_buffer_read(struct ft_buffer* buf, void* dest)
+{
+       unsigned int idx;
+       if (buf->free_count == buf->slot_count)
+               /* nothing available */
+               return 0;
+       idx = buf->read_idx % buf->slot_count;
+       if (buf->slots[idx] == SLOT_READY) {
+               memcpy(dest, ((char*) buf->buffer_mem) + idx * buf->slot_size,
+                      buf->slot_size);
+               buf->slots[idx] = SLOT_FREE;
+               buf->read_idx++;
+               fetch_and_inc(&buf->free_count);
+               return 1;
+       } else
+               return 0;
+}
+
+
+#endif
+diff --git a/include/linux/feather_trace.h b/include/linux/feather_trace.h
+new file mode 100644
+index 0000000..5c37ea7
+--- /dev/null
+++ b/include/linux/feather_trace.h
+@@ -0,0 +1,93 @@
+#ifndef _FEATHER_TRACE_H_
+#define _FEATHER_TRACE_H_
+
+#define feather_callback __attribute__((regparm(0)))
+
+/* make the compiler reload any register that is not saved in
+ * a cdecl function call
+ */
+#define CLOBBER_LIST "memory", "cc", "eax", "ecx", "edx"
+
+#define ft_event(id, callback)                                  \
+        __asm__ __volatile__(                                   \
+            "1: jmp 2f                                    \n\t" \
+           " call " #callback "                          \n\t" \
+            ".section __event_table, \"aw\"               \n\t" \
+            ".long " #id  ", 0, 1b, 2f                    \n\t" \
+            ".previous                                    \n\t" \
+            "2:                                           \n\t" \
+        : : : CLOBBER_LIST)
+
+#define ft_event0(id, callback)                                 \
+        __asm__ __volatile__(                                   \
+            "1: jmp 2f                                    \n\t" \
+           " subl $4, %%esp                              \n\t" \
+            " movl $" #id  ", (%%esp)                     \n\t" \
+           " call " #callback "                          \n\t" \
+           " addl $4, %%esp                              \n\t" \
+            ".section __event_table, \"aw\"               \n\t" \
+            ".long " #id  ", 0, 1b, 2f                    \n\t" \
+            ".previous                                    \n\t" \
+            "2:                                           \n\t" \
+        : :  : CLOBBER_LIST)
+
+#define ft_event1(id, callback, param)                          \
+        __asm__ __volatile__(                                   \
+            "1: jmp 2f                                    \n\t" \
+           " subl $8, %%esp                              \n\t" \
+           " movl %0, 4(%%esp)                           \n\t" \
+            " movl $" #id  ", (%%esp)                     \n\t" \
+           " call " #callback "                          \n\t" \
+           " addl $8, %%esp                              \n\t" \
+            ".section __event_table, \"aw\"               \n\t" \
+            ".long " #id  ", 0, 1b, 2f                    \n\t" \
+            ".previous                                    \n\t" \
+            "2:                                           \n\t" \
+        : : "r" (param)  : CLOBBER_LIST)
+
+#define ft_event2(id, callback, param, param2)                  \
+        __asm__ __volatile__(                                   \
+            "1: jmp 2f                                    \n\t" \
+           " subl $12, %%esp                             \n\t" \
+           " movl %1, 8(%%esp)                           \n\t" \
+           " movl %0, 4(%%esp)                           \n\t" \
+            " movl $" #id  ", (%%esp)                     \n\t" \
+           " call " #callback "                          \n\t" \
+           " addl $12, %%esp                             \n\t" \
+            ".section __event_table, \"aw\"               \n\t" \
+            ".long " #id  ", 0, 1b, 2f                    \n\t" \
+            ".previous                                    \n\t" \
+            "2:                                           \n\t" \
+        : : "r" (param), "r" (param2)  : CLOBBER_LIST)
+
+
+#define ft_event3(id, callback, p, p2, p3)                      \
+        __asm__ __volatile__(                                   \
+            "1: jmp 2f                                    \n\t" \
+           " subl $16, %%esp                             \n\t" \
+           " movl %1, 12(%%esp)                          \n\t" \
+           " movl %1, 8(%%esp)                           \n\t" \
+           " movl %0, 4(%%esp)                           \n\t" \
+            " movl $" #id  ", (%%esp)                     \n\t" \
+           " call " #callback "                          \n\t" \
+           " addl $16, %%esp                             \n\t" \
+            ".section __event_table, \"aw\"               \n\t" \
+            ".long " #id  ", 0, 1b, 2f                    \n\t" \
+            ".previous                                    \n\t" \
+            "2:                                           \n\t" \
+        : : "r" (p), "r" (p2), "r" (p3)  : CLOBBER_LIST)
+
+
+static inline unsigned long long ft_read_tsc(void)
+{
+       unsigned long long ret;
+       __asm__ __volatile__("rdtsc" : "=A" (ret));
+       return ret;
+}
+
+int ft_enable_event(unsigned long id);
+int ft_disable_event(unsigned long id);
+int ft_is_event_enabled(unsigned long id);
+int ft_disable_all_events(void);
+
+#endif
+diff --git a/include/linux/fifo_common.h b/include/linux/fifo_common.h
+new file mode 100644
+index 0000000..0883226
+--- /dev/null
+++ b/include/linux/fifo_common.h
+@@ -0,0 +1,18 @@
+/* FIFO common definitions and utility functions.
+ */
+#ifndef __UNC_SCHED_FIFO_H__
+#define __UNC_SCHED_FIFO_H__
+
+#include <linux/rt_domain.h>
+
+
+int fifo_higher_prio(struct task_struct* first, 
+                    struct task_struct* second);
+
+int fifo_ready_order(struct list_head* a, struct list_head* b);
+
+
+void fifo_domain_init(rt_domain_t* fifo, check_resched_needed_t resched);
+
+
+#endif
+diff --git a/include/linux/fpmath.h b/include/linux/fpmath.h
+new file mode 100644
+index 0000000..a15c239
+--- /dev/null
+++ b/include/linux/fpmath.h
+@@ -0,0 +1,111 @@
+#ifndef __FP_MATH_H__
+#define __FP_MATH_H__
+
+#define FP_SHIFT 10
+#define ROUND_BIT (FP_SHIFT - 1)
+#define ONE FP(1)
+
+#define _fp(x) ((fp_t) {x})
+
+static inline long _point(fp_t x) 
+{      
+       return (x.val % (1 << FP_SHIFT));
+       
+}      
+
+#define fp2str(x) x.val
+/*(x.val >> FP_SHIFT), (x.val % (1 << FP_SHIFT)) */
+#define _FP_  "%ld/1024"
+
+
+static inline fp_t FP(long x)
+{
+       return _fp(((long) x) << FP_SHIFT);
+}
+
+static inline long _floor(fp_t x)
+{
+       return x.val >> FP_SHIFT;
+}
+
+/* FIXME: negative rounding */
+static inline long _round(fp_t x)
+{
+       return _floor(x) + ((x.val >> ROUND_BIT) & 1);
+}      
+
+/* divide two integers to obtain a fixed point value  */
+static inline fp_t _frac(long a, long b)
+{
+       return _fp(FP(a).val / (b));
+}
+
+/* multiply two fixed point values */
+static inline fp_t _mul(fp_t a, fp_t b)
+{      
+       return _fp((a.val * b.val) >> FP_SHIFT);
+}
+
+static inline fp_t _div(fp_t a, fp_t b)
+{
+       /* try not to overflow */
+       if (unlikely(a.val > 2 << (BITS_PER_LONG - FP_SHIFT)))
+               return _fp((a.val / b.val) << FP_SHIFT);
+       else
+               return _fp((a.val << FP_SHIFT) / b.val);
+}
+
+static inline fp_t _add(fp_t a, fp_t b)
+{
+       return _fp(a.val + b.val);
+}
+
+static inline fp_t _sub(fp_t a, fp_t b)
+{
+       return _fp(a.val - b.val);
+}
+
+static inline fp_t _neg(fp_t x)
+{
+       return _fp(-x.val);
+}
+
+static inline fp_t _abs(fp_t x)
+{
+       return _fp(abs(x.val));
+}
+
+static inline int _leq(fp_t a, fp_t b)
+{
+       return a.val <= b.val;
+}
+
+static inline int _geq(fp_t a, fp_t b)
+{
+       return a.val >= b.val;
+}
+
+static inline int _lt(fp_t a, fp_t b)
+{
+       return a.val < b.val;
+}
+
+static inline int _gt(fp_t a, fp_t b)
+{
+       return a.val > b.val;
+}
+
+static inline int _eq(fp_t a, fp_t b)
+{
+       return a.val == b.val;
+}
+
+static inline fp_t _max(fp_t a, fp_t b)
+{
+       if (a.val < b.val)
+               return b;
+       else
+               return a;
+}
+
+#endif
+diff --git a/include/linux/fs.h b/include/linux/fs.h
+index 1410e53..4e1117c 100644
+--- a/include/linux/fs.h
+++ b/include/linux/fs.h
+@@ -524,6 +524,8 @@ static inline int mapping_writably_mapped(struct address_space *mapping)
+ #define i_size_ordered_init(inode) do { } while (0)
+ #endif
+ 
+struct inode_obj_id_table;
+
+ struct inode {
+        struct hlist_node       i_hash;
+        struct list_head        i_list;
+@@ -589,6 +591,9 @@ struct inode {
+        void                    *i_security;
+ #endif
+        void                    *i_private; /* fs or device private pointer */
+
+       struct list_head        i_obj_list;
+       struct mutex            i_obj_mutex;
+ };
+ 
+ /*
+diff --git a/include/linux/ics.h b/include/linux/ics.h
+new file mode 100644
+index 0000000..f19534f
+--- /dev/null
+++ b/include/linux/ics.h
+@@ -0,0 +1,35 @@
+#ifndef _LINUX_ICS_H_
+#define _LINUX_ICS_H_
+
+#include <asm/atomic.h>
+#include <linux/mutex.h>
+
+#define MAX_ICS_NESTING 16
+
+struct ics_descriptor {
+       /* ICS id, only read by kernel */
+       int     id;
+       /* rollback program counter, only read by kernel */
+       void*   pc; 
+       /* rollback stack pointer, not used by kernel */
+       void*    sp;
+       /* retry flag, not used by kernel */
+       int*    retry;
+};
+
+/* ICS control block */
+struct ics_cb {
+       /* Points to the top-most valid entry.
+        * -1 indicates an empty stack.
+        * Read and written by kernel.
+        */
+       int                     top;    
+       struct ics_descriptor   ics_stack[MAX_ICS_NESTING];
+};
+
+/* get rollback addr for current task */
+void* get_rollback_addr(void);
+
+#define ICS_DBG(x, args...) printk(x, ## args)
+
+#endif
+diff --git a/include/linux/list.h b/include/linux/list.h
+index 611059d..319c5ed 100644
+--- a/include/linux/list.h
+++ b/include/linux/list.h
+@@ -898,6 +898,36 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev,
+                ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
+             pos = pos->next)
+ 
+
+typedef int (*list_cmp_t)(struct list_head*, struct list_head*);
+
+static inline unsigned int list_insert(struct list_head* new,
+                                      struct list_head* head,
+                                      list_cmp_t order_before)
+{
+       struct list_head *pos;
+       unsigned int passed = 0;
+
+       BUG_ON(!new);
+
+       /* find a spot where the new entry is less than the next */
+       list_for_each(pos, head) {              
+               if (unlikely(order_before(new, pos))) {
+                       /* pos is not less than new, thus insert here */
+                       __list_add(new, pos->prev, pos);
+                       goto out;
+               }
+               passed++;
+       }
+       /* if we get to this point either the list is empty or every entry
+        * queued element is less than new.
+        * Let's add new to the end. */
+       list_add_tail(new, head);
+ out:
+       return passed;  
+}
+
+
+ #else
+ #warning "don't include kernel headers in userspace"
+ #endif /* __KERNEL__ */
+diff --git a/include/linux/litmus.h b/include/linux/litmus.h
+new file mode 100644
+index 0000000..858b2c3
+--- /dev/null
+++ b/include/linux/litmus.h
+@@ -0,0 +1,141 @@
+/*
+ * Constant definitions related to
+ * scheduling policy.
+ */
+
+#ifndef _LINUX_LITMUS_H_
+#define _LINUX_LITMUS_H_
+
+#include <linux/jiffies.h>
+#include <linux/sched_trace.h>
+
+typedef enum {
+       SCHED_BEG               =  0,
+       SCHED_LINUX             =  0,
+       SCHED_PFAIR             =  1,
+       SCHED_PFAIR_STAGGER     =  2,
+       SCHED_PART_EDF          =  3,
+       SCHED_PART_EEVDF        =  4,
+       SCHED_GLOBAL_EDF        =  5,
+       SCHED_PFAIR_DESYNC      =  6,
+       SCHED_GLOBAL_EDF_NP     =  7,
+       SCHED_CUSTOM            =  8,
+       SCHED_EDF_HSB           =  9,
+       SCHED_GSN_EDF           = 10,
+       SCHED_PSN_EDF           = 11,
+       SCHED_ADAPTIVE          = 12,
+       /*      Add your scheduling policy here */
+
+       SCHED_END               = 12,
+       SCHED_DEFAULT           =  0,
+       SCHED_INVALID           = -1,
+} spolicy;
+
+
+typedef enum {
+       LITMUS_RESERVED_RANGE = 1024,
+
+} sched_setup_cmd_t;
+
+/*     System-wide runtime modes */
+enum rt_mode_t {
+       MODE_NON_RT = 0,
+       MODE_RT_RUN = 1
+};
+
+/*     per-task modes */
+enum rt_task_mode_t {
+       BACKGROUND_TASK = 0,
+       LITMUS_RT_TASK  = 1
+};
+
+
+/*     Plugin boot options, for convenience */
+#define PLUGIN_LINUX           "linux"
+#define PLUGIN_PFAIR           "pfair"
+#define PLUGIN_PART_EDF        "part_edf"
+#define PLUGIN_GLOBAL_EDF      "global_edf"
+#define PLUGIN_GLOBAL_EDF_NP   "global_edf_np"
+#define PLUGIN_EDF_HSB                 "edf_hsb"
+#define PLUGIN_GSN_EDF         "gsn_edf"
+#define PLUGIN_PSN_EDF         "psn_edf"
+#define PLUGIN_ADAPTIVE                "adaptive"
+
+extern spolicy sched_policy;
+
+/*     RT mode start time      */
+extern volatile unsigned long rt_start_time;
+
+/*     Here we store the current mode of the system */
+extern atomic_t rt_mode;
+
+#define get_rt_mode() (atomic_read(&rt_mode))
+#define set_rt_mode(a) atomic_set(&rt_mode,(a))
+
+#define TRACE(fmt, args...) \
+       sched_trace_log_message("%d: " fmt, raw_smp_processor_id(), ## args)
+
+#define TRACE_TASK(t, fmt, args...) \
+       TRACE("(%s/%d) " fmt, (t)->comm, (t)->pid, ##args)
+
+#define TRACE_CUR(fmt, args...) \
+       TRACE_TASK(current, fmt, ## args)
+
+#define TRACE_BUG_ON(cond) \
+       do { if (cond) TRACE("BUG_ON(%s) at %s:%d " \
+                            "called from %p current=%s/%d state=%d " \
+                            "flags=%x mode=%d partition=%d cpu=%d rtflags=%d"\
+                            " job=%u knp=%d timeslice=%u\n",           \
+       #cond, __FILE__, __LINE__, __builtin_return_address(0), current->comm, \
+       current->pid, current->state, current->flags, get_rt_mode(), \
+       get_partition(current), smp_processor_id(), get_rt_flags(current), \
+       current->rt_param.times.job_no, current->rt_param.kernel_np, \
+       current->time_slice\
+       ); } while(0);
+
+
+/* in_list - is a given list_head queued on some list?
+ */
+static inline int in_list(struct list_head* list)
+{
+       return !(  /* case 1: deleted */
+                  (list->next == LIST_POISON1 &&
+                   list->prev == LIST_POISON2)
+                ||
+                  /* case 2: initialized */
+                  (list->next == list &&
+                   list->prev == list)
+               );
+}
+
+void list_qsort(struct list_head* list, list_cmp_t less_than);
+
+
+#define RT_PREEMPTIVE          0x2050 /* = NP */
+#define RT_NON_PREEMPTIVE      0x4e50 /* =  P */
+#define RT_EXIT_NP_REQUESTED   0x5251 /* = RQ */
+
+/* returns 1 if task t has registered np flag and set it to RT_NON_PREEMPTIVE
+ */
+int is_np(struct task_struct *t);
+
+/* request that the task should call sys_exit_np()
+ */
+void request_exit_np(struct task_struct *t);
+
+/* kill naughty tasks
+ */
+void scheduler_signal(struct task_struct *t, unsigned int signal);
+void send_scheduler_signals(void);
+void np_mem_kill(struct task_struct *t);
+
+void litmus_fork(struct task_struct *tsk);
+void litmus_exec(void);
+/* clean up real-time state of a task */
+void exit_litmus(struct task_struct *dead_tsk);
+
+long transition_to_rt(struct task_struct* tsk);
+long transition_to_be(struct task_struct* tsk);
+
+
+#endif
+diff --git a/include/linux/pfair_common.h b/include/linux/pfair_common.h
+new file mode 100644
+index 0000000..67e18c6
+--- /dev/null
+++ b/include/linux/pfair_common.h
+@@ -0,0 +1,40 @@
+/* PFAIR common data structures and utility functions shared by all PFAIR 
+ * based scheduler plugins
+ */
+
+#ifndef __UNC_PFAIR_COMMON_H__
+#define __UNC_PFAIR_COMMON_H__
+
+#include <linux/queuelock.h>
+#include <linux/cpumask.h>
+
+typedef struct _pfair_domain {
+       /* Global lock to protect the data structures */
+       queuelock_t                     pfair_lock;
+       /* runnable rt tasks are in here */
+       struct list_head                ready_queue; 
+
+       /* real-time tasks waiting for release are in here */
+       struct list_head                release_queue;
+       
+       /*  CPU's in the domain */
+       cpumask_t domain_cpus;
+
+} pfair_domain_t;
+
+#define next_ready(pfair) \
+       (list_entry((pfair)->ready_queue.next, struct task_struct, rt_list))
+void pfair_domain_init(pfair_domain_t *pfair);
+void  pfair_add_ready(pfair_domain_t* pfair, struct task_struct *new);
+struct task_struct* __pfair_take_ready(pfair_domain_t* pfair);
+void pfair_add_release(pfair_domain_t* pfair, struct task_struct *task);
+void pfair_try_release_pending(pfair_domain_t* pfair);
+void __pfair_prepare_new_release(struct task_struct *t, jiffie_t start);
+
+void pfair_prepare_next_job(struct task_struct *t);
+void pfair_prepare_next_subtask(struct task_struct *t);
+
+void pfair_prepare_new_releases(pfair_domain_t *pfair, jiffie_t start);
+
+#endif
+
+diff --git a/include/linux/pfair_math.h b/include/linux/pfair_math.h
+new file mode 100644
+index 0000000..b2a14e4
+--- /dev/null
+++ b/include/linux/pfair_math.h
+@@ -0,0 +1,80 @@
+/*     PFAIR Mathematical functions */
+#ifndef __UNC_PFAIR_MATH_H__
+#define __UNC_PFAIR_MATH_H__
+
+#include <linux/rt_param.h>
+#include <asm/div64.h>
+#include <linux/litmus.h>
+#include <linux/sched.h>
+
+/*     Type definition for our quantums */
+typedef unsigned long long quantum_t;
+
+/*
+* This file defines  mathematical functions "ceiling", "floor",
+* and PFAIR specific functions for computing the release and
+* the deadline of a subtask, as well as tie breakers:
+* b-bit and group deadline.
+*/
+static inline quantum_t FLOOR(quantum_t a, unsigned long b)
+{
+       BUG_ON( b == 0);
+       do_div(a, b);
+       return a;       
+}
+static inline quantum_t CEIL(quantum_t a, unsigned long  b)
+{
+       quantum_t t = FLOOR(a, b);
+       return (quantum_t)((t * b == a) ? t : (t + 1));
+}
+
+
+/*
+*      invariant - i-1=get_passed_quanta(t)
+*
+*      release time of i-th subtask of j-th job is
+*      r_{ij}+\lfloor i-1/wt(T) \rfloor
+*      This operation should be robust to wrap-around
+*      so we can compare the result with jiffies safely
+*/
+static inline quantum_t release_time(struct task_struct * t)
+{
+       quantum_t e = get_exec_cost(t);
+       quantum_t p = get_rt_period(t);
+       return FLOOR((get_passed_quanta(t)) * p, e);
+}
+/*
+*      deadline time of i-th subtask of j-th job is
+*      r_{ij}+\lceil i/wt(T) \rceil
+*      This operation should be robust to wrap-around
+*      so we can compare the result with jiffies safely
+*/
+static inline quantum_t pfair_deadline(struct task_struct * t)
+{
+       quantum_t e = get_exec_cost(t);
+       quantum_t p = get_rt_period(t);
+       return CEIL((get_passed_quanta(t) + 1) * p, e);
+}
+/*     In PFAIR b-bit is defined as 
+*      \lceil i/wt(T) \rceil-\lfloor i/wt(T) \rfloor
+*/
+static inline int b_bit(struct task_struct *t)
+{
+       quantum_t e = get_exec_cost(t);
+       quantum_t p = get_rt_period(t);
+       return CEIL((get_passed_quanta(t) + 1) * p, e)-
+               FLOOR((get_passed_quanta(t) + 1) * p, e);
+}
+/*
+*      Group deadline 
+*/
+static inline quantum_t group_deadline(struct task_struct * t)
+{
+       quantum_t p = get_rt_period(t);
+       quantum_t e = get_exec_cost(t);
+       quantum_t stage1 = CEIL((get_passed_quanta(t) + 1) * p, e);
+       quantum_t stage2 = CEIL(stage1 * (p - e), p);
+       return CEIL(stage2 * p, p - e);
+}
+
+#endif /*      __UNC_PFAIR_MATH_H__    */
+diff --git a/include/linux/queuelock.h b/include/linux/queuelock.h
+new file mode 100644
+index 0000000..c289c21
+--- /dev/null
+++ b/include/linux/queuelock.h
+@@ -0,0 +1,98 @@
+#ifndef _UNC_QUEUELOCK_H_
+#define _UNC_QUEUELOCK_H_
+/**
+* Queue lock
+*
+* This is an implementation of T. Anderson's queue lock.
+* It strives to follow the normal Linux locking conventions
+* as much as possible. The rules for acquiring a lock are:
+*
+*  1) The caller must ensure interrupts and preemptions are disabled.
+*
+*  2) The caller _cannot_ recursively acquire the lock.
+*
+*  3) The caller may not sleep while holding the lock. This is currently
+*     not enforced, but it will not work.
+*/
+
+#include <linux/cache.h>
+#include <asm/atomic.h>
+#include <linux/smp.h>
+
+typedef struct {
+       /* pad the values being spun on to make sure
+          that they are cache local
+        */
+       union {
+               volatile enum {
+                       MUST_WAIT,
+                       HAS_LOCK
+               } val;
+               char    padding[SMP_CACHE_BYTES];
+       } slots[NR_CPUS];
+
+       /* since spin_slot is not being spun on it can be
+        * in a shared cache line. next_slot will be evicted
+        * anyway on every attempt to acquire the lock.
+        */
+       int             spin_slot[NR_CPUS];
+
+       /* The next slot that will be available.
+        */
+       atomic_t        next_slot;
+} queuelock_t;
+
+
+static inline void queue_lock_init(queuelock_t *lock)
+{
+       int i;
+       for (i = 0; i < NR_CPUS; i++) {
+               lock->slots[i].val      = MUST_WAIT;
+               lock->spin_slot[i]      = i;
+       }
+       lock->slots[0].val      = HAS_LOCK;
+       atomic_set(&lock->next_slot, 0);
+}
+
+
+static inline void queue_lock(queuelock_t *lock)
+{
+       int me = smp_processor_id();
+       volatile int* spin_var;
+       /* Get slot to spin on. atomic_inc_return() returns the incremented
+        * value, so take one of again
+        */
+       lock->spin_slot[me] = atomic_inc_return(&lock->next_slot) - 1;
+       /* check for wrap-around
+        * This could probably optimized away if we ensure that NR_CPUS divides
+        * INT_MAX...
+        */
+       if (unlikely(lock->spin_slot[me] == NR_CPUS - 1))
+               atomic_add(-NR_CPUS, &lock->next_slot);
+       /* range limit*/
+       lock->spin_slot[me] %= NR_CPUS;
+       /* spin until you acquire the lock */
+       spin_var = (int*) &lock->slots[lock->spin_slot[me]].val;
+       while (*spin_var == MUST_WAIT)
+               cpu_relax();
+
+       /* reset the lock */
+       lock->slots[lock->spin_slot[me]].val = MUST_WAIT;
+       barrier();
+}
+
+
+static inline void queue_unlock(queuelock_t *lock)
+{
+       int me = smp_processor_id();
+       barrier();
+       lock->slots[(lock->spin_slot[me] + 1) % NR_CPUS].val = HAS_LOCK;
+}
+
+#define queue_lock_irqsave(lock, flags) \
+       do { local_irq_save(flags); queue_lock(lock); } while (0);
+
+#define queue_unlock_irqrestore(lock, flags) \
+       do { queue_unlock(lock); local_irq_restore(flags); } while (0);
+
+#endif /*      _UNC_QUEUELOCK_H_       */
+diff --git a/include/linux/rt_domain.h b/include/linux/rt_domain.h
+new file mode 100644
+index 0000000..3305159
+--- /dev/null
+++ b/include/linux/rt_domain.h
+@@ -0,0 +1,98 @@
+/* CLEANUP: Add comments and make it less messy.
+ *
+ */
+
+#ifndef __UNC_RT_DOMAIN_H__
+#define __UNC_RT_DOMAIN_H__
+
+struct _rt_domain;
+
+typedef int (*check_resched_needed_t)(struct _rt_domain *rt);
+typedef void (*release_at_t)(struct task_struct *t, jiffie_t start);
+
+typedef struct _rt_domain {
+       /* runnable rt tasks are in here */
+       rwlock_t                        ready_lock;
+       struct list_head                ready_queue;
+
+       /* real-time tasks waiting for release are in here */
+       spinlock_t                      release_lock;
+       struct list_head                release_queue;
+
+       /* how do we check if we need to kick another CPU? */
+       check_resched_needed_t          check_resched;
+
+       /* how are tasks ordered in the ready queue? */
+       list_cmp_t                      order;
+} rt_domain_t;
+
+#define next_ready(rt) \
+       (list_entry((rt)->ready_queue.next, struct task_struct, rt_list))
+
+#define ready_jobs_pending(rt) \
+       (!list_empty(&(rt)->ready_queue))
+
+void rt_domain_init(rt_domain_t *rt, check_resched_needed_t f,
+                   list_cmp_t order);
+
+void __add_ready(rt_domain_t* rt, struct task_struct *new);
+void __add_release(rt_domain_t* rt, struct task_struct *task);
+
+struct task_struct* __take_ready_rq(rt_domain_t* rt, runqueue_t* rq, int cpu);
+struct task_struct* __take_ready(rt_domain_t* rt);
+struct task_struct* __peek_ready(rt_domain_t* rt);
+
+void try_release_pending(rt_domain_t* rt);
+void __release_pending(rt_domain_t* rt);
+
+void rerelease_all(rt_domain_t *rt, release_at_t release);
+void __rerelease_all(rt_domain_t *rt, release_at_t release);
+
+static inline void add_ready(rt_domain_t* rt, struct task_struct *new)
+{
+       unsigned long flags;
+       /* first we need the write lock for rt_ready_queue */
+       write_lock_irqsave(&rt->ready_lock, flags);
+       __add_ready(rt, new);
+       write_unlock_irqrestore(&rt->ready_lock, flags);
+}
+
+static inline struct task_struct* take_ready(rt_domain_t* rt)
+{
+       unsigned long flags;
+       struct task_struct* ret;
+       /* first we need the write lock for rt_ready_queue */
+       write_lock_irqsave(&rt->ready_lock, flags);
+       ret = __take_ready(rt);
+       write_unlock_irqrestore(&rt->ready_lock, flags);
+       return ret;
+}
+
+
+static inline void add_release(rt_domain_t* rt, struct task_struct *task)
+{
+       unsigned long flags;
+       /* first we need the write lock for rt_ready_queue */
+       spin_lock_irqsave(&rt->release_lock, flags);
+       __add_release(rt, task);
+       spin_unlock_irqrestore(&rt->release_lock, flags);
+}
+
+static inline int __jobs_pending(rt_domain_t* rt)
+{
+       return !list_empty(&rt->ready_queue);
+}
+
+static inline int jobs_pending(rt_domain_t* rt)
+{
+       unsigned long flags;
+       int ret;
+       /* first we need the write lock for rt_ready_queue */
+       read_lock_irqsave(&rt->ready_lock, flags);
+       ret = __jobs_pending(rt);
+       read_unlock_irqrestore(&rt->ready_lock, flags);
+       return ret;
+}
+
+
+#endif
+diff --git a/include/linux/rt_param.h b/include/linux/rt_param.h
+new file mode 100644
+index 0000000..4ebab77
+--- /dev/null
+++ b/include/linux/rt_param.h
+@@ -0,0 +1,277 @@
+/*
+ * Definition of the scheduler plugin interface.
+ *
+ */
+#ifndef _LINUX_RT_PARAM_H_
+#define _LINUX_RT_PARAM_H_
+
+#include <linux/wait.h>
+
+typedef unsigned long jiffie_t;
+
+/* different types of clients */
+typedef enum {
+       RT_CLASS_HARD,
+       RT_CLASS_SOFT,
+       RT_CLASS_BEST_EFFORT
+} task_class_t;
+
+typedef struct rt_param {
+       unsigned long   exec_cost;
+       unsigned long   period;
+       unsigned int    cpu;
+       task_class_t    class;
+} rt_param_t;
+
+/* fixed point wrapper to force compiler
+ * errors in case of misuse of a fixed point value
+ */
+typedef struct
+{
+       long val;
+} fp_t;
+
+typedef struct {
+       fp_t            weight;
+       unsigned long   period;
+       fp_t            value;
+} service_level_t;
+
+typedef struct {
+       fp_t estimate;
+       fp_t accumulated;
+} predictor_state_t;
+
+typedef struct {
+       /* when will this task be release the next time? */
+       jiffie_t        release;
+       /* time instant the last job was released  */
+       jiffie_t        last_release;
+       /* what is the current deadline? */
+       jiffie_t        deadline;
+       /* b-bit tie breaker for PFAIR, it is ignored in EDF */
+       int             b_bit;
+       /* group deadline tie breaker, it is ignored in EDF */
+       jiffie_t        group_deadline;
+       /* how long has this task executed so far?
+        * In case of capacity sharing a job completion cannot be
+        * detected by checking time_slice == 0 as the job may have
+        * executed while using another capacity. Use this counter
+        * to keep track of the time spent on a CPU by a job.
+        *
+        * In other words: The number of consumed quanta since the
+        *                  last job release.
+        */
+       unsigned int    exec_time;
+
+       /* Which job is this. This is used to let user space
+        * specify which job to wait for, which is important if jobs
+        * overrun. If we just call sys_sleep_next_period() then we
+        * will unintentionally miss jobs after an overrun.
+        *
+        * Increase this sequence number when a job is released.
+        */
+       unsigned int    job_no;
+} rt_times_t;
+
+
+/*     RT task parameters for scheduling extensions
+ *     These parameters are inherited during clone and therefore must
+ *     be explicitly set up before the task set is launched.
+ */
+typedef struct task_rt_param {
+       /* is the task sleeping? */
+       unsigned int            flags:8;
+
+       /* Real-time marker: 1 iff it is a LITMUS real-time task.
+        */
+       unsigned int            is_realtime:1;
+
+       /* is a BE->RT or RT->BE transition pending? */
+       unsigned int            transition_pending:1;
+
+       /* is this task under control of litmus?
+        *
+        * this is necessary because otherwise signal delivery code
+        * may try to wake up a task that is already queued in plugin
+        * data structures.
+        */
+       unsigned int            litmus_controlled:1;
+
+       /* Did this task register any SRP controlled resource accesses?
+        * This, of course, should only ever be true under partitioning.
+        * However, this limitation is not currently enforced.
+        */
+       unsigned int            subject_to_srp:1;
+
+       /* if a BE->RT transition failed, then this field contains the error */
+       unsigned long           transition_error;
+
+       /* user controlled parameters */
+       rt_param_t              basic_params;
+
+       /* task representing the current "inherited" task
+        * priority, assigned by inherit_priority and
+        * return priority in the scheduler plugins.
+        * could point to self if PI does not result in
+        * an increased task priority.
+        */
+        struct task_struct*    inh_task;
+
+       /* Don't just dereference this pointer in kernel space!
+        * It might very well point to junk or nothing at all.
+        * NULL indicates that the task has not requested any non-preemptable
+        * section support.
+        * Not inherited upon fork.
+        */
+       __user short*           np_flag;
+
+       /* For the FMLP under PSN-EDF, it is required to make the task
+        * non-preemptive from kernel space. In order not to interfere with
+        * user space, this counter indicates the kernel space np setting.
+        * kernel_np > 0 => task is non-preemptive
+        */
+       unsigned int            kernel_np;
+
+       /* timing parameters */
+       rt_times_t              times;
+
+       /* This is currently only used by the PFAIR code
+        * and a prime candidate for cleanup.
+        */
+       rt_times_t              backup;
+
+       /* This field can be used by plugins to store where the task
+        * is currently scheduled. It is the responsibility of the
+        * plugin to avoid race conditions.
+        *
+        * Used by GSN-EDF.
+        */
+       int                     scheduled_on;
+
+       /* This field can be used by plugins to store where the task
+        * is currently linked. It is the responsibility of the plugin
+        * to avoid race conditions.
+        *
+        * Used by GSN-EDF.
+        */
+       int                     linked_on;
+
+       /* Adaptive support. Adaptive tasks will store service levels
+        * in this (dynamically allocated) structure.
+        */
+       service_level_t*        service_level;
+       unsigned int            no_service_levels;
+       unsigned int            cur_service_level;
+
+       /* Adaptive support. Store state for weight estimation.
+        */
+       predictor_state_t       predictor_state;
+
+       /* Adaptive support. Optimizer fields.
+        */
+       struct list_head        opt_list;
+       fp_t                    opt_order;
+       fp_t                    opt_dw;
+       fp_t                    opt_nw;
+       unsigned int            opt_level;
+       jiffie_t                opt_change;
+
+       /* Fields saved before BE->RT transition.
+        */
+       int old_policy;
+       int old_prio;
+} task_rt_param_t;
+
+/*     Possible RT flags       */
+#define RT_F_RUNNING           0x00000000
+#define RT_F_SLEEP             0x00000001
+#define RT_F_EXP_QUANTA        0x00000002
+#define RT_F_NON_PREEMTABLE    0x00000004
+#define RT_F_EXIT_SEM          0x00000008
+
+#define is_realtime(t)                 ((t)->rt_param.is_realtime)
+#define rt_transition_pending(t) \
+       ((t)->rt_param.transition_pending)
+
+/*     Realtime utility macros */
+#define get_passed_quanta(t)   ((t)->rt_param.times.exec_time)
+#define inc_passed_quanta(t)   ((t)->rt_param.times.exec_time += 1)
+#define get_rt_flags(t)                ((t)->rt_param.flags)
+#define set_rt_flags(t,f)      (t)->rt_param.flags=(f)
+#define get_exec_cost(t)       ((t)->rt_param.basic_params.exec_cost)
+#define get_rt_period(t)       ((t)->rt_param.basic_params.period)
+#define set_rt_period(t,p)     (t)->rt_param.basic_params.period=(p)
+#define set_exec_cost(t,e)     (t)->rt_param.basic_params.exec_cost=(e)
+#define get_partition(t)       (t)->rt_param.basic_params.cpu
+#define get_deadline(t)                ((t)->rt_param.times.deadline)
+#define get_last_release(t)    ((t)->rt_param.times.last_release)
+#define get_class(t)           ((t)->rt_param.basic_params.class)
+
+#define has_active_job(t) \
+       (time_before(get_last_release(t), jiffies) \
+        && time_before_eq(jiffies, get_deadline(t)))
+
+#define get_est_weight(t)      ((t)->rt_param.predictor_state.estimate)
+#define get_sl(t, l)   \
+       ((t)->rt_param.service_level[l])
+#define get_cur_sl(t)          ((t)->rt_param.cur_service_level)
+#define get_max_sl(t)          ((t)->rt_param.no_service_levels - 1)
+#define get_opt_sl(t)          ((t)->rt_param.opt_level)
+
+
+#define is_subject_to_srp(t)   ((t)->rt_param.subject_to_srp)
+#define is_hrt(t)              \
+       ((t)->rt_param.basic_params.class == RT_CLASS_HARD)
+#define is_srt(t)              \
+       ((t)->rt_param.basic_params.class == RT_CLASS_SOFT)
+#define is_be(t)               \
+       ((t)->rt_param.basic_params.class == RT_CLASS_BEST_EFFORT)
+
+#define clear_rt_params(t) \
+memset(&(t)->rt_param,0, sizeof(struct task_rt_param))
+
+#define get_release(t) ((t)->rt_param.times.release)
+#define set_release(t,r) ((t)->rt_param.times.release=(r))
+
+/* honor the flag that is set when scheduling is in progress
+ * This is some dirty hack in Linux that creates race conditions in our code
+ * if we don't pay attention to it.
+ */
+#define is_running(t)                  \
+       ((t)->state == TASK_RUNNING ||  \
+        (t)->thread_info->preempt_count & PREEMPT_ACTIVE)
+
+#define is_blocked(t)  (!is_running(t))
+#define is_released(t)  (time_before_eq((t)->rt_param.times.release, jiffies))
+#define is_tardy(t)     (time_before_eq((t)->rt_param.times.deadline, jiffies))
+#define task_slack(t) ( (int) (t)->rt_param.times.deadline -  (int) jiffies - \
+       (int) ((t)->rt_param.basic_params.exec_cost - \
+       (t)->rt_param.times.exec_time))
+
+
+/* real-time comparison macros */
+#define earlier_deadline(a, b) (time_before(\
+       (a)->rt_param.times.deadline,\
+       (b)->rt_param.times.deadline))
+#define earlier_release(a, b)  (time_before(\
+       (a)->rt_param.times.release,\
+       (b)->rt_param.times.release))
+
+#define earlier_last_release(a, b)  (time_before(\
+       (a)->rt_param.times.last_release,\
+       (b)->rt_param.times.last_release))
+
+
+#define make_np(t) do {t->rt_param.kernel_np++;} while(0);
+#define take_np(t) do {t->rt_param.kernel_np--;} while(0);
+
+#define backup_times(t) do { (t)->rt_param.backup=(t)->rt_param.times; \
+                       } while(0);
+#define restore_times(t) do { (t)->rt_param.times=(t)->rt_param.backup; \
+                        } while(0);
+
+
+#define rt_list2task(p) list_entry(p, struct task_struct, rt_list)
+
+#endif
+diff --git a/include/linux/sched.h b/include/linux/sched.h
+index 4463735..f590e28 100644
+--- a/include/linux/sched.h
+++ b/include/linux/sched.h
+@@ -3,6 +3,8 @@
+ 
+ #include <linux/auxvec.h>      /* For AT_VECTOR_SIZE */
+ 
+#include <linux/rt_param.h>
+
+ /*
+  * cloning flags:
+  */
+@@ -796,6 +798,9 @@ enum sleep_type {
+        SLEEP_INTERRUPTED,
+ };
+ 
+struct od_table_entry;
+struct ics_cb;
+
+ struct prio_array;
+ 
+ struct task_struct {
+@@ -1051,6 +1056,15 @@ struct task_struct {
+ #ifdef CONFIG_FAULT_INJECTION
+        int make_it_fail;
+ #endif
+       /* litmus parameters and state */
+       task_rt_param_t rt_param;
+
+       /* allow scheduler plugins to queue in release lists, etc. */
+       struct list_head rt_list;
+
+       /* references to PI semaphores, etc. */
+       struct od_table_entry* od_table;
+       struct ics_cb* ics_cb;
+ };
+ 
+ static inline pid_t process_group(struct task_struct *tsk)
+diff --git a/include/linux/sched_plugin.h b/include/linux/sched_plugin.h
+new file mode 100644
+index 0000000..e22722c
+--- /dev/null
+++ b/include/linux/sched_plugin.h
+@@ -0,0 +1,147 @@
+/*
+ * Definition of the scheduler plugin interface.
+ *
+ */
+#ifndef _LINUX_SCHED_PLUGIN_H_
+#define _LINUX_SCHED_PLUGIN_H_
+
+#include <linux/sched.h>
+
+/* struct for semaphore with priority inheritance */
+struct pi_semaphore {
+       atomic_t count;
+       int sleepers;
+       wait_queue_head_t wait;
+       union {
+               /* highest-prio holder/waiter */
+               struct task_struct *task;
+               struct task_struct* cpu_task[NR_CPUS];
+       } hp;
+       /* current lock holder */
+       struct task_struct *holder;
+};
+
+
+/* Enforce runqueues to be opaque objects.
+ *
+ * This allows us to pass around pointers to runqueues,
+ * without actually having to rip it out of sched.c. It
+ * also discourages plugins from trying to be
+ * overly clever.
+ */
+typedef void runqueue_t;
+
+
+/********************* scheduler invocation ******************/
+
+typedef enum {
+       NO_RESCHED    = 0,
+       FORCE_RESCHED = 1
+} reschedule_check_t;
+
+
+/*  Plugin-specific realtime tick handler */
+typedef reschedule_check_t (*scheduler_tick_t) (void);
+/* Novell make sched decision function */
+typedef int (*schedule_t) (struct task_struct * prev,
+                          struct task_struct ** next,
+                          runqueue_t * rq);
+/* Clean up after the task switch has occured.
+ * This function is called after every (even non-rt) task switch.
+ */
+typedef void (*finish_switch_t)(struct task_struct *prev);
+
+
+/********************* task state changes ********************/
+
+/* called to setup a new real-time task */
+typedef long (*prepare_task_t) (struct task_struct *task);
+/* called to re-introduce a task after blocking */
+typedef void (*wake_up_task_t) (struct task_struct *task);
+/* called to notify the plugin of a blocking real-time task
+ * it will only be called for real-time tasks and before schedule is called */
+typedef void (*task_blocks_t)  (struct task_struct *task);
+/* called when a real-time task exits. Free any allocated resources */
+typedef long (*tear_down_t)    (struct task_struct *);
+
+/* Called when the new_owner is released from the wait queue
+ * it should now inherit the priority from sem, _before_ it gets readded
+ * to any queue
+ */
+typedef long (*inherit_priority_t) (struct pi_semaphore *sem,
+                                   struct task_struct *new_owner);
+
+/* Called when the current task releases a semahpore where it might have
+ * inherited a piority from
+ */
+typedef long (*return_priority_t) (struct pi_semaphore *sem);
+
+/* Called when a task tries to acquire a semaphore and fails. Check if its
+ * priority is higher than that of the current holder.
+ */
+typedef long (*pi_block_t) (struct pi_semaphore *sem, struct task_struct *t);
+
+
+/********************* sys call backends  ********************/
+/* This function causes the caller to sleep until the next release */
+typedef long (*sleep_next_period_t) (void);
+
+typedef int (*scheduler_setup_t) (int cmd, void __user *parameter);
+
+typedef int (*mode_change_t) (int);
+
+struct sched_plugin {
+       /*      basic info              */
+       char                    *plugin_name;
+       int                     ready_to_use;
+
+       /*      management interface    */
+       mode_change_t           mode_change;
+
+       /*      scheduler invocation    */
+       scheduler_tick_t        scheduler_tick;
+       schedule_t              schedule;
+       finish_switch_t         finish_switch;
+
+       /*      syscall backend         */
+       sleep_next_period_t     sleep_next_period;
+       scheduler_setup_t       scheduler_setup;
+
+       /*      task state changes      */
+       prepare_task_t          prepare_task;
+       wake_up_task_t          wake_up_task;
+       task_blocks_t           task_blocks;
+       tear_down_t             tear_down;
+
+       /*     priority inheritance     */
+       inherit_priority_t      inherit_priority;
+       return_priority_t       return_priority;
+       pi_block_t              pi_block;
+} __attribute__ ((__aligned__(SMP_CACHE_BYTES)));
+
+typedef struct sched_plugin sched_plugin_t;
+
+extern sched_plugin_t *curr_sched_plugin;
+
+
+/* common scheduler tick */
+reschedule_check_t rt_scheduler_tick(void);
+
+
+/* Don't pull in our definitions on top of the real ones
+ * in sched.c!
+ */
+#ifndef __SCHED_C__
+
+/*     External linux scheduler facilities */
+void deactivate_task(struct task_struct *, runqueue_t *);
+/* This function is defined in sched.c. We need acces to it for
+ * indirect switching.
+ */
+void __activate_task(struct task_struct *, runqueue_t *);
+void __setscheduler(struct task_struct *, int, int);
+
+#endif
+
+extern int get_sched_options(void);
+#endif
+diff --git a/include/linux/sched_trace.h b/include/linux/sched_trace.h
+new file mode 100644
+index 0000000..01f21c6
+--- /dev/null
+++ b/include/linux/sched_trace.h
+@@ -0,0 +1,182 @@
+/* sched_trace.h -- record scheduler events to a byte stream for offline analysis.
+ */
+#ifndef _LINUX_SCHED_TRACE_H_
+#define _LINUX_SCHED_TRACE_H_
+
+#include <linux/sched.h>
+
+typedef enum {
+       ST_INVOCATION           =  0,
+       ST_ARRIVAL              =  1,
+       ST_DEPARTURE            =  2,
+       ST_PREEMPTION           =  3,
+       ST_SCHEDULED            =  4,
+       ST_JOB_RELEASE          =  5,
+       ST_JOB_COMPLETION       =  6,
+       ST_CAPACITY_RELEASE     =  7,
+       ST_CAPACITY_ALLOCATION  =  8,
+       ST_SERVICE_LEVEL_CHANGE =  9,
+       ST_WEIGHT_ERROR         = 10,
+} trace_type_t;
+
+typedef struct {
+       trace_type_t            trace:8;
+       unsigned int            size:24;
+       unsigned long long      timestamp;
+} trace_header_t;
+
+
+typedef struct {
+       unsigned int            is_rt:1;
+       unsigned int            is_server:1;
+       task_class_t            class:4;
+       unsigned int            budget:24;
+       u32                     deadline;
+
+       pid_t                   pid;
+} task_info_t;
+
+typedef struct {
+       trace_header_t          header;
+        unsigned long           flags;
+} invocation_record_t;
+
+typedef struct {
+       trace_header_t          header;
+       task_info_t             task;
+} arrival_record_t;
+
+typedef struct {
+       trace_header_t          header;
+       task_info_t             task;
+} departure_record_t;
+
+typedef struct {
+       trace_header_t          header;
+       task_info_t             task;
+       task_info_t             by;
+} preemption_record_t;
+
+typedef struct {
+       trace_header_t          header;
+       task_info_t             task;
+} scheduled_record_t;
+
+typedef struct {
+       trace_header_t          header;
+       task_info_t             task;
+       u16                     period;
+       u16                     wcet;
+} release_record_t;
+
+typedef struct {
+       trace_header_t          header;
+       task_info_t             task;
+       u16                     period;
+       u16                     wcet;
+       int                     tardiness;
+       unsigned int            job_no;
+} completion_record_t;
+
+typedef struct {
+       trace_header_t          header;
+       task_info_t             task;
+} cap_release_record_t;
+
+typedef struct {
+       trace_header_t          header;
+       task_info_t             task;
+       u16                     budget;
+       u32                     deadline;
+       pid_t                   donor;
+} cap_allocation_record_t;
+
+typedef struct {
+       trace_header_t          header;
+       task_info_t             task;
+       unsigned int            from:16;
+       unsigned int            to:16;
+       service_level_t         new_level;
+       service_level_t         old_level;
+} service_level_change_record_t;
+
+typedef struct {
+       trace_header_t          header;
+       pid_t                   task;
+       fp_t                    estimate;
+       fp_t                    actual;
+} weight_error_record_t;
+
+#ifdef CONFIG_SCHED_TASK_TRACE
+void sched_trace_scheduler_invocation(void);
+
+void sched_trace_task_arrival(struct task_struct *t);
+void sched_trace_task_departure(struct task_struct *t);
+void sched_trace_task_preemption(struct task_struct *t,
+                                struct task_struct* by);
+void sched_trace_task_scheduled(struct task_struct *);
+
+void sched_trace_job_release(struct task_struct *t);
+void sched_trace_job_completion(struct task_struct *t);
+
+void sched_trace_capacity_release(struct task_struct *t);
+void sched_trace_capacity_allocation(struct task_struct *t,
+                                    u16 budget, u32 deadline, pid_t donor);
+
+void sched_trace_capacity_alloc_srv(pid_t srv, u32 srv_dl, task_class_t cls,
+                                   u16 srv_budget,
+                                   u16 budget, u32 deadline, pid_t donor);
+
+void sched_trace_server_release(int id, unsigned int wcet,
+                               unsigned int period,
+                               task_class_t class);
+
+void sched_trace_server_completion(int id, unsigned int budget,
+                                  jiffie_t deadline,
+                                  task_class_t class);
+
+void sched_trace_server_scheduled(int id, task_class_t class,
+                                 unsigned int budget, jiffie_t deadline);
+
+void sched_trace_service_level_change(struct task_struct* t,
+                                     unsigned int from,
+                                     unsigned int to);
+
+void sched_trace_weight_error(struct task_struct* t, fp_t actual);
+
+#else
+#define sched_trace_scheduler_invocation(x)
+
+#define  sched_trace_task_arrival(t)
+#define sched_trace_task_departure(t)
+#define sched_trace_task_preemption(t, by)
+#define sched_trace_task_scheduled(t)
+#define sched_trace_job_release(t)
+#define sched_trace_job_completion(t)
+#define sched_trace_capacity_release(t)
+#define sched_trace_capacity_allocation(t, budget, deadline,  donor)
+#define sched_trace_capacity_alloc_srv(srv, srv_dl, cls, srv_budget,\
+       budget, deadline, donor)
+#define sched_trace_server_release(id, wcet, period, class)
+#define sched_trace_server_completion(id, budget, deadline, class)
+#define sched_trace_server_scheduled(id, class, budget, deadline)
+
+#define sched_trace_service_level_change(t, a, b)
+
+#define sched_trace_weight_error(x, y)
+
+
+#endif
+
+
+#ifdef CONFIG_SCHED_DEBUG_TRACE
+void sched_trace_log_message(const char* fmt, ...);
+
+#else
+
+#define sched_trace_log_message(fmt, ...)
+
+#endif
+
+
+#endif
+diff --git a/include/linux/trace.h b/include/linux/trace.h
+new file mode 100644
+index 0000000..9e457aa
+--- /dev/null
+++ b/include/linux/trace.h
+@@ -0,0 +1,74 @@
+
+#ifndef _SYS_TRACE_H_
+#define        _SYS_TRACE_H_
+
+#include <linux/feather_trace.h>
+#include <linux/feather_buffer.h>
+
+
+/*********************** TIMESTAMPS ************************/
+
+struct timestamp {
+       unsigned long           event;
+       unsigned long long      timestamp;
+       unsigned int            seq_no;
+       int                     cpu;
+};
+
+
+/* buffer holding time stamps - will be provided by driver */
+extern struct ft_buffer* trace_ts_buf;
+
+/* save_timestamp:  stores current time as struct timestamp 
+ * in trace_ts_buf 
+ */
+asmlinkage void save_timestamp(unsigned long event);
+
+#define TIMESTAMP(id) ft_event0(id, save_timestamp)
+
+/* Convention for timestamps
+ * =========================
+ * 
+ * In order to process the trace files with a common tool, we use the following
+ * convention to measure execution times: The end time id of a code segment is
+ * always the next number after the start time event id.
+ */
+
+#define TS_SCHED_START                         TIMESTAMP(100)
+#define TS_SCHED_END                   TIMESTAMP(101)
+#define TS_CXS_START                   TIMESTAMP(102)
+#define TS_CXS_END                     TIMESTAMP(103)
+
+#define TS_TICK_START                          TIMESTAMP(110)
+#define TS_TICK_END                            TIMESTAMP(111)
+
+#define TS_PLUGIN_SCHED_START          TIMESTAMP(120)
+#define TS_PLUGIN_SCHED_END            TIMESTAMP(121)
+
+#define TS_PLUGIN_TICK_START           TIMESTAMP(130)
+#define TS_PLUGIN_TICK_END             TIMESTAMP(131)
+
+#define TS_ENTER_NP_START              TIMESTAMP(140)
+#define TS_ENTER_NP_END                        TIMESTAMP(141)
+
+#define TS_EXIT_NP_START               TIMESTAMP(150)
+#define TS_EXIT_NP_END                 TIMESTAMP(151)
+
+#define TS_SRP_UP_START                        TIMESTAMP(160)
+#define TS_SRP_UP_END                  TIMESTAMP(161)
+#define TS_SRP_DOWN_START              TIMESTAMP(162)
+#define TS_SRP_DOWN_END                        TIMESTAMP(163)
+
+#define TS_PI_UP_START                 TIMESTAMP(170)
+#define TS_PI_UP_END                   TIMESTAMP(171)
+#define TS_PI_DOWN_START               TIMESTAMP(172)
+#define TS_PI_DOWN_END                 TIMESTAMP(173)
+
+#define TS_FIFO_UP_START               TIMESTAMP(180)
+#define TS_FIFO_UP_END                 TIMESTAMP(181)
+#define TS_FIFO_DOWN_START             TIMESTAMP(182)
+#define TS_FIFO_DOWN_END               TIMESTAMP(183)
+
+
+
+#endif /* !_SYS_TRACE_H_ */
+diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
+index 975c963..6ae0ff9 100644
+--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
+@@ -84,4 +84,20 @@ static inline unsigned long __copy_from_user_nocache(void *to,
+                ret;                                    \
+        })
+ 
+/* This is a naive attempt at a write version of the above native Linux macro.
+ */
+#define poke_kernel_address(val, addr)                 \
+       ({                                              \
+               long ret;                               \
+               mm_segment_t old_fs = get_fs();         \
+                                                       \
+               set_fs(KERNEL_DS);                      \
+               pagefault_disable();                    \
+               ret = __put_user(val, (__force typeof(val) __user *)(addr)); \
+               pagefault_enable();                     \
+               set_fs(old_fs);                         \
+               ret;                                    \
+       })
+
+
+ #endif         /* __LINUX_UACCESS_H__ */
+diff --git a/include/linux/wait.h b/include/linux/wait.h
+index e820d00..c7e96b6 100644
+--- a/include/linux/wait.h
+++ b/include/linux/wait.h
+@@ -161,6 +161,8 @@ wait_queue_head_t *FASTCALL(bit_waitqueue(void *, int));
+ #define        wake_up_locked(x)               __wake_up_locked((x), TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE)
+ #define wake_up_interruptible_sync(x)   __wake_up_sync((x),TASK_INTERRUPTIBLE, 1)
+ 
+#define pi_wake_up(x)                   __pi_wake_up(x, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1, NULL)
+
+ #define __wait_event(wq, condition)                                    \
+ do {                                                                   \
+        DEFINE_WAIT(__wait);                                            \
+diff --git a/kernel/Makefile b/kernel/Makefile
+index 14f4d45..c3d8b0d 100644
+--- a/kernel/Makefile
+++ b/kernel/Makefile
+@@ -8,7 +8,13 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
+            signal.o sys.o kmod.o workqueue.o pid.o \
+            rcupdate.o extable.o params.o posix-timers.o \
+            kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
+-           hrtimer.o rwsem.o latency.o nsproxy.o srcu.o
+           hrtimer.o rwsem.o latency.o nsproxy.o srcu.o \
+           sched_plugin.o litmus.o sched_trace.o \
+           edf_common.o fifo_common.o pfair_common.o\
+           sched_global_edf.o sched_part_edf.o sched_edf_hsb.o sched_pfair.o \
+            sched_gsn_edf.o sched_psn_edf.o litmus_sem.o \
+           trace.o ft_event.o rt_domain.o sched_adaptive.o \
+           ics.o
+ 
+ obj-$(CONFIG_STACKTRACE) += stacktrace.o
+ obj-y += time/
+diff --git a/kernel/edf_common.c b/kernel/edf_common.c
+new file mode 100644
+index 0000000..4746c66
+--- /dev/null
+++ b/kernel/edf_common.c
+@@ -0,0 +1,135 @@
+/*
+ * kernel/edf_common.c
+ *
+ * Common functions for EDF based scheduler.
+ */
+
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+
+#include <linux/litmus.h>
+#include <linux/sched_plugin.h>
+#include <linux/sched_trace.h>
+
+
+#include <linux/edf_common.h>
+
+/* edf_higher_prio -  returns true if first has a higher EDF priority
+ *                    than second. Deadline ties are broken by PID.
+ *
+ * first first must not be NULL and a real-time task.
+ * second may be NULL or a non-rt task.
+ */
+int edf_higher_prio(struct task_struct* first, 
+                   struct task_struct* second) 
+{
+       struct task_struct *first_task = first;
+       struct task_struct *second_task = second;
+
+       /* Check for inherited priorities. Change task
+        * used for comparison in such a case.
+        */
+       if (first && first->rt_param.inh_task)
+               first_task = first->rt_param.inh_task;
+       if (second && second->rt_param.inh_task)
+               second_task = second->rt_param.inh_task;
+
+       return 
+               /* does the second task exist and is it a real-time task?  If
+                * not, the first task (which is a RT task) has higher
+                * priority.
+                */
+               !second_task || !is_realtime(second_task)  ||
+               
+               /* is the deadline of the first task earlier?
+                * Then it has higher priority.
+                */
+               earlier_deadline(first_task, second_task) ||
+
+               /* Do we have a deadline tie? 
+                * Then break by PID.
+                */
+               (get_deadline(first_task) == get_deadline(second_task) && 
+               (first_task->pid < second_task->pid ||
+                
+               /* If the PIDs are the same then the task with the inherited 
+                * priority wins.
+                */
+               (first_task->pid == second_task->pid &&
+                !second->rt_param.inh_task)));
+}
+
+int edf_ready_order(struct list_head* a, struct list_head* b)
+{
+       return edf_higher_prio(
+               list_entry(a, struct task_struct, rt_list),
+               list_entry(b, struct task_struct, rt_list));
+}
+
+void edf_release_at(struct task_struct *t, jiffie_t start) 
+{
+       t->rt_param.times.deadline = start;
+       edf_prepare_for_next_period(t);
+       t->rt_param.times.last_release = start;
+       set_rt_flags(t, RT_F_RUNNING);  
+}
+
+void edf_domain_init(rt_domain_t* rt, check_resched_needed_t resched)
+{
+       rt_domain_init(rt, resched, edf_ready_order);
+}
+
+void edf_prepare_for_next_period(struct task_struct *t)
+{
+       BUG_ON(!t);     
+       /* prepare next release */
+       t->rt_param.times.release   = t->rt_param.times.deadline;
+       t->rt_param.times.deadline += get_rt_period(t);
+       t->rt_param.times.exec_time = 0;        
+       /* update job sequence number */
+       t->rt_param.times.job_no++;
+
+       t->time_slice               = get_exec_cost(t);
+
+       /* who uses this? statistics?  */ 
+       t->first_time_slice = 0; 
+}
+
+/* need_to_preempt - check whether the task t needs to be preempted
+ *                   call only with irqs disabled and with  ready_lock acquired
+ *                   THIS DOES NOT TAKE NON-PREEMPTIVE SECTIONS INTO ACCOUNT!
+ */
+int edf_preemption_needed(rt_domain_t* rt, struct task_struct *t)
+{
+       /* we need the read lock for edf_ready_queue */ 
+       /* no need to preempt if there is nothing pending */
+       if (!ready_jobs_pending(rt))
+               return 0;
+       /* we need to reschedule if t doesn't exist */
+       if (!t)
+               return 1;
+
+       /* NOTE: We cannot check for non-preemptibility since we 
+        *       don't know what address space we're currently in.
+        */
+
+       /* make sure to get non-rt stuff out of the way */
+       return !is_realtime(t) || edf_higher_prio(next_ready(rt), t);
+}
+
+
+/*
+ *     Deactivate current task until the beginning of the next period.
+ */
+long edf_sleep_next_period(void)
+{
+       /* Mark that we do not excute anymore */
+       set_rt_flags(current, RT_F_SLEEP);
+       /* call schedule, this will return when a new job arrives
+        * it also takes care of preparing for the next release
+        */
+       schedule();
+       return 0;
+}
+
+diff --git a/kernel/exit.c b/kernel/exit.c
+index fec12eb..8a0eb79 100644
+--- a/kernel/exit.c
+++ b/kernel/exit.c
+@@ -50,6 +50,8 @@
+ 
+ extern void sem_exit (void);
+ 
+extern void exit_od_table(struct task_struct* t);
+
+ static void exit_mm(struct task_struct * tsk);
+ 
+ static void __unhash_process(struct task_struct *p)
+@@ -916,6 +918,8 @@ fastcall NORET_TYPE void do_exit(long code)
+        if (unlikely(tsk->audit_context))
+                audit_free(tsk);
+ 
+       exit_od_table(tsk);
+
+        taskstats_exit(tsk, group_dead);
+ 
+        exit_mm(tsk);
+diff --git a/kernel/fifo_common.c b/kernel/fifo_common.c
+new file mode 100644
+index 0000000..c1641a1
+--- /dev/null
+++ b/kernel/fifo_common.c
+@@ -0,0 +1,86 @@
+/*
+ * kernel/fifo_common.c
+ *
+ * Fifo helper functions. Could one day be  a FIFO plugin if someone
+ * is interested.
+ *
+ * The current FIFO implementaion automatically chops Linux tasks into 
+ * smaller jobs by assigning a fixed time slice. Once that time slice expires,
+ * it is treated as a new job release (that is queued in the back). 
+ * 
+ * The result is that it provides FIFO properties on a job level and round-robin
+ * on a task level if the tasks execute continuously.
+ */
+
+#include <asm/uaccess.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+
+#include <linux/litmus.h>
+#include <linux/sched_plugin.h>
+#include <linux/sched_trace.h>
+#include <linux/fifo_common.h>
+
+/* This function is defined in sched.c. We need access it for 
+ * indirect switching.
+ */
+void __activate_task(struct task_struct *p, runqueue_t *rq);
+
+/* fifo_higher_prio -  returns true if first has a higher FIFO priority
+ *                    than second. Release time ties are broken by PID.
+ *
+ * first first must not be NULL and a real-time task.
+ * second may be NULL or a non-rt task.
+ */
+int fifo_higher_prio(struct task_struct* first, 
+                    struct task_struct* second) 
+{
+       struct task_struct *first_task = first;
+       struct task_struct *second_task = second;
+
+       /* Check for inherited priorities. Change task
+        * used for comparison in such a case.
+        */
+       if (first && first->rt_param.inh_task)
+               first_task = first->rt_param.inh_task;
+       if (second && second->rt_param.inh_task)
+               second_task = second->rt_param.inh_task;
+
+       return 
+               /* does the second task exist and is it a real-time task?  If
+                * not, the first task (which is a RT task) has higher
+                * priority.
+                */
+               !second_task || !is_realtime(second_task)  ||
+               
+               /* is the release of the first task earlier?
+                * Then it has higher priority.
+                */
+               earlier_last_release(first_task, second_task) ||
+
+               /* Do we have a release time tie? 
+                * Then break by PID.
+                */
+               (get_last_release(first_task) == 
+                get_last_release(second_task) && 
+               (first_task->pid < second_task->pid ||
+                
+               /* If the PIDs are the same then the task with the inherited 
+                * priority wins.
+                */
+               (first_task->pid == second_task->pid &&
+                !second->rt_param.inh_task)));
+}
+
+int fifo_ready_order(struct list_head* a, struct list_head* b)
+{
+       return fifo_higher_prio(
+               list_entry(a, struct task_struct, rt_list),
+               list_entry(b, struct task_struct, rt_list));
+}
+
+void fifo_domain_init(rt_domain_t* rt, check_resched_needed_t resched)
+{
+       rt_domain_init(rt, resched, fifo_ready_order);  
+}
+diff --git a/kernel/fork.c b/kernel/fork.c
+index d57118d..be824d4 100644
+--- a/kernel/fork.c
+++ b/kernel/fork.c
+@@ -57,6 +57,9 @@
+ #include <asm/cacheflush.h>
+ #include <asm/tlbflush.h>
+ 
+#include <linux/litmus.h>
+#include <linux/sched_plugin.h>
+
+ /*
+  * Protected counters by write_lock_irq(&tasklist_lock)
+  */
+@@ -118,6 +121,8 @@ void __put_task_struct(struct task_struct *tsk)
+        WARN_ON(atomic_read(&tsk->usage));
+        WARN_ON(tsk == current);
+ 
+       exit_litmus(tsk);
+
+        security_task_free(tsk);
+        free_uid(tsk->user);
+        put_group_info(tsk->group_info);
+diff --git a/kernel/ft_event.c b/kernel/ft_event.c
+new file mode 100644
+index 0000000..10318ee
+--- /dev/null
+++ b/kernel/ft_event.c
+@@ -0,0 +1,104 @@
+#include <linux/types.h>
+
+#include <linux/feather_trace.h>
+
+/* the feather trace management functions assume 
+ * exclusive access to the event table
+ */
+
+
+#define BYTE_JUMP      0xeb
+#define BYTE_JUMP_LEN  0x02
+
+/* for each event, there is an entry in the event table */
+struct trace_event {
+       long    id;
+       long    count;
+       long    start_addr;
+       long    end_addr;
+};
+
+extern struct trace_event  __start___event_table[];
+extern struct trace_event  __stop___event_table[];
+
+int ft_enable_event(unsigned long id) 
+{
+       struct trace_event* te = __start___event_table;
+       int count = 0;
+       char* delta;
+       unsigned char* instr;
+
+       while (te < __stop___event_table) {
+               if (te->id == id && ++te->count == 1) {
+                       instr  = (unsigned char*) te->start_addr;
+                       /* make sure we don't clobber something wrong */
+                       if (*instr == BYTE_JUMP) {                              
+                               delta  = (((unsigned char*) te->start_addr) + 1);
+                               *delta = 0;
+                       }
+               }
+               if (te->id == id)
+                       count++;
+               te++;           
+       }
+       return count;
+}
+
+int ft_disable_event(unsigned long id)
+{
+       struct trace_event* te = __start___event_table;
+       int count = 0;
+       char* delta;
+       unsigned char* instr;
+
+       while (te < __stop___event_table) {
+               if (te->id == id && --te->count == 0) {
+                       instr  = (unsigned char*) te->start_addr;
+                       if (*instr == BYTE_JUMP) {
+                               delta  = (((unsigned char*) te->start_addr) + 1);
+                               *delta = te->end_addr - te->start_addr - 
+                                       BYTE_JUMP_LEN;
+                       }
+               }
+               if (te->id == id)
+                       count++;
+               te++;           
+       }
+       return count;
+}
+
+int ft_disable_all_events(void)
+{
+       struct trace_event* te = __start___event_table;
+       int count = 0;
+       char* delta;
+       unsigned char* instr;
+
+       while (te < __stop___event_table) {
+               if (te->count) {
+                       instr  = (unsigned char*) te->start_addr;
+                       if (*instr == BYTE_JUMP) {
+                               delta  = (((unsigned char*) te->start_addr) 
+                                         + 1);
+                               *delta = te->end_addr - te->start_addr - 
+                                       BYTE_JUMP_LEN;
+                               te->count = 0;
+                               count++;
+                       }
+               }
+               te++;           
+       }
+       return count;
+}
+
+int ft_is_event_enabled(unsigned long id)
+{
+       struct trace_event* te = __start___event_table;
+
+       while (te < __stop___event_table) {
+               if (te->id == id)
+                       return te->count;
+               te++;
+       }
+       return 0;
+}
+diff --git a/kernel/ics.c b/kernel/ics.c
+new file mode 100644
+index 0000000..a016033
+--- /dev/null
+++ b/kernel/ics.c
+@@ -0,0 +1,229 @@
+/* ics.c - interruptible critical sections
+ *
+ * (c) 2007 Bjoern Brandenburg, LITMUS^RT project
+ *
+ * This file contains the platform-independent parts to support ICSs on top of
+ * the FDSO layer.
+ */
+
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/errno.h>
+#include <asm/uaccess.h>
+
+
+#include <linux/fdso.h>
+#include <linux/ics.h>
+
+#define ics_stack_empty(top) (top == -1)
+
+struct ics {
+       struct mutex    writer_mutex;
+};
+
+static void* create_ics(void)
+{
+       struct ics* ics;
+       
+       ics = kmalloc(sizeof(struct ics), GFP_KERNEL);
+       if (!ics)
+               return NULL;
+       mutex_init(&ics->writer_mutex);
+       ICS_DBG("allocated ics/%p\n", ics);
+       return ics;
+}
+
+static void destroy_ics(void* ics)
+{
+       ICS_DBG("freeing ics/%p\n", ics);
+       kfree(ics);
+}
+
+static int open_ics(struct od_table_entry* entry, void* __user mapping)
+{
+       if (!access_ok(VERIFY_WRITE, mapping, sizeof(int *)))
+               return -EFAULT;
+
+       entry->extra = (void*) mapping;
+       return 0;
+}
+
+static int close_ics(struct od_table_entry* entry)
+{
+       return 0;
+}
+
+struct fdso_ops ics_ops = {
+       .create         = create_ics,
+       .destroy        = destroy_ics,
+       .open           = open_ics,
+       .close          = close_ics
+};
+
+
+static int get_ics_stack_top(void)
+{
+       int err = 0;
+       int top = -1;
+       struct task_struct* t = current;
+
+       if (t->ics_cb) {
+               err = get_user(top,  &t->ics_cb->top);
+               ICS_DBG("%d stack_top() -> err=%d top=%d\n", t->pid, err, top);
+       }
+       if (err != 0 || top < -1 || top >= MAX_ICS_NESTING)
+               return -1;
+       else 
+               return top;
+}
+
+void* get_rollback_addr(void)
+{
+       int err = 0;
+       int top;
+       void* addr = NULL;
+       struct task_struct* t = current;
+
+       /* we implicitly roll back to the top address */
+
+       top = get_ics_stack_top();
+       if (!ics_stack_empty(top))
+               err = get_user(addr, &t->ics_cb->ics_stack[top].pc);
+                       
+       if (err != 0)
+               addr = NULL;
+       return addr;
+}
+
+static int get_ics_stack(int idx)
+{
+       int err;
+       struct task_struct* t = current;
+       int od;
+
+       err = get_user(od, &t->ics_cb->ics_stack[idx].id);
+       
+       if (!err)
+               return od;
+       else
+               return -1;
+}
+
+
+static void abort_local_ics_reader(void* _ics)
+{
+       struct task_struct* t = current;
+       int i, od, top, err1, err2;
+       int retry = 1;
+       struct ics *stacked, *ics;
+       struct od_table_entry* entry;
+
+       ICS_DBG(KERN_DEBUG "abort_local_ics_reader() on %d, examining %s/%d\n",
+               raw_smp_processor_id(), t->comm, t->pid);
+
+       ics = (struct ics*) _ics;
+
+       /* things to check
+        *
+        * 1) if local task has no ics_cb then return
+        * 2) if local task has no ics on stack then return
+        * 3) if local task has <ics> not in ics stack then return
+        * 4) otherwise rollback local task and set retry flag
+        */
+
+       if (!t->ics_cb) {
+               ICS_DBG("%d no ics_cb\n", t->pid);
+               return;
+       }
+
+       top = get_ics_stack_top();
+       if (ics_stack_empty(top)) {
+               ICS_DBG("%d stack empty\n", t->pid);
+               return;
+       }
+
+       for (i = 0; i <= top; i++) {
+               od = get_ics_stack(i);
+               if (od < 0) {
+                       ICS_DBG("%d garbage od=%d\n", t->pid, od);
+                       /* end of stack or garbage */
+                       return;
+               }
+               stacked = lookup_ics(od);               
+               entry   = __od_lookup(od);
+               if (!stacked) {
+                       ICS_DBG("%d garbage lookup od=%d\n", t->pid, od);
+                       /* garbage on stack */
+                       return;
+               }
+               if (ics == stacked) {
+                       ICS_DBG(KERN_DEBUG "ICS: aborting %s/%d\n", 
+                               t->comm, t->pid);
+                       /* set retry flag */
+                       err1 = put_user(retry, (int*)entry->extra);
+                       /* set ics stack pointer */
+                       err2 = put_user(i, &t->ics_cb->top);
+                       if (likely(err1 == 0 && err2 == 0)) {
+                               set_tsk_thread_flag(t, TIF_ROLLBACK_RCS);
+                               ICS_DBG(KERN_DEBUG "%s/%d aborted.\n", 
+                               t->comm, t->pid);
+                       } else
+                               printk(KERN_INFO "ICS: could not roll back "
+                                      "%s/%d state=%d err1=%d err2=%d i=%d extra=%p  &top=%p\n",
+                                      t->comm, t->pid, t->state, err1, err2, i, entry->extra, &t->ics_cb->top);
+                       return;
+               }
+       }               
+}
+
+static void abort_ics_readers(struct ics* ics)
+{
+       ICS_DBG(KERN_DEBUG "abort_ics_readers() on %d\n",
+               raw_smp_processor_id());
+
+       smp_call_function(abort_local_ics_reader, ics, 0, 1);
+}
+
+static int do_start_wcs(struct ics* ics)
+{
+       mutex_lock(&ics->writer_mutex);
+
+       abort_ics_readers(ics);
+
+       mutex_unlock(&ics->writer_mutex);
+
+       return 0;
+}
+
+
+asmlinkage long sys_start_wcs(int ics_od)
+{
+       long ret = 0;
+       struct ics * ics;
+
+       ics = lookup_ics(ics_od);
+       if (ics)
+               ret = do_start_wcs(ics);
+       else
+               ret = -EINVAL;
+
+       ICS_DBG(KERN_DEBUG "%s/%d sys_start_wcs(%d) -> %ld\n",
+               current->comm, current->pid, ics_od, ret);
+
+       return ret;
+}
+
+
+asmlinkage long sys_reg_ics_cb(struct ics_cb* __user ics_cb)
+{
+       long ret = -EFAULT;
+       struct task_struct *t = current;
+       
+       if (access_ok(VERIFY_WRITE, ics_cb, sizeof(*ics_cb))) {
+               t->ics_cb = ics_cb;
+               ret = 0;
+       }
+
+       return ret;
+}
+diff --git a/kernel/litmus.c b/kernel/litmus.c
+new file mode 100644
+index 0000000..8ebb9c9
+--- /dev/null
+++ b/kernel/litmus.c
+@@ -0,0 +1,1034 @@
+/* litmus.c -- Implementation of the LITMUS syscalls, the LITMUS intialization,
+ *             and the common tick function.
+ */
+#include <asm/uaccess.h>
+#include <linux/uaccess.h>
+#include <linux/sysrq.h>
+
+#include <linux/queuelock.h>
+#include <linux/litmus.h>
+#include <linux/sched.h>
+#include <linux/sched_plugin.h>
+#include <linux/fpmath.h>
+
+#include <linux/trace.h>
+
+#define MAX_SERVICE_LEVELS 10
+
+/*     Variables that govern the scheduling process */
+spolicy sched_policy           = SCHED_DEFAULT;
+int    sched_options           = 0;
+
+
+/* This is a flag for switching the system into RT mode when it is booted up
+ * In RT-mode non-realtime tasks are scheduled as background tasks.
+ */
+
+/* The system is booting in non-realtime mode */
+atomic_t rt_mode               = ATOMIC_INIT(MODE_NON_RT);
+/* Here we specify a mode change to be made */
+atomic_t new_mode              = ATOMIC_INIT(MODE_NON_RT);
+/* Number of RT tasks that exist in the system */
+atomic_t n_rt_tasks            = ATOMIC_INIT(0);
+
+/* Only one CPU may perform a mode change. */
+static queuelock_t mode_change_lock;
+
+/* The time instant when we switched to RT mode */
+volatile jiffie_t rt_start_time = 0;
+
+/* To send signals from the scheduler
+ * Must drop locks first.
+ */
+static LIST_HEAD(sched_sig_list);
+static DEFINE_SPINLOCK(sched_sig_list_lock);
+
+/**
+ * sys_set_rt_mode
+ * @newmode: new mode the scheduler must be switched to
+ *     External syscall for setting the RT mode flag
+ *     Returns EINVAL if mode is not recognized or mode transition is
+ *     not permitted
+ *     On success 0 is returned
+ *
+ *     FIXME: In a "real" OS we cannot just let any user switch the mode...
+ */
+asmlinkage long sys_set_rt_mode(int newmode)
+{
+       if ((newmode == MODE_NON_RT) || (newmode == MODE_RT_RUN)) {
+               printk(KERN_INFO "real-time mode switch to %s\n",
+                     (newmode == MODE_RT_RUN ? "rt" : "non-rt"));
+               atomic_set(&new_mode, newmode);
+               return 0;
+       }
+       return -EINVAL;
+}
+
+/*
+ * sys_set_task_rt_param
+ * @pid: Pid of the task which scheduling parameters must be changed
+ * @param: New real-time extension parameters such as the execution cost and
+ *         period
+ * Syscall for manipulating with task rt extension params
+ * Returns EFAULT  if param is NULL.
+ *         ESRCH   if pid is not corrsponding
+ *                to a valid task.
+ *        EINVAL  if either period or execution cost is <=0
+ *        EPERM   if pid is a real-time task
+ *        0       if success
+ *
+ * Only non-real-time tasks may be configured with this system call
+ * to avoid races with the scheduler. In practice, this means that a
+ * task's parameters must be set _before_ calling sys_prepare_rt_task()
+ */
+asmlinkage long sys_set_rt_task_param(pid_t pid, rt_param_t __user * param)
+{
+       rt_param_t tp;
+       struct task_struct *target;
+       int retval = -EINVAL;
+
+       printk("Setting up rt task parameters for process %d.\n", pid);
+
+       if (pid < 0 || param == 0) {
+               goto out;
+       }
+       if (copy_from_user(&tp, param, sizeof(tp))) {
+               retval = -EFAULT;
+               goto out;
+       }
+
+       /*      Task search and manipulation must be protected */
+       read_lock_irq(&tasklist_lock);
+       if (!(target = find_task_by_pid(pid))) {
+               retval = -ESRCH;
+               goto out_unlock;
+       }
+
+       if (is_realtime(target)) {
+               /* The task is already a real-time task.
+                * We cannot not allow parameter changes at this point.
+                */
+               retval = -EBUSY;
+               goto out_unlock;
+       }
+
+       if (tp.exec_cost <= 0)
+               goto out_unlock;
+       if (tp.period <= 0)
+               goto out_unlock;
+       if (!cpu_online(tp.cpu))
+               goto out_unlock;
+       if (tp.period < tp.exec_cost)
+       {
+               printk(KERN_INFO "litmus: real-time task %d rejected "
+                      "because wcet > period\n", pid);
+               goto out_unlock;
+       }
+
+       /*      Assign params */
+       target->rt_param.basic_params = tp;
+
+       retval = 0;
+      out_unlock:
+       read_unlock_irq(&tasklist_lock);
+      out:
+       return retval;
+}
+
+/*     Getter of task's RT params
+ *     returns EINVAL if param or pid is NULL
+ *     returns ESRCH  if pid does not correspond to a valid task
+ *     returns EFAULT if copying of parameters has failed.
+ */
+asmlinkage long sys_get_rt_task_param(pid_t pid, rt_param_t __user * param)
+{
+       int retval = -EINVAL;
+       struct task_struct *source;
+       rt_param_t lp;
+       if (param == 0 || pid < 0)
+               goto out;
+       read_lock(&tasklist_lock);
+       if (!(source = find_task_by_pid(pid))) {
+               retval = -ESRCH;
+               goto out_unlock;
+       }
+       lp = source->rt_param.basic_params;
+       read_unlock(&tasklist_lock);
+       /* Do copying outside the lock */
+       retval =
+           copy_to_user(param, &lp, sizeof(lp)) ? -EFAULT : 0;
+       return retval;
+      out_unlock:
+       read_unlock(&tasklist_lock);
+      out:
+       return retval;
+
+}
+
+/*
+ * sys_set_service_levels
+ * @pid:    Pid of the task that is to be configured
+ * @count:  The number of service levels
+ * @levels: The new service levels.
+ *
+ * Returns EFAULT  if levels is not a valid address.
+ *         ESRCH   if pid is not corrsponding
+ *                to a valid task.
+ *        EINVAL  if either period or execution cost is <=0 for any level,
+ *                of if utility is not incresing.
+ *        EPERM   if pid is a real-time task
+ *         ENOMEM  if there is insufficient memory available
+ *        0       if success
+ *
+ * May not be used on RT tasks to avoid races.
+ */
+asmlinkage long sys_set_service_levels(pid_t pid,
+                                      unsigned int count,
+                                      service_level_t __user *levels)
+{
+       struct task_struct *target;
+       service_level_t level, *klevels;
+       int retval = -EINVAL, i;
+       fp_t last_value   = FP(0);
+       fp_t last_weight  = FP(0);
+
+       TRACE("Setting up service levels for process %d.\n", pid);
+
+       if (pid < 0 || count > MAX_SERVICE_LEVELS) {
+               goto out;
+       }
+
+       /*      Task search and manipulation must be protected */
+       read_lock_irq(&tasklist_lock);
+       if (!(target = find_task_by_pid(pid))) {
+               retval = -ESRCH;
+               read_unlock_irq(&tasklist_lock);
+               goto out;
+       }
+       read_unlock_irq(&tasklist_lock);
+
+       if (is_realtime(target)) {
+               /* The task is already a real-time task.
+                * We cannot not allow parameter changes at this point.
+                */
+               retval = -EBUSY;
+               goto out;
+       }
+
+       /* get rid of old service levels, if any */
+       kfree(target->rt_param.service_level);
+       target->rt_param.service_level     = NULL;
+       target->rt_param.no_service_levels = 0;
+
+       /* count == 0 means tear down service levels*/
+       if (count == 0) {
+               retval = 0;
+               goto out;
+       }
+
+       klevels = kmalloc(sizeof(service_level_t) * count, GFP_KERNEL);
+       if (!klevels) {
+               retval = -ENOMEM;
+               goto out;
+       }
+
+       for (i = 0; i < count; i++) {
+               if (copy_from_user(&level, levels + i, sizeof(level))) {
+                       retval = -EFAULT;
+                       kfree(klevels);
+                       goto out;
+               }
+               if (level.period <= 0) {
+                       TRACE("service level %d period <= 0\n", i);
+                       goto out;
+               }
+               if (_leq(level.weight, last_weight)) {
+                       TRACE("service level %d weight non-increase\n", i);
+                       goto out;
+               }
+               if (_leq(level.value, last_value)) {
+                       TRACE("service level %d value non-increase\n", i);
+                       goto out;
+               }
+               last_value  = level.value;
+               last_weight = level.weight;
+               klevels[i] = level;
+       }
+       target->rt_param.basic_params.exec_cost   =
+               _round(_mul(klevels[0].weight,
+                           FP(klevels[0].period)));
+       target->rt_param.basic_params.period      = klevels[0].period;
+       target->rt_param.service_level            = klevels;
+       target->rt_param.no_service_levels        = count;
+       retval = 0;
+
+      out:
+       return retval;
+}
+
+asmlinkage long sys_get_cur_service_level(void)
+{
+       long level;
+
+       if (!is_realtime(current))
+               return -EINVAL;
+
+       /* block scheduler that might cause reweighting to happen */
+       local_irq_disable();
+       level = current->rt_param.cur_service_level;
+       local_irq_enable();
+       return level;
+}
+
+
+/* sys_task_mode_transition
+ * @target_mode: The desired execution mode after the system call completes.
+ *               Either BACKGROUND_TASK or LITMUS_RT_TASK.
+ *     Allow a normal task to become a real-time task, vice versa.
+ *     Returns EINVAL  if illegal transition requested.                
+ *             0       if task mode was changed succesfully
+ *             other   if plugin failed.
+ */
+asmlinkage long sys_task_mode_transition(int target_mode)
+{
+       int retval = -EINVAL;
+       struct task_struct *t = current;
+       
+       if (( is_realtime(t) && target_mode == BACKGROUND_TASK) ||
+           (!is_realtime(t) && target_mode == LITMUS_RT_TASK)) {
+               TRACE_TASK(t, "attempts mode transition to %s\n", 
+                          is_realtime(t) ? "best-effort" : "real-time");
+               preempt_disable();
+               t->rt_param.transition_pending = 1;
+               t->state = TASK_STOPPED;
+               preempt_enable_no_resched();
+
+               schedule();
+
+               retval = t->rt_param.transition_error;
+       }
+       return retval;
+}
+
+/* implemented in kernel/litmus_sem.c */
+void srp_ceiling_block(void);
+
+/*
+ *     This is the crucial function for periodic task implementation,
+ *     It checks if a task is periodic, checks if such kind of sleep
+ *     is permitted and calls plugin-specific sleep, which puts the
+ *     task into a wait array.
+ *     returns 0 on successful wakeup
+ *     returns EPERM if current conditions do not permit such sleep
+ *     returns EINVAL if current task is not able to go to sleep
+ */
+asmlinkage long sys_sleep_next_period(void)
+{
+       int retval = -EPERM;
+       if (!is_realtime(current)) {
+               retval = -EINVAL;
+               goto out;
+       }
+       /* Task with negative or zero period cannot sleep */
+       if (get_rt_period(current) <= 0) {
+               retval = -EINVAL;
+               goto out;
+       }
+       /* The plugin has to put the task into an
+        * appropriate queue and call schedule
+        */
+       retval = curr_sched_plugin->sleep_next_period();
+       if (!retval && is_subject_to_srp(current))
+               srp_ceiling_block();
+      out:
+       return retval;
+}
+
+/*     This is an "improved" version of sys_sleep_next_period() that
+ *      addresses the problem of unintentionally missing a job after
+ *      an overrun.
+ *
+ *     returns 0 on successful wakeup
+ *     returns EPERM if current conditions do not permit such sleep
+ *     returns EINVAL if current task is not able to go to sleep
+ */
+asmlinkage long sys_wait_for_job_release(unsigned int job)
+{
+       int retval = -EPERM;
+       if (!is_realtime(current)) {
+               retval = -EINVAL;
+               goto out;
+       }
+
+       /* Task with negative or zero period cannot sleep */
+       if (get_rt_period(current) <= 0) {
+               retval = -EINVAL;
+               goto out;
+       }
+
+       retval = 0;
+
+       /* first wait until we have "reached" the desired job
+        *
+        * This implementation has at least two problems:
+        *
+        * 1) It doesn't gracefully handle the wrap around of
+        *    job_no. Since LITMUS is a prototype, this is not much
+        *    of a problem right now.
+        *
+        * 2) It is theoretically racy if a job release occurs
+        *    between checking job_no and calling sleep_next_period().
+        *    A proper solution would requiring adding another callback
+        *    in the plugin structure and testing the condition with
+        *    interrupts disabled.
+        *
+        * FIXME: At least problem 2 should be taken care of eventually.
+        */
+       while (!retval && job > current->rt_param.times.job_no)
+         /* If the last job overran then job <= job_no and we
+          * don't send the task to sleep.
+          */
+         retval = curr_sched_plugin->sleep_next_period();
+
+       /* We still have to honor the SRP after the actual release.
+        */
+       if (!retval && is_subject_to_srp(current))
+               srp_ceiling_block();
+      out:
+       return retval;
+}
+
+/*     This is a helper syscall to query the current job sequence number.
+ *
+ *     returns 0 on successful query
+ *     returns EPERM if task is not a real-time task.
+ *      returns EFAULT if &job is not a valid pointer.
+ */
+asmlinkage long sys_query_job_no(unsigned int __user *job)
+{
+       int retval = -EPERM;
+       if (is_realtime(current))
+               retval = put_user(current->rt_param.times.job_no, job);
+
+       return retval;
+}
+
+
+/* The LITMUS tick function. It manages the change to and from real-time mode
+ * and then calls the plugin's tick function.
+ */
+reschedule_check_t  __sched rt_scheduler_tick(void)
+{
+       /*      Check for mode change */
+       if ((get_rt_mode() != atomic_read(&new_mode))) {
+               queue_lock(&mode_change_lock);
+               // If the mode is already changed, proceed
+               if (get_rt_mode() == atomic_read(&new_mode)) {
+                       queue_unlock(&mode_change_lock);
+                       goto proceed;
+               }
+               // change the mode
+               if ((atomic_read(&new_mode) == MODE_RT_RUN)) {
+                       /* The deferral of entering real-time mode should be
+                        * handled by deferring task releases in the plugin.
+                        * The plugin interface does not really need to know
+                        * about quanta, that is the plugin's job.
+                        */
+
+                       /*  update rt start time */
+                       rt_start_time = jiffies;
+                       printk(KERN_INFO "Real-Time mode enabled at %ld "
+                              "on %d\n",
+                              jiffies, smp_processor_id());
+               } else
+                       printk(KERN_INFO "Real-Time mode disabled at %ld "
+                              "on %d\n",
+                              jiffies, smp_processor_id());
+               if (curr_sched_plugin->mode_change)
+                 curr_sched_plugin->
+                   mode_change(atomic_read(&new_mode));
+               printk(KERN_INFO "Plugin mode change done at %ld\n",
+                      jiffies);
+               set_rt_mode(atomic_read(&new_mode));
+               queue_unlock(&mode_change_lock);
+       }
+
+ proceed:
+       /*      Call plugin-defined tick handler
+        *
+        *      It is the plugin's tick handler' job to detect quantum
+        *      boundaries in pfair.
+        */
+       return curr_sched_plugin->scheduler_tick();
+}
+
+asmlinkage spolicy sys_sched_setpolicy(spolicy newpolicy)
+{
+       /*      Dynamic policy change is disabled at the moment */
+       return SCHED_INVALID;
+}
+
+asmlinkage spolicy sys_sched_getpolicy(void)
+{
+       return sched_policy;
+}
+
+
+asmlinkage int sys_scheduler_setup(int cmd, void __user *parameter)
+{
+       int ret = -EINVAL;
+
+       ret = curr_sched_plugin->scheduler_setup(cmd, parameter);
+       return ret;
+}
+
+struct sched_sig {
+       struct list_head        list;
+       struct task_struct*     task;
+       unsigned int            signal:31;
+       int                     force:1;
+};
+
+static void __scheduler_signal(struct task_struct *t, unsigned int signo,
+                              int force)
+{
+       struct sched_sig* sig;
+
+       sig = kmalloc(GFP_ATOMIC, sizeof(struct sched_sig));
+       if (!sig) {
+               TRACE_TASK(t, "dropping signal: %u\n", t);
+               return;
+       }
+
+       spin_lock(&sched_sig_list_lock);
+
+       sig->signal = signo;
+       sig->force  = force;
+       sig->task   = t;
+       get_task_struct(t);
+       list_add(&sig->list, &sched_sig_list);
+
+       spin_unlock(&sched_sig_list_lock);
+}
+
+void scheduler_signal(struct task_struct *t, unsigned int signo)
+{
+       __scheduler_signal(t, signo, 0);
+}
+
+void force_scheduler_signal(struct task_struct *t, unsigned int signo)
+{
+       __scheduler_signal(t, signo, 1);
+}
+
+/* FIXME: get rid of the locking and do this on a per-processor basis */
+void send_scheduler_signals(void)
+{
+       unsigned long flags;
+       struct list_head *p, *extra;
+       struct siginfo info;
+       struct sched_sig* sig;
+       struct task_struct* t;
+       struct list_head claimed;
+
+       if (spin_trylock_irqsave(&sched_sig_list_lock, flags)) {
+               if (list_empty(&sched_sig_list))
+                       p = NULL;
+               else {
+                       p = sched_sig_list.next;
+                       list_del(&sched_sig_list);
+                       INIT_LIST_HEAD(&sched_sig_list);
+               }
+               spin_unlock_irqrestore(&sched_sig_list_lock, flags);
+
+               /* abort if there are no signals */
+               if (!p)
+                       return;
+
+               /* take signal list we just obtained */
+               list_add(&claimed, p);
+
+               list_for_each_safe(p, extra, &claimed) {
+                       list_del(p);
+                       sig = list_entry(p, struct sched_sig, list);
+                       t = sig->task;
+                       info.si_signo = sig->signal;
+                       info.si_errno = 0;
+                       info.si_code  = SI_KERNEL;
+                       info.si_pid   = 1;
+                       info.si_uid   = 0;
+                       TRACE("sending signal %d to %d\n", info.si_signo,
+                             t->pid);
+                       if (sig->force)
+                               force_sig_info(sig->signal, &info, t);
+                       else
+                               send_sig_info(sig->signal, &info, t);
+                       put_task_struct(t);
+                       kfree(sig);
+               }
+       }
+
+}
+
+static inline void np_mem_error(struct task_struct* t, const char* reason)
+{
+       if (t->state != TASK_DEAD && !(t->flags & PF_EXITING)) {
+               TRACE("np section: %s => %s/%d killed\n",
+                     reason, t->comm, t->pid);
+               force_scheduler_signal(t, SIGKILL);
+       }
+}
+
+/*     sys_register_np_flag() allows real-time tasks to register an
+ *     np section indicator.
+ *     returns 0      if the flag was successfully registered
+ *     returns EINVAL if current task is not a real-time task
+ *     returns EFAULT if *flag couldn't be written
+ */
+asmlinkage long sys_register_np_flag(short __user *flag)
+{
+       int retval = -EINVAL;
+       short test_val = RT_PREEMPTIVE;
+
+       /* avoid races with the scheduler */
+       preempt_disable();
+       TRACE("reg_np_flag(%p) for %s/%d\n", flag,
+             current->comm, current->pid);
+
+       /* Let's first try to write to the address.
+        * That way it is initialized and any bugs
+        * involving dangling pointers will caught
+        * early.
+        * NULL indicates disabling np section support
+        * and should not be tested.
+        */
+       if (flag)
+         retval = poke_kernel_address(test_val, flag);
+       else
+         retval = 0;
+       TRACE("reg_np_flag: retval=%d\n", retval);
+       if (unlikely(0 != retval))
+               np_mem_error(current, "np flag: not writable");
+       else
+         /* the pointer is ok */
+         current->rt_param.np_flag = flag;
+
+       preempt_enable();
+       return retval;
+}
+
+
+void request_exit_np(struct task_struct *t)
+{
+       int ret;
+       short flag;
+
+       /* We can only do this if t is actually currently scheduled on this CPU
+        * because otherwise we are in the wrong address space. Thus make sure
+        * to check.
+        */
+        BUG_ON(t != current);
+
+       if (unlikely(!is_realtime(t) || !t->rt_param.np_flag)) {
+               TRACE_TASK(t, "request_exit_np(): BAD TASK!\n");
+               return;
+       }
+
+       flag = RT_EXIT_NP_REQUESTED;
+       ret  = poke_kernel_address(flag, t->rt_param.np_flag + 1);
+       TRACE("request_exit_np(%s/%d)\n", t->comm, t->pid);
+       if (unlikely(0 != ret))
+               np_mem_error(current, "request_exit_np(): flag not writable");
+
+}
+
+
+int is_np(struct task_struct* t)
+{
+       int ret;
+       unsigned short flag = 0x5858; /* = XX, looks nicer in debug*/
+
+        BUG_ON(t != current);
+
+       if (unlikely(t->rt_param.kernel_np))
+               return 1;
+       else if (unlikely(t->rt_param.np_flag == NULL) ||
+                t->flags & PF_EXITING ||
+                t->state == TASK_DEAD)
+               return 0;
+       else {
+               /* This is the tricky part. The process has registered a
+                * non-preemptive section marker. We now need to check whether
+                * it is set to to NON_PREEMPTIVE. Along the way we could
+                * discover that the pointer points to an unmapped region (=>
+                * kill the task) or that the location contains some garbage
+                * value (=> also kill the task). Killing the task in any case
+                * forces userspace to play nicely. Any bugs will be discovered
+                * immediately.
+                */
+               ret = probe_kernel_address(t->rt_param.np_flag, flag);
+               if (0 == ret && (flag == RT_NON_PREEMPTIVE ||
+                                flag == RT_PREEMPTIVE))
+               return flag != RT_PREEMPTIVE;
+               else {
+                       /* either we could not read from the address or
+                        * it contained garbage => kill the process
+                        * FIXME: Should we cause a SEGFAULT instead?
+                        */
+                       TRACE("is_np: ret=%d flag=%c%c (%x)\n", ret,
+                             flag & 0xff, (flag >> 8) & 0xff, flag);
+                       np_mem_error(t, "is_np() could not read");
+                       return 0;
+               }
+       }
+}
+
+/*
+ *     sys_exit_np() allows real-time tasks to signal that it left a
+ *      non-preemptable section. It will be called after the kernel requested a
+ *      callback in the preemption indicator flag.
+ *     returns 0      if the signal was valid and processed.
+ *     returns EINVAL if current task is not a real-time task
+ */
+asmlinkage long sys_exit_np(void)
+{
+       int retval = -EINVAL;
+
+       TS_EXIT_NP_START;
+
+       if (!is_realtime(current))
+               goto out;
+
+       TRACE("sys_exit_np(%s/%d)\n", current->comm, current->pid);
+       /* force rescheduling so that we can be preempted */
+       set_tsk_need_resched(current);
+       retval = 0;
+      out:
+
+       TS_EXIT_NP_END;
+       return retval;
+}
+
+long transition_to_rt(struct task_struct* tsk)
+{
+       long retval;
+
+       BUG_ON(is_realtime(tsk));
+
+       if (get_rt_period(tsk) == 0 ||
+           get_exec_cost(tsk) > get_rt_period(tsk)) {
+               TRACE_TASK(tsk, "litmus prepare: invalid task parameters "
+                          "(%lu, %lu)\n",
+                      get_exec_cost(tsk), get_rt_period(tsk));         
+               return -EINVAL;
+       }
+
+       if (!cpu_online(get_partition(tsk)))
+       {
+               TRACE_TASK(tsk, "litmus prepare: cpu %d is not online\n",
+                          get_partition(tsk));
+               return -EINVAL;
+       }
+
+       tsk->rt_param.old_prio   = tsk->rt_priority;
+       tsk->rt_param.old_policy = tsk->policy;
+       INIT_LIST_HEAD(&tsk->rt_list);
+
+       retval = curr_sched_plugin->prepare_task(tsk);
+
+       if (!retval) {
+               atomic_inc(&n_rt_tasks);
+               tsk->rt_param.is_realtime       = 1;
+               tsk->rt_param.litmus_controlled = 1;
+       }
+
+       return retval;
+}
+
+/* p is a real-time task. Re-init its state as a best-effort task. */
+static void reinit_litmus_state(struct task_struct* p, int restore) 
+{
+       rt_param_t    user_config;
+       __user short *np_flag;
+
+       if (restore) {
+               /* Safe user-space provided configuration data.
+                * FIXME: This is missing service levels for adaptive tasks.
+                */
+               user_config = p->rt_param.basic_params;
+               np_flag     = p->rt_param.np_flag;
+       }
+
+       /* We probably should not be inheriting any task's priority
+        * at this point in time.
+        */
+       WARN_ON(p->rt_param.inh_task);
+
+       /* We need to restore the priority of the task. */
+       __setscheduler(p, p->rt_param.old_policy, p->rt_param.old_prio);
+
+       /* Cleanup everything else. */
+       memset(&p->rt_param, 0, sizeof(task_rt_param_t));
+
+       /* Restore preserved fields. */
+       if (restore) {
+               p->rt_param.basic_params = user_config;
+               p->rt_param.np_flag      = np_flag;
+       }
+}
+
+long transition_to_be(struct task_struct* tsk)
+{
+       BUG_ON(!is_realtime(tsk));
+
+       curr_sched_plugin->tear_down(tsk);
+       atomic_dec(&n_rt_tasks);
+       reinit_litmus_state(tsk, 1);
+       return 0;
+}
+
+/* Called upon fork.
+ * p is the newly forked task. 
+ */
+void litmus_fork(struct task_struct* p) 
+{ 
+       if (is_realtime(p))
+               /* clean out any litmus related state, don't preserve anything*/
+               reinit_litmus_state(p, 0);
+}
+
+/* Called upon execve().
+ * current is doing the exec.
+ * Don't let address space specific stuff leak. 
+ */
+void litmus_exec(void) 
+{
+       struct task_struct* p = current;
+
+       if (is_realtime(p)) {
+               WARN_ON(p->rt_param.inh_task);
+               p->rt_param.np_flag = NULL;
+       }
+}
+
+void exit_litmus(struct task_struct *dead_tsk)
+{
+       if (is_realtime(dead_tsk))
+               transition_to_be(dead_tsk);
+       kfree(dead_tsk->rt_param.service_level);
+}
+
+
+void list_qsort(struct list_head* list, list_cmp_t less_than)
+{
+       struct list_head lt;
+       struct list_head geq;
+       struct list_head *pos, *extra, *pivot;
+       int n_lt = 0, n_geq = 0;
+       BUG_ON(!list);
+
+       if (list->next == list)
+               return;
+
+       INIT_LIST_HEAD(&lt);
+       INIT_LIST_HEAD(&geq);
+
+       pivot = list->next;
+       list_del(pivot);
+       list_for_each_safe(pos, extra, list) {
+               list_del(pos);
+               if (less_than(pos, pivot)) {
+                       list_add(pos, &lt);
+                       n_lt++;
+               } else {
+                       list_add(pos, &geq);
+                       n_geq++;
+               }
+       }
+       if (n_lt < n_geq) {
+               list_qsort(&lt, less_than);
+               list_qsort(&geq, less_than);
+       } else {
+               list_qsort(&geq, less_than);
+               list_qsort(&lt, less_than);
+       }
+       list_splice(&geq, list);
+       list_add(pivot, list);
+       list_splice(&lt, list);
+}
+
+#ifdef CONFIG_MAGIC_SYSRQ
+/* We offer the possibility to change the real-time mode of the system
+ * with a magic sys request. This helps in debugging in case the system fails
+ * to perform its planned switch back to normal mode. This may happen if we have
+ * total system utilization and the task that is supposed to do the switch is
+ * always preempted (if it is not a real-time task).
+ */
+int sys_kill(int pid, int sig);
+
+
+static void sysrq_handle_toGgle_rt_mode(int key, struct tty_struct *tty)
+{
+       sys_set_rt_mode(get_rt_mode() == MODE_NON_RT);
+}
+
+static struct sysrq_key_op sysrq_toGgle_rt_mode_op = {
+       .handler        = sysrq_handle_toGgle_rt_mode,
+       .help_msg       = "toGgle-rt-mode",
+       .action_msg     = "real-time mode changed",
+};
+
+static void sysrq_handle_kill_rt_tasks(int key, struct tty_struct *tty)
+{
+       struct task_struct *t;
+       read_lock(&tasklist_lock);
+       for_each_process(t) {
+               if (is_realtime(t)) {
+                       sys_kill(t->pid, SIGKILL);
+               }
+       }
+       read_unlock(&tasklist_lock);
+}
+
+static struct sysrq_key_op sysrq_kill_rt_tasks_op = {
+       .handler        = sysrq_handle_kill_rt_tasks,
+       .help_msg       = "Quit-rt-tasks",
+       .action_msg     = "sent SIGKILL to all real-time tasks",
+};
+#endif
+
+/*
+ * Scheduler initialization so that customized scheduler is
+ * enabled at boot time
+ * by setting boot option "rtsched=plugin_name", e.g. "rtsched=pfair"
+ */
+
+/* All we need to know about other plugins is their initialization
+ * functions. These functions initialize internal data structures of a
+ * scheduler and return a pointer to initialized sched_plugin data
+ * structure with pointers to scheduling function implementations.
+ * If called repeatedly these init functions just return an existing
+ * plugin pointer.
+ */
+sched_plugin_t *init_global_edf_plugin(void);
+sched_plugin_t *init_global_edf_np_plugin(void);
+sched_plugin_t *init_part_edf_plugin(void);
+sched_plugin_t *init_edf_hsb_plugin(void);
+sched_plugin_t *init_pfair_plugin(void);
+sched_plugin_t *init_gsn_edf_plugin(void);
+sched_plugin_t *init_psn_edf_plugin(void);
+sched_plugin_t *init_adaptive_plugin(void);
+
+/* keep everything needed to setup plugins in one place */
+
+/* we are lazy, so we use a convention for function naming to fill
+ * a table
+ */
+#define PLUGIN(caps, small) \
+       {PLUGIN_ ## caps, SCHED_ ## caps, init_ ## small ## _plugin}
+
+#define init_nosetup_plugin 0
+
+static struct {
+       const char *name;
+       const spolicy policy_id;
+       sched_plugin_t *(*init) (void);
+} available_plugins[] = {
+       PLUGIN(LINUX, nosetup),
+       PLUGIN(GLOBAL_EDF_NP, global_edf_np),
+       PLUGIN(GLOBAL_EDF, global_edf),
+       PLUGIN(PART_EDF, part_edf),
+       PLUGIN(EDF_HSB, edf_hsb),
+       PLUGIN(PFAIR, pfair),
+       PLUGIN(GSN_EDF, gsn_edf),
+       PLUGIN(PSN_EDF, psn_edf),
+       PLUGIN(ADAPTIVE, adaptive),
+       /*********************************************
+       *       Add your custom plugin here
+       **********************************************/
+};
+
+/* Some plugins may leave important functions  unused. We define dummies
+ * so that we don't have to check for null pointers all over the place.
+ */
+void litmus_dummy_finish_switch(struct task_struct * prev);
+int  litmus_dummy_schedule(struct task_struct * prev, struct task_struct** next,
+                          runqueue_t* q);
+reschedule_check_t  litmus_dummy_scheduler_tick(void);
+long litmus_dummy_prepare_task(struct task_struct *t);
+void litmus_dummy_wake_up_task(struct task_struct *task);
+void litmus_dummy_task_blocks(struct task_struct *task);
+long litmus_dummy_tear_down(struct task_struct *task);
+int  litmus_dummy_scheduler_setup(int cmd, void __user *parameter);
+long litmus_dummy_sleep_next_period(void);
+long litmus_dummy_inherit_priority(struct pi_semaphore *sem,
+                                  struct task_struct *new_owner);
+long litmus_dummy_return_priority(struct pi_semaphore *sem);
+long litmus_dummy_pi_block(struct pi_semaphore *sem,
+                          struct task_struct *t);
+
+#define CHECK(func) {\
+       if (!curr_sched_plugin->func) \
+               curr_sched_plugin->func = litmus_dummy_ ## func;}
+
+static int boot_sched_setup(char *plugin_name)
+{
+       int i = 0;
+
+       /*      Common initializers,
+        *      mode change lock is used to enforce single mode change
+        *      operation.
+        */
+       queue_lock_init(&mode_change_lock);
+
+       printk("Starting LITMUS^RT kernel\n");
+
+       /* Look for a matching plugin.
+        */
+       for (i = 0; i < ARRAY_SIZE(available_plugins); i++) {
+               if (!strcmp(plugin_name, available_plugins[i].name)) {
+                       printk("Using %s scheduler plugin\n", plugin_name);
+                       sched_policy = available_plugins[i].policy_id;
+                       if (available_plugins[i].init)
+                               curr_sched_plugin = available_plugins[i].init();
+                       goto out;
+               }
+       }
+
+
+       /*      Otherwise we have default linux scheduler */
+       printk("Plugin name %s is unknown, using default %s\n", plugin_name,
+              curr_sched_plugin->plugin_name);
+
+out:
+       /* make sure we don't trip over null pointers later */
+       CHECK(finish_switch);
+       CHECK(schedule);
+       CHECK(scheduler_tick);
+       CHECK(wake_up_task);
+       CHECK(tear_down);
+       CHECK(task_blocks);
+       CHECK(prepare_task);
+       CHECK(scheduler_setup);
+       CHECK(sleep_next_period);
+       CHECK(inherit_priority);
+       CHECK(return_priority);
+       CHECK(pi_block);
+
+#ifdef CONFIG_MAGIC_SYSRQ
+       /* offer some debugging help */
+       if (!register_sysrq_key('g', &sysrq_toGgle_rt_mode_op))
+               printk("Registered eXit real-time mode magic sysrq.\n");
+       else
+               printk("Could not register eXit real-time mode magic sysrq.\n");
+       if (!register_sysrq_key('q', &sysrq_kill_rt_tasks_op))
+               printk("Registered kill rt tasks magic sysrq.\n");
+       else
+               printk("Could not register kill rt tasks magic sysrq.\n");
+#endif
+       printk("Litmus setup complete.");
+       return 1;
+}
+
+/*     Register for boot option */
+__setup("rtsched=", boot_sched_setup);
+diff --git a/kernel/litmus_sem.c b/kernel/litmus_sem.c
+new file mode 100644
+index 0000000..53da534
+--- /dev/null
+++ b/kernel/litmus_sem.c
+@@ -0,0 +1,567 @@
+/*
+ * PI semaphores and SRP implementations.
+ * Much of the code here is borrowed from include/asm-i386/semaphore.h.
+ *
+ * NOTE: This implementation is very much a prototype and horribly insecure. It
+ *       is intended to be a proof of concept, not a feature-complete solution.
+ */
+
+#include <asm/atomic.h>
+#include <asm/semaphore.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/spinlock.h>
+#include <linux/queuelock.h>
+#include <linux/litmus.h>
+#include <linux/sched_plugin.h>
+#include <linux/edf_common.h>
+
+#include <linux/fdso.h>
+
+#include <linux/trace.h>
+
+/* ************************************************************************** */
+/*                          PRIORITY INHERITANCE                              */
+/* ************************************************************************** */
+
+static  void* create_pi_semaphore(void)
+{
+       struct pi_semaphore* sem;
+       int i;
+
+       sem = kmalloc(sizeof(struct pi_semaphore), GFP_KERNEL);
+       if (!sem)
+               return NULL;
+       atomic_set(&sem->count, 1);
+       sem->sleepers = 0;
+       init_waitqueue_head(&sem->wait);
+       sem->hp.task = NULL;
+       sem->holder = NULL;
+       for (i = 0; i < NR_CPUS; i++)
+               sem->hp.cpu_task[i] = NULL;
+       return sem;
+}
+
+static void destroy_pi_semaphore(void* sem)
+{
+       /* XXX assert invariants */
+       kfree(sem);
+}
+
+struct fdso_ops pi_sem_ops = {
+       .create  = create_pi_semaphore,
+       .destroy = destroy_pi_semaphore
+};
+
+struct wq_pair {
+       struct task_struct*  tsk;
+       struct pi_semaphore* sem;
+};
+
+static int rt_pi_wake_up(wait_queue_t *wait, unsigned mode, int sync, 
+                          void *key)
+{
+       struct wq_pair* wqp   = (struct wq_pair*) wait->private;
+       set_rt_flags(wqp->tsk, RT_F_EXIT_SEM);
+       curr_sched_plugin->inherit_priority(wqp->sem, wqp->tsk);
+       TRACE_TASK(wqp->tsk, 
+                  "woken up by rt_pi_wake_up() (RT_F_SEM_EXIT, PI)\n");        
+       /* point to task for default_wake_function() */
+       wait->private = wqp->tsk;
+       default_wake_function(wait, mode, sync, key);   
+
+       /* Always return true since we know that if we encountered a task
+        * that was already running the wake_up raced with the schedule in
+        * rt_pi_down(). In that case the task in rt_pi_down() will be scheduled
+        * immediately and own the lock. We must not wake up another task in
+        * any case.
+        */
+       return 1;
+}
+
+/* caller is responsible for locking */
+int edf_set_hp_task(struct pi_semaphore *sem)
+{
+       struct list_head        *tmp, *next;
+       struct task_struct      *queued;
+       int ret = 0;
+
+       sem->hp.task = NULL;
+       list_for_each_safe(tmp, next, &sem->wait.task_list) {
+               queued  = ((struct wq_pair*) 
+                       list_entry(tmp, wait_queue_t, 
+                                  task_list)->private)->tsk;
+               
+               /* Compare task prios, find high prio task. */
+               if (edf_higher_prio(queued, sem->hp.task)) {
+                       sem->hp.task = queued;
+                       ret = 1;
+               }
+       }
+       return ret;
+}
+
+/* caller is responsible for locking */
+int edf_set_hp_cpu_task(struct pi_semaphore *sem, int cpu)
+{
+       struct list_head        *tmp, *next;
+       struct task_struct      *queued;
+       int ret = 0;
+
+       sem->hp.cpu_task[cpu] = NULL;
+       list_for_each_safe(tmp, next, &sem->wait.task_list) {
+               queued  = ((struct wq_pair*) 
+                       list_entry(tmp, wait_queue_t, 
+                                  task_list)->private)->tsk;
+               
+               /* Compare task prios, find high prio task. */
+               if (get_partition(queued) == cpu &&
+                   edf_higher_prio(queued, sem->hp.cpu_task[cpu])) {
+                       sem->hp.cpu_task[cpu] = queued;
+                       ret = 1;
+               }
+       }
+       return ret;
+}
+
+int do_pi_down(struct pi_semaphore* sem)
+{
+       unsigned long flags;
+       struct task_struct *tsk = current;
+       struct wq_pair pair;
+       int suspended = 1;
+       wait_queue_t wait = {
+               .private = &pair, 
+               .func    = rt_pi_wake_up, 
+               .task_list = {NULL, NULL}
+       };      
+
+       pair.tsk = tsk;
+       pair.sem = sem;
+       spin_lock_irqsave(&sem->wait.lock, flags);
+       
+       if (atomic_dec_return(&sem->count) < 0 || 
+           waitqueue_active(&sem->wait)) {
+               /* we need to suspend */
+               tsk->state = TASK_UNINTERRUPTIBLE;              
+               add_wait_queue_exclusive_locked(&sem->wait, &wait);
+
+               TRACE_CUR("suspends on PI lock %p\n", sem);
+               curr_sched_plugin->pi_block(sem, tsk);
+
+               /* release lock before sleeping */
+               spin_unlock_irqrestore(&sem->wait.lock, flags);
+
+               TS_PI_DOWN_END;
+               preempt_enable_no_resched();
+
+
+               /* we depend on the FIFO order 
+                * Thus, we don't need to recheck when we wake up, we 
+                * are guaranteed to have the lock since there is only one
+                * wake up per release 
+                */
+               schedule();
+
+               TRACE_CUR("woke up, now owns PI lock %p\n", sem);
+
+               /* try_to_wake_up() set our state to TASK_RUNNING,
+                * all we need to do is to remove our wait queue entry
+                */
+               remove_wait_queue(&sem->wait, &wait);
+       } else {
+               /* no priority inheritance necessary, since there are no queued
+                * tasks.
+                */
+               suspended = 0;
+               TRACE_CUR("acquired PI lock %p, no contention\n", sem);
+               sem->holder  = tsk;
+               sem->hp.task = tsk;
+               curr_sched_plugin->inherit_priority(sem, tsk);
+               spin_unlock_irqrestore(&sem->wait.lock, flags);
+       }       
+       return suspended;
+}
+
+void do_pi_up(struct pi_semaphore* sem)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&sem->wait.lock, flags);
+
+       TRACE_CUR("releases PI lock %p\n", sem);        
+       curr_sched_plugin->return_priority(sem);
+       sem->holder = NULL;
+       if (atomic_inc_return(&sem->count) < 1)
+               /* there is a task queued */
+               wake_up_locked(&sem->wait);     
+
+       spin_unlock_irqrestore(&sem->wait.lock, flags);
+}
+
+asmlinkage long sys_pi_down(int sem_od)
+{
+       long ret = 0;
+       struct pi_semaphore * sem;
+       int suspended = 0;
+
+       preempt_disable();
+       TS_PI_DOWN_START;
+
+       sem = lookup_pi_sem(sem_od);
+       if (sem)
+               suspended = do_pi_down(sem);
+       else
+               ret = -EINVAL;
+
+       if (!suspended) {
+               TS_PI_DOWN_END;
+               preempt_enable();
+       }
+       
+       return ret;
+}
+
+asmlinkage long sys_pi_up(int sem_od)
+{
+       long ret = 0;
+       struct pi_semaphore * sem;
+
+       preempt_disable();
+       TS_PI_UP_START;
+
+       sem = lookup_pi_sem(sem_od);
+       if (sem)
+               do_pi_up(sem);
+       else
+               ret = -EINVAL;
+
+
+       TS_PI_UP_END;
+       preempt_enable();
+
+       return ret;
+}
+
+/* Clear wait queue and wakeup waiting tasks, and free semaphore. */
+/*
+asmlinkage long sys_pi_sema_free(int sem_id)
+{
+        struct list_head *tmp, *next;
+       unsigned long flags;
+
+        if (sem_id < 0 || sem_id >= MAX_PI_SEMAPHORES)
+               return -EINVAL;
+
+       if (!pi_sems[sem_id].used)
+               return -EINVAL;
+
+       spin_lock_irqsave(&pi_sems[sem_id].wait.lock, flags);
+       if (waitqueue_active(&pi_sems[sem_id].wait)) {
+               list_for_each_safe(tmp, next,
+                                  &pi_sems[sem_id].wait.task_list) {
+                       wait_queue_t *curr = list_entry(tmp, wait_queue_t,
+                                                       task_list);
+                       list_del(tmp);
+                       set_rt_flags((struct task_struct*)curr->private,
+                                    RT_F_EXIT_SEM);
+                       curr->func(curr,
+                                  TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
+                                  0, NULL);
+               }
+       }
+
+       spin_unlock_irqrestore(&pi_sems[sem_id].wait.lock, flags);
+       pi_sems[sem_id].used = 0;
+
+       return 0;
+}
+*/
+
+
+
+/* ************************************************************************** */
+/*                          STACK RESOURCE POLICY                             */
+/* ************************************************************************** */
+
+
+struct srp_priority {
+       struct list_head        list;
+        unsigned int           period;
+       pid_t                   pid;
+};
+
+#define list2prio(l) list_entry(l, struct srp_priority, list)
+
+/* SRP task priority comparison function. Smaller periods have highest
+ * priority, tie-break is PID. Special case: period == 0 <=> no priority
+ */
+static int srp_higher_prio(struct srp_priority* first,
+                          struct srp_priority* second)
+{
+       if (!first->period)
+               return 0;
+       else
+               return  !second->period ||
+                       first->period < second->period || (
+                       first->period == second->period && 
+                       first->pid < second->pid);
+}
+
+struct srp {
+       struct list_head        ceiling;
+       wait_queue_head_t       ceiling_blocked;        
+};
+
+
+DEFINE_PER_CPU(struct srp, srp);
+
+#define system_ceiling(srp) list2prio(srp->ceiling.next)
+
+static int srp_exceeds_ceiling(struct task_struct* first, 
+                              struct srp* srp)
+{
+       return list_empty(&srp->ceiling) || 
+              get_rt_period(first) < system_ceiling(srp)->period ||
+              (get_rt_period(first) == system_ceiling(srp)->period && 
+               first->pid < system_ceiling(srp)->pid);
+}
+
+static void srp_add_prio(struct srp* srp, struct srp_priority* prio)
+{
+       struct list_head *pos;
+       if (in_list(&prio->list)) {
+               TRACE_CUR("WARNING: SRP violation detected, prio is already in "
+                         "ceiling list!\n");
+               return;
+       }
+       list_for_each(pos, &srp->ceiling)
+               if (unlikely(srp_higher_prio(prio, list2prio(pos)))) {
+                       __list_add(&prio->list, pos->prev, pos);
+                       return;
+               }               
+       
+       list_add_tail(&prio->list, &srp->ceiling);
+}
+
+/* struct for uniprocessor SRP "semaphore" */
+struct srp_semaphore {
+       struct srp_priority ceiling;
+       int cpu; /* cpu associated with this "semaphore" and resource */
+       int claimed; /* is the resource claimed (ceiling should be used)? */
+};
+
+
+static void* create_srp_semaphore(void)
+{
+       struct srp_semaphore* sem;
+
+       if (!is_realtime(current))
+               /* XXX log error */
+               return NULL;
+
+       sem = kmalloc(sizeof(*sem), GFP_KERNEL);
+       if (!sem)
+               return NULL;
+
+       INIT_LIST_HEAD(&sem->ceiling.list);
+       sem->ceiling.period = 0;
+       sem->claimed = 0;
+       sem->cpu     = get_partition(current);  
+       return sem;
+}
+
+static void destroy_srp_semaphore(void* sem)
+{
+       /* XXX invariants */
+       kfree(sem);
+}
+
+struct fdso_ops srp_sem_ops = {
+       .create  = create_srp_semaphore,
+       .destroy = destroy_srp_semaphore
+};
+
+/* Initialize SRP semaphores at boot time. */
+static int __init srp_sema_boot_init(void)
+{
+       int i;
+
+       printk("Initializing SRP per-CPU ceilings...");
+       for (i = 0; i < NR_CPUS; i++) {
+               init_waitqueue_head(&per_cpu(srp, i).ceiling_blocked);          
+               INIT_LIST_HEAD(&per_cpu(srp, i).ceiling);
+       }
+       printk(" done!\n");
+
+       return 0;
+}
+__initcall(srp_sema_boot_init);
+
+
+void do_srp_down(struct srp_semaphore* sem)
+{
+       /* claim... */
+        sem->claimed = 1;      
+       /* ...and update ceiling */
+       srp_add_prio(&__get_cpu_var(srp), &sem->ceiling);
+}
+
+void do_srp_up(struct srp_semaphore* sem)
+{
+       sem->claimed = 0;
+
+       /* Determine new system priority ceiling for this CPU. */
+       if (in_list(&sem->ceiling.list))                
+               list_del(&sem->ceiling.list);
+       else
+               TRACE_CUR("WARNING: SRP violation detected, prio not in ceiling"
+                         " list!\n");
+
+       /* Wake tasks on this CPU, if they exceed current ceiling. */
+       wake_up_all(&__get_cpu_var(srp).ceiling_blocked);
+}
+
+/* Adjust the system-wide priority ceiling if resource is claimed. */
+asmlinkage long sys_srp_down(int sem_od)
+{
+       int cpu;
+       int ret = -EINVAL;
+       struct srp_semaphore* sem;
+
+       /* disabling preemptions is sufficient protection since
+        * SRP is strictly per CPU and we don't interfere with any
+        * interrupt handlers
+        */
+       preempt_disable();
+       TS_SRP_DOWN_START;
+
+       cpu = smp_processor_id();               
+       sem = lookup_srp_sem(sem_od);
+       if (sem && sem->cpu == cpu) {
+               do_srp_down(sem);
+               ret = 0;
+       }
+
+       TS_SRP_DOWN_END;
+       preempt_enable();
+       return ret;
+}
+
+/* Adjust the system-wide priority ceiling if resource is freed. */
+asmlinkage long sys_srp_up(int sem_od)
+{
+       int cpu;
+       int ret = -EINVAL;
+       struct srp_semaphore* sem;
+
+       preempt_disable();
+       TS_SRP_UP_START;
+
+       cpu = smp_processor_id();               
+       sem = lookup_srp_sem(sem_od);
+
+       if (sem && sem->cpu == cpu) {
+               do_srp_up(sem);
+               ret = 0;
+       }
+
+       TS_SRP_UP_END;
+       preempt_enable();
+       return ret;
+}
+
+/* Indicate that task will use a resource associated with a given
+ * semaphore. Should be done *a priori* before RT task system is
+ * executed, so this does *not* update the system priority
+ * ceiling! (The ceiling would be meaningless anyway, as the SRP
+ * breaks without this a priori knowledge.)
+ */
+asmlinkage long sys_reg_task_srp_sem(int sem_od)
+{
+       /*
+        * FIXME: This whole concept is rather brittle!
+        *        There must be a better solution. Maybe register on
+        *        first reference?
+        */
+
+       struct task_struct *t = current;
+       struct srp_priority t_prio;
+       struct srp_semaphore* sem;
+
+       sem = lookup_srp_sem(sem_od);
+
+       if (!sem)
+               return -EINVAL;
+
+       if (!is_realtime(t))
+               return -EPERM;
+
+       if (sem->cpu != get_partition(t))
+               return -EINVAL;
+       
+       preempt_disable();
+       t->rt_param.subject_to_srp = 1;
+       t_prio.period = get_rt_period(t);
+       t_prio.pid    = t->pid;
+       if (srp_higher_prio(&t_prio, &sem->ceiling)) {
+               sem->ceiling.period = t_prio.period;
+               sem->ceiling.pid    = t_prio.pid;
+       }
+       
+       preempt_enable();
+
+       return 0;
+}
+
+static int srp_wake_up(wait_queue_t *wait, unsigned mode, int sync, 
+                      void *key)
+{
+       int cpu = smp_processor_id();
+       struct task_struct *tsk = wait->private;
+       if (cpu != get_partition(tsk))
+               TRACE_TASK(tsk, "srp_wake_up on wrong cpu, partition is %d\b",
+                          get_partition(tsk));
+       else if (srp_exceeds_ceiling(tsk, &__get_cpu_var(srp)))
+               return default_wake_function(wait, mode, sync, key);
+       return 0;
+}
+
+
+/* Wait for current task priority to exceed system-wide priority ceiling.
+ * Can be used to determine when it is safe to run a job after its release.
+ */
+void srp_ceiling_block(void)
+{
+       struct task_struct *tsk = current;
+       wait_queue_t wait = {
+               .private   = tsk, 
+               .func      = srp_wake_up, 
+               .task_list = {NULL, NULL}
+       };
+       
+       preempt_disable();
+       if (!srp_exceeds_ceiling(tsk, &__get_cpu_var(srp))) {
+               tsk->state = TASK_UNINTERRUPTIBLE;
+               add_wait_queue(&__get_cpu_var(srp).ceiling_blocked, &wait);
+               TRACE_CUR("is priority ceiling blocked.\n");
+               preempt_enable_no_resched();            
+               schedule();
+               /* Access to CPU var must occur with preemptions disabled,
+                * otherwise Linux debug code complains loudly, even if it is
+                * ok here.
+                */
+               preempt_disable();
+               TRACE_CUR("finally exceeds system ceiling.\n");
+               remove_wait_queue(&__get_cpu_var(srp).ceiling_blocked, &wait);
+               preempt_enable();
+       } else {
+               TRACE_CUR("is not priority ceiling blocked\n");
+               preempt_enable();
+       }
+}
+
+/* ************************************************************************** */
+
+
+
+diff --git a/kernel/pfair_common.c b/kernel/pfair_common.c
+new file mode 100644
+index 0000000..c50fdab
+--- /dev/null
+++ b/kernel/pfair_common.c
+@@ -0,0 +1,237 @@
+/*
+ * Common functions for PFAIR based scheduler.
+ */
+
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+
+#include <linux/litmus.h>
+#include <linux/sched_plugin.h>
+#include <linux/sched_trace.h>
+
+#include <linux/pfair_common.h>
+#include <linux/pfair_math.h>
+/*  Comparison of two tasks whether 
+ *  the lhs has higher priority than the rhs   */
+int is_pfair_hp(struct task_struct *lhs, struct task_struct *rhs)
+{
+       /*      Favor subtasks with earlier deadlines */
+       if(time_before(get_deadline(lhs), get_deadline(rhs)))
+               return 1;
+       if(get_deadline(lhs) == get_deadline(rhs)) {
+               /* If deadlines are equal, 
+                * favor non-zero b-bit (a heavy task) */
+               if(lhs->rt_param.times.b_bit > rhs->rt_param.times.b_bit)
+                       return 1;
+               
+               if(lhs->rt_param.times.b_bit == rhs->rt_param.times.b_bit &&
+                               lhs->rt_param.times.b_bit == 1)
+                       /* If b-bit is 1, favor tasks with later 
+                        * group deadline */
+                       return time_after(lhs->rt_param.times.group_deadline,
+                               rhs->rt_param.times.group_deadline);
+               
+       }
+       return 0;
+}
+
+void pfair_domain_init(pfair_domain_t *pfair) 
+{
+       BUG_ON(!pfair);
+       INIT_LIST_HEAD(&pfair->ready_queue);
+       INIT_LIST_HEAD(&pfair->release_queue);
+       queue_lock_init(&pfair->pfair_lock);
+       cpus_setall(pfair->domain_cpus);
+       /* Use cpu 0 to keep the system alive 
+        * TODO: Remove later or make it configurable
+        * */
+       cpu_clear(0, pfair->domain_cpus);
+}
+
+
+/* add_ready - add a real-time task to the PFAIR ready queue. 
+ * It must be runnable. Global domain lock must be held before
+ * calling this function.
+ *
+ * @new:      the newly released task
+ */
+void pfair_add_ready(pfair_domain_t* pfair, struct task_struct *new) 
+{
+       struct list_head *pos;
+       struct task_struct *queued;
+
+       BUG_ON(!new);
+       /* find a spot where our deadline is earlier than the next */
+       list_for_each(pos, &pfair->ready_queue) {               
+               queued = list_entry(pos, struct task_struct, rt_list);
+               if (unlikely(is_pfair_hp(new, queued))) {
+                       /* the task at pos has a later deadline */
+                       /* insert the new task in front of it */
+                       __list_add(&new->rt_list, pos->prev, pos);
+                       return;
+               }
+       }
+       /* if we get to this point either the list is empty or new has the
+        * lowest priority. Let's add it to the end. */
+       list_add_tail(&new->rt_list, &pfair->ready_queue);
+}
+/**
+ *     Extraction function.
+ */
+struct task_struct* __pfair_take_ready(pfair_domain_t* pfair) 
+{
+       struct task_struct *t = NULL;
+       /* either not yet released, preempted, or non-rt */
+       if (!list_empty(&pfair->ready_queue)) {
+
+               /* take next rt task */
+               t = list_entry(pfair->ready_queue.next, struct task_struct, 
+                              rt_list);
+
+               /* kick it out of the ready list */
+               list_del(&t->rt_list);
+       }
+       return t;
+}
+
+
+/* add_release - add a real-time task to the PFAIR release queue.
+ * Domain lock must be acquired before the function is called.
+ *
+ * @task:        the sleeping task
+ */
+void pfair_add_release(pfair_domain_t* pfair, struct task_struct *task) 
+{
+       struct list_head *pos;
+       struct task_struct *queued;
+
+       BUG_ON(!task);
+       /* find a spot where our deadline is earlier than the next */
+       list_for_each_prev(pos, &pfair->release_queue) {                
+               queued = list_entry(pos, struct task_struct, rt_list);
+               if ((unlikely(time_before(queued->rt_param.times.release,
+                                        task->rt_param.times.release)))) {
+                       /* the task at pos has an earlier release */
+                       /* insert the new task in behind it */
+                       __list_add(&task->rt_list, pos, pos->next);
+                       return; 
+               }
+       }
+       /* if we get to this point either the list is empty or task has the
+        * earliest release. Let's add it to the front. */
+       list_add(&task->rt_list, &pfair->release_queue);
+}
+/**
+ *     This function is called from tick handler, it acquires the lock
+ *     automatically. Only one processor effectively merges the queues.
+ */
+void pfair_try_release_pending(pfair_domain_t* pfair) 
+{
+       unsigned long flags;
+       struct list_head *pos, *save;
+       struct task_struct *queued;
+       queue_lock_irqsave(&pfair->pfair_lock, flags);
+
+       list_for_each_safe(pos, save, &pfair->release_queue) {
+               queued = list_entry(pos, struct task_struct, rt_list);
+               if (likely(time_before_eq(
+                                  queued->rt_param.times.release, jiffies))) {
+                       /* this one is ready to go*/
+                       list_del(pos);
+                       set_rt_flags(queued, RT_F_RUNNING);
+                               
+                       sched_trace_job_release(queued);
+                       /* now it can be picked up */
+                       barrier();
+                       pfair_add_ready(pfair, queued);
+               } 
+               else
+                       /* the release queue is ordered */
+                       break;                  
+       }
+       queue_unlock_irqrestore(&pfair->pfair_lock, flags);
+}                  
+/*
+ *     Subtask preparation. Assuming that last_release
+ *     denotes the time when the job was released.
+ */
+void pfair_prepare_next_subtask(struct task_struct *t)
+{
+       BUG_ON(!t);     
+       /* assign subtask release time, deadline, b-bit,
+        * and group deadline
+       */
+       t->rt_param.times.release   = t->rt_param.times.last_release
+                                               +release_time(t);
+       t->rt_param.times.deadline  = t->rt_param.times.last_release
+                                               +pfair_deadline(t);
+       t->rt_param.times.b_bit          = b_bit(t);
+       t->rt_param.times.group_deadline = t->rt_param.times.last_release
+                                               +group_deadline(t); 
+}
+
+void pfair_prepare_next_job(struct task_struct *t)
+{
+       BUG_ON(!t);     
+
+       /* prepare next job release */
+       /* make passed quantums zero so that we could compute new release times
+       * and deadlines for subtasks correctly
+       */
+       t->rt_param.times.exec_time = 0;
+       /* assign job-wide release time, 
+       * this is the starting point to 
+       * compute subtask releases, deadlines and group deadlines 
+       */
+       t->rt_param.times.last_release = t->rt_param.times.last_release
+               +get_rt_period(t);
+       /*      Release the first subtask.      */      
+       pfair_prepare_next_subtask(t);
+       t->first_time_slice = 0; 
+       /* Increase job sequence number */
+       t->rt_param.times.job_no++;
+}
+
+void __pfair_prepare_new_release(struct task_struct *t, jiffie_t start) 
+{
+       t->rt_param.times.release = start;
+       t->rt_param.times.last_release = start;
+       t->rt_param.times.exec_time = 0;
+       t->first_time_slice = 0;
+       pfair_prepare_next_subtask(t);
+       set_rt_flags(t, RT_F_RUNNING);  
+}
+
+void pfair_prepare_new_releases(pfair_domain_t *pfair, jiffie_t start) 
+{
+       unsigned long flags;
+       struct list_head tmp_list;
+       struct list_head *pos, *n;
+       struct task_struct *t;
+       
+       INIT_LIST_HEAD(&tmp_list);
+       
+       queue_lock_irqsave(&pfair->pfair_lock, flags);
+
+
+       while (!list_empty(&pfair->release_queue)) {
+               pos = pfair->release_queue.next;
+               list_del(pos);
+               list_add(pos, &tmp_list);              
+       }
+       while (!list_empty(&pfair->ready_queue)) {
+               pos = pfair->ready_queue.next;
+               list_del(pos);
+               list_add(pos, &tmp_list);
+       }
+
+       list_for_each_safe(pos, n, &tmp_list) {
+               t = list_entry(pos, struct task_struct, rt_list);
+               list_del(pos);
+               __pfair_prepare_new_release(t, start);
+               pfair_add_release(pfair, t);
+       }
+       queue_unlock_irqrestore(&pfair->pfair_lock, flags);
+}
+
+diff --git a/kernel/rt_domain.c b/kernel/rt_domain.c
+new file mode 100644
+index 0000000..4875c53
+--- /dev/null
+++ b/kernel/rt_domain.c
+@@ -0,0 +1,185 @@
+/*
+ * kernel/rt_domain.c
+ *
+ * LITMUS real-time infrastructure. This file contains the 
+ * functions that manipulate RT domains. RT domains are an abstraction
+ * of a ready queue and a release queue.
+ */
+
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+
+#include <linux/litmus.h>
+#include <linux/sched_plugin.h>
+#include <linux/sched_trace.h>
+
+#include <linux/rt_domain.h>
+
+
+static int dummy_resched(rt_domain_t *rt) 
+{
+       return 0;
+}
+
+static int dummy_order(struct list_head* a, struct list_head* b)
+{
+       return 0;
+}
+
+int release_order(struct list_head* a, struct list_head* b)
+{
+       return earlier_release(
+               list_entry(a, struct task_struct, rt_list),
+               list_entry(b, struct task_struct, rt_list));
+}
+
+
+void rt_domain_init(rt_domain_t *rt, 
+                   check_resched_needed_t f,
+                   list_cmp_t order) 
+{
+       BUG_ON(!rt);
+       if (!f)
+               f = dummy_resched;
+       if (!order)
+               order = dummy_order;
+       INIT_LIST_HEAD(&rt->ready_queue);
+       INIT_LIST_HEAD(&rt->release_queue);
+       rt->ready_lock          = RW_LOCK_UNLOCKED;
+       rt->release_lock        = SPIN_LOCK_UNLOCKED;
+       rt->check_resched       = f;
+       rt->order               = order;
+}
+
+/* add_ready - add a real-time task to the rt ready queue. It must be runnable.
+ * @new:      the newly released task
+ */
+void __add_ready(rt_domain_t* rt, struct task_struct *new) 
+{      
+       TRACE("rt: adding %s/%d (%u, %u) to ready queue\n", 
+             new->comm, new->pid, get_exec_cost(new), get_rt_period(new));
+
+       if (!list_insert(&new->rt_list, &rt->ready_queue, rt->order))
+               rt->check_resched(rt);
+}
+
+struct task_struct* __take_ready(rt_domain_t* rt) 
+{
+       struct task_struct *t = __peek_ready(rt);
+
+       /* kick it out of the ready list */
+       if (t)
+               list_del(&t->rt_list);  
+       return t;
+}
+
+struct task_struct* __peek_ready(rt_domain_t* rt)
+{
+       if (!list_empty(&rt->ready_queue))
+               return next_ready(rt);
+       else
+               return NULL;
+}
+
+struct task_struct* __take_ready_rq(rt_domain_t* rt, runqueue_t* rq, int cpu)
+{
+       struct task_struct *task = __take_ready(rt);
+
+       if (task) {
+               set_task_cpu(task, cpu);
+               __activate_task(task, rq);
+       }
+       return task;
+}
+
+/* add_release - add a real-time task to the rt release queue.
+ * @task:        the sleeping task
+ */
+void __add_release(rt_domain_t* rt, struct task_struct *task) 
+{
+       TRACE("rt: adding %s/%d (%u, %u) rel=%d to release queue\n", 
+             task->comm, task->pid, get_exec_cost(task), get_rt_period(task),
+             get_release(task));
+       
+       list_insert(&task->rt_list, &rt->release_queue, release_order);
+}
+
+void __release_pending(rt_domain_t* rt)
+{
+       struct list_head *pos, *save;
+       struct task_struct   *queued;
+       list_for_each_safe(pos, save, &rt->release_queue) {
+               queued = list_entry(pos, struct task_struct, rt_list);
+               if (likely(is_released(queued))) {
+                       /* this one is ready to go*/
+                       list_del(pos);
+                       set_rt_flags(queued, RT_F_RUNNING);
+                       
+                       sched_trace_job_release(queued);
+                       
+                       /* now it can be picked up */
+                       barrier();
+                       add_ready(rt, queued);
+               } 
+               else
+                       /* the release queue is ordered */
+                       break;                  
+       }
+}
+
+void try_release_pending(rt_domain_t* rt) 
+{
+       unsigned long flags;
+
+       if (spin_trylock_irqsave(&rt->release_lock, flags)) {
+               __release_pending(rt);
+               spin_unlock_irqrestore(&rt->release_lock, flags);
+       }
+}                  
+
+void rerelease_all(rt_domain_t *rt,  
+                  release_at_t release) 
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&rt->release_lock, flags);
+       write_lock(&rt->ready_lock);
+
+       __rerelease_all(rt, release);
+       
+       write_unlock(&rt->ready_lock);
+       spin_unlock_irqrestore(&rt->release_lock, flags);
+}
+
+void __rerelease_all(rt_domain_t *rt,
+                    release_at_t release) 
+{
+       jiffie_t start = jiffies + 10;
+       struct list_head tmp_list;
+       struct list_head *pos, *n;
+       struct task_struct *t;
+       
+       INIT_LIST_HEAD(&tmp_list);
+
+       while (!list_empty(&rt->release_queue)) {
+               pos = rt->release_queue.next;
+               list_del(pos);
+               list_add(pos, &tmp_list);              
+       }
+       while (!list_empty(&rt->ready_queue)) {
+               pos = rt->ready_queue.next;
+               list_del(pos);
+               list_add(pos, &tmp_list);
+       }
+
+       list_for_each_safe(pos, n, &tmp_list) {
+               t = list_entry(pos, struct task_struct, rt_list);
+               list_del(pos);
+               release(t, start);
+               __add_release(rt, t);
+       }
+
+}
+
+
+diff --git a/kernel/sched.c b/kernel/sched.c
+index cca93cc..47f16cc 100644
+--- a/kernel/sched.c
+++ b/kernel/sched.c
+@@ -56,6 +56,16 @@
+ 
+ #include <asm/unistd.h>
+ 
+#include <linux/litmus.h>
+#define __SCHED_C__
+#include <linux/sched_plugin.h>
+#include <linux/sched_trace.h>
+#include <linux/rt_param.h>
+#include <linux/trace.h>
+
+/* LITMUS: avoid races with multiple task wake-ups */
+DEFINE_SPINLOCK(litmus_task_set_lock);
+
+ /*
+  * Convert user-nice values [ -20 ... 0 ... 19 ]
+  * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
+@@ -836,7 +846,7 @@ static int effective_prio(struct task_struct *p)
+         * keep the priority unchanged. Otherwise, update priority
+         * to the normal priority:
+         */
+-       if (!rt_prio(p->prio))
+       if (!rt_prio(p->prio) && !is_realtime(p))
+                return p->normal_prio;
+        return p->prio;
+ }
+@@ -844,7 +854,7 @@ static int effective_prio(struct task_struct *p)
+ /*
+  * __activate_task - move a task to the runqueue.
+  */
+-static void __activate_task(struct task_struct *p, struct rq *rq)
+void __activate_task(struct task_struct *p, struct rq *rq)
+ {
+        struct prio_array *target = rq->active;
+ 
+@@ -999,7 +1009,7 @@ out:
+ /*
+  * deactivate_task - remove a task from the runqueue.
+  */
+-static void deactivate_task(struct task_struct *p, struct rq *rq)
+void deactivate_task(struct task_struct *p, struct rq *rq)
+ {
+        dec_nr_running(p, rq);
+        dequeue_task(p, p->array);
+@@ -1408,13 +1418,44 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
+ #endif
+ 
+        rq = task_rq_lock(p, &flags);
+
+       if (is_realtime(p))
+               TRACE("try_to_wake_up(%s/%d)\n", p->comm, p->pid);
+       
+        old_state = p->state;
+        if (!(old_state & state))
+-               goto out;
+               goto out;       
+ 
+        if (p->array)
+                goto out_running;
+ 
+       
+       spin_lock(&litmus_task_set_lock);
+       if (p->rt_param.litmus_controlled) {
+               /* Already included. This can happen
+                * if the task dropped all locks to call
+                * schedule() but a wake up raced and came in
+                * early.
+                */
+
+               spin_unlock(&litmus_task_set_lock);
+               goto out_running;
+       }       
+
+       sched_trace_task_arrival(p);
+       if (is_realtime(p)) {           
+               p->rt_param.litmus_controlled = 1;
+               curr_sched_plugin->wake_up_task(p);
+
+               spin_unlock(&litmus_task_set_lock);
+               goto out_running;
+       }
+
+       p->rt_param.litmus_controlled = 0;
+       spin_unlock(&litmus_task_set_lock);
+
+
+
+        cpu = task_cpu(p);
+        this_cpu = smp_processor_id();
+ 
+@@ -1575,11 +1616,14 @@ static void task_running_tick(struct rq *rq, struct task_struct *p);
+ void fastcall sched_fork(struct task_struct *p, int clone_flags)
+ {
+        int cpu = get_cpu();
+       
+       litmus_fork(p);
+ 
+ #ifdef CONFIG_SMP
+        cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
+ #endif
+        set_task_cpu(p, cpu);
+       clear_rt_params(p);
+ 
+        /*
+         * We mark the process as running here, but have not actually
+@@ -1730,6 +1774,9 @@ void fastcall sched_exit(struct task_struct *p)
+        unsigned long flags;
+        struct rq *rq;
+ 
+       if (is_realtime(p)) 
+               return;
+
+        /*
+         * If the child was a (relative-) CPU hog then decrease
+         * the sleep_avg of the parent as well.
+@@ -1765,6 +1812,31 @@ static inline void prepare_task_switch(struct rq *rq, struct task_struct *next)
+        prepare_arch_switch(next);
+ }
+ 
+static void litmus_transition(struct task_struct *tsk, struct rq *rq)
+{
+       int wakeup = 0;
+       WARN_ON(tsk->state != TASK_STOPPED);
+
+       tsk->rt_param.transition_pending = 0;
+       if (is_realtime(tsk)) {
+               /* RT -> BE transition */
+               tsk->rt_param.transition_error = transition_to_be(tsk);
+               wakeup = tsk->rt_param.transition_error == 0;
+       } else {
+               /* BE -> RT transition */
+               tsk->rt_param.transition_error  = transition_to_rt(tsk);
+               /* If it was rejected as a real-time task, then
+                * keep it running as a best-effort task.
+                */
+               wakeup = tsk->rt_param.transition_error != 0;
+       }
+       if (wakeup) {
+               /* we still hold the runqueue lock */
+               tsk->state = TASK_RUNNING;
+               __activate_task(tsk, rq);
+       }
+}
+
+ /**
+  * finish_task_switch - clean up after a task-switch
+  * @rq: runqueue associated with task-switch
+@@ -1801,6 +1873,15 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
+         */
+        prev_state = prev->state;
+        finish_arch_switch(prev);
+       /* Requeue previous real-time task before we drop the rq lock, cause
+        * that may lead to a preemption.
+        */
+       curr_sched_plugin->finish_switch(prev);
+       sched_trace_task_scheduled(current);
+       if (rt_transition_pending(prev))
+               litmus_transition(prev, rq);
+       /* trace before IRQs are enabled */
+       TS_CXS_END;
+        finish_lock_switch(rq, prev);
+        if (mm)
+                mmdrop(mm);
+@@ -1811,7 +1892,7 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
+                 */
+                kprobe_flush_task(prev);
+                put_task_struct(prev);
+-       }
+       }       
+ }
+ 
+ /**
+@@ -2990,7 +3071,7 @@ static inline void idle_balance(int cpu, struct rq *rq)
+ static inline void wake_priority_sleeper(struct rq *rq)
+ {
+ #ifdef CONFIG_SCHED_SMT
+-       if (!rq->nr_running)
+       if (!rq->nr_running || get_rt_mode() == MODE_RT_RUN)
+                return;
+ 
+        spin_lock(&rq->lock);
+@@ -3220,14 +3301,30 @@ void scheduler_tick(void)
+ 
+        update_cpu_clock(p, rq, now);
+ 
+-       if (p == rq->idle)
+-               /* Task on the idle queue */
+-               wake_priority_sleeper(rq);
+-       else
+-               task_running_tick(rq, p);
+       /* check whether the RT scheduler plugin requires a call to 
+        * schedule
+        */
+       TS_PLUGIN_TICK_START;
+       if (rt_scheduler_tick() == FORCE_RESCHED)
+               set_tsk_need_resched(p);
+       TS_PLUGIN_TICK_END;
+
+       /* real-time accounting is done by the plugin
+        * call linux functions only for background tasks
+        */
+       if (!is_realtime(p)) {
+               if (p == rq->idle)
+                       /* Task on the idle queue */
+                       wake_priority_sleeper(rq);
+               else
+                       task_running_tick(rq, p);
+       }
+       send_scheduler_signals();
+
+ #ifdef CONFIG_SMP
+        update_load(rq);
+-       if (time_after_eq(jiffies, rq->next_balance))
+       if (time_after_eq(jiffies, rq->next_balance) && 
+           get_rt_mode() == MODE_NON_RT)
+                raise_softirq(SCHED_SOFTIRQ);
+ #endif
+ }
+@@ -3406,6 +3503,7 @@ static inline int interactive_sleep(enum sleep_type sleep_type)
+                sleep_type == SLEEP_INTERRUPTED);
+ }
+ 
+
+ /*
+  * schedule() is the main scheduler function.
+  */
+@@ -3420,6 +3518,7 @@ asmlinkage void __sched schedule(void)
+        long *switch_count;
+        struct rq *rq;
+ 
+
+        /*
+         * Test if we are atomic.  Since do_exit() needs to call into
+         * schedule() atomically, we ignore that path for now.
+@@ -3427,8 +3526,9 @@ asmlinkage void __sched schedule(void)
+         */
+        if (unlikely(in_atomic() && !current->exit_state)) {
+                printk(KERN_ERR "BUG: scheduling while atomic: "
+-                       "%s/0x%08x/%d\n",
+-                       current->comm, preempt_count(), current->pid);
+                      "%s/0x%08x/%d %s\n",
+                      current->comm, preempt_count(), current->pid,
+                      is_realtime(current) ? "rt" : "non-rt");
+                debug_show_held_locks(current);
+                if (irqs_disabled())
+                        print_irqtrace_events(current);
+@@ -3438,6 +3538,7 @@ asmlinkage void __sched schedule(void)
+ 
+ need_resched:
+        preempt_disable();
+       TS_SCHED_START;
+        prev = current;
+        release_kernel_lock(prev);
+ need_resched_nonpreemptible:
+@@ -3470,6 +3571,7 @@ need_resched_nonpreemptible:
+        spin_lock_irq(&rq->lock);
+ 
+        switch_count = &prev->nivcsw;
+       /* check for blocking tasks */
+        if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
+                switch_count = &prev->nvcsw;
+                if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
+@@ -3478,13 +3580,65 @@ need_resched_nonpreemptible:
+                else {
+                        if (prev->state == TASK_UNINTERRUPTIBLE)
+                                rq->nr_uninterruptible++;
+                       /* we need to remove real-time tasks from the runqueue*/
+
+                       /* protect against races with signal delivery and IO
+                        * interrupts on other CPUs
+                        *
+                        * FIXME: This is probably not sufficient, 
+                        *        as (in theory) after
+                        *        unlocking the task_set_lock this task could
+                        *        be scheduled elsewere before we switched away
+                        *        from it. This has not been observed
+                        *        yet. To get this locking right is tricky.
+                        */
+                       spin_lock(&litmus_task_set_lock);
+                       if (prev->rt_param.litmus_controlled)
+                               prev->rt_param.litmus_controlled = 0;
+                       spin_unlock(&litmus_task_set_lock);
+
+                       if (is_realtime(prev)) {
+                               TRACE("schedule: %s/%d blocks. state = %d\n", 
+                                     prev->comm, prev->pid, prev->state);
+                               curr_sched_plugin->task_blocks(prev);
+                               /* Enable this for all tasks to get _a lot_ of 
+                                * data. Can be helpful for debugging.
+                                */
+                               sched_trace_task_departure(prev);
+                       }                       
+                       /* only indirect switching is supported in the current
+                        * version of LITMUS
+                        */
+                        deactivate_task(prev, rq);
+                }
+        }
+ 
+       next = NULL;
+
+       /* consult the real-time plugin */
+       TS_PLUGIN_SCHED_START;
+       curr_sched_plugin->schedule(prev, &next, rq);
+       TS_PLUGIN_SCHED_END;
+       /* If the real-time plugin wants to switch to a specific task
+        * it'll be on the rq and have the highest priority. There will
+        * be exaclty one such task, thus the selection of the next task
+        * is unambiguous and the following code can only get 
+        * triggered if there are no RT tasks pending (on this CPU). Thus, 
+        * we may as well skip it.
+        */
+       if (next)
+               goto switch_tasks;
+
+        cpu = smp_processor_id();
+        if (unlikely(!rq->nr_running)) {
+-               idle_balance(cpu, rq);
+               /* only load-balance if we are not in RT mode 
+                * 
+                * TODO: Maybe this can be relaxed by modifiying the
+                * load-balancing routines in such a way that they never touch
+                * real-time tasks.
+                */
+               if (get_rt_mode() == MODE_NON_RT)
+                       idle_balance(cpu, rq);
+                if (!rq->nr_running) {
+                        next = rq->idle;
+                        rq->expired_timestamp = 0;
+@@ -3528,7 +3682,7 @@ need_resched_nonpreemptible:
+                }
+        }
+        next->sleep_type = SLEEP_NORMAL;
+-       if (dependent_sleeper(cpu, rq, next))
+       if (get_rt_mode() == MODE_NON_RT && dependent_sleeper(cpu, rq, next))
+                next = rq->idle;
+ switch_tasks:
+        if (next == rq->idle)
+@@ -3546,7 +3700,11 @@ switch_tasks:
+        prev->timestamp = prev->last_ran = now;
+ 
+        sched_info_switch(prev, next);
+       TS_SCHED_END;   
+        if (likely(prev != next)) {
+               TS_CXS_START;
+               if (is_running(prev))
+                       sched_trace_task_preemption(prev, next);
+                next->timestamp = now;
+                rq->nr_switches++;
+                rq->curr = next;
+@@ -3560,9 +3718,12 @@ switch_tasks:
+                 * CPUs since it called schedule(), thus the 'rq' on its stack
+                 * frame will be invalid.
+                 */
+-               finish_task_switch(this_rq(), prev);
+-       } else
+               finish_task_switch(this_rq(), prev);            
+       } else {
+                spin_unlock_irq(&rq->lock);
+       }
+
+       send_scheduler_signals();
+ 
+        prev = current;
+        if (unlikely(reacquire_kernel_lock(prev) < 0))
+@@ -3691,6 +3852,7 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
+        }
+ }
+ 
+
+ /**
+  * __wake_up - wake up threads blocked on a waitqueue.
+  * @q: the waitqueue
+@@ -3709,6 +3871,7 @@ void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
+ }
+ EXPORT_SYMBOL(__wake_up);
+ 
+
+ /*
+  * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
+  */
+@@ -3717,6 +3880,7 @@ void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
+        __wake_up_common(q, mode, 1, 0, NULL);
+ }
+ 
+
+ /**
+  * __wake_up_sync - wake up threads blocked on a waitqueue.
+  * @q: the waitqueue
+@@ -4175,7 +4339,7 @@ static inline struct task_struct *find_process_by_pid(pid_t pid)
+ }
+ 
+ /* Actually do priority change: must hold rq lock. */
+-static void __setscheduler(struct task_struct *p, int policy, int prio)
+void __setscheduler(struct task_struct *p, int policy, int prio)
+ {
+        BUG_ON(p->array);
+ 
+@@ -6877,7 +7041,7 @@ void __init sched_init_smp(void)
+                BUG();
+ }
+ #else
+-void __init sched_init_smp(void)
+void __init linux_sched_init_smp(void)
+ {
+ }
+ #endif /* CONFIG_SMP */
+diff --git a/kernel/sched_adaptive.c b/kernel/sched_adaptive.c
+new file mode 100644
+index 0000000..44ce924
+--- /dev/null
+++ b/kernel/sched_adaptive.c
+@@ -0,0 +1,1454 @@
+
+
+/*
+ * kernel/sched_adaptive.c
+ *
+ * Implementation of Aaron's adaptive global EDF  scheduling algorithm. It is
+ * based on the GSN-EDF scheduler. However, it does not support synchronization
+ * primitives.
+ *
+ * It implements a version of FC-GEDF with a bunch of linearity assumptions for
+ * the optimizer and the the weight-transfer function. The code is meant to be
+ * clear, however you really need to read the paper if you want to understand
+ * what is going on here.
+ *
+ * Block et al., "Feedback-Controlled Adaptive Multiprocessor Real-Time
+ * Systems", submitted to RTAS 2008.
+ */
+
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+
+#include <linux/queuelock.h>
+#include <linux/litmus.h>
+#include <linux/sched_plugin.h>
+#include <linux/edf_common.h>
+#include <linux/sched_trace.h>
+#include <asm/uaccess.h>
+
+#include <linux/fpmath.h>
+
+/* Overview of GSN-EDF operations.
+ *
+ * For a detailed explanation of GSN-EDF have a look at the FMLP paper. This
+ * description only covers how the individual operations are implemented in
+ * LITMUS.
+ *
+ * link_task_to_cpu(T, cpu)    - Low-level operation to update the linkage
+ *                                structure (NOT the actually scheduled
+ *                                task). If there is another linked task To
+ *                                already it will set To->linked_on = NO_CPU
+ *                                (thereby removing its association with this
+ *                                CPU). However, it will not requeue the
+ *                                previously linked task (if any). It will set
+ *                                T's state to RT_F_RUNNING and check whether
+ *                                it is already running somewhere else. If T
+ *                                is scheduled somewhere else it will link
+ *                                it to that CPU instead (and pull the linked
+ *                                task to cpu). T may be NULL.
+ *
+ * unlink(T)                   - Unlink removes T from all scheduler data
+ *                                structures. If it is linked to some CPU it
+ *                                will link NULL to that CPU. If it is
+ *                                currently queued in the gsnedf queue it will
+ *                                be removed from the T->rt_list. It is safe to
+ *                                call unlink(T) if T is not linked. T may not
+ *                                be NULL.
+ *
+ * requeue(T)                  - Requeue will insert T into the appropriate
+ *                                queue. If the system is in real-time mode and
+ *                                the T is released already, it will go into the
+ *                                ready queue. If the system is not in
+ *                                real-time mode is T, then T will go into the
+ *                                release queue. If T's release time is in the
+ *                                future, it will go into the release
+ *                                queue. That means that T's release time/job
+ *                                no/etc. has to be updated before requeu(T) is
+ *                                called. It is not safe to call requeue(T)
+ *                                when T is already queued. T may not be NULL.
+ *
+ * gsnedf_job_arrival(T)       - This is the catch all function when T enters
+ *                                the system after either a suspension or at a
+ *                                job release. It will queue T (which means it
+ *                                is not safe to call gsnedf_job_arrival(T) if
+ *                                T is already queued) and then check whether a
+ *                                preemption is necessary. If a preemption is
+ *                                necessary it will update the linkage
+ *                                accordingly and cause scheduled to be called
+ *                                (either with an IPI or need_resched). It is
+ *                                safe to call gsnedf_job_arrival(T) if T's
+ *                                next job has not been actually released yet
+ *                                (releast time in the future). T will be put
+ *                                on the release queue in that case.
+ *
+ * job_completion(T)           - Take care of everything that needs to be done
+ *                                to prepare T for its next release and place
+ *                                it in the right queue with
+ *                                gsnedf_job_arrival().
+ *
+ *
+ * When we now that T is linked to CPU then link_task_to_cpu(NULL, CPU) is
+ * equivalent to unlink(T). Note that if you unlink a task from a CPU none of
+ * the functions will automatically propagate pending task from the ready queue
+ * to a linked task. This is the job of the calling function ( by means of
+ * __take_ready).
+ */
+
+static void unlink(struct task_struct* t);
+static void adaptive_job_arrival(struct task_struct* task);
+
+/* cpu_entry_t - maintain the linked and scheduled state
+ */
+typedef struct  {
+       int                     cpu;
+       struct task_struct*     linked;         /* only RT tasks */
+       struct task_struct*     scheduled;      /* only RT tasks */
+       struct list_head        list;
+       atomic_t                will_schedule;  /* prevent unneeded IPIs */
+} cpu_entry_t;
+DEFINE_PER_CPU(cpu_entry_t, adaptive_cpu_entries);
+
+#define set_will_schedule() \
+       (atomic_set(&__get_cpu_var(adaptive_cpu_entries).will_schedule, 1))
+#define clear_will_schedule() \
+       (atomic_set(&__get_cpu_var(adaptive_cpu_entries).will_schedule, 0))
+#define test_will_schedule(cpu) \
+       (atomic_read(&per_cpu(adaptive_cpu_entries, cpu).will_schedule))
+
+
+#define NO_CPU 0xffffffff
+
+/* The gsnedf_lock is used to serialize all scheduling events.
+ * It protects
+ */
+static queuelock_t adaptive_lock;
+/* the cpus queue themselves according to priority in here */
+static LIST_HEAD(adaptive_cpu_queue);
+
+static rt_domain_t adaptive;
+
+/* feedback control parameters */
+static fp_t fc_a, fc_b;
+
+/* optimizer trigger */
+static jiffie_t        last_optimizer_run;
+static jiffie_t optimizer_min_invocation_sep;
+static jiffie_t        optimizer_period;
+static fp_t    task_error_threshold;
+
+static fp_t    system_capacity;
+/* total actual weight of the task system */
+static fp_t    total_weight;
+
+/* optimizer time snapshot */
+jiffie_t       opt_time;
+
+/* Delayed weight increase notification list.
+ * This list gets clobbered on each optimizer run.
+ */
+static LIST_HEAD(adaptive_inc_list);
+
+/* comment out to disable optimizer debugging */
+#define ENABLE_OPTIMIZER_DEBUGGING
+
+#ifdef ENABLE_OPTIMIZER_DEBUGGING
+#define OPT_DBG TRACE
+#define OPT_DBG_T TRACE_TASK
+#else
+#define OPT_DBG
+#define OPT_DBG_T OPT_D
+#endif
+
+/******************************************************************************/
+/*                             OPTIMIZER MATH                                 */
+/******************************************************************************/
+
+/* All time dependent functions
+ * rely on opt_time.
+ * Update in the optimizer before use!
+ */
+
+static inline fp_t ideal(fp_t weight, jiffie_t delta_t)
+{
+       return _mul(weight, FP(delta_t));
+}
+
+static noinline long ideal_exec_time(struct task_struct* t)
+{
+       jiffie_t delta = opt_time - get_last_release(t);
+       return _round(ideal(get_est_weight(t), delta));
+}
+
+/* this makes a whole bunch of linearity assumptions */
+static noinline fp_t weight_transfer(struct task_struct* t,
+                                  unsigned int from, unsigned int to,
+                                  fp_t act_weight)
+{
+       fp_t rel_from, rel_to, ret;
+       rel_from = get_sl(t, from).weight;
+       rel_to   = get_sl(t, to).weight;
+       ret.val = (act_weight.val * rel_to.val) / rel_from.val;
+       OPT_DBG("weight_transfer(%ld, %ld, %ld) => %ld  to=%u from=%u\n",
+               rel_from.val, rel_to.val, act_weight.val, ret.val, from, to);
+
+       return ret;
+}
+
+static noinline fp_t est_weight_at(struct task_struct* t, unsigned int level)
+{
+       if (t->rt_param.no_service_levels)
+               return weight_transfer(t, get_cur_sl(t), level,
+                                      get_est_weight(t));
+       else
+               return get_est_weight(t);
+
+}
+
+static noinline void update_estimate(predictor_state_t *state, fp_t actual_weight,
+                                    fp_t a, fp_t b)
+{
+       fp_t err, new;
+
+       OPT_DBG("OLD ESTIMATE Weight" _FP_ " ActWt " _FP_ " A:" _FP_ ", B:" _FP_
+               "\n", fp2str(state->estimate), fp2str(actual_weight), fp2str(a),
+               fp2str(b));
+       err                = _sub(actual_weight, state->estimate);
+       new = _add(_mul(a, err),
+                  _mul(b, state->accumulated));
+
+       total_weight = _sub(total_weight, state->estimate);
+       state->estimate = new;
+       total_weight = _add(total_weight, state->estimate);
+
+       state->accumulated = _add(state->accumulated, err);
+       OPT_DBG("ERROR " _FP_ ", NEW " _FP_ ", ACC" _FP_ "\n", fp2str(err),
+               fp2str(new), fp2str(state->accumulated));
+
+}
+
+static noinline fp_t linear_metric(struct task_struct* t)
+{
+       fp_t v1, vmax, g1, gmax;
+       fp_t est_w;
+       unsigned int l    = t->rt_param.no_service_levels;
+       unsigned int lcur;
+
+       if (l <= 1)
+               return FP(0);
+
+       lcur  = get_cur_sl(t);;
+       est_w = get_est_weight(t);
+
+       OPT_DBG_T(t, " linear_metric: lcur=%u l=%u est_w=" _FP_ "\n",
+                  lcur, l, est_w);
+       OPT_DBG_T(t, " linear_metric: est_w.val=%ld\n", est_w.val);
+
+
+       v1    = t->rt_param.service_level[0].value;
+       vmax  = t->rt_param.service_level[l - 1].value;
+
+       OPT_DBG_T(t, " linear_metric: v1=" _FP_ " vmax=" _FP_  "\n", v1, vmax);
+       OPT_DBG_T(t, " linear_metric: v1=%ld vmax=%ld\n", v1.val, vmax.val);
+
+
+       g1    = weight_transfer(t, lcur, 0, est_w);
+       gmax  = weight_transfer(t, lcur, l - 1, est_w);
+
+       OPT_DBG_T(t, " linear_metric: g1=" _FP_ " gmax=" _FP_  "\n", g1, gmax);
+       OPT_DBG_T(t, " linear_metric: g1=%ld gmax=%ld\n", g1, gmax);
+
+
+       TRACE_BUG_ON(_eq(_sub(gmax, g1), FP(0)));
+       if (_eq(_sub(gmax, g1), FP(0)))
+               return FP(0);
+       return _div(_sub(vmax, v1),
+                   _sub(gmax, g1));
+}
+
+static noinline  unsigned long reweighted_period(fp_t ow, fp_t nw,
+                                                unsigned long alloc,
+                                                jiffie_t deadline,
+                                                jiffie_t release)
+{
+       fp_t dl;
+       dl = _mul(FP(deadline - release), ow);
+       dl = _sub(dl, FP(alloc));
+       if(_eq(nw, FP(0)))
+               return 0;
+       dl = _div(dl, nw);
+       return _round(dl);
+}
+
+static noinline int is_under_allocated(struct task_struct* t)
+{
+       return ideal_exec_time(t) >= t->rt_param.times.exec_time;
+}
+
+static noinline jiffie_t dec_equal_point_delay(struct task_struct* t)
+{
+       if (_lt(FP(0), get_est_weight(t)))
+               /* when t was released plus time needed to equalize
+                * minus now
+                */
+               return get_last_release(t) +
+                       _round(_div( FP(t->rt_param.times.exec_time),
+                                    get_est_weight(t))) -
+                       opt_time;
+       else
+               /* if the weight is zero we just take the
+                * deadline
+                */
+               return t->rt_param.times.deadline;
+}
+
+static noinline jiffie_t inc_equal_point_delay(struct task_struct* t)
+{
+       if (_lt(FP(0), t->rt_param.opt_nw))
+               /* when t was released plus time needed to equalize
+                * minus now
+                */
+               return  get_last_release(t) +
+                       _round(_div( FP(t->rt_param.times.exec_time),
+                                    t->rt_param.opt_nw)) -
+                       opt_time;
+       else
+               /* if the weight is zero we just take the
+                * deadline
+                */
+               return t->rt_param.times.deadline;
+}
+
+static noinline jiffie_t decrease_delay(struct task_struct* t)
+{
+       if (has_active_job(t) && !is_under_allocated(t))
+               return dec_equal_point_delay(t);
+       return 0;
+}
+
+
+
+/******************************************************************************/
+/*                             SORT ORDERS                                    */
+/******************************************************************************/
+
+static int by_linear_metric(struct list_head* a, struct list_head* b)
+{
+       struct task_struct *ta, *tb;
+       ta = list_entry(a, struct task_struct, rt_param.opt_list);
+       tb = list_entry(b, struct task_struct, rt_param.opt_list);
+       return _gt(ta->rt_param.opt_order, tb->rt_param.opt_order);
+}
+
+static int by_delta_weight(struct list_head* a, struct list_head* b)
+{
+       struct task_struct *ta, *tb;
+       ta = list_entry(a, struct task_struct, rt_param.opt_list);
+       tb = list_entry(b, struct task_struct, rt_param.opt_list);
+       return _lt(ta->rt_param.opt_dw, tb->rt_param.opt_dw);
+}
+
+static int by_enactment_time(struct list_head* a, struct list_head* b)
+{
+       struct task_struct *ta, *tb;
+       ta = list_entry(a, struct task_struct, rt_param.opt_list);
+       tb = list_entry(b, struct task_struct, rt_param.opt_list);
+       return ta->rt_param.opt_change < tb->rt_param.opt_change;
+}
+
+/******************************************************************************/
+/*                         WEIGHT CHANGE MECHANICS                            */
+/******************************************************************************/
+
+static void set_service_level(struct task_struct* t, unsigned int level)
+{
+       service_level_t *new;
+       unsigned int old;
+       BUG_ON(!t);
+       BUG_ON(t->rt_param.no_service_levels <= level);
+
+       old = t->rt_param.cur_service_level;
+       t->rt_param.cur_service_level  = level;
+       new = t->rt_param.service_level + level;
+       t->rt_param.basic_params.period = new->period;
+       t->rt_param.basic_params.exec_cost = _round(_mul(new->weight,
+                                                        FP(new->period)));
+
+       scheduler_signal(t, SIGUSR1);
+
+       sched_trace_service_level_change(t, old, level);
+       OPT_DBG_T(t, "service level %u activated\n", level);
+}
+
+/* call this _before_ updating deadline and release of t */
+static void update_weight_estimate(struct task_struct* t)
+{
+       fp_t nw, ow;
+       jiffie_t sl_period, exec_time;
+
+       ow        = get_est_weight(t);
+       nw        = t->rt_param.opt_nw;
+       exec_time = t->rt_param.times.exec_time;
+       sl_period = get_sl(t, get_opt_sl(t)).period;
+
+       OPT_DBG("ow=" _FP_ " nw=" _FP_ ", r-d " _FP_
+               ", deadline %d, release %d, exec_time=%ld sl_period=%lu\n",
+               fp2str(ow), fp2str(nw),
+               fp2str(FP(get_deadline(t) - get_last_release(t))),
+               get_deadline(t), get_last_release(t), exec_time, sl_period);
+
+       total_weight = _sub(total_weight, get_est_weight(t));
+       t->rt_param.predictor_state.estimate = nw;
+       OPT_DBG_T(t, "update_weight_estimate from " _FP_ " to "_FP_"\n",
+                 fp2str(ow), fp2str(nw));
+       total_weight = _add(total_weight, get_est_weight(t));
+
+       OPT_DBG_T(t, " update_weight_estimate: " _FP_ " => " _FP_ "\n",
+       fp2str(ow), fp2str(get_est_weight(t)));
+}
+
+
+static void decrease_weight(struct task_struct* t)
+{
+       fp_t ow, nw;
+       jiffie_t last, period, delay;
+
+       ow = get_sl(t, get_cur_sl(t)).weight;
+       nw = get_sl(t, get_opt_sl(t)).weight;
+       last = t->rt_param.times.last_release;
+       period = reweighted_period(ow, nw, t->rt_param.times.exec_time,
+                                  t->rt_param.times.deadline, last);
+
+       /* necessary delay has already been computed by optimizer */
+       delay = t->rt_param.opt_change;
+
+       update_weight_estimate(t);
+
+       if (!delay)
+               t->rt_param.times.last_release = opt_time;
+       t->rt_param.times.release      = opt_time + delay;
+       t->rt_param.times.deadline     = opt_time + delay + period;
+
+       set_service_level(t, get_opt_sl(t));
+
+       /* take out of queue/link structure */
+       unlink(t);
+       /* present as a new job */
+       adaptive_job_arrival(t);
+}
+
+
+static void increase_weight(struct task_struct* t)
+{
+       fp_t ow, nw;
+       jiffie_t last, period, delay;
+
+       ow = get_sl(t, get_cur_sl(t)).weight;
+       nw = get_sl(t, get_opt_sl(t)).weight;
+       last = t->rt_param.times.last_release;
+       period = reweighted_period(ow, nw, t->rt_param.times.exec_time,
+                                  t->rt_param.times.deadline, last);
+
+       if (t->rt_param.opt_change == 0) {
+               /* can be enacted now */
+               if (is_under_allocated(t) ||
+                   time_before(opt_time + period, get_deadline(t)))
+                       /* do it now */
+                       delay = 0;
+               else {
+                       if (is_under_allocated(t)) {
+                               t->rt_param.opt_change += opt_time;
+                               /* The next job release will notice that opt !=
+                                * sl and initiate a weight change.
+                                */
+                               return;
+                       } else
+                               /* nope, wait for equal point */
+                               delay = inc_equal_point_delay(t);
+               }
+
+               update_weight_estimate(t);
+
+               if (!delay)
+                       t->rt_param.times.last_release = opt_time;
+               t->rt_param.times.release      = opt_time + delay;
+               t->rt_param.times.deadline     = opt_time + delay + period;
+
+               set_service_level(t, get_opt_sl(t));
+
+               /* take out of queue/link structure */
+               unlink(t);
+               /* present as a new job */
+               adaptive_job_arrival(t);
+
+       } else {
+               /* must wait until capacity is released */
+               t->rt_param.opt_change += opt_time;
+               list_insert(&t->rt_param.opt_list, &adaptive_inc_list,
+                           by_enactment_time);
+       }
+}
+
+static void delayed_increase_weight(void)
+{
+       struct list_head *p, *extra;
+       struct task_struct* t;
+
+       opt_time = jiffies;
+       list_for_each_safe(p, extra, &adaptive_inc_list) {
+               t = list_entry(p, struct task_struct, rt_param.opt_list);
+               if (time_before_eq(t->rt_param.opt_change, opt_time)) {
+                       list_del(p);
+                       /* prevent recursion */
+                       t->rt_param.opt_change = 0;
+                       /* this takes care of everything */
+                       increase_weight(t);
+               } else
+                       /* list is sorted */
+                       break;
+       }
+}
+
+static void change_weight(struct task_struct* t)
+{
+       if (get_cur_sl(t) < get_opt_sl(t))
+               increase_weight(t);
+       else
+               decrease_weight(t);
+       OPT_DBG_T(t, "after change_weight: last_rel:%d rel:%d dl:%d\n",
+                 get_last_release(t),
+                 get_release(t),
+                 get_deadline(t));
+}
+
+/******************************************************************************/
+/*                               OPTIMIZER                                    */
+/******************************************************************************/
+
+/* only invoke with adaptive_lock behing held */
+void adaptive_optimize(void)
+{
+       struct list_head list;
+       struct list_head inc, dec;
+       struct list_head *p, *extra;
+       cpu_entry_t *cpu;
+       struct task_struct* t;
+       fp_t M = FP(0), w0, wl, tmp, estU = FP(0);
+       unsigned int l;
+       jiffie_t enactment_time;
+
+       if (time_before(jiffies,
+                       last_optimizer_run + optimizer_min_invocation_sep))
+               return;
+
+       OPT_DBG(":::::: running adaptive optimizer\n");
+       opt_time = jiffies;
+
+       INIT_LIST_HEAD(&list);
+
+       /* 1) gather all tasks */
+       list_for_each(p, &adaptive.ready_queue)
+               list_add(&(rt_list2task(p)->rt_param.opt_list), &list);
+       list_for_each(p, &adaptive.release_queue)
+               list_add(&(rt_list2task(p)->rt_param.opt_list), &list);
+       list_for_each(p, &adaptive_cpu_queue) {
+               cpu = list_entry(p, cpu_entry_t, list);
+               if (cpu->linked)
+                       list_add(&cpu->linked->rt_param.opt_list, &list);
+       }
+
+       /* 2) determine current system capacity */
+       M = system_capacity;
+       OPT_DBG("opt: system capacity: " _FP_ "\n", fp2str(M));
+
+       /* 3) Compute L value for all tasks,
+        *    and set tasks to service level 0,
+        *    also account for weight.
+        *    Also establish current estimated utilization
+        */
+       list_for_each_safe(p, extra, &list) {
+               t  = list_entry(p, struct task_struct, rt_param.opt_list);
+               if (time_before(opt_time, get_last_release(t))) {
+                       list_del(p);
+                       continue;
+               }
+               t->rt_param.opt_order = linear_metric(t);
+               OPT_DBG_T(t, "est_w = " _FP_ " L = " _FP_ "\n",
+                         get_est_weight(t),
+                         fp2str(t->rt_param.opt_order));
+               t->rt_param.opt_level = 0;
+               M = _sub(M, est_weight_at(t, 0));
+               estU = _add(estU, get_est_weight(t));
+       }
+       OPT_DBG("opt: estimated utilization: " _FP_ "\n", fp2str(estU));
+       OPT_DBG("opt: estimated capacity at all sl=0: " _FP_ "\n", fp2str(M));
+
+
+       /* 4) sort list by decreasing linear metric */
+       list_qsort(&list, by_linear_metric);
+
+       /* 5) assign each task a service level  */
+       list_for_each(p, &list) {
+               t  = list_entry(p, struct task_struct, rt_param.opt_list);
+               l = t->rt_param.no_service_levels;
+               w0 = est_weight_at(t, 0);
+               while (l > 1) {
+                       l--;
+                       wl = est_weight_at(t, l);
+                       tmp = _sub(M, _sub(wl, w0));
+                       if (_leq(FP(0), tmp)) {
+                               /* this level fits in */
+                               M = tmp;
+                               t->rt_param.opt_level = l;
+                               t->rt_param.opt_dw    = _sub(wl,
+                                                            get_est_weight(t));
+                               t->rt_param.opt_nw    = wl;
+                               break; /* proceed to next task */
+                       }
+               }
+               OPT_DBG_T(t, " will run at sl=%u, prior=%u dw=" _FP_ "\n",
+                          l, get_cur_sl(t), fp2str(t->rt_param.opt_dw));
+
+       }
+
+       /* 6) filter tasks that reweight  */
+       INIT_LIST_HEAD(&inc);
+       INIT_LIST_HEAD(&dec);
+       list_for_each_safe(p, extra, &list) {
+               t  = list_entry(p, struct task_struct, rt_param.opt_list);
+               list_del(p);
+               if (t->rt_param.opt_level < get_cur_sl(t)) {
+                       list_add(p, &dec);
+                       t->rt_param.opt_change = decrease_delay(t);
+               } else if (t->rt_param.opt_level > get_cur_sl(t)) {
+                       list_add(p, &inc);
+                       t->rt_param.opt_change = 0;
+               }
+               /* if t doesn't change we can ignore it from now on */
+       }
+
+       /* 7) sort dec and inc list */
+       list_qsort(&dec, by_enactment_time);
+       list_qsort(&inc, by_delta_weight);
+
+       /* 8) now figure out when we can enact weight increases
+        *    It works like this: We know the current system utilization.
+        *    Thus, we know the remaining capacity. We also know when
+        *    decreases are going to be enacted (=> capacity increases).
+        *    Now we only need to find a spot where the weight increase will
+        *    not drive the system into overload.
+        */
+
+       /* Very ugly jump, but we need to force enactment_time = 0
+        * during the first iteration.
+        */
+       M = system_capacity;
+       enactment_time = 0;
+       goto first_iteration;
+
+       while (!list_empty(&inc)) {
+               enactment_time = list_entry(dec.next, struct task_struct,
+                                           rt_param.opt_list)
+                       ->rt_param.opt_change;
+       first_iteration:
+               /* Start by collapsing the next decrease.
+                * Except for in the first iteration, it will always
+                * pick off at least one task.
+                */
+               list_for_each_safe(p, extra, &dec) {
+                       t  = list_entry(p, struct task_struct,
+                                       rt_param.opt_list);
+                       if (t->rt_param.opt_change == enactment_time) {
+                               list_del(p);
+                               /* opt_dw is negative */
+                               estU = _add(estU, t->rt_param.opt_dw);
+                               list_add(p, &list);
+
+                               OPT_DBG_T(t, " weight decrease at %ld => estU="
+                                         _FP_ "\n", enactment_time,
+                                         fp2str(estU));
+
+                       } else
+                               /* stop decrease loop */
+                               break;
+               }
+
+               /* now start setting enactment times for increases */
+               while (!list_empty(&inc)) {
+                       p  = inc.next;
+                       t  = list_entry(p, struct task_struct,
+                                       rt_param.opt_list);
+                       tmp = _add(estU, t->rt_param.opt_dw);
+                       if (_leq(tmp, M)) {
+                               /* it fits */
+                               estU = tmp;
+                               t->rt_param.opt_change = enactment_time;
+                               list_del(p);
+                               list_add(p, &list);
+
+                               OPT_DBG_T(t, " weight increase at %ld => estU="
+                                          _FP_ "\n", enactment_time,
+                                          fp2str(estU));
+
+                       } else
+                               /* stop increase loop */
+                               break;
+               }
+
+               TRACE_BUG_ON(list_empty(&dec) && !list_empty(&inc));
+               if (list_empty(&dec) && !list_empty(&inc))
+                       /* break out in case of bug */
+                       break;
+       }
+
+       /* 9) Wow. We made it. Every task has a now a new service level
+        *    assigned, together with a correct (earliest) enactment time.
+        *    all we have left to do now is to enact changes that did not get
+        *    delayed. Also convert change fields to actual timestamp for to be
+        *    nice to the scheduler_tick().
+        */
+       INIT_LIST_HEAD(&adaptive_inc_list);
+       list_for_each_safe(p, extra, &list) {
+               t  = list_entry(p, struct task_struct, rt_param.opt_list);
+               list_del(p);
+               change_weight(t);
+       }
+
+       last_optimizer_run = jiffies;
+       OPT_DBG(":::::: optimizer run complete\n");
+}
+
+/* update_cpu_position - Move the cpu entry to the correct place to maintain
+ *                       order in the cpu queue. Caller must hold adaptive lock.
+ */
+static void update_cpu_position(cpu_entry_t *entry)
+{
+       cpu_entry_t *other;
+       struct list_head *pos;
+       list_del(&entry->list);
+       /* if we do not execute real-time jobs we just move
+        * to the end of the queue
+        */
+       if (entry->linked) {
+               list_for_each(pos, &adaptive_cpu_queue) {
+                       other = list_entry(pos, cpu_entry_t, list);
+                       if (edf_higher_prio(entry->linked, other->linked)) {
+                               __list_add(&entry->list, pos->prev, pos);
+                               return;
+                       }
+               }
+       }
+       /* if we get this far we have the lowest priority job */
+       list_add_tail(&entry->list, &adaptive_cpu_queue);
+}
+
+/* link_task_to_cpu - Update the link of a CPU.
+ *                    Handles the case where the to-be-linked task is already
+ *                    scheduled on a different CPU.
+ */
+static noinline void link_task_to_cpu(struct task_struct* linked,
+                            cpu_entry_t *entry)
+
+{
+       cpu_entry_t *sched;
+       struct task_struct* tmp;
+       int on_cpu;
+
+       BUG_ON(linked && !is_realtime(linked));
+
+       /* Currently linked task is set to be unlinked. */
+       if (entry->linked)
+               entry->linked->rt_param.linked_on = NO_CPU;
+
+       /* Link new task to CPU. */
+       if (linked) {
+               set_rt_flags(linked, RT_F_RUNNING);
+               /* handle task is already scheduled somewhere! */
+               on_cpu = linked->rt_param.scheduled_on;
+               if (on_cpu != NO_CPU) {
+                       sched = &per_cpu(adaptive_cpu_entries, on_cpu);
+                       /* this should only happen if not linked already */
+                       BUG_ON(sched->linked == linked);
+
+                       /* If we are already scheduled on the CPU to which we
+                        * wanted to link, we don't need to do the swap --
+                        * we just link ourselves to the CPU and depend on
+                        * the caller to get things right.
+                        */
+                       if (entry != sched) {
+                               tmp = sched->linked;
+                               linked->rt_param.linked_on = sched->cpu;
+                               sched->linked = linked;
+                               update_cpu_position(sched);
+                               linked = tmp;
+                       }
+               }
+               if (linked) /* might be NULL due to swap */
+                       linked->rt_param.linked_on = entry->cpu;
+       }
+       entry->linked = linked;
+       update_cpu_position(entry);
+}
+
+/* unlink - Make sure a task is not linked any longer to an entry
+ *          where it was linked before. Must hold adaptive_lock.
+ */
+static void unlink(struct task_struct* t)
+{
+       cpu_entry_t *entry;
+
+       if (unlikely(!t)) {
+               TRACE_BUG_ON(!t);
+               return;
+       }
+
+       if (t->rt_param.linked_on != NO_CPU) {
+               /* unlink */
+               entry = &per_cpu(adaptive_cpu_entries, t->rt_param.linked_on);
+               t->rt_param.linked_on = NO_CPU;
+               link_task_to_cpu(NULL, entry);
+       } else if (in_list(&t->rt_list)) {
+               /* This is an interesting situation: t is scheduled,
+                * but was just recently unlinked.  It cannot be
+                * linked anywhere else (because then it would have
+                * been relinked to this CPU), thus it must be in some
+                * queue. We must remove it from the list in this
+                * case.
+                */
+               list_del(&t->rt_list);
+       }
+}
+
+
+/* preempt - force a CPU to reschedule
+ */
+static noinline void preempt(cpu_entry_t *entry)
+{
+       /* We cannot make the is_np() decision here if it is a remote CPU
+        * because requesting exit_np() requires that we currently use the
+        * address space of the task. Thus, in the remote case we just send
+        * the IPI and let schedule() handle the problem.
+        */
+
+       if (smp_processor_id() == entry->cpu) {
+               if (entry->scheduled && is_np(entry->scheduled))
+                       request_exit_np(entry->scheduled);
+               else
+                       set_tsk_need_resched(current);
+       } else
+               /* in case that it is a remote CPU we have to defer the
+                * the decision to the remote CPU
+                */
+               if (!test_will_schedule(entry->cpu))
+                       smp_send_reschedule(entry->cpu);
+}
+
+/* requeue - Put an unlinked task into gsn-edf domain.
+ *           Caller must hold adaptive_lock.
+ */
+static noinline void requeue(struct task_struct* task)
+{
+       BUG_ON(!task);
+       /* sanity check rt_list before insertion */
+       BUG_ON(in_list(&task->rt_list));
+
+       if (get_rt_flags(task) == RT_F_SLEEP ||
+           get_rt_mode() != MODE_RT_RUN) {
+               /* this task has expired
+                * _schedule has already taken care of updating
+                * the release and
+                * deadline. We just must check if it has been released.
+                */
+               if (is_released(task) && get_rt_mode() == MODE_RT_RUN)
+                       __add_ready(&adaptive, task);
+               else {
+                       /* it has got to wait */
+                       __add_release(&adaptive, task);
+               }
+
+       } else
+               /* this is a forced preemption
+                * thus the task stays in the ready_queue
+                * we only must make it available to others
+                */
+               __add_ready(&adaptive, task);
+}
+
+/* adaptive_job_arrival: task is either resumed or released */
+static void adaptive_job_arrival(struct task_struct* task)
+{
+       cpu_entry_t* last;
+
+       BUG_ON(list_empty(&adaptive_cpu_queue));
+       BUG_ON(!task);
+
+       TRACE_TASK(task, "job_arrival: last_rel=%d rel=%d dl=%d now=%d\n",
+                  get_last_release(task), get_release(task),
+                  get_deadline(task),
+                  jiffies);
+
+
+       /* first queue arriving job */
+       requeue(task);
+
+       /* then check for any necessary preemptions */
+       last = list_entry(adaptive_cpu_queue.prev, cpu_entry_t, list);
+       if (edf_preemption_needed(&adaptive, last->linked)) {
+               /* preemption necessary */
+               task = __take_ready(&adaptive);
+
+               TRACE("job_arrival: task %d linked to %d\n",
+                     task->pid, last->cpu);
+
+               if (last->linked)
+                       requeue(last->linked);
+
+               link_task_to_cpu(task, last);
+               preempt(last);
+       }
+}
+
+/* check for current job releases */
+static noinline  void adaptive_release_jobs(void)
+{
+       struct list_head *pos, *save;
+       struct task_struct   *queued;
+
+       list_for_each_safe(pos, save, &adaptive.release_queue) {
+               queued = list_entry(pos, struct task_struct, rt_list);
+               if (likely(is_released(queued))) {
+                       TRACE_TASK(queued, "released rel=%d now=%d\n",
+                                  get_release(queued), jiffies);
+                       /* this one is ready to go*/
+                       list_del(pos);
+                       set_rt_flags(queued, RT_F_RUNNING);
+                       queued->rt_param.times.last_release =
+                               queued->rt_param.times.release;
+
+                       /* check for delayed weight increase */
+                       if (get_opt_sl(queued) != get_cur_sl(queued) &&
+                           time_before_eq(queued->rt_param.opt_change, jiffies)) {
+                               opt_time = jiffies;
+                               set_service_level(queued, get_opt_sl(queued));
+                               queued->rt_param.times.deadline =
+                                       get_last_release(queued) +
+                                       get_rt_period(queued);
+                               total_weight = _sub(total_weight, get_est_weight(queued));
+                               queued->rt_param.predictor_state.estimate =
+                                       queued->rt_param.opt_nw;
+                               total_weight = _add(total_weight, get_est_weight(queued));
+                       }
+
+                       sched_trace_job_release(queued);
+                       adaptive_job_arrival(queued);
+               }
+               else
+                       /* the release queue is ordered */
+                       break;
+       }
+}
+
+/* adaptive_scheduler_tick - this function is called for every local timer
+ *                       interrupt.
+ *
+ *                   checks whether the current task has expired and checks
+ *                   whether we need to preempt it if it has not expired
+ */
+static reschedule_check_t adaptive_scheduler_tick(void)
+{
+       unsigned long           flags;
+       struct task_struct*     t = current;
+       reschedule_check_t      want_resched = NO_RESCHED;
+
+       /* Account for exec time.
+        * Since we don't preempt forcefully, nothing else needs to be done.
+        */
+       if (is_realtime(t))
+               t->rt_param.times.exec_time++;
+
+       /* only the first CPU needs to release jobs */
+       if (get_rt_mode() == MODE_RT_RUN) {
+               queue_lock_irqsave(&adaptive_lock, flags);
+
+               /* (1) run the optimizer if it did not trigger often enough */
+               if (time_before_eq(last_optimizer_run + optimizer_period, jiffies)) {
+
+                       OPT_DBG("adaptive: optimizing due to period threshold\n");
+
+                       adaptive_optimize();
+               }
+
+               /* (2) enact delayed weight increases */
+               delayed_increase_weight();
+
+               /* (3) try to release pending jobs */
+               adaptive_release_jobs();
+
+               /* we don't need to check linked != scheduled since
+                * set_tsk_need_resched has been set by preempt() if necessary
+                */
+
+               queue_unlock_irqrestore(&adaptive_lock, flags);
+       }
+
+       return want_resched;
+}
+
+/* caller holds adaptive_lock */
+static noinline void job_completion(struct task_struct *t)
+{
+       long delta;
+       fp_t actual_weight, old_estimate;
+       unsigned int lcurr = get_cur_sl(t);
+       fp_t v = t->rt_param.service_level[lcurr].value;
+
+       int  non_zero_weight;
+       fp_t error_percentage;
+       int  exceeds_threshold;
+
+       BUG_ON(!t);
+
+       TRACE_TASK(t, " completion, last_rel=%d rel=%d dl=%d now=%d "
+                  "period=%d\n",
+                  get_last_release(t), get_release(t), get_deadline(t),
+                  jiffies, get_rt_period(t));
+
+       sched_trace_job_completion(t);
+       delta = t->rt_param.times.exec_time -
+               t->rt_param.basic_params.exec_cost;
+
+       OPT_DBG_T(t, "job %d completes, delta WCET = %d\n",
+                  t->rt_param.times.job_no, delta);
+
+       actual_weight = _frac(t->rt_param.times.exec_time,
+                             t->rt_param.basic_params.period);
+       sched_trace_weight_error(t, actual_weight);
+       old_estimate = get_est_weight(t);
+       update_estimate(&t->rt_param.predictor_state, actual_weight,
+                       fc_a, fc_b);
+
+       OPT_DBG_T(t, "Job %d completes. Current value " _FP_
+                 ", Weight estimation: error=" _FP_  " weight="
+                 _FP_ " => " _FP_ "\n",t->rt_param.times.job_no, v,
+                  _sub(get_est_weight(t), old_estimate),
+                  old_estimate, get_est_weight(t));
+
+       /* Now we have determined the task error.
+        * Next we release the next job.
+        * Then we optimize. It's easier for the optimizer to deal
+        * with just-released jobs.
+        */
+
+       /* prepare for next period */
+       edf_prepare_for_next_period(t);
+
+       TRACE_TASK(t, " prepped, last_rel=%d rel=%d dl=%d now=%d\n",
+                  get_last_release(t), get_release(t), get_deadline(t),
+                  jiffies);
+
+       if (is_released(t)) {
+               /* set flags */
+               /* prevent fake completions */
+               set_rt_flags(t, RT_F_RUNNING);
+               t->rt_param.times.last_release =
+                       t->rt_param.times.release;
+       }
+
+
+       non_zero_weight = !_eq(get_est_weight(t),FP(0));
+       if (non_zero_weight)
+               error_percentage = _div(_abs(_sub(get_est_weight(t),
+                                                 old_estimate)),
+                                       get_est_weight(t));
+       else
+               error_percentage = FP(0);
+       exceeds_threshold = _gt(error_percentage, task_error_threshold);
+
+
+       if (exceeds_threshold) {
+               OPT_DBG("adaptive: optimizing due to task error threshold\n");
+               adaptive_optimize();
+       } else if (_gt(total_weight, system_capacity)) {
+               OPT_DBG("adaptive: optimizing due to system capacity exceeded\n");
+               adaptive_optimize();
+       }
+
+
+       /* unlink */
+       unlink(t);
+       /* requeue
+        * But don't requeue a blocking task. */
+       if (is_running(t))
+               adaptive_job_arrival(t);
+}
+
+
+/* Getting schedule() right is a bit tricky. schedule() may not make any
+ * assumptions on the state of the current task since it may be called for a
+ * number of reasons. The reasons include a scheduler_tick() determined that it
+ * was necessary, because sys_exit_np() was called, because some Linux
+ * subsystem determined so, or even (in the worst case) because there is a bug
+ * hidden somewhere. Thus, we must take extreme care to determine what the
+ * current state is.
+ *
+ * The CPU could currently be scheduling a task (or not), be linked (or not).
+ *
+ * The following assertions for the scheduled task could hold:
+ *
+ *      - !is_running(scheduled)        // the job blocks
+ *     - get_rt_flag() == RT_F_SLEEP   // the job completed (by syscall)
+ *     - linked != scheduled           // we need to reschedule (for any reason)
+ *
+ * Any of these can occur together.
+ */
+static int adaptive_schedule(struct task_struct * prev,
+                        struct task_struct ** next,
+                        runqueue_t * rq)
+{
+       cpu_entry_t*            entry = &__get_cpu_var(adaptive_cpu_entries);
+       int                     sleep, preempt, exists,
+                               rt, blocks;
+       struct task_struct*     linked;
+
+       /* Will be released in finish_switch. */
+       queue_lock(&adaptive_lock);
+       clear_will_schedule();
+
+       /* sanity checking */
+       BUG_ON(entry->scheduled && entry->scheduled != prev);
+       BUG_ON(entry->scheduled && !is_realtime(prev));
+
+       /* (0) Determine state */
+       exists      = entry->scheduled != NULL;
+       blocks      = exists && !is_running(entry->scheduled);
+       sleep       = exists && get_rt_flags(entry->scheduled) == RT_F_SLEEP;
+       preempt     = entry->scheduled != entry->linked;
+       rt          = get_rt_mode() == MODE_RT_RUN;
+
+       /* If a task blocks we have no choice but to reschedule.
+        */
+       if (blocks)
+               unlink(entry->scheduled);
+
+       /* Task wants to sleep -> job is done.
+        */
+       if (sleep)
+               job_completion(entry->scheduled);
+
+       /* Stop real-time tasks when we leave real-time mode
+        */
+       if (!rt && entry->linked) {
+               /* task will be preempted once it is preemptable
+                * (which it may be already)
+                */
+               linked = entry->linked;
+               unlink(linked);
+               requeue(linked);
+       }
+
+       /* Link pending task if we became unlinked.
+        */
+       if (rt && !entry->linked)
+               link_task_to_cpu(__take_ready(&adaptive), entry);
+
+       /* The final scheduling decision. Do we need to switch for some reason?
+        * If linked different from scheduled select linked as next.
+        */
+       if (entry->linked != entry->scheduled) {
+               /* Take care of a previously scheduled
+                * job by taking it out of the Linux runqueue.
+                */
+               if (entry->scheduled)
+                       if (prev->array)
+                               /* take it out of the run queue */
+                               deactivate_task(prev, rq);
+
+               /* Schedule a linked job? */
+               if (entry->linked) {
+                       *next = entry->linked;
+                       /* mark the task as executing on this cpu */
+                       set_task_cpu(*next, smp_processor_id());
+                       /* stick the task into the runqueue */
+                       __activate_task(*next, rq);
+               }
+       } else
+               /* Only override Linux scheduler if we have real-time task
+                * scheduled that needs to continue.
+                */
+               if (exists)
+                       *next = prev;
+
+       /* Unlock in case that we don't affect real-time tasks or
+        * if nothing changed and finish_switch won't be called.
+        */
+       if (prev == *next || (!is_realtime(prev) && !*next))
+               queue_unlock(&adaptive_lock);
+
+       return 0;
+}
+
+
+/* _finish_switch - we just finished the switch away from prev
+ */
+static void adaptive_finish_switch(struct task_struct *prev)
+{
+       cpu_entry_t*    entry = &__get_cpu_var(adaptive_cpu_entries);
+
+       if (is_realtime(current))
+               entry->scheduled = current;
+       else
+               entry->scheduled = NULL;
+
+       prev->rt_param.scheduled_on    = NO_CPU;
+       current->rt_param.scheduled_on = smp_processor_id();
+
+       /* unlock in case schedule() left it locked */
+       if (is_realtime(current) || is_realtime(prev))
+                       queue_unlock(&adaptive_lock);
+}
+
+
+/*     Prepare a task for running in RT mode
+ *     Enqueues the task into master queue data structure
+ *     returns
+ *             -EPERM  if task is not TASK_STOPPED
+ */
+static long adaptive_prepare_task(struct task_struct * t)
+{
+       unsigned long           flags;
+
+       TRACE("adaptive: prepare task %d\n", t->pid);
+
+       if (t->state == TASK_STOPPED) {
+               __setscheduler(t, SCHED_FIFO, MAX_RT_PRIO - 1);
+
+               t->rt_param.scheduled_on       = NO_CPU;
+               t->rt_param.linked_on          = NO_CPU;
+               if (t->rt_param.no_service_levels) {
+                       t->rt_param.predictor_state.estimate =
+                               get_sl(t, 0).weight;
+               } else
+                       t->rt_param.predictor_state.estimate =
+                               _frac(get_exec_cost(t), get_rt_period(t));
+
+               TRACE_TASK(t, "est_weight=" _FP_  "\n", get_est_weight(t));
+
+               if (get_rt_mode() == MODE_RT_RUN)
+                       /* The action is already on.
+                        * Prepare immediate release
+                        */
+                       edf_release_now(t);
+               /* The task should be running in the queue, otherwise signal
+                * code will try to wake it up with fatal consequences.
+                */
+               t->state = TASK_RUNNING;
+
+               queue_lock_irqsave(&adaptive_lock, flags);
+               total_weight = _add(total_weight, get_est_weight(t));
+               requeue(t);
+               queue_unlock_irqrestore(&adaptive_lock, flags);
+               return 0;
+       }
+       else
+               return -EPERM;
+}
+
+static void adaptive_wake_up_task(struct task_struct *task)
+{
+       unsigned long flags;
+       /* We must determine whether task should go into the release
+        * queue or into the ready queue. It may enter the ready queue
+        * if it has credit left in its time slice and has not yet reached
+        * its deadline. If it is now passed its deadline we assume this the
+        * arrival of a new sporadic job and thus put it in the ready queue
+        * anyway.If it has zero budget and the next release is in the future
+        * it has to go to the release queue.
+        */
+
+       TRACE("adaptive: %d unsuspends\n", task->pid);
+
+       task->state = TASK_RUNNING;
+
+       if (is_tardy(task)) {
+               /* new sporadic release */
+               edf_release_now(task);
+               sched_trace_job_release(task);
+       }
+       else if (task->time_slice)
+               /* came back in time before deadline */
+               set_rt_flags(task, RT_F_RUNNING);
+
+       queue_lock_irqsave(&adaptive_lock, flags);
+       total_weight = _add(total_weight, get_est_weight(task));
+       adaptive_job_arrival(task);
+       queue_unlock_irqrestore(&adaptive_lock, flags);
+}
+
+static void adaptive_task_blocks(struct task_struct *t)
+{
+       unsigned long flags;
+
+       /* unlink if necessary */
+       queue_lock_irqsave(&adaptive_lock, flags);
+       total_weight = _sub(total_weight, get_est_weight(t));
+       unlink(t);
+       queue_unlock_irqrestore(&adaptive_lock, flags);
+
+       BUG_ON(!is_realtime(t));
+
+       TRACE("task %d suspends\n", t->pid);
+
+       BUG_ON(t->rt_list.next != LIST_POISON1);
+       BUG_ON(t->rt_list.prev != LIST_POISON2);
+}
+
+
+/* When _tear_down is called, the task should not be in any queue any more
+ * as it must have blocked first. We don't have any internal state for the task,
+ * it is all in the task_struct.
+ */
+static long adaptive_tear_down(struct task_struct * t)
+{
+       BUG_ON(!is_realtime(t));
+        TRACE_TASK(t, "RIP\n");
+       BUG_ON(t->array);
+       BUG_ON(t->rt_list.next != LIST_POISON1);
+       BUG_ON(t->rt_list.prev != LIST_POISON2);
+       return 0;
+}
+
+static int adaptive_mode_change(int new_mode)
+{
+       unsigned long flags;
+       int cpu;
+       cpu_entry_t *entry;
+       struct task_struct* t;
+       struct list_head* pos;
+
+       if (new_mode == MODE_RT_RUN) {
+               queue_lock_irqsave(&adaptive_lock, flags);
+
+               system_capacity = FP(0);
+               for_each_online_cpu(cpu)
+                       system_capacity = _add(system_capacity, FP(1));
+
+               __rerelease_all(&adaptive, edf_release_at);
+
+               total_weight    = FP(0);
+               list_for_each(pos, &adaptive.release_queue) {
+                       t = list_entry(pos, struct task_struct, rt_list);
+                       total_weight = _add(total_weight, get_est_weight(t));
+               }
+               TRACE("adaptive: total weight: "  _FP_
+                     " (at mode change)\n", total_weight);
+
+
+               /* get old cruft out of the way in case we reenter real-time
+                * mode for a second time
+                */
+               while (!list_empty(&adaptive_cpu_queue))
+                       list_del(adaptive_cpu_queue.next);
+               /* reinitialize */
+               for_each_online_cpu(cpu) {
+                       entry = &per_cpu(adaptive_cpu_entries, cpu);
+                       atomic_set(&entry->will_schedule, 0);
+                       entry->linked    = NULL;
+                       entry->scheduled = NULL;
+                       list_add(&entry->list, &adaptive_cpu_queue);
+               }
+
+               adaptive_optimize();
+
+               queue_unlock_irqrestore(&adaptive_lock, flags);
+
+       }
+       return 0;
+}
+
+
+typedef enum {
+       ADAPTIVE_SET_MIN_OPT_SEP = 1
+} adaptive_cmds_t;
+
+
+static int adaptive_setup(int cmd, void __user *up)
+{
+       unsigned int error = -EINVAL;
+       unsigned int val;
+
+       if (copy_from_user(&val, up, sizeof(unsigned int))) {
+               error = -EFAULT;
+               goto out;
+       }
+
+       switch (cmd) {
+       case ADAPTIVE_SET_MIN_OPT_SEP:
+               optimizer_min_invocation_sep = val;
+               TRACE("adaptive: min opt sep set to %d\n",
+                     optimizer_min_invocation_sep);
+               return 0;
+               break;
+       }
+
+out:
+       return error;
+}
+
+
+/*     Plugin object   */
+static sched_plugin_t s_plugin __cacheline_aligned_in_smp = {
+       .ready_to_use = 0
+};
+
+
+/*
+ *     Plugin initialization code.
+ */
+#define INIT_SCHED_PLUGIN (struct sched_plugin){               \
+       .plugin_name            = "ADAPTIVE",                   \
+       .ready_to_use           = 1,                            \
+       .scheduler_tick         = adaptive_scheduler_tick,      \
+       .prepare_task           = adaptive_prepare_task,        \
+       .sleep_next_period      = edf_sleep_next_period,        \
+       .tear_down              = adaptive_tear_down,           \
+       .schedule               = adaptive_schedule,            \
+       .finish_switch          = adaptive_finish_switch,       \
+       .mode_change            = adaptive_mode_change,         \
+       .wake_up_task           = adaptive_wake_up_task,        \
+       .task_blocks            = adaptive_task_blocks,         \
+       .scheduler_setup        = adaptive_setup                \
+}
+
+
+sched_plugin_t *__init init_adaptive_plugin(void)
+{
+       int cpu;
+       cpu_entry_t *entry;
+
+       /* magic values given in the paper */
+       fc_a = _frac(  102, 1000);
+       fc_b = _frac(  303, 1000);
+
+       optimizer_period = 1000;
+       optimizer_min_invocation_sep = 200;
+       task_error_threshold = _frac(1, 2);
+
+       if (!s_plugin.ready_to_use)
+       {
+               /* initialize CPU state */
+               for (cpu = 0; cpu < NR_CPUS; cpu++)  {
+                       entry = &per_cpu(adaptive_cpu_entries, cpu);
+                       atomic_set(&entry->will_schedule, 0);
+                       entry->linked    = NULL;
+                       entry->scheduled = NULL;
+                       entry->cpu       = cpu;
+               }
+
+               queue_lock_init(&adaptive_lock);
+               edf_domain_init(&adaptive, NULL);
+               s_plugin = INIT_SCHED_PLUGIN;
+       }
+       return &s_plugin;
+}
+
+
+diff --git a/kernel/sched_edf_hsb.c b/kernel/sched_edf_hsb.c
+new file mode 100644
+index 0000000..b888e17
+--- /dev/null
+++ b/kernel/sched_edf_hsb.c
+@@ -0,0 +1,1724 @@
+/*
+ * kernel/sched_edf_hsb.c
+ *
+ * Implementation of the EDF-HSB scheduler plugin.
+ *
+ */
+
+#include <asm/uaccess.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+
+#include <linux/litmus.h>
+#include <linux/sched_plugin.h>
+#include <linux/edf_common.h>
+#include <linux/fifo_common.h>
+#include <linux/sched_trace.h>
+
+/* undefine to remove capacity sharing */
+#define HSB_CAP_SHARE_ENABLED
+
+/* fake server PIDs */
+#define HRT_BASE_PID 50000
+#define SRT_BASE_PID 60000
+
+
+/******************************************************************************/
+/*                               Capacity queue                               */
+/******************************************************************************/
+
+int cap_check_resched(jiffie_t deadline);
+
+typedef struct {
+       int                     budget;
+       jiffie_t                deadline;
+       pid_t                   donor;
+
+       struct list_head        list;
+} capacity_t;
+
+typedef struct {
+       spinlock_t              lock;
+       struct list_head        queue;
+} capacity_queue_t;
+
+#define next_cap(q) list_entry((q)->queue.next, capacity_t, list)
+
+void capacity_queue_init(capacity_queue_t* queue)
+{
+       queue->lock     = SPIN_LOCK_UNLOCKED;
+       INIT_LIST_HEAD(&queue->queue);
+}
+
+void __add_capacity(capacity_queue_t*  queue, capacity_t *cap)
+{
+       struct list_head*       pos;
+       capacity_t*             queued;
+
+       list_for_each_prev(pos, &queue->queue) {
+               queued = list_entry(pos, capacity_t, list);
+               if ( time_before_eq(queued->deadline,  cap->deadline)) {
+                       __list_add(&cap->list, pos, pos->next);
+                       return;
+               }
+       }
+       list_add(&cap->list, &queue->queue);
+}
+
+int __capacity_available(capacity_queue_t* queue)
+{
+       capacity_t *cap;
+
+       while (!list_empty(&queue->queue)) {
+               cap = list_entry(queue->queue.next, capacity_t, list);
+
+
+               if (time_before_eq(cap->deadline, jiffies)) {
+                       list_del(queue->queue.next);
+                       kfree(cap);
+                       cap = NULL;
+               } else
+                       break;
+       }
+
+       return !list_empty(&queue->queue);
+}
+
+void __return_capacity(capacity_queue_t*  queue, capacity_t *cap)
+{
+       if (!cap->budget || time_before_eq(cap->deadline, jiffies))
+               kfree(cap);
+       else
+               __add_capacity(queue, cap);
+}
+
+
+void return_capacity(capacity_queue_t*  queue, capacity_t *cap)
+
+{
+       unsigned long flags;
+
+       if (!cap->budget || time_before_eq(cap->deadline, jiffies))
+               kfree(cap);
+       else {
+               spin_lock_irqsave(&queue->lock, flags);
+               __add_capacity(queue, cap);
+               spin_unlock_irqrestore(&queue->lock, flags);
+       }
+}
+
+
+#define MIN_TIME_DELTA 1
+#define MIN_BUDGET     1
+
+#ifdef HSB_CAP_SHARE_ENABLED
+void release_capacity(capacity_queue_t* queue, unsigned int budget,
+                     jiffie_t deadline, struct task_struct* t)
+{
+       capacity_t*  cap;
+       unsigned long flags;
+
+       if (deadline >= jiffies + MIN_TIME_DELTA && budget >= MIN_BUDGET) {
+               cap = kmalloc(sizeof(capacity_t), GFP_ATOMIC);
+               if (cap) {
+                       cap->budget   = budget;
+                       cap->deadline = deadline;
+                       if (t)
+                               cap->donor    = t->pid;
+                       else
+                               cap->donor    = 0;
+                       spin_lock_irqsave(&queue->lock, flags);
+                       __add_capacity(queue, cap);
+                       cap_check_resched(next_cap(queue)->deadline);
+                       spin_unlock_irqrestore(&queue->lock, flags);
+                       if (t)
+                               sched_trace_capacity_release(t);
+               }
+       }
+}
+
+void __release_capacity(capacity_queue_t* queue, unsigned int budget,
+                     jiffie_t deadline, struct task_struct* t)
+{
+       capacity_t*  cap;
+
+       if (deadline >= jiffies + MIN_TIME_DELTA && budget >= MIN_BUDGET) {
+               cap = kmalloc(sizeof(capacity_t), GFP_ATOMIC);
+               if (cap) {
+                       cap->budget   = budget;
+                       cap->deadline = deadline;
+                       if (t)
+                               cap->donor    = t->pid;
+                       else
+                               cap->donor    = 0;
+                       /* no locking, no resched check  -- called from schedule */
+                       __add_capacity(queue, cap);
+                       if (t)
+                               sched_trace_capacity_release(t);
+               }
+       }
+}
+
+
+capacity_t* __take_capacity(capacity_queue_t* queue, jiffie_t deadline, int deadline_matters)
+{
+       capacity_t*             cap = NULL;
+
+       while (!list_empty(&queue->queue)) {
+               cap = list_entry(queue->queue.next, capacity_t, list);
+
+               if (deadline_matters && time_before(deadline, cap->deadline)) {
+                       cap = NULL;
+                       break;
+               }
+
+               list_del(queue->queue.next);
+               if (cap->deadline > jiffies) {
+                       if (cap->deadline - jiffies < cap->budget)
+                               cap->budget = cap->deadline - jiffies;
+                       break;
+               }
+               kfree(cap);
+               cap = NULL;
+       }
+
+       return cap;
+}
+#else
+
+/* no capacity sharing */
+void release_capacity(capacity_queue_t* queue, unsigned int budget,
+                     jiffie_t deadline, struct task_struct* t)
+{
+}
+
+capacity_t* __take_capacity(capacity_queue_t* queue, jiffie_t deadline, int deadline_matters)
+{
+       return NULL;
+}
+#endif
+
+
+/******************************************************************************/
+/*                          server abstractions                               */
+/******************************************************************************/
+
+
+/* hrt_server_t - Abstraction of a hard real-time server.
+ *
+ * One HRT server per CPU. If it is unused period and wcet may be zero.
+ * HRT servers are strictly periodic and retain their budget.
+ */
+typedef struct {
+       rt_domain_t             domain;
+
+       unsigned int            period;
+       unsigned int            wcet;
+
+       jiffie_t                deadline;
+       int                     budget;
+} hrt_server_t;
+
+/* be_server_t - Abstraction of best-effort server.
+ *
+ * This is pretty much only an accounting abstraction.
+ */
+typedef struct {
+       unsigned int            period;
+       unsigned int            wcet;
+
+       jiffie_t                deadline;
+       jiffie_t                release;
+       int                     budget;
+
+       struct list_head        list;
+       pid_t                   pid;
+} be_server_t;
+
+/* cast to int to allow for negative slack, i.e. tardiness */
+#define server_slack(srv) \
+       ( ((int) (srv)->deadline - (int) jiffies) - (int) (srv)->budget )
+
+typedef struct  {
+       int                     cpu;
+
+       hrt_server_t            hrt;
+       be_server_t*            be;
+       capacity_t*             cap;
+
+       task_class_t            exec_class;
+       jiffie_t                cur_deadline;
+       atomic_t                will_schedule;
+
+       struct list_head        list;
+       spinlock_t              lock;
+} cpu_state_t;
+
+
+DEFINE_PER_CPU(cpu_state_t, hsb_cpu_state);
+
+#define hrt_dom(cpu) (&per_cpu(hsb_cpu_state, cpu).hrt.domain)
+
+#define set_will_schedule() \
+       (atomic_set(&__get_cpu_var(hsb_cpu_state).will_schedule, 1))
+#define clear_will_schedule() \
+       (atomic_set(&__get_cpu_var(hsb_cpu_state).will_schedule, 0))
+#define test_will_schedule(cpu) \
+       (atomic_read(&per_cpu(hsb_cpu_state, cpu).will_schedule))
+
+
+static void prepare_hrt_release(hrt_server_t *srv, jiffie_t start)
+{
+       if (srv->period && srv->wcet) {
+               srv->deadline = start;
+               srv->budget   = 0;
+       }
+}
+
+static void check_for_hrt_release(hrt_server_t *srv) {
+       if (srv->wcet && srv->period &&
+           time_before_eq(srv->deadline, jiffies)) {
+               srv->deadline += srv->period;
+               srv->budget    = srv->wcet;
+               sched_trace_server_release(HRT_BASE_PID + smp_processor_id(),
+                                          srv->budget, srv->period, RT_CLASS_HARD);
+       }
+}
+
+/*  A HRT client is eligible if either its deadline is before the
+ *  the server deadline or if the server has zero slack. The server
+ *  must have budget left.
+ */
+static inline int hrt_client_eligible(hrt_server_t *srv)
+{
+       if (!list_empty(&srv->domain.ready_queue))
+               return srv->budget && (
+                       time_before(get_deadline(next_ready(&srv->domain)),
+                                                srv->deadline)
+                       || server_slack(srv) <= 0);
+       else
+               return 0;
+}
+
+static void hsb_cpu_state_init(cpu_state_t* cpu_state,
+                              check_resched_needed_t check,
+                              int cpu)
+{
+       edf_domain_init(&cpu_state->hrt.domain, check);
+       cpu_state->hrt.budget           = 0;
+       cpu_state->hrt.deadline         = 0;
+       cpu_state->hrt.period           = 0;
+       cpu_state->hrt.wcet             = 0;
+
+       cpu_state->be                   = NULL;
+       cpu_state->cap                  = NULL;
+
+       cpu_state->cur_deadline         = 0;
+       cpu_state->cpu                  = cpu;
+       cpu_state->lock                 = SPIN_LOCK_UNLOCKED;
+       cpu_state->exec_class           = RT_CLASS_BEST_EFFORT;
+
+       atomic_set(&cpu_state->will_schedule, 0);
+       INIT_LIST_HEAD(&cpu_state->list);
+}
+
+/******************************************************************************/
+/*               BE queue functions - mostly like edf_common.c                */
+/******************************************************************************/
+
+#define be_earlier_deadline(a, b) (time_before(\
+       (a)->deadline, (b)->deadline))
+#define be_earlier_release(a, b)  (time_before(\
+       (a)->release, (b)->release))
+
+
+static void be_add_ready(rt_domain_t* edf, be_server_t *new)
+{
+       unsigned long flags;
+       struct list_head *pos;
+       be_server_t *queued;
+       unsigned int passed = 0;
+
+       BUG_ON(!new);
+       /* first we need the write lock for rt_ready_queue */
+       write_lock_irqsave(&edf->ready_lock, flags);
+       /* find a spot where our deadline is earlier than the next */
+       list_for_each(pos, &edf->ready_queue) {
+               queued = list_entry(pos, be_server_t, list);
+               if (unlikely(be_earlier_deadline(new, queued))) {
+                       __list_add(&new->list, pos->prev, pos);
+                       goto out;
+               }
+               passed++;
+       }
+       /* if we get to this point either the list is empty or new has the
+        * lowest priority. Let's add it to the end. */
+       list_add_tail(&new->list, &edf->ready_queue);
+ out:
+       if (!passed)
+               edf->check_resched(edf);
+       write_unlock_irqrestore(&edf->ready_lock, flags);
+}
+
+static be_server_t* be_take_ready(rt_domain_t* edf)
+{
+       be_server_t *t = NULL;
+
+       if (!list_empty(&edf->ready_queue)) {
+               t = list_entry(edf->ready_queue.next, be_server_t, list);
+               /* kick it out of the ready list */
+               list_del(&t->list);
+       }
+       return t;
+}
+
+/*static be_server_t* get_be_server(rt_domain_t* edf)
+{
+       be_server_t *t = NULL;
+
+       spin_lock(&edf->release_lock);
+       write_lock(&edf->ready_lock);
+       t = be_take_ready(edf);
+
+       if (!t && !list_empty(&edf->release_queue)) {
+               t = list_entry(edf->release_queue.next, be_server_t, list);
+
+               list_del(&t->list);
+       }
+
+       write_unlock(&edf->ready_lock);
+       spin_unlock(&edf->release_lock);
+       return t;
+}*/
+
+static void be_add_release(rt_domain_t* edf, be_server_t *srv)
+{
+       unsigned long flags;
+       struct list_head *pos;
+       be_server_t *queued;
+
+       spin_lock_irqsave(&edf->release_lock, flags);
+       list_for_each_prev(pos, &edf->release_queue) {
+               queued = list_entry(pos, be_server_t, list);
+               if ((unlikely(be_earlier_release(queued, srv)))) {
+                       /* the task at pos has an earlier release */
+                       /* insert the new task in behind it */
+                       __list_add(&srv->list, pos, pos->next);
+                       goto out;
+               }
+       }
+
+       list_add(&srv->list, &edf->release_queue);
+ out:
+       spin_unlock_irqrestore(&edf->release_lock, flags);
+}
+
+static void be_try_release_pending(rt_domain_t* edf)
+{
+       unsigned long flags;
+       struct list_head *pos, *save;
+       be_server_t *queued;
+
+       if (spin_trylock_irqsave(&edf->release_lock, flags)) {
+               list_for_each_safe(pos, save, &edf->release_queue) {
+                       queued = list_entry(pos, be_server_t, list);
+                       if (likely(time_before_eq(
+                                          queued->release,
+                                          jiffies))) {
+                               list_del(pos);
+                               be_add_ready(edf, queued);
+                               sched_trace_server_release(
+                                       queued->pid, queued->budget,
+                                       queued->period, RT_CLASS_BEST_EFFORT);
+                       } else
+                               /* the release queue is ordered */
+                               break;
+               }
+               spin_unlock_irqrestore(&edf->release_lock, flags);
+       }
+}
+
+static void be_prepare_new_release(be_server_t *t, jiffie_t start) {
+       t->release   = start;
+       t->deadline  = t->release + t->period;
+       t->budget    = t->wcet;
+}
+
+static void be_prepare_new_releases(rt_domain_t *edf, jiffie_t start)
+{
+       unsigned long flags;
+       struct list_head tmp_list;
+       struct list_head *pos, *n;
+       be_server_t *t;
+
+       INIT_LIST_HEAD(&tmp_list);
+
+       spin_lock_irqsave(&edf->release_lock, flags);
+       write_lock(&edf->ready_lock);
+
+
+       while (!list_empty(&edf->release_queue)) {
+               pos = edf->release_queue.next;
+               list_del(pos);
+               list_add(pos, &tmp_list);
+       }
+
+       while (!list_empty(&edf->ready_queue)) {
+               pos = edf->ready_queue.next;
+               list_del(pos);
+               list_add(pos, &tmp_list);
+
+       }
+
+       write_unlock(&edf->ready_lock);
+       spin_unlock_irqrestore(&edf->release_lock, flags);
+
+       list_for_each_safe(pos, n, &tmp_list) {
+               t = list_entry(pos, be_server_t, list);
+               list_del(pos);
+               be_prepare_new_release(t, start);
+               be_add_release(edf, t);
+       }
+
+}
+
+static void be_prepare_for_next_period(be_server_t *t)
+{
+       BUG_ON(!t);
+       /* prepare next release */
+       t->release   = t->deadline;
+       t->deadline += t->period;
+       t->budget    = t->wcet;
+}
+
+#define be_next_ready(edf) \
+       list_entry((edf)->ready_queue.next, be_server_t, list)
+
+
+/* need_to_preempt - check whether the task t needs to be preempted by a
+ *                   best-effort server.
+ */
+static inline int be_preemption_needed(rt_domain_t* edf, cpu_state_t* state)
+{
+       /* we need the read lock for rt_ready_queue */
+       if (!list_empty(&edf->ready_queue))
+       {
+
+               if (state->exec_class == RT_CLASS_SOFT) {
+                       if (state->cap)
+                               return time_before(
+                                       be_next_ready(edf)->deadline,
+                                       state->cap->deadline);
+                       else
+                               return time_before(
+                                       be_next_ready(edf)->deadline,
+                                       state->cur_deadline);
+               } else
+                       return 1;
+       }
+       return 0;
+}
+
+static void be_enqueue(rt_domain_t* edf, be_server_t* srv)
+{
+       int new_release = 0;
+       if (!srv->budget) {
+               be_prepare_for_next_period(srv);
+               new_release = 1;
+       }
+
+       if (time_before_eq(srv->release, jiffies) &&
+           get_rt_mode() == MODE_RT_RUN) {
+               be_add_ready(edf, srv);
+               if (new_release)
+                       sched_trace_server_release(
+                               srv->pid, srv->budget,
+                               srv->period, RT_CLASS_BEST_EFFORT);
+       } else
+               be_add_release(edf, srv);
+}
+
+static void be_preempt(rt_domain_t *be, cpu_state_t *state)
+{
+       be_server_t *srv;
+
+       spin_lock(&state->lock);
+       srv = state->be;
+       state->be = NULL;
+       spin_unlock(&state->lock);
+
+       /* add outside of lock to avoid deadlock */
+       if (srv)
+               be_enqueue(be, srv);
+}
+
+
+/******************************************************************************/
+/*                             Actual HSB implementation                      */
+/******************************************************************************/
+
+/* always acquire the cpu lock as the last lock to avoid deadlocks */
+static spinlock_t hsb_cpu_lock = SPIN_LOCK_UNLOCKED;
+/* the cpus queue themselves according to priority in here */
+static LIST_HEAD(hsb_cpu_queue);
+
+
+/* the global soft real-time domain */
+static rt_domain_t srt;
+/* the global best-effort server domain
+ * belongs conceptually to the srt domain, but has
+ * be_server_t* queued instead of tast_t*
+ */
+static rt_domain_t be;
+
+static rt_domain_t hsb_fifo;
+
+static capacity_queue_t cap_queue;
+
+
+
+
+/* adjust_cpu_queue - Move the cpu entry to the correct place to maintain
+ *                    order in the cpu queue.
+ *
+ */
+static void adjust_cpu_queue(task_class_t class, jiffie_t deadline,
+                            be_server_t *be)
+{
+       struct list_head *pos;
+       cpu_state_t *other;
+       cpu_state_t *entry;
+
+       spin_lock(&hsb_cpu_lock);
+
+       entry = &__get_cpu_var(hsb_cpu_state);
+
+       spin_lock(&entry->lock);
+       entry->exec_class       = class;
+       entry->cur_deadline     = deadline;
+       entry->be               = be;
+
+       spin_unlock(&entry->lock);
+
+
+
+       if (be)
+               sched_trace_server_scheduled(
+                       be->pid, RT_CLASS_BEST_EFFORT, be->budget,
+                       be->deadline);
+       else if (class == RT_CLASS_HARD)
+               sched_trace_server_scheduled(
+                       HRT_BASE_PID + smp_processor_id(), RT_CLASS_HARD,
+                       entry->hrt.budget, entry->hrt.deadline);
+
+       list_del(&entry->list);
+       /* If we do not execute real-time jobs we just move
+        * to the end of the queue .
+        * If we execute hard real-time jobs we move the start
+        * of the queue.
+        */
+
+       switch (entry->exec_class) {
+       case RT_CLASS_HARD:
+               list_add(&entry->list, &hsb_cpu_queue);
+               break;
+
+       case RT_CLASS_SOFT:
+               list_for_each(pos, &hsb_cpu_queue) {
+                       other = list_entry(pos, cpu_state_t, list);
+                       if (other->exec_class > RT_CLASS_SOFT ||
+                           time_before_eq(entry->cur_deadline,
+                                          other->cur_deadline))
+                       {
+                               __list_add(&entry->list, pos->prev, pos);
+                               goto out;
+                       }
+               }
+               /* possible fall through if lowest SRT  priority */
+
+       case RT_CLASS_BEST_EFFORT:
+               list_add_tail(&entry->list, &hsb_cpu_queue);
+               break;
+
+       default:
+               /* something wrong in the variable */
+               BUG();
+       }
+ out:
+       spin_unlock(&hsb_cpu_lock);
+}
+
+
+/* hrt_check_resched - check whether the HRT server on given CPU needs to
+ *                     preempt the running task.
+ */
+static int hrt_check_resched(rt_domain_t *edf)
+{
+       hrt_server_t *srv  = container_of(edf, hrt_server_t, domain);
+       cpu_state_t *state = container_of(srv, cpu_state_t, hrt);
+       int ret = 0;
+
+       spin_lock(&state->lock);
+
+       if (hrt_client_eligible(srv)) {
+               if (state->exec_class > RT_CLASS_HARD ||
+                   time_before(
+                           get_deadline(next_ready(edf)),
+                           state->cur_deadline)
+                       ) {
+                       if (state->cpu == smp_processor_id())
+                               set_tsk_need_resched(current);
+                       else
+                               smp_send_reschedule(state->cpu);
+               }
+       }
+
+       spin_unlock(&state->lock);
+       return ret;
+}
+
+
+/* srt_check_resched - Check whether another CPU needs to switch to a SRT task.
+ *
+ * The function only checks and kicks the last CPU. It will reschedule and
+ * kick the next if necessary, and so on. The caller is responsible for making
+ * sure that it is not the last entry or that a reschedule is not necessary.
+ *
+ * Caller must hold edf->ready_lock!
+ */
+static int srt_check_resched(rt_domain_t *edf)
+{
+       cpu_state_t *last;
+       int ret = 0;
+
+       spin_lock(&hsb_cpu_lock);
+
+       if (!list_empty(&srt.ready_queue)) {
+               last = list_entry(hsb_cpu_queue.prev, cpu_state_t, list);
+               /* guard against concurrent updates */
+               spin_lock(&last->lock);
+               if (last->exec_class == RT_CLASS_BEST_EFFORT || (
+                   last->exec_class == RT_CLASS_SOFT &&
+                   time_before(get_deadline(next_ready(&srt)),
+                               last->cur_deadline)))
+               {
+                       if (smp_processor_id() == last->cpu)
+                               set_tsk_need_resched(current);
+                       else
+                               if (!test_will_schedule(last->cpu))
+                                   smp_send_reschedule(last->cpu);
+                       ret = 1;
+               }
+               spin_unlock(&last->lock);
+       }
+
+       spin_unlock(&hsb_cpu_lock);
+       return ret;
+}
+
+
+/* be_check_resched - Check whether another CPU needs to switch to a BE server..
+ *
+ * Caller must hold edf->ready_lock!
+ */
+static int be_check_resched(rt_domain_t *edf)
+{
+       cpu_state_t *last;
+       int soft, bg;
+       int ret = 0;
+
+       spin_lock(&hsb_cpu_lock);
+
+       if (!list_empty(&be.ready_queue)) {
+               last = list_entry(hsb_cpu_queue.prev, cpu_state_t, list);
+               /* guard against concurrent updates */
+               spin_lock(&last->lock);
+
+               bg   = last->exec_class == RT_CLASS_BEST_EFFORT;
+               soft = last->exec_class == RT_CLASS_SOFT;
+
+               if (bg || (soft && time_before(be_next_ready(&be)->deadline,
+                                              last->cur_deadline)))
+               {
+                       if (smp_processor_id() == last->cpu)
+                               set_tsk_need_resched(current);
+                       else
+                               if (!test_will_schedule(last->cpu))
+                                   smp_send_reschedule(last->cpu);
+                       ret = 1;
+               }
+
+               spin_unlock(&last->lock);
+       }
+
+       spin_unlock(&hsb_cpu_lock);
+       return ret;
+}
+
+
+int cap_check_resched(jiffie_t deadline)
+{
+       unsigned long flags;
+       cpu_state_t *last;
+       int soft, bg;
+       int ret = 0;
+
+
+
+       if (get_rt_mode() == MODE_RT_RUN) {
+               spin_lock_irqsave(&hsb_cpu_lock, flags);
+
+               last = list_entry(hsb_cpu_queue.prev, cpu_state_t, list);
+               /* guard against concurrent updates */
+               spin_lock(&last->lock);
+
+               bg   = last->exec_class == RT_CLASS_BEST_EFFORT;
+               soft = last->exec_class == RT_CLASS_SOFT;
+
+               if (bg || (soft && time_before(deadline,
+                                              last->cur_deadline)))
+               {
+                       if (smp_processor_id() == last->cpu)
+                               set_tsk_need_resched(current);
+                       else
+                               if (!test_will_schedule(last->cpu))
+                                       smp_send_reschedule(last->cpu);
+                       ret = 1;
+               }
+
+               spin_unlock(&last->lock);
+
+               spin_unlock_irqrestore(&hsb_cpu_lock, flags);
+       }
+       return ret;
+}
+
+int fifo_check_resched(void)
+{
+       unsigned long flags;
+       cpu_state_t *last;
+       int ret = 0;
+
+       if (get_rt_mode() == MODE_RT_RUN) {
+               spin_lock_irqsave(&hsb_cpu_lock, flags);
+
+
+               last = list_entry(hsb_cpu_queue.prev, cpu_state_t, list);
+               /* guard against concurrent updates */
+
+               spin_lock(&last->lock);
+
+               if (last->exec_class == RT_CLASS_BEST_EFFORT)
+               {
+                       if (smp_processor_id() == last->cpu)
+                               set_tsk_need_resched(current);
+                       else
+                               if (!test_will_schedule(last->cpu))
+                                       smp_send_reschedule(last->cpu);
+                       ret = 1;
+               }
+
+               spin_unlock(&last->lock);
+
+               spin_unlock_irqrestore(&hsb_cpu_lock, flags);
+       }
+       return ret;
+}
+
+
+
+static inline int hsb_preemption_needed(rt_domain_t* edf, cpu_state_t* state)
+{
+       /* we need the read lock for rt_ready_queue */
+       if (!list_empty(&edf->ready_queue))
+       {
+               if (state->exec_class == RT_CLASS_SOFT) {
+                       if (state->cap)
+                               return time_before(get_deadline(next_ready(edf))
+                                                  , state->cap->deadline);
+                       else
+                               return time_before(get_deadline(next_ready(edf))
+                                                  , state->cur_deadline);
+               } else
+                       return 1;
+       }
+       return 0;
+}
+
+static inline int cap_preemption_needed(capacity_queue_t* q, cpu_state_t* state)
+{
+       /* we need the read lock for rt_ready_queue */
+       if (!list_empty(&q->queue))
+       {
+               if (state->exec_class == RT_CLASS_SOFT) {
+                       if (state->cap)
+                               return time_before(next_cap(q)->deadline
+                                                  , state->cap->deadline);
+                       else
+                               return time_before(next_cap(q)->deadline
+                                                  , state->cur_deadline);
+               } else
+                       return 1;
+       }
+       return 0;
+}
+
+/* hsb_scheduler_tick - this function is called for every local timer
+ *                       interrupt.
+ *
+ *                   checks whether the current task has expired and checks
+ *                   whether we need to preempt it if it has not expired
+ */
+static reschedule_check_t hsb_scheduler_tick(void)
+{
+       unsigned long flags;
+       struct task_struct *t = current;
+       int resched = 0;
+
+       cpu_state_t *state = &__get_cpu_var(hsb_cpu_state);
+
+       /* expire tasks even if not in real-time mode
+        * this makes sure that at the end of real-time mode
+        * no tasks "run away forever".
+        */
+
+       /* charge BE server only if we are not running on a spare capacity */
+       if (state->be && !state->cap && --state->be->budget <= 0) {
+               sched_trace_server_completion(state->be->pid, 0,
+                                             state->be->deadline,
+                                             RT_CLASS_BEST_EFFORT);
+               be_preempt(&be, state);
+               resched = 1;
+       }
+
+       if (state->cap)
+               if (--state->cap->budget <= 0 ||
+                   time_before_eq(state->cap->deadline, jiffies)) {
+                       kfree(state->cap);
+                       state->cap = NULL;
+                       resched    = 1;
+               }
+
+       if (is_realtime(t)) {
+               if (is_hrt(t) && (--state->hrt.budget <= 0)) {
+                       sched_trace_server_completion(
+                               HRT_BASE_PID + smp_processor_id(), 0,
+                               state->hrt.deadline, RT_CLASS_HARD);
+                       resched = 1;
+               }
+
+               /* account for received service... */
+               t->rt_param.times.exec_time++;
+
+               /* ...and charge current budget    */
+               if (!state->cap) {
+                       --t->time_slice;
+                       /* a task always should be able to finish its job */
+                       BUG_ON(!is_be(t) && !t->time_slice && !job_completed(t));
+               }
+
+               if (job_completed(t) || (is_be(t) && !t->time_slice)) {
+                       sched_trace_job_completion(t);
+                       set_rt_flags(t, RT_F_SLEEP);
+                       resched = 1;
+               }
+       }
+
+
+       if (get_rt_mode() == MODE_RT_RUN)
+       {
+               try_release_pending(&state->hrt.domain);
+               check_for_hrt_release(&state->hrt);
+               try_release_pending(&srt);
+               be_try_release_pending(&be);
+
+               if (!resched)
+                       switch (state->exec_class) {
+                       case RT_CLASS_HARD:
+                               read_lock_irqsave(&state->hrt.domain.ready_lock,
+                                                 flags);
+                               resched = edf_preemption_needed(
+                                                 &state->hrt.domain,
+                                                 t);
+                               read_unlock_irqrestore(
+                                       &state->hrt.domain.ready_lock, flags);
+                               break;
+
+                       case RT_CLASS_SOFT:
+                       case RT_CLASS_BEST_EFFORT:
+                               local_irq_save(flags);
+
+                               /* check for HRT jobs */
+                               read_lock(&state->hrt.domain.ready_lock);
+                               resched = hrt_client_eligible(&state->hrt);
+                               read_unlock(&state->hrt.domain.ready_lock);
+
+                               /* check for spare capacities */
+                               if (!resched) {
+                                       spin_lock(&cap_queue.lock);
+                                       resched =
+                                         cap_preemption_needed(&cap_queue,
+                                                               state);
+                                       spin_unlock(&cap_queue.lock);
+                               }
+
+                               /* check for SRT jobs */
+                               if (!resched) {
+                                       read_lock(&srt.ready_lock);
+                                       resched = hsb_preemption_needed(
+                                               &srt, state);
+                                       read_unlock(&srt.ready_lock);
+                               }
+
+                               /* check for BE jobs */
+                               if (!resched) {
+                                       read_lock(&be.ready_lock);
+                                       resched = be_preemption_needed(
+                                               &be, state);
+                                       read_unlock(&be.ready_lock);
+                               }
+
+                               /* check for background jobs */
+                               if (!resched && !is_realtime(current))
+                                       resched = jobs_pending(&hsb_fifo);
+                               local_irq_restore(flags);
+                               break;
+
+                       default:
+                               /* something wrong in the variable */
+                               BUG();
+                       }
+       }
+
+       if (resched) {
+               set_will_schedule();
+               return FORCE_RESCHED;
+       } else
+               return NO_RESCHED;
+}
+
+static int schedule_hrt(struct task_struct * prev,
+                       struct task_struct ** next, runqueue_t * rq)
+{
+       unsigned long   flags;
+       int             deactivate      = 1;
+       cpu_state_t     *state;
+
+
+       state = &__get_cpu_var(hsb_cpu_state);
+
+       write_lock_irqsave(&state->hrt.domain.ready_lock, flags);
+
+
+       if (state->cap) {
+               /* hrt_schedule does not have the cap_queue lock */
+               return_capacity(&cap_queue, state->cap);
+               state->cap = NULL;
+       }
+
+       if (is_hrt(prev) && is_released(prev) && is_running(prev)
+           && !edf_preemption_needed(&state->hrt.domain, prev)) {
+               /* This really should only happen if the task has
+                * 100% utilization or when we got a bogus/delayed
+                * resched IPI.
+                */
+               TRACE("HRT: prev will be next, already released\n");
+               *next           = prev;
+               deactivate      = 0;
+       } else {
+               /* either not yet released, preempted, or non-rt */
+               *next = __take_ready(&state->hrt.domain);
+               /* the logic in hsb_schedule makes sure *next must exist
+                * if we get here */
+               BUG_ON(!*next);
+               /* stick the task into the runqueue */
+               __activate_task(*next, rq);
+               set_task_cpu(*next, smp_processor_id());
+       }
+
+       set_rt_flags(*next, RT_F_RUNNING);
+       adjust_cpu_queue(RT_CLASS_HARD, get_deadline(*next), NULL);
+       clear_will_schedule();
+
+       write_unlock_irqrestore(&state->hrt.domain.ready_lock, flags);
+       return deactivate;
+}
+
+
+static struct task_struct* find_min_slack_task(struct task_struct *prev,
+                                              rt_domain_t* edf)
+{
+       struct list_head *pos;
+       struct task_struct* tsk = NULL;
+       struct task_struct* cur;
+
+       if (is_realtime(prev) && is_running(prev) &&
+           get_rt_flags(prev) != RT_F_SLEEP)
+               tsk = prev;
+       list_for_each(pos, &edf->ready_queue) {
+               cur = list_entry(pos, struct task_struct, rt_list);
+               if (!tsk || task_slack(tsk) > task_slack(cur))
+                       tsk = cur;
+       }
+       return tsk;
+}
+
+static struct task_struct* null_heuristic(struct task_struct *prev,
+                                         rt_domain_t* edf,
+                                         rt_domain_t* fifo)
+{
+       if (jobs_pending(fifo))
+               return  NULL;
+       else if (!list_empty(&edf->ready_queue))
+               return list_entry(edf->ready_queue.next,
+                                 struct task_struct, rt_list);
+       else
+               return NULL;
+}
+
+/* caller holds all locks
+ */
+
+static int schedule_capacity(struct task_struct *prev,
+                            struct task_struct **next, runqueue_t *rq)
+{
+       cpu_state_t     *state          = &__get_cpu_var(hsb_cpu_state);
+       capacity_t* old;
+
+       if (state->cap) {
+               old = state->cap;
+               state->cap = __take_capacity(&cap_queue, old->deadline, 1);
+               if (!state->cap)
+                       state->cap = old;
+               else
+                       __return_capacity(&cap_queue, old);
+       } else
+               state->cap = __take_capacity(&cap_queue, 0, 0);
+
+
+       /* pick a task likely to be tardy */
+       *next = find_min_slack_task(prev, &srt);
+
+       /* only give away spare capacities if there is no task that
+        * is going to be tardy
+        */
+       if (*next && task_slack(*next) >= 0)
+               *next = null_heuristic(prev, &srt, &hsb_fifo);
+       if (*next && *next != prev)
+               list_del(&(*next)->rt_list);
+
+
+       /* if there is none pick a BE job */
+       if (!*next) {
+               if (is_realtime(prev) && is_be(prev) && is_running(prev) &&
+                   get_rt_flags(prev) != RT_F_SLEEP)
+                       *next = prev;
+               else
+                       *next = take_ready(&hsb_fifo);
+       }
+
+       if (state->be)
+               be_preempt(&be, state);
+       BUG_ON(!state->cap);
+       if (*next && state->cap->donor) {
+               sched_trace_capacity_allocation(
+                       *next, state->cap->budget, state->cap->deadline,
+                       state->cap->donor);
+       }
+
+       return *next != prev;
+}
+
+
+
+#define BG 0
+#define SRT 1
+#define BE  2
+#define CAP 3
+
+static inline int what_first(rt_domain_t *be, rt_domain_t *srt, capacity_queue_t* q)
+{
+       jiffie_t sdl = 0, bdl= 0, cdl = 0, cur;
+       int _srt = !list_empty(&srt->ready_queue);
+       int _be  = !list_empty(&be->ready_queue);
+       int _cap = __capacity_available(q);
+
+
+       int ret = BG;           /* nothing ready => background mode*/
+       cur = 0;
+
+       if (_srt)
+               sdl = get_deadline(next_ready(srt));
+       if (_be)
+               bdl = be_next_ready(be)->deadline;
+       if (_cap)
+               cdl = next_cap(q)->deadline;
+
+
+
+       if (_cap) {
+               ret = CAP;
+               cur = cdl;
+       }
+       if (_srt && (time_before(sdl, cur) || !ret)) {
+               ret = SRT;
+               cur = sdl;
+       }
+       if (_be && (time_before(bdl, cur) || !ret)) {
+               ret = BE;
+               cur = bdl;
+       }
+       return ret;
+}
+
+
+
+static int schedule_srt_be_cap(struct task_struct *prev,
+                              struct task_struct **next, runqueue_t *rq)
+{
+       task_class_t    class    = RT_CLASS_BEST_EFFORT;
+       jiffie_t        deadline = 0;
+       unsigned long   flags;
+       int             deactivate = 1;
+       be_server_t*    bes;
+       cpu_state_t*    state;
+       int             type = BG;
+
+reschedule:
+       write_lock_irqsave(&srt.ready_lock, flags);
+       write_lock(&be.ready_lock);
+       spin_lock(&cap_queue.lock);
+
+
+       state = &__get_cpu_var(hsb_cpu_state);
+       bes   = NULL;
+
+       clear_will_schedule();
+
+       if (is_realtime(prev) && (is_released(prev) || is_be(prev)) &&
+           is_running(prev) && !hsb_preemption_needed(&srt, state) &&
+           !be_preemption_needed(&be,   state)
+               ) {
+               /* Our current task's next job has already been
+                * released and has higher priority than the highest
+                * prioriy waiting task; in other words: it is tardy.
+                * We just keep it.
+                */
+               TRACE("prev will be next, already released\n");
+               *next           = prev;
+               class           = prev->rt_param.basic_params.class;
+               deadline        = get_deadline(*next);
+               deactivate      = 0;
+       } else {
+               /* either not yet released, preempted, or non-rt */
+               type = what_first(&be, &srt, &cap_queue);
+               switch (type) {
+               case CAP:
+                       /* capacity */
+                       deactivate = schedule_capacity(prev, next, rq);
+                       deadline = state->cap->deadline;
+                       if (*next)
+                               class = RT_CLASS_SOFT;
+                       else
+                               class = RT_CLASS_BEST_EFFORT;
+                       break;
+               case BE:
+                       /* be */
+                       *next = NULL;
+                       bes   = be_take_ready(&be);
+                       if (bes) {
+                               class    = RT_CLASS_SOFT;
+                               deadline = bes->deadline;
+                               *next    = take_ready(&hsb_fifo);
+                               if (!*next) {
+                                       /* deactivate  */
+                                       __release_capacity(&cap_queue,
+                                                          bes->budget,
+                                                          bes->deadline, NULL);
+                                       bes->budget = 0;
+                                       barrier();
+                                       spin_unlock(&cap_queue.lock);
+                                       write_unlock(&be.ready_lock);
+                                       write_unlock_irqrestore(&srt.ready_lock,
+                                                               flags);
+                                       be_enqueue(&be, bes);
+                                       goto reschedule;
+                               }
+                       }
+                       break;
+               case SRT:
+                       /* srt */
+                       *next            = __take_ready(&srt);
+                       if (*next) {
+                               class    = RT_CLASS_SOFT;
+                               deadline = get_deadline(*next);
+                       }
+                       break;
+               case BG:
+                       /* background server mode */
+                       class    = RT_CLASS_BEST_EFFORT;
+                       deadline = 0;
+                       *next    = take_ready(&hsb_fifo);
+                       break;
+               }
+
+
+               /* give back capacities */
+               if (type != CAP && state->cap) {
+                       __return_capacity(&cap_queue, state->cap);
+                       state->cap = NULL;
+               }
+               if (*next && deactivate) {
+                       /* mark the task as executing on this cpu */
+                       set_task_cpu(*next, smp_processor_id());
+                       /* stick the task into the runqueue */
+                       __activate_task(*next, rq);
+               }
+       }
+
+       adjust_cpu_queue(class, deadline, bes);
+
+       switch (type) {
+       case BG:
+               break;
+       case BE:
+               be.check_resched(&be);
+               break;
+       case SRT:
+               srt.check_resched(&srt);
+               break;
+       case CAP:
+               if (!list_empty(&cap_queue.queue))
+                       cap_check_resched(list_entry(cap_queue.queue.next,
+                                          capacity_t, list)->deadline);
+               break;
+       }
+
+
+       if(*next)
+               set_rt_flags(*next, RT_F_RUNNING);
+
+       spin_unlock(&cap_queue.lock);
+       write_unlock(&be.ready_lock);
+       write_unlock_irqrestore(&srt.ready_lock, flags);
+       return deactivate;
+}
+
+
+static int hsb_schedule(struct task_struct * prev, struct task_struct ** next,
+                       runqueue_t * rq)
+{
+       int             need_deactivate = 1;
+       cpu_state_t     *state = NULL;
+
+       preempt_disable();
+
+       state = &__get_cpu_var(hsb_cpu_state);
+
+       be_preempt(&be, state);
+
+
+       if (is_realtime(prev) && !is_be(prev)  &&
+           get_rt_flags(prev) == RT_F_SLEEP)
+       {
+               TRACE("preparing %d for next period\n", prev->pid);
+               release_capacity(&cap_queue, prev->time_slice,
+                                prev->rt_param.times.deadline, prev);
+               edf_prepare_for_next_period(prev);
+       }
+
+       if (get_rt_mode() == MODE_RT_RUN) {
+               /* we need to schedule hrt if a hrt job is pending or when
+                * we have a non expired hrt job on the cpu
+                */
+
+               if (hrt_client_eligible(&state->hrt) ||
+                   unlikely((is_hrt(prev) && is_running(prev) &&
+                             get_rt_flags(prev) != RT_F_SLEEP))) {
+                       if (state->cap) {
+                               return_capacity(&cap_queue, state->cap);
+                               state->cap = NULL;
+                       }
+                       need_deactivate = schedule_hrt(prev, next, rq);
+               } else
+                       need_deactivate = schedule_srt_be_cap(prev, next, rq);
+
+       }
+
+       if (is_realtime(prev) && need_deactivate && prev->array) {
+               /* take it out of the run queue */
+               deactivate_task(prev, rq);
+       }
+
+       preempt_enable();
+
+       return 0;
+}
+
+/* put task into correct queue */
+static inline void hsb_add_release(struct task_struct *t)
+{
+       if (is_hrt(t))
+               add_release(hrt_dom(get_partition(t)), t);
+       else if (is_srt(t))
+               add_release(&srt, t);
+       else if (is_be(t)) {
+               t->time_slice = 0;
+               add_ready(&hsb_fifo, t);
+               fifo_check_resched();
+       } else
+               BUG();
+
+}
+
+/* put task into correct queue */
+static inline void hsb_add_ready(struct task_struct *t)
+{
+       if (is_hrt(t))
+               add_ready(hrt_dom(get_partition(t)), t);
+       else if (is_srt(t))
+               add_ready(&srt, t);
+       else if (is_be(t)) {
+               add_ready(&hsb_fifo, t);
+               fifo_check_resched();
+       }
+       else
+               BUG();
+}
+
+
+/* _finish_switch - we just finished the switch away from prev
+ *                  it is now safe to requeue the task
+ */
+static void hsb_finish_switch(struct task_struct *prev)
+{
+       if (!is_realtime(prev) || !is_running(prev))
+               return;
+
+       TRACE("finish switch for %d\n", prev->pid);
+
+       if (is_be(prev)) {
+               add_ready(&hsb_fifo, prev);
+               return;
+       }
+
+       if (get_rt_flags(prev) == RT_F_SLEEP ||
+           get_rt_mode() != MODE_RT_RUN) {
+               /* this task has expired
+                * _schedule has already taken care of updating
+                * the release and
+                * deadline. We just must check if has been released.
+                */
+               if (is_released(prev) && get_rt_mode() == MODE_RT_RUN) {
+                       sched_trace_job_release(prev);
+                       hsb_add_ready(prev);
+                       TRACE("%d goes straight to ready queue\n", prev->pid);
+               }
+               else
+                       /* it has got to wait */
+                       hsb_add_release(prev);
+       }
+       else {
+               /* this is a forced preemption
+                * thus the task stays in the ready_queue
+                * we only must make it available to other cpus
+                */
+               hsb_add_ready(prev);
+       }
+}
+
+
+/*     Prepare a task for running in RT mode
+ *     Enqueues the task into master queue data structure
+ *     returns
+ *             -EPERM  if task is not TASK_STOPPED
+ */
+static long hsb_prepare_task(struct task_struct * t)
+{
+       TRACE("edf-hsb: prepare task %d\n", t->pid);
+
+       if (t->state == TASK_STOPPED) {
+               __setscheduler(t, SCHED_FIFO, MAX_RT_PRIO - 1);
+
+               if (get_rt_mode() == MODE_RT_RUN && !is_be(t))
+                       /* The action is already on.
+                        * Prepare immediate release
+                        */
+                       edf_release_now(t);
+               /* The task should be running in the queue, otherwise signal
+                * code will try to wake it up with fatal consequences.
+                */
+               t->state = TASK_RUNNING;
+               if (is_be(t))
+                       t->rt_param.times.deadline = 0;
+               hsb_add_release(t);
+               return 0;
+       }
+       else
+               return -EPERM;
+}
+
+static void hsb_wake_up_task(struct task_struct *task)
+{
+       /* We must determine whether task should go into the release
+        * queue or into the ready queue. It may enter the ready queue
+        * if it has credit left in its time slice and has not yet reached
+        * its deadline. If it is now passed its deadline we assume this the
+        * arrival of a new sporadic job and thus put it in the ready queue
+        * anyway.If it has zero budget and the next release is in the future
+        * it has to go to the release queue.
+        */
+       TRACE("edf-hsb: wake up %d with budget=%d\n",
+             task->pid, task->time_slice);
+       task->state = TASK_RUNNING;
+
+       if (is_be(task)) {
+               task->rt_param.times.last_release = jiffies;
+               hsb_add_release(task);
+       }
+       else if (is_tardy(task)) {
+               /* new sporadic release */
+               edf_release_now(task);
+               sched_trace_job_release(task);
+               hsb_add_ready(task);
+       }
+       else if (task->time_slice) {
+               /* came back in time before deadline
+                */
+               set_rt_flags(task, RT_F_RUNNING);
+               hsb_add_ready(task);
+       }
+       else {
+               hsb_add_release(task);
+       }
+
+}
+
+static void hsb_task_blocks(struct task_struct *t)
+{
+       /* not really anything to do since it can only block if
+        * it is running, and when it is not running it is not in any
+        * queue anyway.
+        */
+       TRACE("task %d blocks with budget=%d\n", t->pid, t->time_slice);
+       if (is_be(t))
+               sched_trace_job_completion(t);
+}
+
+
+static int hsb_mode_change(int new_mode)
+{
+       int cpu;
+       cpu_state_t *entry;
+       jiffie_t start;
+
+       TRACE("[%d] edf-hsb: mode changed to %d\n", smp_processor_id(),
+             new_mode);
+       if (new_mode == MODE_RT_RUN) {
+               start = jiffies + 20;
+               rerelease_all(&srt, edf_release_at);
+               be_prepare_new_releases(&be, start);
+
+               /* initialize per CPU state
+                * we can't do this at boot time because we don't know
+                * which CPUs will be online and we can't put non-existing
+                * cpus into the queue
+                */
+               spin_lock(&hsb_cpu_lock);
+               /* get old cruft out of the way in case we reenter real-time
+                * mode for a second time
+                */
+               while (!list_empty(&hsb_cpu_queue))
+                       list_del(hsb_cpu_queue.next);
+               /* reinitialize */
+               for_each_online_cpu(cpu) {
+                       entry = &per_cpu(hsb_cpu_state, cpu);
+                       atomic_set(&entry->will_schedule, 0);
+                       entry->exec_class               = RT_CLASS_BEST_EFFORT;
+                       entry->cur_deadline             = 0;
+                       list_add(&entry->list, &hsb_cpu_queue);
+
+                       rerelease_all(&entry->hrt.domain, edf_release_at);
+                       prepare_hrt_release(&entry->hrt, start);
+               }
+               spin_unlock(&hsb_cpu_lock);
+
+       }
+       TRACE("[%d] edf-hsb: mode change done\n", smp_processor_id());
+       return 0;
+}
+
+
+typedef enum {
+       EDF_HSB_SET_HRT,
+       EDF_HSB_GET_HRT,
+       EDF_HSB_CREATE_BE
+} edf_hsb_setup_cmds_t;
+
+typedef struct {
+       int             cpu;
+       unsigned int    wcet;
+       unsigned int    period;
+} setup_hrt_param_t;
+
+typedef struct {
+       unsigned int    wcet;
+       unsigned int    period;
+} create_be_param_t;
+
+typedef struct {
+       union {
+               setup_hrt_param_t       setup_hrt;
+               create_be_param_t       create_be;
+       };
+} param_t;
+
+static pid_t next_be_server_pid = SRT_BASE_PID;
+
+static int hsb_scheduler_setup(int cmd, void __user* up)
+{
+       unsigned long   flags;
+       int             error = -EINVAL;
+       cpu_state_t*    state;
+       be_server_t*    srv;
+       param_t         param;
+
+       switch (cmd) {
+       case EDF_HSB_SET_HRT:
+               if (copy_from_user(&param, up, sizeof(setup_hrt_param_t))) {
+                       error = -EFAULT;
+                       goto out;
+               }
+               if (!cpu_online(param.setup_hrt.cpu)) {
+                       printk(KERN_WARNING "scheduler setup: "
+                              "CPU %d is not online!\n", param.setup_hrt.cpu);
+                       error = -EINVAL;
+                       goto out;
+               }
+               if (param.setup_hrt.period < param.setup_hrt.wcet) {
+                       printk(KERN_WARNING "period < wcet!\n");
+                       error = -EINVAL;
+                       goto out;
+               }
+
+               state = &per_cpu(hsb_cpu_state, param.setup_hrt.cpu);
+               spin_lock_irqsave(&state->lock, flags);
+
+               state->hrt.wcet   = param.setup_hrt.wcet;
+               state->hrt.period = param.setup_hrt.period;
+
+               spin_unlock_irqrestore(&state->lock, flags);
+
+               printk(KERN_WARNING "edf-hsb: set HRT #%d to (%d, %d)\n",
+                      param.setup_hrt.cpu, param.setup_hrt.wcet,
+                      param.setup_hrt.period);
+
+               error = 0;
+
+               break;
+
+       case EDF_HSB_GET_HRT:
+               if (copy_from_user(&param, up, sizeof(setup_hrt_param_t))) {
+                       error = -EFAULT;
+                       goto out;
+               }
+               if (!cpu_online(param.setup_hrt.cpu)) {
+                       error = -EINVAL;
+                       goto out;
+               }
+               state = &per_cpu(hsb_cpu_state, param.setup_hrt.cpu);
+               spin_lock_irqsave(&state->lock, flags);
+
+               param.setup_hrt.wcet   = state->hrt.wcet;
+               param.setup_hrt.period = state->hrt.period;
+
+               spin_unlock_irqrestore(&state->lock, flags);
+
+               if (copy_to_user(up, &param, sizeof(setup_hrt_param_t))) {
+                       error = -EFAULT;
+                       goto out;
+               }
+               error = 0;
+               break;
+
+       case EDF_HSB_CREATE_BE:
+               if (copy_from_user(&param, up, sizeof(create_be_param_t))) {
+                       error = -EFAULT;
+                       goto out;
+               }
+               if (param.create_be.period < param.create_be.wcet ||
+                   !param.create_be.period || !param.create_be.wcet) {
+                       error = -EINVAL;
+                       goto out;
+               }
+               srv = (be_server_t*) kmalloc(sizeof(be_server_t), GFP_KERNEL);
+               if (!srv) {
+                       error = -ENOMEM;
+                       goto out;
+               }
+               srv->wcet       = param.create_be.wcet;
+               srv->period     = param.create_be.period;
+               srv->pid        = next_be_server_pid++;
+               INIT_LIST_HEAD(&srv->list);
+               be_prepare_new_release(srv, jiffies);
+               be_enqueue(&be, srv);
+
+               printk(KERN_WARNING "edf-hsb: created a BE with (%d, %d)\n",
+                      param.create_be.wcet, param.create_be.period);
+
+               error = 0;
+               break;
+
+       default:
+               printk(KERN_WARNING "edf-hsb: unknown command %d\n", cmd);
+       }
+
+out:
+       return error;
+}
+
+/*     Plugin object   */
+static sched_plugin_t s_plugin __cacheline_aligned_in_smp = {
+       .ready_to_use = 0
+};
+
+
+/*
+ *     Plugin initialization code.
+ */
+#define INIT_SCHED_PLUGIN (struct sched_plugin){\
+       .plugin_name            = "EDF-HSB",\
+       .ready_to_use           = 1,\
+       .scheduler_tick         = hsb_scheduler_tick,\
+       .prepare_task           = hsb_prepare_task,\
+       .sleep_next_period      = edf_sleep_next_period,\
+       .schedule               = hsb_schedule,\
+       .finish_switch          = hsb_finish_switch,\
+       .mode_change            = hsb_mode_change,\
+       .wake_up_task           = hsb_wake_up_task,\
+       .task_blocks            = hsb_task_blocks, \
+       .scheduler_setup        = hsb_scheduler_setup \
+}
+
+
+sched_plugin_t *__init init_edf_hsb_plugin(void)
+{
+       int i;
+
+       if (!s_plugin.ready_to_use)
+       {
+               capacity_queue_init(&cap_queue);
+               edf_domain_init(&srt, srt_check_resched);
+               edf_domain_init(&be, be_check_resched);
+               fifo_domain_init(&hsb_fifo, NULL);
+               for (i = 0; i < NR_CPUS; i++)
+               {
+                       hsb_cpu_state_init(&per_cpu(hsb_cpu_state, i),
+                                            hrt_check_resched, i);
+                       printk("HRT server %d initialized.\n", i);
+               }
+               s_plugin = INIT_SCHED_PLUGIN;
+       }
+       return &s_plugin;
+}
+diff --git a/kernel/sched_global_edf.c b/kernel/sched_global_edf.c
+new file mode 100644
+index 0000000..bc32373
+--- /dev/null
+++ b/kernel/sched_global_edf.c
+@@ -0,0 +1,550 @@
+/*
+ * kernel/sched-global-edf.c
+ *
+ * Re-Implementation of the Global EDF scheduler.
+ *
+ * This version works without using the struct queue. It uses the
+ * builtin kernel lists.
+ */
+
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+
+#include <linux/litmus.h>
+#include <linux/sched_plugin.h>
+
+#include <linux/edf_common.h>
+#include <linux/sched_trace.h>
+
+
+/* cpu_entry_t - maintain state of the priority of cpu's current task
+ *               this is needed to check for priority inversions.
+ */
+typedef struct  {
+       int                     cpu;
+       int                     executes_realtime;
+       jiffie_t                cur_deadline;
+       struct list_head        list;
+       atomic_t                will_schedule;
+} cpu_entry_t;
+DEFINE_PER_CPU(cpu_entry_t, gedf_cpu_entries);
+
+#define set_will_schedule() \
+       (atomic_set(&__get_cpu_var(gedf_cpu_entries).will_schedule, 1))
+#define clear_will_schedule() \
+       (atomic_set(&__get_cpu_var(gedf_cpu_entries).will_schedule, 0))
+#define test_will_schedule(cpu) \
+       (atomic_read(&per_cpu(gedf_cpu_entries, cpu).will_schedule))
+
+
+/* always acquire the cpu lock as the last lock to avoid deadlocks */
+static spinlock_t gedf_cpu_lock = SPIN_LOCK_UNLOCKED;
+/* the cpus queue themselves according to priority in here */
+static LIST_HEAD(gedf_cpu_queue);
+
+
+static rt_domain_t gedf;
+
+#define DUMP(args...) TRACE(args)
+
+/* adjust_cpu_queue - Move the cpu entry to the correct place to maintain
+ *                    order in the cpu queue. Caller must hold ready write lock.
+ *
+ */
+static void adjust_cpu_queue(int exec_rt, jiffie_t deadline)
+{
+       struct list_head *pos;
+       cpu_entry_t *other;
+       cpu_entry_t *entry;
+
+       spin_lock(&gedf_cpu_lock);
+
+       entry = &__get_cpu_var(gedf_cpu_entries);
+       entry->executes_realtime = exec_rt;
+       entry->cur_deadline      = deadline;
+
+       list_del(&entry->list);
+       /* if we do not execute real-time jobs we just move
+        * to the end of the queue
+        */
+       if (entry->executes_realtime)
+               list_for_each(pos, &gedf_cpu_queue) {
+                       other = list_entry(pos, cpu_entry_t, list);
+                       if (!other->executes_realtime ||
+                           time_before_eq(entry->cur_deadline,
+                                          other->cur_deadline))
+                       {
+                               __list_add(&entry->list, pos->prev, pos);
+                               goto out;
+                       }
+               }
+       /* if we get this far we have the lowest priority task */
+       list_add_tail(&entry->list, &gedf_cpu_queue);
+
+ out:
+       spin_unlock(&gedf_cpu_lock);
+}
+
+
+/* check_reschedule_needed - Check whether another CPU needs to reschedule.
+ *
+ * The function only checks and kicks the last CPU. It will reschedule and
+ * kick the next if necessary, and so on. The caller is responsible for making
+ * sure that it is not the last entry or that a reschedule is not necessary.
+ *
+ */
+static int gedf_check_resched(rt_domain_t *edf)
+{
+       cpu_entry_t *last;
+       int ret = 0;
+
+       spin_lock(&gedf_cpu_lock);
+
+       if (!list_empty(&edf->ready_queue)) {
+               last = list_entry(gedf_cpu_queue.prev, cpu_entry_t, list);
+               if (!last->executes_realtime ||
+                   time_before(next_ready(edf)->rt_param.times.deadline,
+                               last->cur_deadline))
+               {
+                       if (smp_processor_id() == last->cpu)
+                               set_tsk_need_resched(current);
+                       else
+                               if (!test_will_schedule(last->cpu))
+                                   smp_send_reschedule(last->cpu);
+                       ret = 1;
+               }
+       }
+
+       spin_unlock(&gedf_cpu_lock);
+       return ret;
+}
+
+
+
+/* gedf_scheduler_tick - this function is called for every local timer
+ *                       interrupt.
+ *
+ *                   checks whether the current task has expired and checks
+ *                   whether we need to preempt it if it has not expired
+ */
+static reschedule_check_t gedf_scheduler_tick(void)
+{
+       unsigned long flags;
+       struct task_struct *t = current;
+       reschedule_check_t want_resched = NO_RESCHED;
+
+       /* expire tasks even if not in real-time mode
+        * this makes sure that at the end of real-time mode
+        * no tasks "run away forever".
+        */
+       BUG_ON(is_realtime(t) && t->time_slice > 100000);
+       if (is_realtime(t) && (!--t->time_slice)) {
+               /* this task has exhausted its budget in this period */
+               set_rt_flags(t, RT_F_SLEEP);
+               want_resched = FORCE_RESCHED;
+               set_will_schedule();
+               sched_trace_job_completion(t);
+       }
+       if (get_rt_mode() == MODE_RT_RUN)
+       {
+               /* check whether anything is waiting to be released
+                * this could probably be moved to the global timer
+                * interrupt handler since the state will only change
+                * once per jiffie
+                */
+               try_release_pending(&gedf);
+               if (want_resched != FORCE_RESCHED)
+               {
+                       read_lock_irqsave(&gedf.ready_lock, flags);
+                       if (edf_preemption_needed(&gedf, t))
+                       {
+                               want_resched = FORCE_RESCHED;
+                               set_will_schedule();
+                       }
+                       read_unlock_irqrestore(&gedf.ready_lock, flags);
+               }
+       }
+       return want_resched;
+}
+
+/* This is main Global EDF schedule function
+ *
+ * Assumes the caller holds the lock for rq and that irqs are disabled
+ * This is function only works for indirect switching
+ */
+static int gedf_schedule(struct task_struct * prev,
+                        struct task_struct ** next,
+                        runqueue_t * rq)
+{
+       int             need_deactivate = 1;
+       int             rt;
+       jiffie_t        deadline;
+       unsigned long   flags;
+
+
+       if (is_realtime(prev) && get_rt_flags(prev) == RT_F_SLEEP)
+       {
+               DUMP("preparing %d for next period\n", prev->pid);
+               edf_prepare_for_next_period(prev);
+       }
+
+       if (get_rt_mode() == MODE_RT_RUN) {
+               write_lock_irqsave(&gedf.ready_lock, flags);
+
+               clear_will_schedule();
+
+               if (is_realtime(prev) && is_released(prev) && is_running(prev)
+                   && !edf_preemption_needed(&gedf, prev)) {
+                       /* Our current task's next job has already been
+                        * released and has higher priority than the highest
+                        * prioriy waiting task; in other words: it is tardy.
+                        * We just keep it.
+                        */
+                       DUMP("prev will be next, already released\n");
+                       *next = prev;
+                       rt = 1;
+                       deadline = prev->rt_param.times.deadline;
+                       need_deactivate = 0;
+               } else {
+                       /* either not yet released, preempted, or non-rt */
+                       *next = __take_ready(&gedf);
+                       if (*next) {
+                               /* mark the task as executing on this cpu */
+                               set_task_cpu(*next, smp_processor_id());
+
+                               /* stick the task into the runqueue */
+                               __activate_task(*next, rq);
+                               rt = 1;
+                               deadline = (*next)->rt_param.times.deadline;
+                       }
+                       else
+                               rt = deadline = 0;
+               }
+
+               adjust_cpu_queue(rt, deadline);
+
+               if (rt) {
+                       set_rt_flags(*next, RT_F_RUNNING);
+                       gedf.check_resched(&gedf);
+               }
+               write_unlock_irqrestore(&gedf.ready_lock, flags);
+       }
+
+       if (is_realtime(prev) && need_deactivate && prev->array) {
+               /* take it out of the run queue */
+               deactivate_task(prev, rq);
+       }
+
+       /* don't put back into release yet.
+        * We first need to actually switch
+        * stacks before we can execute it
+        * on a different CPU */
+
+       /* in the current implementation nobody cares about the return value */
+       return 0;
+}
+
+
+/* _finish_switch - we just finished the switch away from prev
+ *                  it is now safe to requeue the task
+ */
+static void gedf_finish_switch(struct task_struct *prev)
+{
+       if (!is_realtime(prev) || !is_running(prev))
+               return;
+
+       /*printk(KERN_INFO "gedf finish switch for %d\n", prev->pid);*/
+       if (get_rt_flags(prev) == RT_F_SLEEP ||
+           get_rt_mode() != MODE_RT_RUN) {
+               /* this task has expired
+                * _schedule has already taken care of updating
+                * the release and
+                * deadline. We just must check if has been released.
+                */
+               if (time_before_eq(prev->rt_param.times.release, jiffies)
+                   && get_rt_mode() == MODE_RT_RUN) {
+                       /* already released */
+                       add_ready(&gedf, prev);
+                       DUMP("%d goes straight to ready queue\n", prev->pid);
+               }
+               else
+                       /* it has got to wait */
+                       add_release(&gedf, prev);
+       }
+       else {
+               /* this is a forced preemption
+                * thus the task stays in the ready_queue
+                * we only must make it available to others
+                */
+               add_ready(&gedf, prev);
+       }
+}
+
+
+/*     Prepare a task for running in RT mode
+ *     Enqueues the task into master queue data structure
+ *     returns
+ *             -EPERM  if task is not TASK_STOPPED
+ */
+static long gedf_prepare_task(struct task_struct * t)
+{
+       TRACE("global edf: prepare task %d\n", t->pid);
+
+       if (t->state == TASK_STOPPED) {
+               __setscheduler(t, SCHED_FIFO, MAX_RT_PRIO - 1);
+
+               if (get_rt_mode() == MODE_RT_RUN)
+                       /* The action is already on.
+                        * Prepare immediate release
+                        */
+                       edf_release_now(t);
+               /* The task should be running in the queue, otherwise signal
+                * code will try to wake it up with fatal consequences.
+                */
+               t->state = TASK_RUNNING;
+               add_release(&gedf, t);
+               return 0;
+       }
+       else
+               return -EPERM;
+}
+
+static void gedf_wake_up_task(struct task_struct *task)
+{
+       /* We must determine whether task should go into the release
+        * queue or into the ready queue. It may enter the ready queue
+        * if it has credit left in its time slice and has not yet reached
+        * its deadline. If it is now passed its deadline we assume this the
+        * arrival of a new sporadic job and thus put it in the ready queue
+        * anyway.If it has zero budget and the next release is in the future
+        * it has to go to the release queue.
+        */
+       TRACE("global edf: wake up %d with budget=%d\n",
+             task->pid, task->time_slice);
+       task->state = TASK_RUNNING;
+       if (is_tardy(task)) {
+               /* new sporadic release */
+               edf_release_now(task);
+               sched_trace_job_release(task);
+               add_ready(&gedf, task);
+       }
+       else if (task->time_slice) {
+               /* came back in time before deadline
+                */
+               set_rt_flags(task, RT_F_RUNNING);
+               add_ready(&gedf, task);
+       }
+       else {
+               add_release(&gedf, task);
+       }
+
+}
+
+static void gedf_task_blocks(struct task_struct *t)
+{
+       BUG_ON(!is_realtime(t));
+       /* not really anything to do since it can only block if
+        * it is running, and when it is not running it is not in any
+        * queue anyway.
+        *
+        */
+       TRACE("task %d blocks with budget=%d\n", t->pid, t->time_slice);
+       BUG_ON(t->rt_list.next != LIST_POISON1);
+       BUG_ON(t->rt_list.prev != LIST_POISON2);
+}
+
+
+/* When _tear_down is called, the task should not be in any queue any more
+ * as it must have blocked first. We don't have any internal state for the task,
+ * it is all in the task_struct.
+ */
+static long gedf_tear_down(struct task_struct * t)
+{
+       BUG_ON(!is_realtime(t));
+        TRACE("global edf: tear down called for %d \n", t->pid);
+       BUG_ON(t->array);
+       BUG_ON(t->rt_list.next != LIST_POISON1);
+       BUG_ON(t->rt_list.prev != LIST_POISON2);
+       return 0;
+}
+
+
+static int gedf_mode_change(int new_mode)
+{
+       int cpu;
+       cpu_entry_t *entry;
+
+/*     printk(KERN_INFO "[%d] global edf: mode changed to %d\n", smp_processor_id(),
+       new_mode);*/
+       if (new_mode == MODE_RT_RUN) {
+               rerelease_all(&gedf, edf_release_at);
+
+               /* initialize per CPU state
+                * we can't do this at boot time because we don't know
+                * which CPUs will be online and we can't put non-existing
+                * cpus into the queue
+                */
+               spin_lock(&gedf_cpu_lock);
+               /* get old cruft out of the way in case we reenter real-time
+                * mode for a second time
+                */
+               while (!list_empty(&gedf_cpu_queue))
+                       list_del(gedf_cpu_queue.next);
+               /* reinitialize */
+               for_each_online_cpu(cpu) {
+                       entry = &per_cpu(gedf_cpu_entries, cpu);
+                       atomic_set(&entry->will_schedule, 0);
+                       entry->executes_realtime = 0;
+                       entry->cur_deadline      = 0;
+                       entry->cpu               = cpu;
+                       list_add(&entry->list, &gedf_cpu_queue);
+               }
+               spin_unlock(&gedf_cpu_lock);
+       }
+       /*printk(KERN_INFO "[%d] global edf: mode change done\n", smp_processor_id()); */
+       return 0;
+}
+
+
+/*     Plugin object   */
+static sched_plugin_t s_plugin __cacheline_aligned_in_smp = {
+       .ready_to_use = 0
+};
+
+
+/*
+ *     Plugin initialization code.
+ */
+#define INIT_SCHED_PLUGIN (struct sched_plugin){\
+       .plugin_name            = "Global EDF",\
+       .ready_to_use           = 1,\
+       .scheduler_tick         = gedf_scheduler_tick,\
+       .prepare_task           = gedf_prepare_task,\
+       .sleep_next_period      = edf_sleep_next_period,\
+       .tear_down              = gedf_tear_down,\
+       .schedule               = gedf_schedule,\
+       .finish_switch          = gedf_finish_switch,\
+       .mode_change            = gedf_mode_change,\
+       .wake_up_task           = gedf_wake_up_task,\
+       .task_blocks            = gedf_task_blocks \
+       }
+
+
+sched_plugin_t *__init init_global_edf_plugin(void)
+{
+       if (!s_plugin.ready_to_use)
+       {
+               edf_domain_init(&gedf, gedf_check_resched);
+               s_plugin = INIT_SCHED_PLUGIN;
+       }
+       return &s_plugin;
+}
+
+
+
+/*****************************************************************************/
+/*****************************************************************************/
+/*****************************************************************************/
+/*                       NON-PREEMPTIVE GLOBAL EDF                           */
+
+
+/* gedf_np_scheduler_tick - this function is called for every local timer
+ *                          interrupt.
+ *
+ *                   checks whether the current task has expired and checks
+ *                   whether we need to preempt it if it has not expired
+ */
+static reschedule_check_t gedf_np_scheduler_tick(void)
+{
+       if (get_rt_mode() == MODE_RT_RUN)
+       {
+               /* check whether anything is waiting to be released
+                * this could probably be moved to the global timer
+                * interrupt handler since the state will only change
+                * once per jiffie
+                */
+               try_release_pending(&gedf);
+       }
+
+       /* expire tasks even if not in real-time mode
+        * this makes sure that at the end of real-time mode
+        * no tasks "run away forever".
+        */
+       BUG_ON(current->time_slice > 1000);
+       if (is_realtime(current) && (!--current->time_slice)) {
+               /* this task has exhausted its budget in this period */
+               set_rt_flags(current, RT_F_SLEEP);
+               return FORCE_RESCHED;
+       }
+       else
+               return NO_RESCHED;
+}
+
+/* gedf_np_check_resched - Check whether another CPU needs to reschedule.
+ *
+ * The function only checks and kicks the last CPU. It will reschedule and
+ * kick the next if necessary, and so on. The caller is responsible for making
+ * sure that it is not the last entry or that a reschedule is not necessary.
+ *
+ */
+static int gedf_np_check_resched(rt_domain_t *edf)
+{
+       cpu_entry_t *last;
+       int ret = 0;
+
+       spin_lock(&gedf_cpu_lock);
+
+       if (!list_empty(&edf->ready_queue)) {
+               last = list_entry(gedf_cpu_queue.prev, cpu_entry_t, list);
+               /* preemption happens only for non-realtime tasks */
+               if (!last->executes_realtime)
+               {
+                       if (smp_processor_id() == last->cpu)
+                               set_tsk_need_resched(current);
+                       else
+                               smp_send_reschedule(last->cpu);
+                       ret = 1;
+                       goto out;
+               }
+       }
+
+ out:
+       spin_unlock(&gedf_cpu_lock);
+       return ret;
+}
+
+
+/* non-preemptive global EDF
+ *
+ * Non-preemptive EDF is almost the same as normal EDF. We only have to
+ * adjust the scheduler tick and the resched function.
+ */
+#define INIT_SCHED_PLUGIN_NP (struct sched_plugin){\
+       .plugin_name            = "Non-Preemptive Global EDF",\
+       .ready_to_use           = 1,\
+       .scheduler_tick         = gedf_np_scheduler_tick,\
+       .prepare_task           = gedf_prepare_task,\
+       .sleep_next_period      = edf_sleep_next_period,\
+       .tear_down              = gedf_tear_down,\
+       .schedule               = gedf_schedule,\
+       .finish_switch          = gedf_finish_switch,\
+       .mode_change            = gedf_mode_change,\
+       .wake_up_task           = gedf_wake_up_task,\
+       .task_blocks            = gedf_task_blocks \
+       }
+
+
+/* as we only set the plugin at boot time,
+ * we use the same structure as preemptive EDF. This simplifies a lot
+ * of the funtions.
+ */
+sched_plugin_t* __init init_global_edf_np_plugin(void)
+{
+       if (!s_plugin.ready_to_use)
+       {
+               edf_domain_init(&gedf, gedf_np_check_resched);
+               s_plugin = INIT_SCHED_PLUGIN_NP;
+       }
+       return &s_plugin;
+}
+diff --git a/kernel/sched_gsn_edf.c b/kernel/sched_gsn_edf.c
+new file mode 100644
+index 0000000..f6ba521
+--- /dev/null
+++ b/kernel/sched_gsn_edf.c
+@@ -0,0 +1,816 @@
+/*
+ * kernel/sched_gsn_edf.c
+ *
+ * Implementation of the GSN-EDF scheduling algorithm.
+ *
+ * This version uses the simple approach and serializes all scheduling
+ * decisions by the use of a queue lock. This is probably not the
+ * best way to do it, but it should suffice for now. It should not
+ * affect the benchmarks since all synchronization primitives will
+ * take the same performance hit, if any.
+ */
+
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+
+#include <linux/queuelock.h>
+#include <linux/litmus.h>
+#include <linux/sched_plugin.h>
+#include <linux/edf_common.h>
+#include <linux/sched_trace.h>
+
+/* Overview of GSN-EDF operations.
+ *
+ * For a detailed explanation of GSN-EDF have a look at the FMLP paper. This
+ * description only covers how the individual operations are implemented in
+ * LITMUS.
+ *
+ * link_task_to_cpu(T, cpu)    - Low-level operation to update the linkage
+ *                                structure (NOT the actually scheduled
+ *                                task). If there is another linked task To
+ *                                already it will set To->linked_on = NO_CPU
+ *                                (thereby removing its association with this
+ *                                CPU). However, it will not requeue the
+ *                                previously linked task (if any). It will set
+ *                                T's state to RT_F_RUNNING and check whether
+ *                                it is already running somewhere else. If T
+ *                                is scheduled somewhere else it will link
+ *                                it to that CPU instead (and pull the linked
+ *                                task to cpu). T may be NULL.
+ *
+ * unlink(T)                   - Unlink removes T from all scheduler data
+ *                                structures. If it is linked to some CPU it
+ *                                will link NULL to that CPU. If it is
+ *                                currently queued in the gsnedf queue it will
+ *                                be removed from the T->rt_list. It is safe to
+ *                                call unlink(T) if T is not linked. T may not
+ *                                be NULL.
+ *
+ * requeue(T)                  - Requeue will insert T into the appropriate
+ *                                queue. If the system is in real-time mode and
+ *                                the T is released already, it will go into the
+ *                                ready queue. If the system is not in
+ *                                real-time mode is T, then T will go into the
+ *                                release queue. If T's release time is in the
+ *                                future, it will go into the release
+ *                                queue. That means that T's release time/job
+ *                                no/etc. has to be updated before requeu(T) is
+ *                                called. It is not safe to call requeue(T)
+ *                                when T is already queued. T may not be NULL.
+ *
+ * gsnedf_job_arrival(T)       - This is the catch all function when T enters
+ *                                the system after either a suspension or at a
+ *                                job release. It will queue T (which means it
+ *                                is not safe to call gsnedf_job_arrival(T) if
+ *                                T is already queued) and then check whether a
+ *                                preemption is necessary. If a preemption is
+ *                                necessary it will update the linkage
+ *                                accordingly and cause scheduled to be called
+ *                                (either with an IPI or need_resched). It is
+ *                                safe to call gsnedf_job_arrival(T) if T's
+ *                                next job has not been actually released yet
+ *                                (releast time in the future). T will be put
+ *                                on the release queue in that case.
+ *
+ * job_completion(T)           - Take care of everything that needs to be done
+ *                                to prepare T for its next release and place
+ *                                it in the right queue with
+ *                                gsnedf_job_arrival().
+ *
+ *
+ * When we now that T is linked to CPU then link_task_to_cpu(NULL, CPU) is
+ * equivalent to unlink(T). Note that if you unlink a task from a CPU none of
+ * the functions will automatically propagate pending task from the ready queue
+ * to a linked task. This is the job of the calling function ( by means of
+ * __take_ready).
+ */
+
+
+/* cpu_entry_t - maintain the linked and scheduled state
+ */
+typedef struct  {
+       int                     cpu;
+       struct task_struct*     linked;         /* only RT tasks */
+       struct task_struct*     scheduled;      /* only RT tasks */
+       struct list_head        list;
+       atomic_t                will_schedule;  /* prevent unneeded IPIs */
+} cpu_entry_t;
+DEFINE_PER_CPU(cpu_entry_t, gsnedf_cpu_entries);
+
+#define set_will_schedule() \
+       (atomic_set(&__get_cpu_var(gsnedf_cpu_entries).will_schedule, 1))
+#define clear_will_schedule() \
+       (atomic_set(&__get_cpu_var(gsnedf_cpu_entries).will_schedule, 0))
+#define test_will_schedule(cpu) \
+       (atomic_read(&per_cpu(gsnedf_cpu_entries, cpu).will_schedule))
+
+
+#define NO_CPU 0xffffffff
+
+/* The gsnedf_lock is used to serialize all scheduling events.
+ * It protects
+ */
+static queuelock_t gsnedf_lock;
+/* the cpus queue themselves according to priority in here */
+static LIST_HEAD(gsnedf_cpu_queue);
+
+static rt_domain_t gsnedf;
+
+
+/* update_cpu_position - Move the cpu entry to the correct place to maintain
+ *                       order in the cpu queue. Caller must hold gsnedf lock.
+ */
+static void update_cpu_position(cpu_entry_t *entry)
+{
+       cpu_entry_t *other;
+       struct list_head *pos;
+       list_del(&entry->list);
+       /* if we do not execute real-time jobs we just move
+        * to the end of the queue
+        */
+       if (entry->linked) {
+               list_for_each(pos, &gsnedf_cpu_queue) {
+                       other = list_entry(pos, cpu_entry_t, list);
+                       if (edf_higher_prio(entry->linked, other->linked)) {
+                               __list_add(&entry->list, pos->prev, pos);
+                               return;
+                       }
+               }
+       }
+       /* if we get this far we have the lowest priority job */
+       list_add_tail(&entry->list, &gsnedf_cpu_queue);
+}
+
+/* link_task_to_cpu - Update the link of a CPU.
+ *                    Handles the case where the to-be-linked task is already
+ *                    scheduled on a different CPU.
+ */
+static noinline void link_task_to_cpu(struct task_struct* linked,
+                            cpu_entry_t *entry)
+
+{
+       cpu_entry_t *sched;
+       struct task_struct* tmp;
+       int on_cpu;
+
+       BUG_ON(linked && !is_realtime(linked));
+
+       /* Currently linked task is set to be unlinked. */
+       if (entry->linked) {
+               entry->linked->rt_param.linked_on = NO_CPU;
+       }
+
+       /* Link new task to CPU. */
+       if (linked) {
+               set_rt_flags(linked, RT_F_RUNNING);
+               /* handle task is already scheduled somewhere! */
+               on_cpu = linked->rt_param.scheduled_on;
+               if (on_cpu != NO_CPU) {
+                       sched = &per_cpu(gsnedf_cpu_entries, on_cpu);
+                       /* this should only happen if not linked already */
+                       BUG_ON(sched->linked == linked);
+
+                       /* If we are already scheduled on the CPU to which we
+                        * wanted to link, we don't need to do the swap --
+                        * we just link ourselves to the CPU and depend on
+                        * the caller to get things right.
+                        */
+                       if (entry != sched) {
+                               tmp = sched->linked;
+                               linked->rt_param.linked_on = sched->cpu;
+                               sched->linked = linked;
+                               update_cpu_position(sched);
+                               linked = tmp;
+                       }
+               }
+               if (linked) /* might be NULL due to swap */
+                       linked->rt_param.linked_on = entry->cpu;
+       }
+       entry->linked = linked;
+       update_cpu_position(entry);
+}
+
+/* unlink - Make sure a task is not linked any longer to an entry
+ *          where it was linked before. Must hold gsnedf_lock.
+ */
+static noinline void unlink(struct task_struct* t)
+{
+       cpu_entry_t *entry;
+
+       if (unlikely(!t)) {
+               TRACE_BUG_ON(!t);
+               return;
+       }
+
+       if (t->rt_param.linked_on != NO_CPU) {
+               /* unlink */
+               entry = &per_cpu(gsnedf_cpu_entries, t->rt_param.linked_on);
+               t->rt_param.linked_on = NO_CPU;
+               link_task_to_cpu(NULL, entry);
+       } else if (in_list(&t->rt_list)) {
+               /* This is an interesting situation: t is scheduled,
+                * but was just recently unlinked.  It cannot be
+                * linked anywhere else (because then it would have
+                * been relinked to this CPU), thus it must be in some
+                * queue. We must remove it from the list in this
+                * case.
+                */
+               list_del(&t->rt_list);
+       }
+}
+
+
+/* preempt - force a CPU to reschedule
+ */
+static noinline void preempt(cpu_entry_t *entry)
+{
+       /* We cannot make the is_np() decision here if it is a remote CPU
+        * because requesting exit_np() requires that we currently use the
+        * address space of the task. Thus, in the remote case we just send
+        * the IPI and let schedule() handle the problem.
+        */
+
+       if (smp_processor_id() == entry->cpu) {
+               if (entry->scheduled && is_np(entry->scheduled))
+                       request_exit_np(entry->scheduled);
+               else
+                       set_tsk_need_resched(current);
+       } else
+               /* in case that it is a remote CPU we have to defer the
+                * the decision to the remote CPU
+                * FIXME: We could save a few IPI's here if we leave the flag
+                * set when we are waiting for a np_exit().
+                */
+               if (!test_will_schedule(entry->cpu))
+                       smp_send_reschedule(entry->cpu);
+}
+
+/* requeue - Put an unlinked task into gsn-edf domain.
+ *           Caller must hold gsnedf_lock.
+ */
+static noinline void requeue(struct task_struct* task)
+{
+       BUG_ON(!task);
+       /* sanity check rt_list before insertion */
+       BUG_ON(in_list(&task->rt_list));
+
+       if (get_rt_flags(task) == RT_F_SLEEP ||
+           get_rt_mode() != MODE_RT_RUN) {
+               /* this task has expired
+                * _schedule has already taken care of updating
+                * the release and
+                * deadline. We just must check if it has been released.
+                */
+               if (is_released(task) && get_rt_mode() == MODE_RT_RUN)
+                       __add_ready(&gsnedf, task);
+               else {
+                       /* it has got to wait */
+                       __add_release(&gsnedf, task);
+               }
+
+       } else
+               /* this is a forced preemption
+                * thus the task stays in the ready_queue
+                * we only must make it available to others
+                */
+               __add_ready(&gsnedf, task);
+}
+
+/* gsnedf_job_arrival: task is either resumed or released */
+static noinline void gsnedf_job_arrival(struct task_struct* task)
+{
+       cpu_entry_t* last;
+
+       BUG_ON(list_empty(&gsnedf_cpu_queue));
+       BUG_ON(!task);
+
+       /* first queue arriving job */
+       requeue(task);
+
+       /* then check for any necessary preemptions */
+       last = list_entry(gsnedf_cpu_queue.prev, cpu_entry_t, list);
+       if (edf_preemption_needed(&gsnedf, last->linked)) {
+               /* preemption necessary */
+               task = __take_ready(&gsnedf);
+               TRACE("job_arrival: task %d linked to %d\n",
+                     task->pid, last->cpu);
+               if (last->linked)
+                       requeue(last->linked);
+
+               link_task_to_cpu(task, last);
+               preempt(last);
+       }
+}
+
+/* check for current job releases */
+static noinline  void gsnedf_release_jobs(void)
+{
+       struct list_head *pos, *save;
+       struct task_struct   *queued;
+
+       list_for_each_safe(pos, save, &gsnedf.release_queue) {
+               queued = list_entry(pos, struct task_struct, rt_list);
+               if (likely(is_released(queued))) {
+                       /* this one is ready to go*/
+                       list_del(pos);
+                       set_rt_flags(queued, RT_F_RUNNING);
+
+                       sched_trace_job_release(queued);
+                       gsnedf_job_arrival(queued);
+               }
+               else
+                       /* the release queue is ordered */
+                       break;
+       }
+}
+
+/* gsnedf_scheduler_tick - this function is called for every local timer
+ *                       interrupt.
+ *
+ *                   checks whether the current task has expired and checks
+ *                   whether we need to preempt it if it has not expired
+ */
+static reschedule_check_t gsnedf_scheduler_tick(void)
+{
+       unsigned long           flags;
+       struct task_struct*     t = current;
+       reschedule_check_t      want_resched = NO_RESCHED;
+
+       /* expire tasks even if not in real-time mode
+        * this makes sure that at the end of real-time mode
+        * no task  "runs away forever".
+        */
+       if (is_realtime(t)) {
+               TRACE_CUR("before dec: time_slice == %u\n", t->time_slice);
+       }
+
+       if (is_realtime(t) && t->time_slice && !--t->time_slice) {
+               if (!is_np(t)) { /* np tasks will be preempted when they become
+                                   preemptable again */
+                       want_resched = FORCE_RESCHED;
+                       set_will_schedule();
+                       TRACE("gsnedf_scheduler_tick: "
+                             "%d is preemptable "
+                             " => FORCE_RESCHED\n", t->pid);
+               } else {
+                       TRACE("gsnedf_scheduler_tick: "
+                             "%d is non-preemptable, "
+                             "preemption delayed.\n", t->pid);
+                       request_exit_np(t);
+               }
+       }
+
+       /* only the first CPU needs to release jobs */
+       if (get_rt_mode() == MODE_RT_RUN && smp_processor_id() == 0) {
+               queue_lock_irqsave(&gsnedf_lock, flags);
+
+               /* (1) try to release pending jobs */
+               gsnedf_release_jobs();
+
+               /* we don't need to check linked != scheduled since
+                * set_tsk_need_resched has been set by preempt() if necessary
+                */
+
+               queue_unlock_irqrestore(&gsnedf_lock, flags);
+       }
+
+       return want_resched;
+}
+
+/* caller holds gsnedf_lock */
+static noinline void job_completion(struct task_struct *t)
+{
+       BUG_ON(!t);
+
+       sched_trace_job_completion(t);
+
+       TRACE_TASK(t, "job_completion().\n");
+
+       /* set flags */
+       set_rt_flags(t, RT_F_SLEEP);
+       /* prepare for next period */
+       edf_prepare_for_next_period(t);
+       /* unlink */
+       unlink(t);
+       /* requeue
+        * But don't requeue a blocking task. */
+       if (is_running(t))
+               gsnedf_job_arrival(t);
+}
+
+
+/* Getting schedule() right is a bit tricky. schedule() may not make any
+ * assumptions on the state of the current task since it may be called for a
+ * number of reasons. The reasons include a scheduler_tick() determined that it
+ * was necessary, because sys_exit_np() was called, because some Linux
+ * subsystem determined so, or even (in the worst case) because there is a bug
+ * hidden somewhere. Thus, we must take extreme care to determine what the
+ * current state is.
+ *
+ * The CPU could currently be scheduling a task (or not), be linked (or not).
+ *
+ * The following assertions for the scheduled task could hold:
+ *
+ *      - !is_running(scheduled)        // the job blocks
+ *     - scheduled->timeslice == 0     // the job completed (forcefully)
+ *     - get_rt_flag() == RT_F_SLEEP   // the job completed (by syscall)
+ *     - linked != scheduled           // we need to reschedule (for any reason)
+ *     - is_np(scheduled)              // rescheduling must be delayed,
+ *                                        sys_exit_np must be requested
+ *
+ * Any of these can occur together.
+ */
+static int gsnedf_schedule(struct task_struct * prev,
+                        struct task_struct ** next,
+                        runqueue_t * rq)
+{
+       cpu_entry_t*            entry = &__get_cpu_var(gsnedf_cpu_entries);
+       int                     out_of_time, sleep, preempt, np, exists,
+                               rt, blocks;
+       struct task_struct*     linked;
+
+       /* Will be released in finish_switch. */
+       queue_lock(&gsnedf_lock);
+       clear_will_schedule();
+
+       /* sanity checking */
+       BUG_ON(entry->scheduled && entry->scheduled != prev);
+       BUG_ON(entry->scheduled && !is_realtime(prev));
+       BUG_ON(is_realtime(prev) && !entry->scheduled);
+
+       /* (0) Determine state */
+       exists      = entry->scheduled != NULL;
+       blocks      = exists && !is_running(entry->scheduled);
+       out_of_time = exists && !entry->scheduled->time_slice;
+       np          = exists && is_np(entry->scheduled);
+       sleep       = exists && get_rt_flags(entry->scheduled) == RT_F_SLEEP;
+       preempt     = entry->scheduled != entry->linked;
+       rt          = get_rt_mode() == MODE_RT_RUN;
+
+       /* If a task blocks we have no choice but to reschedule.
+        */
+       if (blocks)
+               unlink(entry->scheduled);
+
+       /* Request a sys_exit_np() call if we would like to preempt but cannot.
+        * We need to make sure to update the link structure anyway in case
+        * that we are still linked. Multiple calls to request_exit_np() don't
+        * hurt.
+        */
+       if (np && (out_of_time || preempt || sleep)) {
+               unlink(entry->scheduled);
+               request_exit_np(entry->scheduled);
+       }
+
+       /* Any task that is preemptable and either exhausts its execution
+        * budget or wants to sleep completes. We may have to reschedule after
+        * this.
+        */
+       if (!np && (out_of_time || sleep))
+               job_completion(entry->scheduled);
+
+       /* Stop real-time tasks when we leave real-time mode
+        */
+       if (!rt && entry->linked) {
+               /* task will be preempted once it is preemptable
+                * (which it may be already)
+                */
+               linked = entry->linked;
+               unlink(linked);
+               requeue(linked);
+       }
+
+       /* Link pending task if we became unlinked.
+        */
+       if (rt && !entry->linked)
+               link_task_to_cpu(__take_ready(&gsnedf), entry);
+
+       /* The final scheduling decision. Do we need to switch for some reason?
+        * If linked different from scheduled select linked as next.
+        */
+       if ((!np || blocks) &&
+           entry->linked != entry->scheduled) {
+               /* Take care of a previously scheduled
+                * job by taking it out of the Linux runqueue.
+                */
+               if (entry->scheduled) {
+                       if (prev->array)
+                               /* take it out of the run queue */
+                               deactivate_task(prev, rq);
+               }
+
+               /* Schedule a linked job? */
+               if (entry->linked) {
+                       *next = entry->linked;
+                       /* mark the task as executing on this cpu */
+                       set_task_cpu(*next, smp_processor_id());
+                       /* stick the task into the runqueue */
+                       __activate_task(*next, rq);
+               }
+       } else
+               /* Only override Linux scheduler if we have real-time task
+                * scheduled that needs to continue.
+                */
+               if (exists)
+                       *next = prev;
+
+       /* Unlock in case that we don't affect real-time tasks or
+        * if nothing changed and finish_switch won't be called.
+        */
+       if (prev == *next || (!is_realtime(prev) && !*next))
+               queue_unlock(&gsnedf_lock);
+
+       return 0;
+}
+
+
+/* _finish_switch - we just finished the switch away from prev
+ */
+static void gsnedf_finish_switch(struct task_struct *prev)
+{
+       cpu_entry_t*    entry = &__get_cpu_var(gsnedf_cpu_entries);
+
+       if (is_realtime(current))
+               entry->scheduled = current;
+       else
+               entry->scheduled = NULL;
+
+       prev->rt_param.scheduled_on    = NO_CPU;
+       current->rt_param.scheduled_on = smp_processor_id();
+
+       /* unlock in case schedule() left it locked */
+       if (is_realtime(current) || is_realtime(prev))
+                       queue_unlock(&gsnedf_lock);
+}
+
+
+/*     Prepare a task for running in RT mode
+ *     Enqueues the task into master queue data structure
+ *     returns
+ *             -EPERM  if task is not TASK_STOPPED
+ */
+static long gsnedf_prepare_task(struct task_struct * t)
+{
+       unsigned long           flags;
+       TRACE("gsn edf: prepare task %d\n", t->pid);
+
+       if (t->state == TASK_STOPPED) {
+               __setscheduler(t, SCHED_FIFO, MAX_RT_PRIO - 1);
+
+               t->rt_param.scheduled_on       = NO_CPU;
+               t->rt_param.linked_on          = NO_CPU;
+               if (get_rt_mode() == MODE_RT_RUN)
+                       /* The action is already on.
+                        * Prepare immediate release
+                        */
+                       edf_release_now(t);
+               /* The task should be running in the queue, otherwise signal
+                * code will try to wake it up with fatal consequences.
+                */
+               t->state = TASK_RUNNING;
+
+               queue_lock_irqsave(&gsnedf_lock, flags);
+               requeue(t);
+               queue_unlock_irqrestore(&gsnedf_lock, flags);
+               return 0;
+       }
+       else
+               return -EPERM;
+}
+
+static void gsnedf_wake_up_task(struct task_struct *task)
+{
+       unsigned long flags;
+       /* We must determine whether task should go into the release
+        * queue or into the ready queue. It may enter the ready queue
+        * if it has credit left in its time slice and has not yet reached
+        * its deadline. If it is now passed its deadline we assume this the
+        * arrival of a new sporadic job and thus put it in the ready queue
+        * anyway.If it has zero budget and the next release is in the future
+        * it has to go to the release queue.
+        */
+       TRACE("gsnedf: %d unsuspends with budget=%d\n",
+             task->pid, task->time_slice);
+       task->state = TASK_RUNNING;
+
+       /* We need to take suspensions because of semaphores into
+        * account! If a job resumes after being suspended due to acquiring
+        * a semaphore, it should never be treated as a new job release.
+        */
+       if (get_rt_flags(task) == RT_F_EXIT_SEM) {
+               set_rt_flags(task, RT_F_RUNNING);
+       } else {
+               if (is_tardy(task)) {
+                       /* new sporadic release */
+                       edf_release_now(task);
+                       sched_trace_job_release(task);
+               }
+               else if (task->time_slice)
+                       /* came back in time before deadline
+                        */
+                       set_rt_flags(task, RT_F_RUNNING);
+       }
+
+       queue_lock_irqsave(&gsnedf_lock, flags);
+       gsnedf_job_arrival(task);
+       queue_unlock_irqrestore(&gsnedf_lock, flags);
+}
+
+static void gsnedf_task_blocks(struct task_struct *t)
+{
+       unsigned long flags;
+
+       /* unlink if necessary */
+       queue_lock_irqsave(&gsnedf_lock, flags);
+       unlink(t);
+       queue_unlock_irqrestore(&gsnedf_lock, flags);
+
+       BUG_ON(!is_realtime(t));
+       TRACE("task %d suspends with budget=%d\n", t->pid, t->time_slice);
+       BUG_ON(t->rt_list.next != LIST_POISON1);
+       BUG_ON(t->rt_list.prev != LIST_POISON2);
+}
+
+
+/* When _tear_down is called, the task should not be in any queue any more
+ * as it must have blocked first. We don't have any internal state for the task,
+ * it is all in the task_struct.
+ */
+static long gsnedf_tear_down(struct task_struct * t)
+{
+       BUG_ON(!is_realtime(t));
+        TRACE_TASK(t, "RIP\n");
+       BUG_ON(t->array);
+       BUG_ON(t->rt_list.next != LIST_POISON1);
+       BUG_ON(t->rt_list.prev != LIST_POISON2);
+       return 0;
+}
+
+static long gsnedf_pi_block(struct pi_semaphore *sem,
+                           struct task_struct *new_waiter)
+{
+       /* This callback has to handle the situation where a new waiter is
+        * added to the wait queue of the semaphore.
+        *
+        * We must check if has a higher priority than the currently
+        * highest-priority task, and then potentially reschedule.
+        */
+
+       BUG_ON(!new_waiter);
+
+       if (edf_higher_prio(new_waiter, sem->hp.task)) {
+               TRACE_TASK(new_waiter, " boosts priority\n");
+               /* called with IRQs disabled */
+               queue_lock(&gsnedf_lock);
+               /* store new highest-priority task */
+               sem->hp.task = new_waiter;
+               if (sem->holder) {
+                       /* let holder inherit */
+                       sem->holder->rt_param.inh_task = new_waiter;
+                       unlink(sem->holder);
+                       gsnedf_job_arrival(sem->holder);
+               }
+               queue_unlock(&gsnedf_lock);
+       }
+
+       return 0;
+}
+
+static long gsnedf_inherit_priority(struct pi_semaphore *sem,
+                                   struct task_struct *new_owner)
+{
+       /* We don't need to acquire the gsnedf_lock since at the time of this
+        * call new_owner isn't actually scheduled yet (it's still sleeping)
+        * and since the calling function already holds sem->wait.lock, which
+        * prevents concurrent sem->hp.task changes.
+        */
+
+       if (sem->hp.task && sem->hp.task != new_owner) {
+               new_owner->rt_param.inh_task = sem->hp.task;
+               TRACE_TASK(new_owner, "inherited priority from %s/%d\n",
+                          sem->hp.task->comm, sem->hp.task->pid);
+       } else
+               TRACE_TASK(new_owner,
+                          "cannot inherit priority, "
+                          "no higher priority job waits.\n");
+       return 0;
+}
+
+/* This function is called on a semaphore release, and assumes that
+ * the current task is also the semaphore holder.
+ */
+static long gsnedf_return_priority(struct pi_semaphore *sem)
+{
+       struct task_struct* t = current;
+       int ret = 0;
+
+        /* Find new highest-priority semaphore task
+        * if holder task is the current hp.task.
+        *
+        * Calling function holds sem->wait.lock.
+        */
+       if (t == sem->hp.task)
+               edf_set_hp_task(sem);
+
+       TRACE_CUR("gsnedf_return_priority for lock %p\n", sem);
+
+       if (t->rt_param.inh_task) {
+               /* interrupts already disabled by PI code */
+               queue_lock(&gsnedf_lock);
+
+               /* Reset inh_task to NULL. */
+               t->rt_param.inh_task = NULL;
+
+               /* Check if rescheduling is necessary */
+               unlink(t);
+               gsnedf_job_arrival(t);
+               queue_unlock(&gsnedf_lock);
+       }
+
+       return ret;
+}
+
+static int gsnedf_mode_change(int new_mode)
+{
+       unsigned long flags;
+       int cpu;
+       cpu_entry_t *entry;
+
+       if (new_mode == MODE_RT_RUN) {
+               queue_lock_irqsave(&gsnedf_lock, flags);
+
+               __rerelease_all(&gsnedf, edf_release_at);
+
+               /* get old cruft out of the way in case we reenter real-time
+                * mode for a second time
+                */
+               while (!list_empty(&gsnedf_cpu_queue))
+                       list_del(gsnedf_cpu_queue.next);
+               /* reinitialize */
+               for_each_online_cpu(cpu) {
+                       entry = &per_cpu(gsnedf_cpu_entries, cpu);
+                       atomic_set(&entry->will_schedule, 0);
+                       entry->linked    = NULL;
+                       entry->scheduled = NULL;
+                       list_add(&entry->list, &gsnedf_cpu_queue);
+               }
+
+               queue_unlock_irqrestore(&gsnedf_lock, flags);
+
+       }
+       return 0;
+}
+
+
+/*     Plugin object   */
+static sched_plugin_t s_plugin __cacheline_aligned_in_smp = {
+       .ready_to_use = 0
+};
+
+
+/*
+ *     Plugin initialization code.
+ */
+#define INIT_SCHED_PLUGIN (struct sched_plugin){               \
+       .plugin_name            = "GSN-EDF",                    \
+       .ready_to_use           = 1,                            \
+       .scheduler_tick         = gsnedf_scheduler_tick,        \
+       .prepare_task           = gsnedf_prepare_task,          \
+       .sleep_next_period      = edf_sleep_next_period,        \
+       .tear_down              = gsnedf_tear_down,             \
+       .schedule               = gsnedf_schedule,              \
+       .finish_switch          = gsnedf_finish_switch,         \
+       .mode_change            = gsnedf_mode_change,           \
+       .wake_up_task           = gsnedf_wake_up_task,          \
+       .task_blocks            = gsnedf_task_blocks,           \
+       .inherit_priority       = gsnedf_inherit_priority,      \
+       .return_priority        = gsnedf_return_priority,       \
+       .pi_block               = gsnedf_pi_block               \
+}
+
+
+sched_plugin_t *__init init_gsn_edf_plugin(void)
+{
+       int cpu;
+       cpu_entry_t *entry;
+
+       if (!s_plugin.ready_to_use)
+       {
+               /* initialize CPU state */
+               for (cpu = 0; cpu < NR_CPUS; cpu++)  {
+                       entry = &per_cpu(gsnedf_cpu_entries, cpu);
+                       atomic_set(&entry->will_schedule, 0);
+                       entry->linked    = NULL;
+                       entry->scheduled = NULL;
+                       entry->cpu       = cpu;
+               }
+
+               queue_lock_init(&gsnedf_lock);
+               edf_domain_init(&gsnedf, NULL);
+               s_plugin = INIT_SCHED_PLUGIN;
+       }
+       return &s_plugin;
+}
+
+
+diff --git a/kernel/sched_part_edf.c b/kernel/sched_part_edf.c
+new file mode 100644
+index 0000000..df741f5
+--- /dev/null
+++ b/kernel/sched_part_edf.c
+@@ -0,0 +1,340 @@
+/*
+ * kernel/sched_part_edf.c
+ *
+ * Implementation of the partitioned EDF scheduler plugin.
+ */
+
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+
+#include <linux/litmus.h>
+#include <linux/sched_plugin.h>
+#include <linux/edf_common.h>
+
+
+typedef struct {
+       rt_domain_t             domain;
+       int                     cpu;
+       struct task_struct*     scheduled; /* only RT tasks */
+       spinlock_t              lock;
+} part_edf_domain_t;
+
+
+#define local_edf              (&__get_cpu_var(part_edf_domains).domain)
+#define local_pedf             (&__get_cpu_var(part_edf_domains))
+#define remote_edf(cpu)                (&per_cpu(part_edf_domains, cpu).domain)
+#define remote_pedf(cpu)       (&per_cpu(part_edf_domains, cpu))
+#define task_edf(task)         remote_edf(get_partition(task))
+
+static void part_edf_domain_init(part_edf_domain_t* pedf,
+                                check_resched_needed_t check,
+                                int cpu)
+{
+       edf_domain_init(&pedf->domain, check);
+       pedf->cpu               = cpu;
+       pedf->lock              = SPIN_LOCK_UNLOCKED;
+       pedf->scheduled         = NULL;
+}
+
+DEFINE_PER_CPU(part_edf_domain_t, part_edf_domains);
+
+/* This check is trivial in partioned systems as we only have to consider
+ * the CPU of the partition.
+ *
+ */
+static int part_edf_check_resched(rt_domain_t *edf)
+{
+       part_edf_domain_t *pedf = container_of(edf, part_edf_domain_t, domain);
+       int ret = 0;
+
+       spin_lock(&pedf->lock);
+
+       /* because this is a callback from rt_domain_t we already hold
+        * the necessary lock for the ready queue
+        */
+       if (edf_preemption_needed(edf, pedf->scheduled)) {
+               if (pedf->cpu == smp_processor_id())
+                       set_tsk_need_resched(current);
+               else
+                       smp_send_reschedule(pedf->cpu);
+               ret = 1;
+       }
+       spin_unlock(&pedf->lock);
+       return ret;
+}
+
+
+static reschedule_check_t part_edf_scheduler_tick(void)
+{
+       unsigned long flags;
+       struct task_struct *t = current;
+       reschedule_check_t want_resched = NO_RESCHED;
+       rt_domain_t *edf       = local_edf;
+       part_edf_domain_t *pedf = local_pedf;
+
+       /* Check for inconsistency. We don't need the lock for this since
+        * ->scheduled is only changed in schedule, which obviously is not
+        *  executing in parallel on this CPU
+        */
+       BUG_ON(is_realtime(t) && t != pedf->scheduled);
+
+       /* expire tasks even if not in real-time mode
+        * this makes sure that at the end of real-time mode
+        * no tasks "run away forever".
+        */
+       if (is_realtime(t) && (!--t->time_slice)) {
+               /* this task has exhausted its budget in this period */
+               set_rt_flags(t, RT_F_SLEEP);
+               want_resched = FORCE_RESCHED;
+       }
+       if (get_rt_mode() == MODE_RT_RUN)
+       {
+               /* check whether anything is waiting to be released
+                * this could probably be moved to the global timer
+                * interrupt handler since the state will only change
+                * once per jiffie
+                */
+               try_release_pending(edf);
+               if (want_resched != FORCE_RESCHED)
+               {
+                       read_lock_irqsave(&edf->ready_lock, flags);
+                       if (edf_preemption_needed(edf, t))
+                               want_resched = FORCE_RESCHED;
+                       read_unlock_irqrestore(&edf->ready_lock, flags);
+               }
+       }
+       return want_resched;
+}
+
+static int part_edf_schedule(struct task_struct * prev,
+                            struct task_struct ** next,
+                            runqueue_t * rq)
+{
+       int                     need_deactivate = 1;
+       part_edf_domain_t*      pedf = local_pedf;
+       rt_domain_t*            edf  = &pedf->domain;
+
+
+       if (is_realtime(prev) && get_rt_flags(prev) == RT_F_SLEEP)
+               edf_prepare_for_next_period(prev);
+
+       if (get_rt_mode() == MODE_RT_RUN) {
+               write_lock(&edf->ready_lock);
+               if (is_realtime(prev) && is_released(prev) && is_running(prev)
+                   && !edf_preemption_needed(edf, prev)) {
+                       /* this really should only happen if the task has
+                        * 100% utilization...
+                        */
+                       TRACE("prev will be next, already released\n");
+                       *next = prev;
+                       need_deactivate = 0;
+               } else {
+                       /* either not yet released, preempted, or non-rt */
+                       *next = __take_ready(edf);
+                       if (*next) {
+                               /* stick the task into the runqueue */
+                               __activate_task(*next, rq);
+                               set_task_cpu(*next, smp_processor_id());
+                       }
+               }
+               spin_lock(&pedf->lock);
+               pedf->scheduled = *next;
+               spin_unlock(&pedf->lock);
+               if (*next)
+                       set_rt_flags(*next, RT_F_RUNNING);
+
+               write_unlock(&edf->ready_lock);
+       }
+
+       if (is_realtime(prev) && need_deactivate && prev->array) {
+               /* take it out of the run queue */
+               deactivate_task(prev, rq);
+       }
+
+       return 0;
+}
+
+
+static void part_edf_finish_switch(struct task_struct *prev)
+{
+       rt_domain_t*    edf = local_edf;
+
+       if (!is_realtime(prev) || !is_running(prev))
+               return;
+
+       if (get_rt_flags(prev) == RT_F_SLEEP ||
+           get_rt_mode() != MODE_RT_RUN) {
+               /* this task has expired
+                * _schedule has already taken care of updating
+                * the release and
+                * deadline. We just must check if has been released.
+                */
+               if (is_released(prev) && get_rt_mode() == MODE_RT_RUN) {
+                       /* already released */
+                       add_ready(edf, prev);
+                       TRACE("%d goes straight to ready queue\n", prev->pid);
+               } else
+                       /* it has got to wait */
+                       add_release(edf, prev);
+       } else {
+               /* this is a forced preemption
+                * thus the task stays in the ready_queue
+                * we only must make it available to others
+                */
+               add_ready(edf, prev);
+       }
+}
+
+
+/*     Prepare a task for running in RT mode
+ *     Enqueues the task into master queue data structure
+ *     returns
+ *             -EPERM  if task is not TASK_STOPPED
+ */
+static long part_edf_prepare_task(struct task_struct * t)
+{
+       rt_domain_t*    edf = task_edf(t);
+
+
+       TRACE("[%d] part edf: prepare task %d on CPU %d\n",
+               smp_processor_id(), t->pid, get_partition(t));
+       if (t->state == TASK_STOPPED) {
+               __setscheduler(t, SCHED_FIFO, MAX_RT_PRIO - 1);
+
+               if (get_rt_mode() == MODE_RT_RUN)
+                       /* The action is already on.
+                        * Prepare immediate release.
+                        */
+                       edf_release_now(t);
+               /* The task should be running in the queue, otherwise signal
+                * code will try to wake it up with fatal consequences.
+                */
+               t->state = TASK_RUNNING;
+               add_release(edf, t);
+               return 0;
+       } else
+               return -EPERM;
+}
+
+static void part_edf_wake_up_task(struct task_struct *task)
+{
+       rt_domain_t* edf;
+
+       edf = task_edf(task);
+
+       /* We must determine whether task should go into the release
+        * queue or into the ready queue. It may enter the ready queue
+        * if it has credit left in its time slice and has not yet reached
+        * its deadline. If it is now passed its deadline we assume this the
+        * arrival of a new sporadic job and thus put it in the ready queue
+        * anyway.If it has zero budget and the next release is in the future
+        * it has to go to the release queue.
+        */
+       TRACE("part edf: wake up %d with budget=%d for cpu %d\n",
+             task->pid, task->time_slice, get_partition(task));
+       task->state = TASK_RUNNING;
+       if (is_tardy(task)) {
+               /* new sporadic release */
+               edf_release_now(task);
+               add_ready(edf, task);
+
+       } else if (task->time_slice) {
+               /* Came back in time before deadline. This may cause
+                * deadline overruns, but since we don't handle suspensions
+                * in the analytical model, we don't care since we can't
+                * guarantee anything at all if tasks block.
+                */
+               set_rt_flags(task, RT_F_RUNNING);
+               add_ready(edf, task);
+
+       } else {
+               add_release(edf, task);
+       }
+
+}
+
+static void part_edf_task_blocks(struct task_struct *t)
+{
+       BUG_ON(!is_realtime(t));
+       /* not really anything to do since it can only block if
+        * it is running, and when it is not running it is not in any
+        * queue anyway.
+        *
+        */
+       TRACE("task %d blocks with budget=%d\n", t->pid, t->time_slice);
+       BUG_ON(in_list(&t->rt_list));
+}
+
+
+/* When _tear_down is called, the task should not be in any queue any more
+ * as it must have blocked first. We don't have any internal state for the task,
+ * it is all in the task_struct.
+ */
+static long part_edf_tear_down(struct task_struct * t)
+{
+       BUG_ON(!is_realtime(t));
+        TRACE("part edf: tear down called for %d \n", t->pid);
+       BUG_ON(t->array);
+       BUG_ON(in_list(&t->rt_list));
+       return 0;
+}
+
+
+static int part_edf_mode_change(int new_mode)
+{
+       int cpu;
+
+       if (new_mode == MODE_RT_RUN)
+               for_each_online_cpu(cpu)
+                       rerelease_all(remote_edf(cpu), edf_release_at);
+       TRACE("[%d] part edf: mode changed to %d\n",
+              smp_processor_id(), new_mode);
+       return 0;
+}
+
+
+/*     Plugin object   */
+static sched_plugin_t s_plugin __cacheline_aligned_in_smp = {
+       .ready_to_use = 0
+};
+
+
+/*
+ *     Plugin initialization code.
+ */
+#define INIT_SCHED_PLUGIN (struct sched_plugin) {\
+       .plugin_name            = "Partitioned EDF",\
+       .ready_to_use           = 1,\
+       .scheduler_tick         = part_edf_scheduler_tick,\
+       .prepare_task           = part_edf_prepare_task,\
+       .sleep_next_period      = edf_sleep_next_period,\
+       .tear_down              = part_edf_tear_down,\
+       .schedule               = part_edf_schedule,\
+       .finish_switch          = part_edf_finish_switch,\
+       .mode_change            = part_edf_mode_change,\
+       .wake_up_task           = part_edf_wake_up_task,\
+       .task_blocks            = part_edf_task_blocks \
+}
+
+
+sched_plugin_t *__init init_part_edf_plugin(void)
+{
+       int i;
+
+       if (!s_plugin.ready_to_use)
+       {
+               for (i = 0; i < NR_CPUS; i++)
+               {
+                       part_edf_domain_init(remote_pedf(i),
+                                            part_edf_check_resched, i);
+                       printk("CPU partition %d initialized.", i);
+               }
+               s_plugin = INIT_SCHED_PLUGIN;
+       }
+       return &s_plugin;
+}
+
+
+
+diff --git a/kernel/sched_pfair.c b/kernel/sched_pfair.c
+new file mode 100644
+index 0000000..dbb7e5c
+--- /dev/null
+++ b/kernel/sched_pfair.c
+@@ -0,0 +1,503 @@
+/*
+ *
+ * Implementation of synchronized PFAIR PD2 scheduler
+ *
+ */
+
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+
+#include <linux/litmus.h>
+#include <linux/sched_plugin.h>
+#include <linux/pfair_common.h>
+#include <linux/sched_trace.h>
+#include <linux/queuelock.h>
+
+struct cpu_state {
+       struct task_struct *    t;
+       volatile jiffie_t       jiffie_marker;
+};
+/*     PFAIR scheduling domain, release and ready queues */
+static pfair_domain_t  pfair           __cacheline_aligned_in_smp;
+
+/*     An indicator that quantum boundary was crossed
+ *     and a decision has to be made
+ */
+static int   sync_go[NR_CPUS];
+
+
+/*     A collection of CPU states protected by pfair lock      */
+DEFINE_PER_CPU(struct cpu_state, states);
+
+/*
+ * This function gets called by the timer code, with HZ frequency
+ * with interrupts disabled.
+ *
+ * The function merges the release queue with the ready queue
+ * and indicates that quantum boundary was crossed.
+ *
+ * It also suggests to schedule off currently running
+ * real-time task if the mode is non-real-time.
+ */
+static reschedule_check_t pfair_scheduler_tick(void)
+{
+       int want_resched = NO_RESCHED;
+       sync_go[smp_processor_id()] = 0;
+       if (!cpu_isset(smp_processor_id(), pfair.domain_cpus))
+               goto out;
+       /* Now determine if we want current task to be preempted */
+       if (get_rt_mode() == MODE_RT_RUN) {
+               pfair_try_release_pending(&pfair);
+               want_resched = FORCE_RESCHED;
+               /* indicate that the interrupt fired */
+               sync_go[smp_processor_id()] = 1;
+               barrier();
+        } else if (is_realtime(current) && is_running(current)) {
+               /*      In non real-time mode we want to
+                *      schedule off real-time tasks    */
+               want_resched = FORCE_RESCHED;
+       } else if (is_realtime(current) && !is_running(current)) {
+               TRACE("[%d] %d Timer interrupt on not runninng %d\n",
+                               smp_processor_id(),
+                               jiffies-rt_start_time, current->pid);
+       }
+out:
+       return want_resched;
+}
+
+/**
+ *     This function is called by the processor
+ *     that performs rescheduling. It saves the timing
+ *     parameters of currently running jobs that were not rescheduled yet
+ *     and releases next subtask for these jobs placing them into
+ *     release and ready queues.
+ */
+static void pretend_release(cpumask_t p)
+{
+       int i = 0;
+       struct task_struct * t = NULL;
+       /* for all the tasks increment the number of used quanta
+       *  and release next subtask or job depending on the number
+       *  of used quanta
+       */
+       for_each_cpu_mask(i, p) {
+               t = per_cpu(states, i).t;
+               if (t != NULL) {
+                       backup_times(t);
+                       inc_passed_quanta(t);
+                       if ( get_passed_quanta(t) == get_exec_cost(t)) {
+                               pfair_prepare_next_job(t);
+                       } else {
+                               pfair_prepare_next_subtask(t);
+                       }
+                       /*
+                       TRACE("[%d] %d pretending release %d with (%d, %d)\n",
+                                       smp_processor_id(),
+                                       jiffies-rt_start_time,t->pid,
+                                       get_release(t)-rt_start_time,
+                                       get_deadline(t)-rt_start_time);*/
+                       /* detect if the job or subtask has to be released now*/
+                       if (time_before_eq(get_release(t), jiffies))
+                               pfair_add_ready(&pfair, t);
+                       else
+                               pfair_add_release(&pfair, t);
+               }
+       }
+}
+/*
+ *     Rollback the the pretended release of tasks.
+ *     Timing parameters are restored and tasks are removed
+ *     from the queues as it was before calling the schedule() function.
+ *
+ */
+static void rollback_release(cpumask_t p)
+{
+       int i = -1;
+       struct task_struct * t = NULL;
+       /*
+       *  Rollback the pretended changes
+       */
+       for_each_cpu_mask(i, p) {
+               t = per_cpu(states, i).t;
+               if (t != NULL) {
+                       restore_times(t);
+                       if(t->rt_list.prev != LIST_POISON1 ||
+                               t->rt_list.next != LIST_POISON2) {
+                               /*      Delete the task from a queue */
+                               list_del(&t->rt_list);
+                       }
+               }
+       }
+}
+
+/*
+ * The procedure creates a list of cpu's whose tasks have not been
+ * rescheduled yet. These are CPU's with jiffie marker different from
+ * the value of jiffies.
+ */
+static void find_participants(cpumask_t * target)
+{
+       cpumask_t res;int i;
+       cpus_clear(res);
+       for_each_online_cpu(i) {
+               if(per_cpu(states, i).jiffie_marker != jiffies)
+                       cpu_set(i, res);
+       }
+       /*      Examine only cpus in the domain */
+       cpus_and(res, pfair.domain_cpus, res);
+       (*target) = res;
+}
+
+/*
+ * This is main PFAIR schedule function,
+ * each processor pretends that some currently running tasks are
+ * released in the next quantum and determines whether it should
+ * keep the task that is currently running (this is usually the case
+ * for heavy tasks).
+*/
+static int pfair_schedule(struct task_struct *prev,
+                         struct task_struct **next,
+                         runqueue_t * rq)
+{
+       int cpu                 =-1;
+       int k                   =-1;
+       int need_deactivate     = 1;
+       int keep                =0;
+       unsigned long flags;
+       cpumask_t participants;
+       /*      A temporary array */
+       struct task_struct * rs_old_ptr[NR_CPUS];
+
+       *next = NULL;
+       cpu = smp_processor_id();
+       /* CPU's not in the domain just bypass */
+       if (!cpu_isset(cpu, pfair.domain_cpus)) {
+               goto out;
+       }
+       queue_lock_irqsave(&pfair.pfair_lock, flags);
+
+       /* If we happen to run in non-realtime mode
+        * then we have to schedule off currently running tasks
+        * */
+       if (get_rt_mode() != MODE_RT_RUN) {
+               if (is_realtime(prev)) {
+                       per_cpu(states, cpu).t = NULL;
+                       TRACE("[%d] %d Suspending  %d\n",
+                                       cpu, jiffies - rt_start_time,
+                                       prev->pid);
+                       /*      Move the task to the
+                        *      release queue for future runs
+                        *      FIXME: Do something smarter.
+                        *      For example create a set where
+                        *      prepared or inactive tasks are placed
+                        *      and then released.
+                        *      */
+                       set_release(prev, get_release(prev) + 1000);
+                       pfair_add_release(&pfair, prev);
+               }
+               goto out_deactivate;
+       }
+       /*      If the current task stops or dies       */
+       if (is_realtime(prev) && !is_running(prev)) {
+               /*  remove it from the running set      */
+               per_cpu(states, cpu).t = NULL;
+       }
+       /* Make pfair decisions at quantum boundaries only,
+        * but schedule off stopped or dead tasks */
+
+       if ((sync_go[cpu]--) != 1)
+               goto out_deactivate;
+
+       /*TRACE("[%d] %d Scheduler activation", cpu, jiffies-rt_start_time);
+       cpus_and(res, pfair.domain_cpus, cpu_online_map);
+       for_each_cpu_mask(k, res) {
+               TRACE("%d" ,(per_cpu(states, k).jiffie_marker!=jiffies));
+       }
+       TRACE("\n");*/
+
+       /*      Find processors that have not rescheduled yet */
+       find_participants(&participants);
+       /*      For each task on remote cpu's pretend release   */
+       pretend_release(participants);
+       /*  Clear temporary array */
+       for_each_possible_cpu(k) { rs_old_ptr[k] = NULL; }
+       /*  Select a new subset of eligible tasks */
+       for_each_cpu_mask(k, participants) {
+               rs_old_ptr[k] = __pfair_take_ready (&pfair);
+               /* Check if our current task must be scheduled in the next quantum */
+               if (rs_old_ptr[k] == per_cpu(states, cpu).t) {
+                       /* this is our current task, keep it */
+                       *next = per_cpu(states, cpu).t;
+                       need_deactivate = 0;
+                       keep = 1;
+                       break;
+               }
+       }
+       /*      Put all the extracted tasks back into the ready queue */
+       for_each_cpu_mask(k, participants) {
+               if (rs_old_ptr[k] != NULL){
+                       pfair_add_ready(&pfair, rs_old_ptr[k]);
+                       rs_old_ptr[k] = NULL;
+               }
+       }
+       /*      Rollback the pretended release,
+        *      task parameters are restored and running tasks are removed
+        *      from queues */
+       rollback_release(participants);
+       /*
+        *      If the current task is not scheduled in the next quantum
+        *      then select a new pfair task
+        */
+       if(!keep) {
+               *next = per_cpu(states, cpu).t = __pfair_take_ready(&pfair);
+               if (*next != NULL) {
+                       /*TRACE("[%d] %d Scheduling %d with (%d, %d)\n",
+                                       cpu, jiffies-rt_start_time,
+                                       get_release(*next),
+                                       get_deadline(*next));
+                       */
+                       set_task_cpu(*next, cpu);
+                       __activate_task(*next, rq);
+               }
+       } else {
+               if (is_realtime(prev)) {
+                       /*TRACE("[%d] %d prev==next %d\n",
+                                       cpu,jiffies-rt_start_time,
+                                       (prev)->pid);*/
+
+                       /*      The task will not be switched off but we
+                        *      need to track the execution time
+                        */
+                       inc_passed_quanta(prev);
+               }
+       }
+
+       /*Show that our task does not participate in subsequent selections*/
+       __get_cpu_var(states).jiffie_marker = jiffies;
+
+out_deactivate:
+       if ( is_realtime(prev) && need_deactivate && prev->array) {
+               /* take prev out of the linux run queue */
+               deactivate_task(prev, rq);
+       }
+       queue_unlock_irqrestore(&pfair.pfair_lock, flags);
+out:
+       return 0;
+}
+
+static void pfair_finish_task_switch(struct task_struct *t)
+{
+       if (!is_realtime(t) || !is_running(t))
+               return;
+
+       queue_lock(&pfair.pfair_lock);
+       /* Release in real-time mode only,
+        * if the mode is non real-time, then
+        * the task is already in the release queue
+        * with the time far in the future
+        */
+       if (get_rt_mode() == MODE_RT_RUN) {
+               inc_passed_quanta(t);
+               if ( get_passed_quanta(t) == get_exec_cost(t)) {
+                       sched_trace_job_completion(t);
+                       pfair_prepare_next_job(t);
+               } else {
+                       pfair_prepare_next_subtask(t);
+               }
+               /*TRACE("[%d] %d releasing %d with (%d, %d)\n",
+                       smp_processor_id(),
+                                       jiffies-rt_start_time,
+                                       t->pid,
+                                       get_release(t)-rt_start_time,
+                                       get_deadline(t)-rt_start_time);*/
+               if (time_before_eq(get_release(t), jiffies))
+                       pfair_add_ready(&pfair, t);
+               else
+                       pfair_add_release(&pfair, t);
+       }
+       queue_unlock(&pfair.pfair_lock);
+}
+
+/*      Prepare a task for running in RT mode
+ *      Enqueues the task into master queue data structure
+ *      returns
+ *              -EPERM  if task is not TASK_STOPPED
+ */
+static long pfair_prepare_task(struct task_struct * t)
+{
+       unsigned long flags;
+        TRACE("pfair: prepare task %d\n", t->pid);
+        if (t->state == TASK_STOPPED) {
+                __setscheduler(t, SCHED_FIFO, MAX_RT_PRIO - 1);
+
+                if (get_rt_mode() == MODE_RT_RUN)
+                        /* The action is already on.
+                         * Prepare immediate release
+                         */
+                        __pfair_prepare_new_release(t, jiffies);
+                /* The task should be running in the queue, otherwise signal
+                 * code will try to wake it up with fatal consequences.
+                 */
+                t->state = TASK_RUNNING;
+               queue_lock_irqsave(&pfair.pfair_lock, flags);
+                pfair_add_release(&pfair, t);
+               queue_unlock_irqrestore(&pfair.pfair_lock, flags);
+                return 0;
+        } else
+               return -EPERM;
+}
+
+
+
+static void pfair_wake_up_task(struct task_struct *task)
+{
+
+       unsigned long flags;
+
+        /* We must determine whether task should go into the release
+         * queue or into the ready queue.
+        * The task enters the ready queue if the previous deadline was missed,
+        * so we treat the invoked job as  a new sporadic release.
+        *
+        * The job can also enter the ready queue if it was invoked before its
+        * global deadline, but its budjet must be clipped down to one quantum
+         */
+        task->state = TASK_RUNNING;
+        if (time_after_eq(jiffies, task->rt_param.times.last_release
+                                       + get_rt_period(task))) {
+                /* new sporadic release */
+               TRACE("[%d] Sporadic release of %d at %d\n",
+                               smp_processor_id(),
+                               jiffies-rt_start_time,
+                               task->pid);
+                __pfair_prepare_new_release(task, jiffies);
+               queue_lock_irqsave(&pfair.pfair_lock, flags);
+                sched_trace_job_release(task);
+                pfair_add_ready(&pfair, task);
+               queue_unlock_irqrestore(&pfair.pfair_lock, flags);
+        } else if (task->time_slice) {
+                /* came back in time before deadline
+                 * clip the budget to be the last subtask of a job or
+                * the new job.
+                */
+               task->rt_param.times.exec_time = get_exec_cost(task) - 1;
+               if (task->rt_param.times.exec_time == 0) {
+                       pfair_prepare_next_job(task);
+               } else {
+                       pfair_prepare_next_subtask(task);
+               }
+               TRACE("[%d] %d Resume of %d with %d, %d, %d\n",
+                               smp_processor_id(), jiffies-rt_start_time,
+                               task->pid, get_release(task)-rt_start_time,
+                               get_deadline(task)-rt_start_time,
+                               get_passed_quanta(task));
+
+                set_rt_flags(task, RT_F_RUNNING);
+               queue_lock_irqsave(&pfair.pfair_lock, flags);
+               sched_trace_job_release(task);
+               if (time_after_eq(jiffies, get_release(task))) {
+                       pfair_add_ready(&pfair, task);
+               } else {
+                       pfair_add_release(&pfair, task);
+               }
+               queue_unlock_irqrestore(&pfair.pfair_lock, flags);
+
+        } else {
+               TRACE("[%d] %d Strange release of %d with %d, %d, %d\n",
+                               smp_processor_id(), jiffies-rt_start_time,
+                               task->pid,
+                               get_release(task), get_deadline(task),
+                               get_passed_quanta(task));
+
+               queue_lock_irqsave(&pfair.pfair_lock, flags);
+                pfair_add_release(&pfair, task);
+               queue_unlock_irqrestore(&pfair.pfair_lock, flags);
+        }
+}
+
+
+static void pfair_task_blocks(struct task_struct *t)
+{
+       unsigned long flags;
+       int i;
+       cpumask_t res;
+       BUG_ON(!is_realtime(t));
+       /* If the task blocks, then it must be removed from the running set */
+       queue_lock_irqsave(&pfair.pfair_lock, flags);
+       cpus_and(res,pfair.domain_cpus, cpu_online_map);
+       for_each_cpu_mask(i, res) {
+               if (per_cpu(states, i).t        == t)
+                       per_cpu(states, i).t = NULL;
+       }
+       /*      If the task is running and in some
+        *      list it might have been released by another
+        *      processor
+        */
+       if((t->rt_list.next != LIST_POISON1 ||
+                               t->rt_list.prev != LIST_POISON2)) {
+               TRACE("[%d] %d task %d is deleted from the list\n",
+                       smp_processor_id(),
+                       jiffies-rt_start_time, t->pid);
+               list_del(&t->rt_list);
+       }
+       queue_unlock_irqrestore(&pfair.pfair_lock, flags);
+        TRACE("[%d] %d task %d blocks with budget=%d state=%d\n",
+                       smp_processor_id(), jiffies-rt_start_time,
+                       t->pid, t->time_slice, t->state);
+}
+
+static long pfair_tear_down(struct task_struct * t)
+{
+       BUG_ON(!is_realtime(t));
+        TRACE("pfair: tear down called for %d \n", t->pid);
+        BUG_ON(t->array);
+        BUG_ON(t->rt_list.next != LIST_POISON1);
+        BUG_ON(t->rt_list.prev != LIST_POISON2);
+        return 0;
+}
+
+static int pfair_mode_change(int new_mode)
+{
+       printk(KERN_INFO "[%d] pfair mode change %d\n",
+                       smp_processor_id(), new_mode);
+       if (new_mode == MODE_RT_RUN) {
+               pfair_prepare_new_releases(&pfair, jiffies + 10);
+       }
+       printk(KERN_INFO "[%d] pfair: mode change done\n", smp_processor_id());
+       return 0;
+}
+
+/*     Plugin object */
+static sched_plugin_t s_plugin __cacheline_aligned_in_smp = {
+       .ready_to_use = 0
+};
+/*
+*      PFAIR plugin initialization macro.
+*/
+#define INIT_PFAIR_PLUGIN (struct sched_plugin){\
+       .plugin_name            = "PFAIR",\
+       .ready_to_use           = 1,\
+       .scheduler_tick         = pfair_scheduler_tick,\
+       .prepare_task           = pfair_prepare_task,\
+       .tear_down              = pfair_tear_down,\
+       .schedule               = pfair_schedule,\
+       .finish_switch          = pfair_finish_task_switch,\
+       .mode_change            = pfair_mode_change,\
+       .wake_up_task           = pfair_wake_up_task,\
+       .task_blocks            = pfair_task_blocks \
+       }
+
+sched_plugin_t* __init init_pfair_plugin(void)
+{
+       int i=0;
+       if (!s_plugin.ready_to_use) {
+               pfair_domain_init(&pfair);
+               for (i=0; i<NR_CPUS; i++) {
+                       sync_go[i] = 0;
+                       per_cpu(states, i).t = NULL;
+               }
+               s_plugin = INIT_PFAIR_PLUGIN;
+       }
+       return &s_plugin;
+}
+diff --git a/kernel/sched_plugin.c b/kernel/sched_plugin.c
+new file mode 100644
+index 0000000..9fb10af
+--- /dev/null
+++ b/kernel/sched_plugin.c
+@@ -0,0 +1,108 @@
+/* sched_plugin.c -- core infrastructure for the scheduler plugin system
+ *
+ * This file includes the initialization of the plugin system, the no-op Linux
+ * scheduler plugin and some dummy functions.
+ */
+
+
+#include <linux/litmus.h>
+#include <linux/sched_plugin.h>
+
+
+/*************************************************************
+ *                   Dummy plugin functions                  *
+ *************************************************************/
+
+void litmus_dummy_finish_switch(struct task_struct * prev)
+{
+}
+
+int litmus_dummy_schedule(struct task_struct * prev,
+                         struct task_struct** next,
+                         runqueue_t* q)
+{
+       return 0;
+}
+
+reschedule_check_t litmus_dummy_scheduler_tick(void)
+{
+       return NO_RESCHED;
+}
+
+
+long litmus_dummy_prepare_task(struct task_struct *t)
+{
+       return 0;
+}
+
+void litmus_dummy_wake_up_task(struct task_struct *task)
+{
+       printk(KERN_WARNING "task %d: unhandled real-time wake up!\n",
+         task->pid);
+}
+
+void litmus_dummy_task_blocks(struct task_struct *task)
+{
+}
+
+long litmus_dummy_tear_down(struct task_struct *task)
+{
+       return 0;
+}
+
+int litmus_dummy_scheduler_setup(int cmd, void __user *parameter)
+{
+       return -EPERM;
+}
+
+long litmus_dummy_sleep_next_period(void)
+{
+       return -EPERM;
+}
+
+long litmus_dummy_inherit_priority(struct pi_semaphore *sem,
+                                  struct task_struct *new_owner)
+{
+       return -EPERM;
+}
+
+long litmus_dummy_return_priority(struct pi_semaphore *sem)
+{
+       return -EPERM;
+}
+
+long litmus_dummy_pi_block(struct pi_semaphore *sem,
+                          struct task_struct *new_waiter)
+{
+       return -EPERM;
+}
+
+
+/* The default scheduler plugin. It doesn't do anything and lets Linux do its
+ * job.
+ */
+
+sched_plugin_t linux_sched_plugin = {
+       .plugin_name = "Linux",
+       .ready_to_use = 1,
+       .scheduler_tick = litmus_dummy_scheduler_tick,
+       .prepare_task = litmus_dummy_prepare_task,
+       .tear_down = litmus_dummy_tear_down,
+       .wake_up_task = litmus_dummy_wake_up_task,
+       .task_blocks = litmus_dummy_task_blocks,
+       .sleep_next_period = litmus_dummy_sleep_next_period,
+       .schedule = litmus_dummy_schedule,
+       .finish_switch = litmus_dummy_finish_switch,
+       .scheduler_setup = litmus_dummy_scheduler_setup,
+       .inherit_priority = litmus_dummy_inherit_priority,
+       .return_priority = litmus_dummy_return_priority,
+       .pi_block = litmus_dummy_pi_block
+};
+
+/*
+ *     The reference to current plugin that is used to schedule tasks within
+ *     the system. It stores references to actual function implementations
+ *     Should be initialized by calling "init_***_plugin()"
+ */
+sched_plugin_t *curr_sched_plugin = &linux_sched_plugin;
+
+diff --git a/kernel/sched_psn_edf.c b/kernel/sched_psn_edf.c
+new file mode 100644
+index 0000000..a1e12e0
+--- /dev/null
+++ b/kernel/sched_psn_edf.c
+@@ -0,0 +1,523 @@
+
+/*
+ * kernel/sched_psn_edf.c
+ *
+ * Implementation of the PSN-EDF scheduler plugin.
+ * Based on kern/sched_part_edf.c and kern/sched_gsn_edf.c.
+ *
+ * Suspensions and non-preemptable sections are supported.
+ * Priority inheritance is not supported.
+ */
+
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+
+#include <linux/litmus.h>
+#include <linux/sched_plugin.h>
+#include <linux/edf_common.h>
+
+
+typedef struct {
+       rt_domain_t             domain;
+       int                     cpu;
+       struct task_struct*     scheduled; /* only RT tasks */
+       spinlock_t              lock;      /* protects the domain and
+                                            * serializes scheduling decisions
+                                           */
+} psnedf_domain_t;
+
+DEFINE_PER_CPU(psnedf_domain_t, psnedf_domains);
+
+#define local_edf              (&__get_cpu_var(psnedf_domains).domain)
+#define local_pedf             (&__get_cpu_var(psnedf_domains))
+#define remote_edf(cpu)                (&per_cpu(psnedf_domains, cpu).domain)
+#define remote_pedf(cpu)       (&per_cpu(psnedf_domains, cpu))
+#define task_edf(task)         remote_edf(get_partition(task))
+#define task_pedf(task)                remote_pedf(get_partition(task))
+
+
+static void psnedf_domain_init(psnedf_domain_t* pedf,
+                                check_resched_needed_t check,
+                                int cpu)
+{
+       edf_domain_init(&pedf->domain, check);
+       pedf->cpu               = cpu;
+       pedf->lock              = SPIN_LOCK_UNLOCKED;
+       pedf->scheduled         = NULL;
+}
+
+static void requeue(struct task_struct* t, rt_domain_t *edf)
+{
+       /* only requeue if t is actually running */
+       BUG_ON(!is_running(t));
+
+       if (t->state != TASK_RUNNING)
+               TRACE_TASK(t, "requeue: !TASK_RUNNING");
+
+       set_rt_flags(t, RT_F_RUNNING);
+       if (!is_released(t) ||
+           get_rt_mode() != MODE_RT_RUN)
+               __add_release(edf, t); /* it has got to wait */
+       else
+               __add_ready(edf, t);
+}
+
+/* we assume the lock is being held */
+static void preempt(psnedf_domain_t *pedf)
+{
+       if (smp_processor_id() == pedf->cpu) {
+               if (pedf->scheduled && is_np(pedf->scheduled))
+                       request_exit_np(pedf->scheduled);
+               else
+                       set_tsk_need_resched(current);
+       } else
+               /* in case that it is a remote CPU we have to defer the
+                * the decision to the remote CPU
+                */
+               smp_send_reschedule(pedf->cpu);
+}
+
+/* This check is trivial in partioned systems as we only have to consider
+ * the CPU of the partition.
+ */
+static int psnedf_check_resched(rt_domain_t *edf)
+{
+       psnedf_domain_t *pedf = container_of(edf, psnedf_domain_t, domain);
+       int ret = 0;
+
+       /* because this is a callback from rt_domain_t we already hold
+        * the necessary lock for the ready queue
+        */
+       if (edf_preemption_needed(edf, pedf->scheduled)) {
+               preempt(pedf);
+               ret = 1;
+       }
+       return ret;
+}
+
+
+static reschedule_check_t psnedf_scheduler_tick(void)
+{
+       unsigned long       flags;
+       struct task_struct *t            = current;
+       reschedule_check_t  want_resched = NO_RESCHED;
+       rt_domain_t        *edf          = local_edf;
+       psnedf_domain_t    *pedf         = local_pedf;
+
+       /* Check for inconsistency. We don't need the lock for this since
+        * ->scheduled is only changed in schedule, which obviously is not
+        *  executing in parallel on this CPU
+        */
+       BUG_ON(is_realtime(t) && t != pedf->scheduled);
+
+       if (is_realtime(t))
+               TRACE("%s/%d was hit by scheduler tick\n", t->comm, t->pid);
+
+       /* expire tasks even if not in real-time mode
+        * this makes sure that at the end of real-time mode
+        * no tasks "run away forever".
+        */
+       if (is_realtime(t) && t->time_slice && !--t->time_slice) {
+               if (!is_np(t)) {
+                       want_resched = FORCE_RESCHED;
+               } else {
+                       TRACE("psnedf_scheduler_tick: "
+                             "%d is non-preemptable, "
+                             "preemption delayed.\n", t->pid);
+                       request_exit_np(t);
+               }
+       }
+
+       if (get_rt_mode() == MODE_RT_RUN)
+       {
+               /* check whether anything is waiting to be released
+                * this could probably be moved to the global timer
+                * interrupt handler since the state will only change
+                * once per jiffie
+                */
+               spin_lock_irqsave(&pedf->lock, flags);
+               __release_pending(edf);
+               if (want_resched != FORCE_RESCHED &&
+                   edf_preemption_needed(edf, t))
+                       want_resched = FORCE_RESCHED;
+
+               spin_unlock_irqrestore(&pedf->lock, flags);
+
+       }
+       return want_resched;
+}
+
+static void job_completion(struct task_struct* t)
+{
+       TRACE_TASK(t, "job_completion().\n");
+       set_rt_flags(t, RT_F_SLEEP);
+       edf_prepare_for_next_period(t);
+}
+
+static int psnedf_schedule(struct task_struct * prev,
+                            struct task_struct ** next,
+                            runqueue_t * rq)
+{
+       psnedf_domain_t*        pedf = local_pedf;
+       rt_domain_t*            edf  = &pedf->domain;
+
+       int                     out_of_time, sleep, preempt,
+                               np, exists, rt, blocks, resched;
+
+       spin_lock(&pedf->lock);
+
+       /* sanity checking */
+       BUG_ON(pedf->scheduled && pedf->scheduled != prev);
+       BUG_ON(pedf->scheduled && !is_realtime(prev));
+
+       /* (0) Determine state */
+       exists      = pedf->scheduled != NULL;
+       blocks      = exists && !is_running(pedf->scheduled);
+       out_of_time = exists && !pedf->scheduled->time_slice;
+       np          = exists && is_np(pedf->scheduled);
+       sleep       = exists && get_rt_flags(pedf->scheduled) == RT_F_SLEEP;
+       preempt     = edf_preemption_needed(edf, prev);
+       rt          = get_rt_mode() == MODE_RT_RUN;
+
+
+       /* If we need to preempt do so.
+        * The following checks set resched to 1 in case of special
+        * circumstances.
+        */
+       resched = preempt;
+
+       /* If a task blocks we have no choice but to reschedule.
+        */
+       if (blocks)
+               resched = 1;
+
+       /* Request a sys_exit_np() call if we would like to preempt but cannot.
+        * Multiple calls to request_exit_np() don't hurt.
+        */
+       if (np && (out_of_time || preempt || sleep))
+               request_exit_np(pedf->scheduled);
+
+       /* Any task that is preemptable and either exhausts its execution
+        * budget or wants to sleep completes. We may have to reschedule after
+        * this.
+        */
+       if (!np && (out_of_time || sleep)) {
+               job_completion(pedf->scheduled);
+               resched = 1;
+       }
+
+       /* Stop real-time tasks when we leave real-time mode
+        */
+       if (!rt && exists)
+               resched = 1;
+
+       /* The final scheduling decision. Do we need to switch for some reason?
+        * Switch if we are in RT mode and have no task or if we need to
+        * resched.
+        */
+       *next = NULL;
+       if ((!np || blocks) && (resched || (!exists && rt))) {
+               /* Take care of a previously scheduled
+                * job by taking it out of the Linux runqueue.
+                */
+               if (pedf->scheduled) {
+                       /* as opposed to global schedulers that switch without
+                        * a lock being held we can requeue already here since
+                        * no other CPU will schedule from this domain.
+                        */
+                       if (!blocks)
+                               requeue(pedf->scheduled, edf);
+                       if (prev->array)
+                               /* take it out of the run queue */
+                               deactivate_task(prev, rq);
+               }
+
+               /* only pick tasks if we are actually in RT mode */
+               if (rt)
+                       *next = __take_ready(edf);
+               if (*next) {
+                       /* stick the task into the runqueue */
+                       __activate_task(*next, rq);
+                       set_task_cpu(*next, smp_processor_id());
+               }
+
+       } else
+               /* Only override Linux scheduler if we have a real-time task
+                * scheduled that needs to continue.
+                */
+               if (exists)
+                       *next = prev;
+
+       if (*next)
+               set_rt_flags(*next, RT_F_RUNNING);
+
+       pedf->scheduled = *next;
+       spin_unlock(&pedf->lock);
+       return 0;
+}
+
+
+/*     Prepare a task for running in RT mode
+ *     Enqueues the task into master queue data structure
+ *     returns
+ *             -EPERM  if task is not TASK_STOPPED
+ */
+static long psnedf_prepare_task(struct task_struct * t)
+{
+       rt_domain_t*            edf  = task_edf(t);
+       psnedf_domain_t*        pedf = task_pedf(t);
+       unsigned long           flags;
+
+       TRACE("[%d] psn edf: prepare task %d on CPU %d\n",
+               smp_processor_id(), t->pid, get_partition(t));
+       if (t->state == TASK_STOPPED) {
+               __setscheduler(t, SCHED_FIFO, MAX_RT_PRIO - 1);
+
+               if (get_rt_mode() == MODE_RT_RUN)
+                       /* The action is already on.
+                        * Prepare immediate release.
+                        */
+                       edf_release_now(t);
+               /* The task should be running in the queue, otherwise signal
+                * code will try to wake it up with fatal consequences.
+                */
+               t->state = TASK_RUNNING;
+               spin_lock_irqsave(&pedf->lock, flags);
+               __add_release(edf, t);
+               spin_unlock_irqrestore(&pedf->lock, flags);
+               return 0;
+       } else
+               return -EPERM;
+}
+
+static void psnedf_wake_up_task(struct task_struct *task)
+{
+       unsigned long           flags;
+       psnedf_domain_t*        pedf = task_pedf(task);
+       rt_domain_t*            edf  = task_edf(task);
+
+       TRACE("psnedf: %d unsuspends with budget=%d\n",
+             task->pid, task->time_slice);
+
+       /* After fixing the litmus_controlled bug,
+        * this should hold again.
+        */
+       BUG_ON(in_list(&task->rt_list));
+
+       task->state = TASK_RUNNING;
+
+       /* We need to take suspensions because of semaphores into
+        * account! If a job resumes after being suspended due to acquiring
+        * a semaphore, it should never be treated as a new job release.
+        */
+       if (is_tardy(task) && get_rt_flags(task) != RT_F_EXIT_SEM) {
+               /* new sporadic release */
+               edf_release_now(task);
+               sched_trace_job_release(task);
+       }
+
+       spin_lock_irqsave(&pedf->lock, flags);
+       requeue(task, edf);
+       spin_unlock_irqrestore(&pedf->lock, flags);
+}
+
+static void psnedf_task_blocks(struct task_struct *t)
+{
+       BUG_ON(!is_realtime(t));
+       /* not really anything to do since it can only block if
+        * it is running, and when it is not running it is not in any
+        * queue anyway.
+        */
+       TRACE("task %d blocks with budget=%d\n", t->pid, t->time_slice);
+       BUG_ON(in_list(&t->rt_list));
+}
+
+
+/* When _tear_down is called, the task should not be in any queue any more
+ * as it must have blocked first. We don't have any internal state for the task,
+ * it is all in the task_struct.
+ */
+static long psnedf_tear_down(struct task_struct * t)
+{
+       BUG_ON(!is_realtime(t));
+        TRACE_TASK(t, "tear down called");
+       BUG_ON(t->array);
+       BUG_ON(in_list(&t->rt_list));
+       return 0;
+}
+
+static long psnedf_pi_block(struct pi_semaphore *sem,
+                           struct task_struct *new_waiter)
+{
+       psnedf_domain_t*        pedf;
+       rt_domain_t*            edf;
+       struct task_struct*     t;
+       int cpu  = get_partition(new_waiter);
+
+       BUG_ON(!new_waiter);
+
+       if (edf_higher_prio(new_waiter, sem->hp.cpu_task[cpu])) {
+               TRACE_TASK(new_waiter, " boosts priority\n");
+               pedf = task_pedf(new_waiter);
+               edf  = task_edf(new_waiter);
+
+               /* interrupts already disabled */
+               spin_lock(&pedf->lock);
+
+               /* store new highest-priority task */
+               sem->hp.cpu_task[cpu] = new_waiter;
+               if (sem->holder &&
+                   get_partition(sem->holder) == get_partition(new_waiter)) {
+                       /* let holder inherit */
+                       sem->holder->rt_param.inh_task = new_waiter;
+                       t = sem->holder;
+                       if (in_list(&t->rt_list)) {
+                               /* queued in domain*/
+                               list_del(&t->rt_list);
+                               /* readd to make priority change take place */
+                               if (is_released(t))
+                                       __add_ready(edf, t);
+                               else
+                                       __add_release(edf, t);
+                       }
+               }
+
+               /* check if we need to reschedule */
+               if (edf_preemption_needed(edf, current))
+                       preempt(pedf);
+
+               spin_unlock(&pedf->lock);
+       }
+
+       return 0;
+}
+
+static long psnedf_inherit_priority(struct pi_semaphore *sem,
+                                   struct task_struct *new_owner)
+{
+       int cpu  = get_partition(new_owner);
+
+       new_owner->rt_param.inh_task = sem->hp.cpu_task[cpu];
+       if (sem->hp.cpu_task[cpu] && new_owner != sem->hp.cpu_task[cpu]) {
+               TRACE_TASK(new_owner,
+                          "inherited priority from %s/%d\n",
+                          sem->hp.cpu_task[cpu]->comm,
+                          sem->hp.cpu_task[cpu]->pid);
+       } else
+               TRACE_TASK(new_owner,
+                          "cannot inherit priority: "
+                          "no higher priority job waits on this CPU!\n");
+       /* make new owner non-preemptable as required by FMLP under
+        * PSN-EDF.
+        */
+       make_np(new_owner);
+       return 0;
+}
+
+
+/* This function is called on a semaphore release, and assumes that
+ * the current task is also the semaphore holder.
+ */
+static long psnedf_return_priority(struct pi_semaphore *sem)
+{
+       struct task_struct*     t    = current;
+       psnedf_domain_t*        pedf = task_pedf(t);
+       rt_domain_t*            edf  = task_edf(t);
+       int                     ret  = 0;
+       int                     cpu  = get_partition(current);
+
+
+        /* Find new highest-priority semaphore task
+        * if holder task is the current hp.cpu_task[cpu].
+        *
+        * Calling function holds sem->wait.lock.
+        */
+       if (t == sem->hp.cpu_task[cpu])
+               edf_set_hp_cpu_task(sem, cpu);
+
+       take_np(t);
+       if (current->rt_param.inh_task) {
+               TRACE_CUR("return priority of %s/%d\n",
+                         current->rt_param.inh_task->comm,
+                         current->rt_param.inh_task->pid);
+               spin_lock(&pedf->lock);
+
+               /* Reset inh_task to NULL. */
+               current->rt_param.inh_task = NULL;
+
+               /* check if we need to reschedule */
+               if (edf_preemption_needed(edf, current))
+                       preempt(pedf);
+
+               spin_unlock(&pedf->lock);
+       } else
+               TRACE_CUR(" no priority to return %p\n", sem);
+
+       return ret;
+}
+
+
+static int psnedf_mode_change(int new_mode)
+{
+       int cpu;
+
+       if (new_mode == MODE_RT_RUN)
+               for_each_online_cpu(cpu) {
+                       spin_lock(&remote_pedf(cpu)->lock);
+                       __rerelease_all(remote_edf(cpu), edf_release_at);
+                       spin_unlock(&remote_pedf(cpu)->lock);
+               }
+
+       TRACE("[%d] psn edf: mode changed to %d\n",
+              smp_processor_id(), new_mode);
+       return 0;
+}
+
+
+/*     Plugin object   */
+static sched_plugin_t s_plugin __cacheline_aligned_in_smp = {
+       .ready_to_use = 0
+};
+
+
+/*
+ *     Plugin initialization code.
+ */
+#define INIT_SCHED_PLUGIN (struct sched_plugin) {\
+       .plugin_name            = "PSN-EDF",\
+       .ready_to_use           = 1,\
+       .scheduler_tick         = psnedf_scheduler_tick,\
+       .prepare_task           = psnedf_prepare_task,\
+       .sleep_next_period      = edf_sleep_next_period,\
+       .tear_down              = psnedf_tear_down,\
+       .schedule               = psnedf_schedule,\
+       .mode_change            = psnedf_mode_change,\
+       .wake_up_task           = psnedf_wake_up_task,\
+       .task_blocks            = psnedf_task_blocks, \
+       .pi_block               = psnedf_pi_block, \
+       .inherit_priority       = psnedf_inherit_priority, \
+       .return_priority        = psnedf_return_priority \
+}
+
+
+sched_plugin_t *__init init_psn_edf_plugin(void)
+{
+       int i;
+
+       if (!s_plugin.ready_to_use)
+       {
+               for (i = 0; i < NR_CPUS; i++)
+               {
+                       psnedf_domain_init(remote_pedf(i),
+                                            psnedf_check_resched, i);
+                       printk("PSN-EDF: CPU partition %d initialized.\n", i);
+               }
+               s_plugin = INIT_SCHED_PLUGIN;
+       }
+       return &s_plugin;
+}
+
+
+
+diff --git a/kernel/sched_trace.c b/kernel/sched_trace.c
+new file mode 100644
+index 0000000..0213ca7
+--- /dev/null
+++ b/kernel/sched_trace.c
+@@ -0,0 +1,755 @@
+/* sched_trace.c -- record scheduling events to a byte stream.
+ *
+ * TODO: Move ring buffer to a lockfree implementation.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/fs.h>
+#include <linux/cdev.h>
+#include <asm/semaphore.h>
+#include <asm/uaccess.h>
+#include <linux/module.h>
+
+#include <linux/queuelock.h>
+#include <linux/sched_trace.h>
+#include <linux/litmus.h>
+
+
+typedef struct {
+        /*     guard read and write pointers                   */
+       spinlock_t      lock;
+       /*      guard against concurrent freeing of buffer      */
+       rwlock_t        del_lock;
+
+       /*      memory allocated for ring buffer                */
+       unsigned long   order;
+       char*           buf;
+       char*           end;
+
+       /*      Read/write pointer. May not cross.
+        *      They point to the position of next write and
+        *      last read.
+        */
+       char*           writep;
+       char*           readp;
+
+} ring_buffer_t;
+
+#define EMPTY_RING_BUFFER {    \
+       .lock     = SPIN_LOCK_UNLOCKED,         \
+       .del_lock = RW_LOCK_UNLOCKED,           \
+       .buf      = NULL,                       \
+       .end      = NULL,                       \
+       .writep   = NULL,                       \
+       .readp    = NULL                        \
+}
+
+void rb_init(ring_buffer_t* buf)
+{
+       *buf = (ring_buffer_t) EMPTY_RING_BUFFER;
+}
+
+int rb_alloc_buf(ring_buffer_t* buf, unsigned long order)
+{
+       unsigned long flags;
+       int error = 0;
+       char *mem;
+
+       /* do memory allocation while not atomic */
+       mem = (char *) __get_free_pages(GFP_KERNEL, order);
+       if (!mem)
+               return -ENOMEM;
+       write_lock_irqsave(&buf->del_lock, flags);
+       BUG_ON(buf->buf);
+       buf->buf = mem;
+       buf->end = buf->buf + PAGE_SIZE * (1 << order) - 1;
+       memset(buf->buf, 0xff, buf->end - buf->buf);
+       buf->order = order;
+       buf->writep = buf->buf + 1;
+       buf->readp  = buf->buf;
+       write_unlock_irqrestore(&buf->del_lock, flags);
+       return error;
+}
+
+int rb_free_buf(ring_buffer_t* buf)
+{
+       unsigned long flags;
+       int error = 0;
+       write_lock_irqsave(&buf->del_lock, flags);
+       BUG_ON(!buf->buf);
+       free_pages((unsigned long) buf->buf, buf->order);
+       buf->buf    = NULL;
+       buf->end    = NULL;
+       buf->writep = NULL;
+       buf->readp  = NULL;
+       write_unlock_irqrestore(&buf->del_lock, flags);
+       return error;
+}
+
+/* Assumption: concurrent writes are serialized externally
+ *
+ * Will only succeed if there is enough space for all len bytes.
+ */
+int rb_put(ring_buffer_t* buf, char* mem, size_t len)
+{
+       unsigned long flags;
+       char* r , *w;
+       int error = 0;
+       read_lock_irqsave(&buf->del_lock, flags);
+       if (!buf->buf) {
+               error = -ENODEV;
+               goto out;
+       }
+       spin_lock(&buf->lock);
+       r = buf->readp;
+       w = buf->writep;
+       spin_unlock(&buf->lock);
+       if (r < w && buf->end - w >= len - 1) {
+               /* easy case: there is enough space in the buffer
+                * to write it in one continous chunk*/
+               memcpy(w, mem, len);
+               w += len;
+               if (w > buf->end)
+                       /* special case: fit exactly into buffer
+                        * w is now buf->end + 1
+                        */
+                       w = buf->buf;
+       } else if (w < r && r - w >= len) { /* >= len because  may not cross */
+               /* we are constrained by the read pointer but we there
+                * is enough space
+                */
+               memcpy(w, mem, len);
+               w += len;
+       } else if (r <= w && buf->end - w < len - 1) {
+               /* the wrap around case: there may or may not be space */
+               if ((buf->end - w) + (r - buf->buf) >= len - 1) {
+                       /* copy chunk that fits at the end */
+                       memcpy(w, mem, buf->end - w + 1);
+                       mem += buf->end - w + 1;
+                       len -= (buf->end - w + 1);
+                       w = buf->buf;
+                       /* copy the rest */
+                       memcpy(w, mem, len);
+                       w += len;
+               }
+               else
+                       error = -ENOMEM;
+       } else {
+               error = -ENOMEM;
+       }
+       if (!error) {
+               spin_lock(&buf->lock);
+               buf->writep = w;
+               spin_unlock(&buf->lock);
+       }
+ out:
+       read_unlock_irqrestore(&buf->del_lock, flags);
+       return error;
+}
+
+/* Assumption: concurrent reads are serialized externally */
+int rb_get(ring_buffer_t* buf, char* mem, size_t len)
+{
+       unsigned long flags;
+       char* r , *w;
+       int error = 0;
+       read_lock_irqsave(&buf->del_lock, flags);
+       if (!buf->buf) {
+               error = -ENODEV;
+               goto out;
+       }
+       spin_lock(&buf->lock);
+       r = buf->readp;
+       w = buf->writep;
+       spin_unlock(&buf->lock);
+
+       if (w <= r && buf->end - r >= len) {
+               /* easy case: there is enough data in the buffer
+                * to get it in one  chunk*/
+               memcpy(mem, r + 1, len);
+               r += len;
+               error = len;
+
+       } else if (r + 1 < w && w - r - 1 >= len) {
+               /* we are constrained by the write pointer but
+                * there is enough data
+                */
+               memcpy(mem, r + 1, len);
+               r += len;
+               error = len;
+
+       } else if (r + 1 < w && w - r - 1 < len) {
+               /* we are constrained by the write pointer and there
+                * there is not enough data
+                */
+               memcpy(mem, r + 1, w - r - 1);
+               error = w - r - 1;
+               r    += w - r - 1;
+
+       } else if (w <= r && buf->end - r < len) {
+               /* the wrap around case: there may or may not be enough data
+                * first let's get what is available
+                */
+               memcpy(mem, r + 1, buf->end - r);
+               error += (buf->end - r);
+               mem   += (buf->end - r);
+               len   -= (buf->end - r);
+               r     += (buf->end - r);
+
+               if (w > buf->buf) {
+                       /* there is more to get */
+                       r = buf->buf - 1;
+                       if (w - r >= len) {
+                               /* plenty */
+                               memcpy(mem, r + 1, len);
+                               error += len;
+                               r     += len;
+                       } else {
+                               memcpy(mem, r + 1, w - r - 1);
+                               error += w - r - 1;
+                               r     += w - r - 1;
+                       }
+               }
+       } /* nothing available */
+
+       if (error > 0) {
+               spin_lock(&buf->lock);
+               buf->readp = r;
+               spin_unlock(&buf->lock);
+       }
+ out:
+       read_unlock_irqrestore(&buf->del_lock, flags);
+       return error;
+}
+
+
+
+/******************************************************************************/
+/*                        DEVICE FILE DRIVER                                  */
+/******************************************************************************/
+
+
+
+/* Allocate a buffer of about 1 MB per CPU.
+ *
+ */
+#define BUFFER_ORDER 8
+
+typedef struct {
+       ring_buffer_t           buf;
+       atomic_t                reader_cnt;
+       struct semaphore        reader_mutex;
+} trace_buffer_t;
+
+
+/* This does not initialize the semaphore!! */
+
+#define EMPTY_TRACE_BUFFER \
+       { .buf = EMPTY_RING_BUFFER, .reader_cnt = ATOMIC_INIT(0)}
+
+static DEFINE_PER_CPU(trace_buffer_t, trace_buffer);
+
+#ifdef CONFIG_SCHED_DEBUG_TRACE
+static spinlock_t              log_buffer_lock = SPIN_LOCK_UNLOCKED;
+#endif
+static trace_buffer_t          log_buffer = EMPTY_TRACE_BUFFER;
+
+static void init_buffers(void)
+{
+       int i;
+
+       for (i = 0; i < NR_CPUS; i++) {
+               rb_init(&per_cpu(trace_buffer, i).buf);
+               init_MUTEX(&per_cpu(trace_buffer, i).reader_mutex);
+               atomic_set(&per_cpu(trace_buffer, i).reader_cnt, 0);
+       }
+       /* only initialize the mutex, the rest was initialized as part
+        * of the static initialization macro
+        */
+       init_MUTEX(&log_buffer.reader_mutex);
+}
+
+static int trace_release(struct inode *in, struct file *filp)
+{
+       int error               = -EINVAL;
+       trace_buffer_t* buf     = filp->private_data;
+
+       BUG_ON(!filp->private_data);
+
+       if (down_interruptible(&buf->reader_mutex)) {
+               error = -ERESTARTSYS;
+               goto out;
+       }
+
+       /*      last release must deallocate buffers    */
+       if (atomic_dec_return(&buf->reader_cnt) == 0) {
+               error = rb_free_buf(&buf->buf);
+       }
+
+       up(&buf->reader_mutex);
+ out:
+       return error;
+}
+
+static ssize_t trace_read(struct file *filp, char __user *to, size_t len,
+                     loff_t *f_pos)
+{
+       /*      we ignore f_pos, this is strictly sequential */
+
+       ssize_t error = -EINVAL;
+       char*   mem;
+       trace_buffer_t *buf = filp->private_data;
+
+       if (down_interruptible(&buf->reader_mutex)) {
+               error = -ERESTARTSYS;
+               goto out;
+       }
+
+       if (len > 64 * 1024)
+               len = 64 * 1024;
+       mem = kmalloc(len, GFP_KERNEL);
+       if (!mem) {
+               error = -ENOMEM;
+               goto out_unlock;
+       }
+
+       error = rb_get(&buf->buf, mem, len);
+       while (!error) {
+               set_current_state(TASK_INTERRUPTIBLE);
+               schedule_timeout(110);
+               if (signal_pending(current))
+                       error = -ERESTARTSYS;
+               else
+                       error = rb_get(&buf->buf, mem, len);
+       }
+
+       if (error > 0 && copy_to_user(to, mem, error))
+               error = -EFAULT;
+
+       kfree(mem);
+ out_unlock:
+       up(&buf->reader_mutex);
+ out:
+       return error;
+}
+
+
+/* trace_open - Open one of the per-CPU sched_trace buffers.
+ */
+static int trace_open(struct inode *in, struct file *filp)
+{
+       int error               = -EINVAL;
+       int cpu                 = MINOR(in->i_rdev);
+       trace_buffer_t* buf;
+
+       if (!cpu_online(cpu)) {
+               printk(KERN_WARNING "sched trace: "
+                       "CPU #%d is not online. (open failed)\n", cpu);
+               error = -ENODEV;
+               goto out;
+       }
+
+       buf = &per_cpu(trace_buffer, cpu);
+
+       if (down_interruptible(&buf->reader_mutex)) {
+               error = -ERESTARTSYS;
+               goto out;
+       }
+
+       /*      first open must allocate buffers        */
+       if (atomic_inc_return(&buf->reader_cnt) == 1) {
+               if ((error = rb_alloc_buf(&buf->buf, BUFFER_ORDER)))
+               {
+                       atomic_dec(&buf->reader_cnt);
+                       goto out_unlock;
+               }
+       }
+
+       error = 0;
+       filp->private_data = buf;
+
+ out_unlock:
+       up(&buf->reader_mutex);
+ out:
+       return error;
+}
+
+/* log_open - open the global log message ring buffer.
+ */
+static int log_open(struct inode *in, struct file *filp)
+{
+       int error               = -EINVAL;
+       trace_buffer_t* buf;
+
+       buf = &log_buffer;
+
+       if (down_interruptible(&buf->reader_mutex)) {
+               error = -ERESTARTSYS;
+               goto out;
+       }
+
+       /*      first open must allocate buffers        */
+       if (atomic_inc_return(&buf->reader_cnt) == 1) {
+               if ((error = rb_alloc_buf(&buf->buf, BUFFER_ORDER)))
+               {
+                       atomic_dec(&buf->reader_cnt);
+                       goto out_unlock;
+               }
+       }
+
+       error = 0;
+       filp->private_data = buf;
+
+ out_unlock:
+       up(&buf->reader_mutex);
+ out:
+       return error;
+}
+
+/******************************************************************************/
+/*                          Device Registration                               */
+/******************************************************************************/
+
+/* the major numbes are from the unassigned/local use block
+ *
+ * This should be converted to dynamic allocation at some point...
+ */
+#define TRACE_MAJOR    250
+#define LOG_MAJOR      251
+
+/* trace_fops - The file operations for accessing the per-CPU scheduling event
+ *              trace buffers.
+ */
+struct file_operations trace_fops = {
+       .owner   = THIS_MODULE,
+       .open    = trace_open,
+       .release = trace_release,
+       .read    = trace_read,
+};
+
+/* log_fops  - The file operations for accessing the global LITMUS log message
+ *             buffer.
+ *
+ * Except for opening the device file it uses the same operations as trace_fops.
+ */
+struct file_operations log_fops = {
+       .owner   = THIS_MODULE,
+       .open    = log_open,
+       .release = trace_release,
+       .read    = trace_read,
+};
+
+static int __init register_buffer_dev(const char* name,
+                                     struct file_operations* fops,
+                                     int major, int count)
+{
+       dev_t  trace_dev;
+       struct cdev *cdev;
+       int error = 0;
+
+       trace_dev = MKDEV(major, 0);
+       error     = register_chrdev_region(trace_dev, count, name);
+       if (error)
+       {
+               printk(KERN_WARNING "sched trace: "
+                      "Could not register major/minor number %d\n", major);
+               return error;
+       }
+       cdev = cdev_alloc();
+       if (!cdev) {
+               printk(KERN_WARNING "sched trace: "
+                       "Could not get a cdev for %s.\n", name);
+               return -ENOMEM;
+       }
+       cdev->owner = THIS_MODULE;
+       cdev->ops   = fops;
+       error = cdev_add(cdev, trace_dev, count);
+       if (error) {
+               printk(KERN_WARNING "sched trace: "
+                       "add_cdev failed for %s.\n", name);
+               return -ENOMEM;
+       }
+       return error;
+
+}
+
+static int __init init_sched_trace(void)
+{
+       int error1 = 0, error2 = 0;
+
+       printk("Initializing scheduler trace device\n");
+       init_buffers();
+
+       error1 = register_buffer_dev("schedtrace", &trace_fops,
+                                   TRACE_MAJOR, NR_CPUS);
+
+       error2 = register_buffer_dev("litmus_log", &log_fops,
+                                    LOG_MAJOR, 1);
+       if (error1 || error2)
+               return min(error1, error2);
+       else
+               return 0;
+}
+
+module_init(init_sched_trace);
+
+/******************************************************************************/
+/*                                KERNEL API                                  */
+/******************************************************************************/
+
+/* The per-CPU LITMUS log buffer. Don't put it on the stack, it is too big for
+ * that and the kernel gets very picky with nested interrupts and small stacks.
+ */
+
+#ifdef CONFIG_SCHED_DEBUG_TRACE
+
+#define MSG_SIZE 255
+static DEFINE_PER_CPU(char[MSG_SIZE], fmt_buffer);
+
+/* sched_trace_log_message - This is the only function that accesses the the
+ *                           log buffer inside the kernel for writing.
+ *                           Concurrent access to it is serialized via the
+ *                           log_buffer_lock.
+ *
+ *                           The maximum length of a formatted message is 255.
+ */
+void sched_trace_log_message(const char* fmt, ...)
+{
+       unsigned long   flags;
+       va_list         args;
+       size_t          len;
+       char*           buf;
+
+       va_start(args, fmt);
+       local_irq_save(flags);
+
+       /* format message */
+       buf = __get_cpu_var(fmt_buffer);
+       len = vscnprintf(buf, MSG_SIZE, fmt, args);
+
+       spin_lock(&log_buffer_lock);
+       /* Don't copy the trailing null byte, we don't want null bytes
+        * in a text file.
+        */
+       rb_put(&log_buffer.buf, buf, len);
+       spin_unlock(&log_buffer_lock);
+
+       local_irq_restore(flags);
+       va_end(args);
+}
+
+#endif
+
+#ifdef CONFIG_SCHED_TASK_TRACE
+
+static inline void  __put_trace(char* mem, size_t size)
+{
+       trace_buffer_t* buf = &__get_cpu_var(trace_buffer);
+       rb_put(&buf->buf, mem, size);
+}
+
+#define put_trace(obj)                                 \
+       if (get_rt_mode() == MODE_RT_RUN)       \
+               __put_trace((char *) &obj, sizeof(obj))
+
+#define header(rec, type)                      \
+{                                              \
+       rec.header.trace = type;                \
+       rec.header.timestamp = sched_clock();   \
+       rec.header.size = sizeof(rec);          \
+}
+
+#define tinfo(info, t)                         \
+{                                              \
+       info.is_rt     = is_realtime(t);        \
+       info.is_server = 0;                     \
+       info.class     = get_class(t);          \
+       info.budget    = (t)->time_slice;       \
+       info.pid       = (t)->pid;              \
+       info.deadline  = (t)->rt_param.times.deadline; \
+}
+
+#define rtinfo(info, t)                                \
+{                                              \
+       info.wcet      = get_exec_cost(t);      \
+       info.period    = get_rt_period(t);      \
+}
+
+void sched_trace_scheduler_invocation(void)
+{
+       invocation_record_t rec;
+       header(rec, ST_INVOCATION);
+       rec.flags = current->flags;
+       put_trace(rec);
+}
+
+void sched_trace_task_arrival(struct task_struct *t)
+{
+       arrival_record_t rec;
+       header(rec, ST_ARRIVAL);
+       tinfo(rec.task, t);
+       put_trace(rec);
+}
+
+
+void sched_trace_task_departure(struct task_struct *t)
+{
+       departure_record_t rec;
+       header(rec, ST_DEPARTURE);
+       tinfo(rec.task, t);
+       put_trace(rec);
+}
+
+void sched_trace_task_preemption(struct task_struct *t, struct task_struct* by)
+{
+       preemption_record_t rec;
+       header(rec, ST_PREEMPTION);
+       tinfo(rec.task, t);
+       tinfo(rec.by, by);
+       put_trace(rec);
+}
+
+
+void sched_trace_task_scheduled(struct task_struct *t)
+{
+       scheduled_record_t rec;
+       header(rec, ST_SCHEDULED);
+       tinfo(rec.task, t);
+       put_trace(rec);
+}
+
+
+void sched_trace_job_release(struct task_struct *t)
+{
+       release_record_t rec;
+       header(rec, ST_JOB_RELEASE);
+       tinfo(rec.task, t);
+       rtinfo(rec, t);
+       put_trace(rec);
+}
+
+void sched_trace_job_completion(struct task_struct *t)
+{
+       completion_record_t rec;
+       header(rec, ST_JOB_COMPLETION);
+       tinfo(rec.task, t);
+       rtinfo(rec, t);
+       rec.tardiness = jiffies - t->rt_param.times.deadline;
+       rec.job_no = t->rt_param.times.job_no;
+       TRACE_TASK(t, "AAATardiness : %d\n", rec.tardiness);
+       put_trace(rec);
+}
+
+
+void sched_trace_server_scheduled(int id, task_class_t class,
+                                 unsigned int budget, jiffie_t deadline)
+{
+       scheduled_record_t rec;
+       header(rec, ST_SCHEDULED);
+       rec.task.pid            = id;
+       rec.task.is_rt          = 1;
+       rec.task.is_server      = 1;
+       rec.task.class          = class;
+       rec.task.budget         = budget;
+       rec.task.deadline       = deadline;
+       put_trace(rec);
+}
+
+void sched_trace_server_release(int id, unsigned int wcet,
+                                unsigned int period, task_class_t class)
+{
+       release_record_t rec;
+       header(rec, ST_JOB_RELEASE);
+       rec.task.pid            = id;
+       rec.task.is_rt          = 1;
+       rec.task.is_server      = 1;
+       rec.task.class          = class;
+       rec.task.budget         = wcet;
+       rec.period              = period;
+       rec.wcet                = wcet;
+       put_trace(rec);
+}
+
+void sched_trace_server_completion(int id, unsigned int budget,
+                                    jiffie_t deadline, task_class_t class)
+{
+       completion_record_t rec;
+       header(rec, ST_JOB_COMPLETION);
+       rec.task.pid            = id;
+       rec.task.is_rt          = 1;
+       rec.task.is_server      = 1;
+       rec.task.class          = class;
+       rec.task.budget         = budget;
+       rec.task.deadline       = deadline;
+       rec.period              = 0;
+       rec.tardiness           = jiffies - deadline;
+       put_trace(rec);
+
+}
+
+void sched_trace_capacity_release(struct task_struct *t)
+{
+       cap_release_record_t rec;
+       header(rec, ST_CAPACITY_RELEASE);
+       tinfo(rec.task, t);
+       put_trace(rec);
+}
+
+void sched_trace_capacity_allocation(struct task_struct *t, u16 budget, u32 deadline,
+                                    pid_t donor)
+{
+       cap_allocation_record_t rec;
+       header(rec, ST_CAPACITY_ALLOCATION);
+       tinfo(rec.task, t);
+       rec.donor       = donor;
+       rec.budget      = budget;
+       rec.deadline    = deadline;
+       put_trace(rec);
+}
+
+void sched_trace_capacity_alloc_srv(pid_t srv, u32 srv_dl, task_class_t cls,
+                                   u16 srv_budget,
+                                   u16 budget, u32 deadline, pid_t donor)
+{
+       cap_allocation_record_t rec;
+       header(rec, ST_CAPACITY_ALLOCATION);
+       rec.task.pid       = srv;
+       rec.task.is_rt     = 1;
+       rec.task.is_server = 1;
+       rec.task.class     = cls;
+       rec.task.budget    = srv_budget;
+       rec.task.deadline  = srv_dl;
+       rec.donor          = donor;
+       rec.budget         = budget;
+       rec.deadline       = deadline;
+       put_trace(rec);
+}
+
+void sched_trace_service_level_change(struct task_struct *t,
+                                     unsigned int from,
+                                     unsigned int to)
+{
+       service_level_change_record_t rec;
+       header(rec, ST_SERVICE_LEVEL_CHANGE);
+       tinfo(rec.task, t);
+       rec.to = to;
+       rec.from = from;
+       rec.new_level =
+               t->rt_param.service_level[to];
+       rec.old_level =
+               t->rt_param.service_level[from];
+       put_trace(rec);
+}
+
+void sched_trace_weight_error(struct task_struct* t, fp_t actual)
+{
+       weight_error_record_t rec;
+       header(rec, ST_WEIGHT_ERROR);
+       rec.task = t->pid;
+       rec.actual   = actual;
+       rec.estimate = get_est_weight(t);
+       put_trace(rec);
+}
+
+
+#endif
+diff --git a/kernel/timer.c b/kernel/timer.c
+index c2a8ccf..77a1b6b 100644
+--- a/kernel/timer.c
+++ b/kernel/timer.c
+@@ -737,6 +737,27 @@ static inline s64 __get_nsec_offset(void)
+        return ns_offset;
+ }
+ 
+/* Non-static, non-inline, public version of function above.
+ * It's up to the programmer to decide how to use it, no guarantees
+ * about anything are made here.
+ */
+s64 get_nsec_offset(void)
+{
+       cycle_t cycle_now, cycle_delta;
+       s64 ns_offset;
+
+       /* read clocksource: */
+       cycle_now = clocksource_read(clock);
+
+       /* calculate the delta since the last update_wall_time: */
+       cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
+
+       /* convert to nanoseconds: */
+       ns_offset = cyc2ns(clock, cycle_delta);
+
+       return ns_offset;
+}
+
+ /**
+  * __get_realtime_clock_ts - Returns the time of day in a timespec
+  * @ts:                pointer to the timespec to be set
+@@ -789,6 +810,7 @@ void do_gettimeofday(struct timeval *tv)
+ }
+ 
+ EXPORT_SYMBOL(do_gettimeofday);
+
+ /**
+  * do_settimeofday - Sets the time of day
+  * @tv:                pointer to the timespec variable containing the new time
+diff --git a/kernel/trace.c b/kernel/trace.c
+new file mode 100644
+index 0000000..6119574
+--- /dev/null
+++ b/kernel/trace.c
+@@ -0,0 +1,302 @@
+#include <linux/fs.h>
+#include <linux/cdev.h>
+#include <asm/semaphore.h>
+#include <asm/uaccess.h>
+#include <linux/module.h>
+
+#include <linux/trace.h>
+
+/******************************************************************************/
+/*                          Allocation                                        */
+/******************************************************************************/
+
+struct ft_buffer* trace_ts_buf = NULL;
+
+static unsigned int ts_seq_no = 0;
+
+feather_callback void save_timestamp(unsigned long event)
+{
+       unsigned int seq_no = fetch_and_inc((int *) &ts_seq_no);
+       struct timestamp *ts;
+       if (ft_buffer_start_write(trace_ts_buf, (void**)  &ts)) {               
+               ts->event     = event;
+               ts->timestamp = ft_read_tsc();
+               ts->seq_no    = seq_no;
+               ts->cpu       = raw_smp_processor_id();
+               ft_buffer_finish_write(trace_ts_buf, ts);
+       }
+}
+
+static struct ft_buffer* alloc_ft_buffer(unsigned int count, size_t size)
+{
+       struct ft_buffer* buf;
+       size_t total = (size + 1) * count;
+       char* mem;
+       int order = 0, pages = 1;
+
+       buf = kmalloc(sizeof(struct ft_buffer), GFP_KERNEL);
+       if (!buf)
+               return NULL;
+
+       total = (total / PAGE_SIZE) + (total % PAGE_SIZE != 0);
+       while (pages < total) {
+               order++;
+               pages *= 2;
+       }
+
+       mem = (char*) __get_free_pages(GFP_KERNEL, order);
+       if (!mem) {
+               kfree(buf);
+               return NULL;
+       }
+       
+       if (!init_ft_buffer(buf, count, size,                        
+                           mem + (count * size),  /* markers at the end */
+                           mem)) {                /* buffer objects     */
+               free_pages((unsigned long) mem, order);
+               kfree(buf);
+               return NULL;
+       }
+       return buf;
+}
+
+static void free_ft_buffer(struct ft_buffer* buf)
+{
+       int order = 0, pages = 1;
+       size_t total;
+
+       if (buf) {
+               total = (buf->slot_size + 1) * buf->slot_count;
+               total = (total / PAGE_SIZE) + (total % PAGE_SIZE != 0);
+               while (pages < total) {
+                       order++;
+                       pages *= 2;
+               }
+               free_pages((unsigned long) buf->buffer_mem, order);
+               kfree(buf);
+       }
+}
+
+
+/******************************************************************************/
+/*                        DEVICE FILE DRIVER                                  */
+/******************************************************************************/
+
+#define NO_TIMESTAMPS 262144
+
+static DECLARE_MUTEX(feather_lock);
+static int use_count = 0;
+
+static int trace_release(struct inode *in, struct file *filp)
+{
+       int err                 = -EINVAL;
+
+       if (down_interruptible(&feather_lock)) {
+               err = -ERESTARTSYS;
+               goto out;
+       }
+
+       printk(KERN_ALERT "%s/%d disconnects from feather trace device. "
+              "use_count=%d\n",
+              current->comm, current->pid, use_count);
+
+       if (use_count == 1) {
+               /* disable events */
+               ft_disable_all_events();
+               
+               /* wait for any pending events to complete */
+               set_current_state(TASK_UNINTERRUPTIBLE);
+               schedule_timeout(HZ);
+               
+               printk(KERN_ALERT "Failed trace writes: %u\n", 
+                      trace_ts_buf->failed_writes);
+       
+               free_ft_buffer(trace_ts_buf);
+               trace_ts_buf = NULL;
+       }
+
+       use_count--;
+       up(&feather_lock);
+out:
+       return err;
+}
+
+
+static ssize_t trace_read(struct file *filp, char __user *to, size_t len, 
+                     loff_t *f_pos)
+{
+       /*      we ignore f_pos, this is strictly sequential */ 
+       ssize_t error = 0;
+       struct timestamp ts;
+
+       if (down_interruptible(&feather_lock)) {
+               error = -ERESTARTSYS;
+               goto out;
+       }
+
+       
+       while (len >= sizeof(struct timestamp)) {
+               if (ft_buffer_read(trace_ts_buf, &ts)) {
+                       if (copy_to_user(to, &ts, sizeof(struct timestamp))) {
+                               error = -EFAULT;
+                               break;
+                       } else {
+                               len    -= sizeof(struct timestamp);
+                               to     += sizeof(struct timestamp);
+                               error  += sizeof(struct timestamp);
+                       }
+               } else {
+                       set_current_state(TASK_INTERRUPTIBLE);
+                       schedule_timeout(50);
+                       if (signal_pending(current)) {
+                               error = -ERESTARTSYS;
+                               break;
+                       }
+               }
+       }
+       up(&feather_lock);
+out:
+       return error;
+}
+
+#define ENABLE_CMD     0
+#define DISABLE_CMD    1
+
+static ssize_t trace_write(struct file *filp, const char __user *from, 
+                          size_t len, loff_t *f_pos) 
+{
+       ssize_t error = -EINVAL;
+       unsigned long cmd;
+       unsigned long id;
+       
+       if (len % sizeof(long) || len < 2 * sizeof(long))
+               goto out;
+
+       if (copy_from_user(&cmd, from, sizeof(long))) {
+               error = -EFAULT;
+               goto out;
+       }
+       len  -= sizeof(long);
+       from += sizeof(long);
+
+       if (cmd != ENABLE_CMD && cmd != DISABLE_CMD) 
+               goto out;
+
+       if (down_interruptible(&feather_lock)) {
+               error = -ERESTARTSYS;
+               goto out;
+       }
+       
+       error = sizeof(long);
+       while (len) {
+               if (copy_from_user(&id, from, sizeof(long))) {
+                       error = -EFAULT;
+                       goto out;
+               }
+               len  -= sizeof(long);
+               from += sizeof(long);
+               if (cmd) {
+                       printk(KERN_INFO 
+                              "Disabling feather-trace event %lu.\n", id);
+                       ft_disable_event(id);
+               } else {
+                       printk(KERN_INFO 
+                              "Enabling feather-trace event %lu.\n", id);
+                       ft_enable_event(id);
+               }
+               error += sizeof(long);
+       }
+       
+       up(&feather_lock);
+ out:
+       return error;
+}
+
+static int trace_open(struct inode *in, struct file *filp) 
+{
+       int err = 0;
+        unsigned int count = NO_TIMESTAMPS;
+
+       if (down_interruptible(&feather_lock)) {
+               err = -ERESTARTSYS;
+               goto out;
+       }
+       
+       while (count && !trace_ts_buf) {
+               printk("trace: trying to allocate %u time stamps.\n", count);
+               trace_ts_buf = alloc_ft_buffer(count, sizeof(struct timestamp));
+               count /= 2;
+       }
+       if (!trace_ts_buf)
+               err = -ENOMEM;
+       else
+               use_count++;
+
+       up(&feather_lock);
+out:
+       return err;
+}
+
+/******************************************************************************/
+/*                          Device Registration                               */
+/******************************************************************************/
+
+#define FT_TRACE_MAJOR 252
+
+struct file_operations ft_trace_fops = {
+       .owner   = THIS_MODULE,
+       .open    = trace_open,
+       .release = trace_release,
+       .write   = trace_write,
+       .read    = trace_read,
+};
+
+
+static int __init register_buffer_dev(const char* name,
+                                     struct file_operations* fops, 
+                                     int major, int count) 
+{
+       dev_t   trace_dev;
+       struct cdev *cdev;      
+       int error = 0;
+
+       trace_dev = MKDEV(major, 0);
+       error     = register_chrdev_region(trace_dev, count, name);
+       if (error)
+       {
+               printk(KERN_WARNING "trace: "
+                      "Could not register major/minor number %d\n", major);
+               return error;
+       }
+       cdev = cdev_alloc();
+       if (!cdev) {
+               printk(KERN_WARNING "trace: "
+                       "Could not get a cdev for %s.\n", name);
+               return -ENOMEM;
+       }
+       cdev->owner = THIS_MODULE;
+       cdev->ops   = fops;
+       error = cdev_add(cdev, trace_dev, count);
+       if (error) {
+               printk(KERN_WARNING "trace: "
+                       "add_cdev failed for %s.\n", name);
+               return -ENOMEM;         
+       }
+       return error;
+
+}
+
+static int __init init_sched_trace(void) 
+{
+       int error = 0;
+
+       printk("Initializing Feather-Trace device\n");
+       /* dummy entry to make linker happy */
+       ft_event0(666, save_timestamp);
+
+       error = register_buffer_dev("ft_trace", &ft_trace_fops, 
+                                   FT_TRACE_MAJOR, 1);
+       return error;
+}
+
+module_init(init_sched_trace);
+diff --git a/lib/semaphore-sleepers.c b/lib/semaphore-sleepers.c
+index 1281805..3f4d543 100644
+--- a/lib/semaphore-sleepers.c
+++ b/lib/semaphore-sleepers.c
+@@ -108,7 +108,7 @@ fastcall int __sched __down_interruptible(struct semaphore * sem)
+                /*
+                 * With signals pending, this turns into
+                 * the trylock failure case - we won't be
+-                * sleeping, and we* can't get the lock as
+                * sleeping, and we can't get the lock as
+                 * it has contention. Just correct the count
+                 * and exit.
+                 */
diff --git a/index.html b/index.html
index 3070daf..de7c876 100644
--- a/index.html
+++ b/index.html
@@ -40,7 +40,7 @@
    kernel with focus on multiprocessor real-time scheduling and
    synchronization. The Linux kernel is modified to support the sporadic task
    model and modular scheduler plugins. Both partitioned and global scheduling
-    is supported. In the current version (2007.2), plugins for the following
+    is supported. In the current version (2007.3), plugins for the following
    scheduling policies are included:
    </p>
    <ul>
@@ -54,7 +54,7 @@
      <li> PFAIR (both staggered and aligned quanta are supported)</li>
    </ul>
    <p>
-    The latest public release of LITMUS<sup>RT</sup> occurred on 10/29/2007.
+    The latest public release of LITMUS<sup>RT</sup> occurred on 01/28/2008.
    </p>
    </div>
@@ -175,7 +175,7 @@
    General Public License (GPL)</a>. 
    </p>
    <p>
-    The latest version of LITMUS<sup>RT</sup> is 2007.2 and was released on 10/29/2007.
+    The latest version of LITMUS<sup>RT</sup> is 2007.3 and was released on 01/28/2008
    It  consists of
    our Linux kernel modifications in the form of
    a patch against Linux 2.6.20, 
@@ -184,21 +184,48 @@
    provides synchronization primitives suitable for real-time tasks.    
    </p>
-    
+    <ul>
-    <ul >
+    <li>
-       <li><a href="download/litmus-rt-2007.2.patch">litmus-rt-2007.2.patch</a>
+         2007.3 (January 2008)<br/>
-       (328 KB)<br/> 
+         Based on Linux 2.6.20. (see <a href="#install">Section Install</a> 
-       Applies
+         below) <br/>
-       against Linux 2.6.20 (see <a href="#install">Section Install</a> below).</li>
+         Files:
-       
+         <ul>
-       <li><a href="download/liblitmus-2007.2.tgz">liblitmus-2007.2.tgz</a>
+           <li><a href="download/2007.3/litmus-rt-2007.3.patch">litmus-rt-2007.3.patch</a> (344 KB)</li>
-       (11 KB)
+         <li><a href="download/2007.3/liblitmus-2007.3.tgz">liblitmus-2007.3.tgz</a>
-       </li>
+         (14 KB)</li>
-       
+         <li><a href="download/2007.3/libso-2007.3.tgz">libso-2007.3.tgz</a>
-       <li><a href="download/libso-2007.2.tgz">libso-2007.2.tgz</a>
+         (15 KB)</li>
-       (16 KB)
+         <li><a href="download/2007.3/SHA256SUMS">SHA256 check sums.</a>
-       </li>       
+         </li>
+         </ul>
+    </li>
+    <li>
+         Major changes: 
+         <ul>
+           <li>
+             Support for multi-threaded real-time applications added. The
+             use of <span class="src">libso</span> is no longer required.
+           </li>
+           <li>
+             All allocations (semaphores, etc.) are now dynamic. No more 
+             running out of resources.
+           </li>
+           <li>
+             Real-Time tasks do not have be launched with 
+             <span class="src">rt_launch</span> anymore. Instead, a new
+             <span class="src">task_mode()</span> API was introduced that
+             allows (Linux) tasks to transition in and out of
+             background task mode (std. Linux task) and LITMUS<sup>RT</sup>
+             real-time task.
+           </li>
+           <li>
+             Many bug fixes.
+           </li>
+         </ul>
+    </li>
    </ul>
+    
  <p class="nobottommargin">
       Please note that the current implementation is a <em>prototype</em> with
       certain limitations. Most notably, it is not secure in a multiuser context,
@@ -210,7 +237,18 @@
    <p class="nobottommargin">
      Old releases:
    </p>
    <ul>
+       <li> 2007.2 (November 2007)<br/>
+         Based on Linux 2.6.20. <br/>
+         <a href="download/litmus-rt-2007.2.patch">litmus-rt-2007.2.patch</a>
+         (328 KB)<br/> 
+         <a href="download/liblitmus-2007.2.tgz">liblitmus-2007.2.tgz</a>
+         (11 KB) <br/>
+         <a href="download/libso-2007.2.tgz">libso-2007.2.tgz</a>
+         (16 KB) <br/><br/>
+       </li>
+ 
       <li> 2007.1  (May 2007)<br/>      
         Based on Linux 2.6.20. <br/>
         <a href="download/litmus-rt-2007.1.patch">litmus-rt-2007.1.patch</a>
@@ -253,11 +291,11 @@ cd $DIR
 # get Linux 2.6.20
 wget http://www.kernel.org/pub/linux/kernel/v2.6/linux-2.6.20.tar.bz2
 tar xjf linux-2.6.20.tar.bz2
-wget http://www.cs.unc.edu/~anderson/litmus-rt/download/litmus-rt-2007.2.patch
+wget http://www.cs.unc.edu/~anderson/litmus-rt/download/2007.3/litmus-rt-2007.3.patch
 mv linux-2.6.20 litmus-rt
 # apply the LITMUS RT patch
 cd litmus-rt
-patch -p1 &lt; ../litmus-rt-2007.2.patch
+patch -p1 &lt; ../litmus-rt-2007.3.patch
 # create a working kernel configuration with HZ=1000
 make gconfig
 # compile the kernel
@@ -298,8 +336,8 @@ initrd /boot/kernel-2.6.20-LITMUSRT.img
    </p>
 <pre class="shell">
 cd $DIR
-wget http://www.cs.unc.edu/~anderson/litmus-rt/download/liblitmus-2007.2.tgz
+wget http://www.cs.unc.edu/~anderson/litmus-rt/download/2007.3/liblitmus-2007.3.tgz
-tar xzf liblitmus-2007.2.tgz
+tar xzf liblitmus-2007.3.tgz
 cd liblitmus 
 make
 </pre>
@@ -312,8 +350,8 @@ make
    </p>
 <pre class="shell">
 cd $DIR
-wget http://www.cs.unc.edu/~anderson/litmus-rt/download/libso-2007.2.tgz
+wget http://www.cs.unc.edu/~anderson/litmus-rt/download/2007.3/libso-2007.3.tgz
-tar xzf libso-2007.2.tgz
+tar xzf libso-2007.3.tgz
 cd libso
 make
 make tests
@@ -330,26 +368,27 @@ make tests
    <h2 id="doc">Documentation</h2> 
    <div class="box"> 
-    <p class="nomargin">
+    <p class="notopmargin">
      Most of the documentation has yet to be written. To get an overview of
      the architecture of the kernel extension, we recommend to read the paper
      <a href="http://www.cs.unc.edu/~anderson/papers/rtlws07.pdf">&ldquo;LITMUS<sup>RT</sup>:
      A Status Report&rdquo;</a>.
-      <br/>
+    </p>
-      <br/>
+    <p>
+    The user space library that provides the LITMUS<sup>RT</sup> API, 
+    <span class="src">liblitmus</span>, contains two example real-time tasks 
+    (<span class="src">base_task.c</span> and 
+     <span class="src">base_mt_task.c</span>)
+    that both illustrate how to use the API and provide a skeleton for real-time
+    task development. To get started with development, take a look these example
+    programs.
+    </p>
+    <p class="nobottommargin">
      Please contact <span class="src">bbb[AT]cs.unc.edu</span> if you have any
      questions.
    </p>
-<!--    <p class="nomargin">
-    <em>To be written...</em>
-    <ul class="nomargin">
-      <li>How to use LITMUS<sup>RT</sup></li>
-      <li>A real-time &quot;Hello World!&quot;</li>
-    </ul>
-    </p>
-->
    </div>
    <h2 id="credits">Credits</h2>
author	Bjoern Brandenburg <bbb@bbb1-cs.cs.unc.edu>	2008-01-28 14:13:24 -0500
committer	Bjoern Brandenburg <bbb@bbb1-cs.cs.unc.edu>	2008-01-28 14:13:24 -0500
commit	d3605639a4e641ae7591734f9e8f836605e58f1c (patch)
tree	d45d12db3f6dc22412e137e835670c9a8779b215
parent	24a3d78b334a52123f168a451fa4a5db4bb157e0 (diff)