From 1acaf95fd9ff52512bfd377a87f0c28050e01bc5 Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@Serenity.local>
Date: Fri, 22 Aug 2008 22:43:23 -0400
Subject: publish PCP implementation

---
 download/RTCSA08/SHA256SUMS              |    2 +
 download/RTCSA08/liblitmus-RTCSA08.tgz   |  Bin 0 -> 10277 bytes
 download/RTCSA08/litmus-rt-RTCSA08.patch | 7768 ++++++++++++++++++++++++++++++
 index.html                               |   17 +
 4 files changed, 7787 insertions(+)
 create mode 100644 download/RTCSA08/SHA256SUMS
 create mode 100644 download/RTCSA08/liblitmus-RTCSA08.tgz
 create mode 100644 download/RTCSA08/litmus-rt-RTCSA08.patch

diff --git a/download/RTCSA08/SHA256SUMS b/download/RTCSA08/SHA256SUMS
new file mode 100644
index 0000000..4bc8472
--- /dev/null
+++ b/download/RTCSA08/SHA256SUMS
@@ -0,0 +1,2 @@
+f9176d0d1dfd7e1c4ab3ba5f4dc62efa3dd1ab8c50e2e63628fe2d2376cb344b  liblitmus-RTCSA08.tgz
+24c6b22ba13b096b3dc4356ed98f484548c68c77a59296952d72458154dd6bac  litmus-rt-RTCSA08.patch
diff --git a/download/RTCSA08/liblitmus-RTCSA08.tgz b/download/RTCSA08/liblitmus-RTCSA08.tgz
new file mode 100644
index 0000000..9947121
Binary files /dev/null and b/download/RTCSA08/liblitmus-RTCSA08.tgz differ
diff --git a/download/RTCSA08/litmus-rt-RTCSA08.patch b/download/RTCSA08/litmus-rt-RTCSA08.patch
new file mode 100644
index 0000000..e4863a6
--- /dev/null
+++ b/download/RTCSA08/litmus-rt-RTCSA08.patch
@@ -0,0 +1,7768 @@
+ Makefile                         |    2 +-
+ arch/i386/Kconfig                |   28 ++
+ arch/i386/kernel/apic.c          |   92 +++++
+ arch/i386/kernel/i386_ksyms.c    |    1 +
+ arch/i386/kernel/signal.c        |    3 +-
+ arch/i386/kernel/smp.c           |    1 +
+ arch/i386/kernel/syscall_table.S |   22 +
+ fs/exec.c                        |    5 +-
+ fs/inode.c                       |    2 +
+ include/asm-i386/unistd.h        |   25 ++-
+ include/linux/completion.h       |    2 +
+ include/linux/fs.h               |    5 +
+ include/linux/sched.h            |   14 +
+ include/linux/uaccess.h          |   16 +
+ include/litmus/edf_common.h      |   27 ++
+ include/litmus/fdso.h            |   78 ++++
+ include/litmus/feather_buffer.h  |  108 +++++
+ include/litmus/feather_trace.h   |   93 +++++
+ include/litmus/jobs.h            |    9 +
+ include/litmus/litmus.h          |  200 +++++++++
+ include/litmus/rm_common.h       |   44 ++
+ include/litmus/rt_domain.h       |   94 +++++
+ include/litmus/rt_param.h        |  177 ++++++++
+ include/litmus/sched_plugin.h    |  120 ++++++
+ include/litmus/sched_trace.h     |   31 ++
+ include/litmus/trace.h           |  106 +++++
+ kernel/exit.c                    |    4 +
+ kernel/fork.c                    |    5 +
+ kernel/sched.c                   |  177 ++++++++-
+ lib/semaphore-sleepers.c         |    2 +-
+ litmus/Makefile                  |    9 +
+ litmus/edf_common.c              |   95 +++++
+ litmus/fdso.c                    |  289 +++++++++++++
+ litmus/ft_event.c                |  104 +++++
+ litmus/jobs.c                    |   43 ++
+ litmus/litmus.c                  |  830 ++++++++++++++++++++++++++++++++++++++
+ litmus/litmus_sem.c              |  551 +++++++++++++++++++++++++
+ litmus/pcp.c                     |  764 +++++++++++++++++++++++++++++++++++
+ litmus/rm_common.c               |   76 ++++
+ litmus/rt_domain.c               |  130 ++++++
+ litmus/sched_gsn_edf.c           |  733 +++++++++++++++++++++++++++++++++
+ litmus/sched_plugin.c            |  169 ++++++++
+ litmus/sched_psn_edf.c           |  458 +++++++++++++++++++++
+ litmus/sched_rm.c                |  397 ++++++++++++++++++
+ litmus/sched_trace.c             |  541 +++++++++++++++++++++++++
+ litmus/sync.c                    |   84 ++++
+ litmus/trace.c                   |  302 ++++++++++++++
+ 47 files changed, 7052 insertions(+), 16 deletions(-)
+
+diff --git a/Makefile b/Makefile
+index 7e2750f..79cf62b 100644
+--- a/Makefile
++++ b/Makefile
+@@ -553,7 +553,7 @@ export mod_strip_cmd
+ 
+ 
+ ifeq ($(KBUILD_EXTMOD),)
+-core-y		+= kernel/ mm/ fs/ ipc/ security/ crypto/ block/
++core-y		+= kernel/ mm/ fs/ ipc/ security/ crypto/ block/ litmus/
+ 
+ vmlinux-dirs	:= $(patsubst %/,%,$(filter %/, $(init-y) $(init-m) \
+ 		     $(core-y) $(core-m) $(drivers-y) $(drivers-m) \
+diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
+index 0dfee81..da6f1e9 100644
+--- a/arch/i386/Kconfig
++++ b/arch/i386/Kconfig
+@@ -1210,6 +1210,7 @@ config KPROBES
+ 	  a probepoint and specifies the callback.  Kprobes is useful
+ 	  for kernel debugging, non-intrusive instrumentation and testing.
+ 	  If in doubt, say "N".
++
+ endmenu
+ 
+ source "arch/i386/Kconfig.debug"
+@@ -1259,3 +1260,30 @@ config X86_TRAMPOLINE
+ config KTIME_SCALAR
+ 	bool
+ 	default y
++
++
++menu "LITMUS^RT"
++
++
++config SCHED_TASK_TRACE
++	bool "Trace real-time tasks"
++	default y	
++	help
++	  Include support for the sched_trace_XXX() tracing functions. This
++          allows the collection of real-time task events such as job 
++	  completions, job releases, early completions, etc. This results in  a
++	  small overhead in the scheduling code. Disable if the overhead is not
++	  acceptable (e.g., benchmarking).
++
++config SCHED_DEBUG_TRACE
++	bool "TRACE() debugging"
++	default y	
++	help
++	  Include support for sched_trace_log_messageg(), which is used to 
++	  implement TRACE(). If disabled, no TRACE() messages will be included
++	  in the kernel, and no overheads due to debugging statements will be
++	  incurred by the scheduler. Disable if the overhead is not acceptable
++	  (e.g. benchmarking).
++
++
++endmenu
+diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c
+index 776d9be..36b0159 100644
+--- a/arch/i386/kernel/apic.c
++++ b/arch/i386/kernel/apic.c
+@@ -26,6 +26,7 @@
+ #include <linux/sysdev.h>
+ #include <linux/cpu.h>
+ #include <linux/module.h>
++#include <litmus/litmus.h>
+ 
+ #include <asm/atomic.h>
+ #include <asm/smp.h>
+@@ -43,6 +44,8 @@
+ 
+ #include "io_ports.h"
+ 
++#include <litmus/trace.h>
++
+ /*
+  * cpu_mask that denotes the CPUs that needs timer interrupt coming in as
+  * IPIs in place of local APIC timers
+@@ -54,6 +57,15 @@ static cpumask_t timer_bcast_ipi;
+  */
+ static int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */
+ 
++/*
++ * Definitions and variables related to quantum synchronization.
++ */
++#define WAIT_TO_SYNC 30000 /* time after boot until sync */
++static int stagger = 0; /* are we using staggered quanta? */
++static atomic_t qsync_time = ATOMIC_INIT(INITIAL_JIFFIES);
++static atomic_t quantum_sync_barrier = ATOMIC_INIT(0);
++static atomic_t sync_done = ATOMIC_INIT(0);
++
+ static inline void lapic_disable(void)
+ {
+ 	enable_local_apic = -1;
+@@ -786,6 +798,23 @@ static int __init apic_set_verbosity(char *str)
+ 
+ __setup("apic=", apic_set_verbosity);
+ 
++/*
++ * Determine whether to use aligned or staggerd quanta.
++ */
++
++static int __init apic_synch_type(char *str)
++{
++	if (strcmp("aligned", str) == 0)
++		stagger = 0;
++	else if (strcmp("staggered", str) == 0)
++		stagger = 1;
++	else
++		stagger = 0; /* aligned quanta by default */
++	return 1;
++}
++
++__setup("quanta=", apic_synch_type);
++
+ static int __init detect_init_APIC (void)
+ {
+ 	u32 h, l, features;
+@@ -1198,6 +1227,47 @@ EXPORT_SYMBOL(switch_ipi_to_APIC_timer);
+ #undef APIC_DIVISOR
+ 
+ /*
++ * This function is called to align all quanta, and to stagger quanta if
++ * necessary. It relies on a barrier to synchronize all processors, so
++ * that they all reset their APIC timers at the same time. If quanta
++ * should be staggered, the appropriate stagger delay is then added at
++ * each processor.
++ */
++
++void synchronize_quanta(void)
++{
++	int cpu = smp_processor_id();
++	int total_cpus = num_online_cpus();
++	int stagger_interval = jiffies_to_usecs(1) / total_cpus;
++
++	/*
++	 * Disable APIC timer, wait for all other processors to reach barrier,
++	 * and re-enable all timers concurrently.
++	 */
++	disable_APIC_timer();
++	atomic_inc(&quantum_sync_barrier);
++	while (atomic_read(&quantum_sync_barrier) < total_cpus) {
++		/* Delay, otherwise atomic_inc's cannot occur. */
++		udelay(1);
++	}
++
++	/* Add necessary stagger for this CPU, if required. */
++	if (stagger) {
++		int stagger_us = cpu * stagger_interval;
++		udelay(stagger_us);
++	}
++
++	/* Re-enable all timers. */
++	__setup_APIC_LVTT(calibration_result);
++	enable_APIC_timer();
++	
++	/* The first CPU signals that quantum sync is complete. */
++	if (cpu == 0)
++                atomic_inc(&sync_done); 
++}
++
++
++/*
+  * Local timer interrupt handler. It does both profiling and
+  * process statistics/rescheduling.
+  *
+@@ -1209,11 +1279,32 @@ EXPORT_SYMBOL(switch_ipi_to_APIC_timer);
+ 
+ inline void smp_local_timer_interrupt(void)
+ {
++/*	s64 offset; */
++
++	TS_TICK_START;
++
+ 	profile_tick(CPU_PROFILING);
+ #ifdef CONFIG_SMP
+ 	update_process_times(user_mode_vm(get_irq_regs()));
+ #endif
+ 
++	/* Print out timing data - can be commented out if necessary. */
++/*	offset = get_nsec_offset(); */
++/*	TRACE("%d\n", offset);      */
++
++	/*
++	 * Synchronize quanta if we have reached qsync_time plus wait
++	 * interval. The synchronization code itself is placed in its own
++	 * (non-inline) function, to avoid issues with creating an inline
++	 * function that is too large.
++	 */
++	if (unlikely(!atomic_read(&sync_done) &&
++	             time_after(jiffies,
++			        (unsigned long)(atomic_read(&qsync_time) + 
++		                msecs_to_jiffies(WAIT_TO_SYNC))))) {
++		synchronize_quanta();
++	}
++
+ 	/*
+ 	 * We take the 'long' return path, and there every subsystem
+ 	 * grabs the apropriate locks (kernel lock/ irq lock).
+@@ -1224,6 +1315,7 @@ inline void smp_local_timer_interrupt(void)
+ 	 * Currently this isn't too much of an issue (performance wise),
+ 	 * we can take more than 100K local irqs per second on a 100 MHz P5.
+ 	 */
++	TS_TICK_END;
+ }
+ 
+ /*
+diff --git a/arch/i386/kernel/i386_ksyms.c b/arch/i386/kernel/i386_ksyms.c
+index e3d4b73..9670f77 100644
+--- a/arch/i386/kernel/i386_ksyms.c
++++ b/arch/i386/kernel/i386_ksyms.c
+@@ -6,6 +6,7 @@ EXPORT_SYMBOL(__down_failed);
+ EXPORT_SYMBOL(__down_failed_interruptible);
+ EXPORT_SYMBOL(__down_failed_trylock);
+ EXPORT_SYMBOL(__up_wakeup);
++
+ /* Networking helper routines. */
+ EXPORT_SYMBOL(csum_partial_copy_generic);
+ 
+diff --git a/arch/i386/kernel/signal.c b/arch/i386/kernel/signal.c
+index 65d7620..e95d732 100644
+--- a/arch/i386/kernel/signal.c
++++ b/arch/i386/kernel/signal.c
+@@ -651,7 +651,6 @@ void do_notify_resume(struct pt_regs *regs, void *_unused,
+ 
+ 	/* deal with pending signal delivery */
+ 	if (thread_info_flags & (_TIF_SIGPENDING | _TIF_RESTORE_SIGMASK))
+-		do_signal(regs);
+-	
++		do_signal(regs);	
+ 	clear_thread_flag(TIF_IRET);
+ }
+diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c
+index 5285aff..91921a3 100644
+--- a/arch/i386/kernel/smp.c
++++ b/arch/i386/kernel/smp.c
+@@ -605,6 +605,7 @@ void smp_send_stop(void)
+  */
+ fastcall void smp_reschedule_interrupt(struct pt_regs *regs)
+ {
++	set_tsk_need_resched(current);
+ 	ack_APIC_irq();
+ }
+ 
+diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
+index 2697e92..48e5e8e 100644
+--- a/arch/i386/kernel/syscall_table.S
++++ b/arch/i386/kernel/syscall_table.S
+@@ -319,3 +319,25 @@ ENTRY(sys_call_table)
+ 	.long sys_move_pages
+ 	.long sys_getcpu
+ 	.long sys_epoll_pwait
++	/* LITMUS syscalls */ 
++	.long sys_set_rt_task_param	/* 320 */
++	.long sys_get_rt_task_param
++	.long sys_task_mode_transition 
++	.long sys_sleep_next_period
++	.long sys_register_np_flag
++	.long sys_exit_np   		/* 325 */
++	.long sys_od_open
++	.long sys_od_close
++	.long sys_pi_down
++	.long sys_pi_up			
++	.long sys_srp_down		/* 330 */		
++	.long sys_srp_up
++	.long sys_reg_task_srp_sem
++	.long sys_query_job_no
++	.long sys_wait_for_job_release  
++	.long sys_wait_for_ts_release   /* 335 */
++	.long sys_release_ts
++	.long sys_pcp_down
++	.long sys_pcp_up
++	.long sys_dpcp_invoke
++	.long sys_dpcp_agent		/* 340 */
+diff --git a/fs/exec.c b/fs/exec.c
+index 11fe93f..353d6e3 100644
+--- a/fs/exec.c
++++ b/fs/exec.c
+@@ -54,6 +54,8 @@
+ #include <asm/uaccess.h>
+ #include <asm/mmu_context.h>
+ 
++#include <litmus/litmus.h>
++
+ #ifdef CONFIG_KMOD
+ #include <linux/kmod.h>
+ #endif
+@@ -1140,7 +1142,8 @@ int do_execve(char * filename,
+ 	if (IS_ERR(file))
+ 		goto out_kfree;
+ 
+-	sched_exec();
++	sched_exec();	
++	litmus_exec();
+ 
+ 	bprm->p = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *);
+ 
+diff --git a/fs/inode.c b/fs/inode.c
+index bf21dc6..fcf8ce3 100644
+--- a/fs/inode.c
++++ b/fs/inode.c
+@@ -205,6 +205,8 @@ void inode_init_once(struct inode *inode)
+ 	INIT_LIST_HEAD(&inode->inotify_watches);
+ 	mutex_init(&inode->inotify_mutex);
+ #endif
++	INIT_LIST_HEAD(&inode->i_obj_list);
++	mutex_init(&inode->i_obj_mutex);
+ }
+ 
+ EXPORT_SYMBOL(inode_init_once);
+diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
+index 833fa17..d0ba5c3 100644
+--- a/include/asm-i386/unistd.h
++++ b/include/asm-i386/unistd.h
+@@ -325,10 +325,33 @@
+ #define __NR_move_pages		317
+ #define __NR_getcpu		318
+ #define __NR_epoll_pwait	319
++/* LITMUS */
++#define __NR_set_rt_task_param	320
++#define __NR_get_rt_task_param	321
++#define __NR_task_mode		322
++#define __NR_sleep_next_period  323
++#define __NR_register_np_flag   324
++#define __NR_exit_np            325
++#define __NR_od_open		326
++#define __NR_od_close		327
++#define __NR_pi_down		328
++#define __NR_pi_up		329
++#define __NR_srp_down		330
++#define __NR_srp_up		331
++#define __NR_reg_task_srp_sem	332
++#define __NR_query_job_no	333
++#define __NR_wait_for_job_release 334
++#define __NR_wait_for_ts_release 335
++#define __NR_release_ts		336
++#define __NR_pcp_down		337
++#define __NR_pcp_up		338
++#define __NR_dpcp_invoke	339
++#define __NR_dpcp_agent		340
++
+ 
+ #ifdef __KERNEL__
+ 
+-#define NR_syscalls 320
++#define NR_syscalls 343
+ 
+ #define __ARCH_WANT_IPC_PARSE_VERSION
+ #define __ARCH_WANT_OLD_READDIR
+diff --git a/include/linux/completion.h b/include/linux/completion.h
+index 268c5a4..dc633ed 100644
+--- a/include/linux/completion.h
++++ b/include/linux/completion.h
+@@ -51,6 +51,8 @@ extern unsigned long FASTCALL(wait_for_completion_interruptible_timeout(
+ 
+ extern void FASTCALL(complete(struct completion *));
+ extern void FASTCALL(complete_all(struct completion *));
++extern void FASTCALL(complete_n(struct completion *, int n));
++
+ 
+ #define INIT_COMPLETION(x)	((x).done = 0)
+ 
+diff --git a/include/linux/fs.h b/include/linux/fs.h
+index 1410e53..4e1117c 100644
+--- a/include/linux/fs.h
++++ b/include/linux/fs.h
+@@ -524,6 +524,8 @@ static inline int mapping_writably_mapped(struct address_space *mapping)
+ #define i_size_ordered_init(inode) do { } while (0)
+ #endif
+ 
++struct inode_obj_id_table;
++
+ struct inode {
+ 	struct hlist_node	i_hash;
+ 	struct list_head	i_list;
+@@ -589,6 +591,9 @@ struct inode {
+ 	void			*i_security;
+ #endif
+ 	void			*i_private; /* fs or device private pointer */
++
++	struct list_head	i_obj_list;
++	struct mutex		i_obj_mutex;
+ };
+ 
+ /*
+diff --git a/include/linux/sched.h b/include/linux/sched.h
+index 4463735..c7929d6 100644
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -3,6 +3,8 @@
+ 
+ #include <linux/auxvec.h>	/* For AT_VECTOR_SIZE */
+ 
++#include <litmus/rt_param.h>
++
+ /*
+  * cloning flags:
+  */
+@@ -796,6 +798,8 @@ enum sleep_type {
+ 	SLEEP_INTERRUPTED,
+ };
+ 
++struct od_table_entry;
++
+ struct prio_array;
+ 
+ struct task_struct {
+@@ -1051,6 +1055,16 @@ struct task_struct {
+ #ifdef CONFIG_FAULT_INJECTION
+ 	int make_it_fail;
+ #endif
++	/* litmus parameters and state */
++	struct rt_param rt_param;
++
++	/* allow scheduler plugins to queue in release lists, etc. 
++	 * Cleanup: Move this into the rt_param struct.
++	 */
++	struct list_head rt_list;
++
++	/* references to PI semaphores, etc. */
++	struct od_table_entry* od_table;
+ };
+ 
+ static inline pid_t process_group(struct task_struct *tsk)
+diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
+index 975c963..6ae0ff9 100644
+--- a/include/linux/uaccess.h
++++ b/include/linux/uaccess.h
+@@ -84,4 +84,20 @@ static inline unsigned long __copy_from_user_nocache(void *to,
+ 		ret;					\
+ 	})
+ 
++/* This is a naive attempt at a write version of the above native Linux macro.
++ */
++#define poke_kernel_address(val, addr)			\
++	({						\
++		long ret;				\
++		mm_segment_t old_fs = get_fs();		\
++							\
++		set_fs(KERNEL_DS);			\
++		pagefault_disable();			\
++		ret = __put_user(val, (__force typeof(val) __user *)(addr)); \
++		pagefault_enable();			\
++		set_fs(old_fs);				\
++		ret;					\
++	})
++
++
+ #endif		/* __LINUX_UACCESS_H__ */
+diff --git a/include/litmus/edf_common.h b/include/litmus/edf_common.h
+new file mode 100644
+index 0000000..df711f5
+--- /dev/null
++++ b/include/litmus/edf_common.h
+@@ -0,0 +1,27 @@
++/* EDF common data structures and utility functions shared by all EDF
++ * based scheduler plugins
++ */
++
++/* CLEANUP: Add comments and make it less messy.
++ *
++ */
++
++#ifndef __UNC_EDF_COMMON_H__
++#define __UNC_EDF_COMMON_H__
++
++#include <litmus/rt_domain.h>
++
++
++void edf_domain_init(rt_domain_t* rt, check_resched_needed_t resched);
++
++int edf_higher_prio(struct task_struct* first,
++		    struct task_struct* second);
++
++int edf_ready_order(struct list_head* a, struct list_head* b);
++
++int  edf_preemption_needed(rt_domain_t* rt, struct task_struct *t);
++
++#define job_completed(t) (!is_be(t) && \
++	(t)->rt_param.times.exec_time == (t)->rt_param.basic_params.exec_cost)
++
++#endif
+diff --git a/include/litmus/fdso.h b/include/litmus/fdso.h
+new file mode 100644
+index 0000000..5544c1b
+--- /dev/null
++++ b/include/litmus/fdso.h
+@@ -0,0 +1,78 @@
++/* fdso.h - file descriptor attached shared objects
++ *
++ * (c) 2007 B. Brandenburg, LITMUS^RT project
++ */
++
++#ifndef _LINUX_FDSO_H_
++#define _LINUX_FDSO_H_
++
++#include <linux/list.h>
++#include <asm/atomic.h>
++
++#include <linux/fs.h>
++
++#define MAX_OBJECT_DESCRIPTORS 32
++
++typedef enum  {
++	MIN_OBJ_TYPE 	= 0,
++
++	PI_SEM 		= 0,
++	SRP_SEM		= 1,
++	PCP_SEM		= 2,
++	MPCP_SEM	= 3,
++
++	MAX_OBJ_TYPE	= 3
++} obj_type_t;
++
++struct inode_obj_id {
++	struct list_head	list;
++	atomic_t		count;
++	struct inode*		inode;
++
++	obj_type_t 		type;
++	void*			obj;
++	unsigned int		id;
++};
++
++
++struct od_table_entry {
++	unsigned int		used;
++
++	struct inode_obj_id*	obj;
++	void*			extra;
++};
++
++struct fdso_ops {
++	void* (*create)	(void);
++	void  (*destroy)(void*);
++	int   (*open)	(struct od_table_entry*, void* __user);
++	int   (*close)	(struct od_table_entry*);
++};
++
++/* translate a userspace supplied od into the raw table entry
++ * returns NULL if od is invalid
++ */
++struct od_table_entry* __od_lookup(int od);
++
++/* translate a userspace supplied od into the associated object
++ * returns NULL if od is invalid
++ */
++static inline void* od_lookup(int od, obj_type_t type)
++{
++	struct od_table_entry* e = __od_lookup(od);
++	return e && e->obj->type == type ? e->obj->obj : NULL;
++}
++
++static inline void* od_lookup2(int od, obj_type_t type, obj_type_t type2)
++{
++	struct od_table_entry* e = __od_lookup(od);
++	return e && (e->obj->type == type || e->obj->type == type2) ?
++	       e->obj->obj : NULL;
++}
++
++#define lookup_pi_sem(od)  ((struct pi_semaphore*)  od_lookup(od, PI_SEM))
++#define lookup_srp_sem(od) ((struct srp_semaphore*) od_lookup(od, SRP_SEM))
++#define lookup_pcp_sem(od)  ((struct pcp_semaphore*)  \
++	od_lookup2(od, PCP_SEM, MPCP_SEM))
++
++#endif
+diff --git a/include/litmus/feather_buffer.h b/include/litmus/feather_buffer.h
+new file mode 100644
+index 0000000..c788227
+--- /dev/null
++++ b/include/litmus/feather_buffer.h
+@@ -0,0 +1,108 @@
++#ifndef _FEATHER_BUFFER_H_
++#define _FEATHER_BUFFER_H_
++
++/* requires UINT_MAX and memcpy */
++
++static inline int  fetch_and_inc(int *val)
++{
++	int ret = 1;
++	__asm__ __volatile__("lock; xaddl %0, %1" : "+r" (ret), "+m" (*val) : : "memory" );
++	return ret;
++}
++
++static inline int  fetch_and_dec(int *val)
++{
++	int ret = -1;
++	__asm__ __volatile__("lock; xaddl %0, %1" : "+r" (ret), "+m" (*val) : : "memory" );
++	return ret;
++}
++
++#define SLOT_FREE	0
++#define	SLOT_BUSY 	1
++#define	SLOT_READY	2
++
++struct ft_buffer {
++	unsigned int	slot_count;
++	unsigned int	slot_size;
++
++	int 		free_count;
++	unsigned int 	write_idx;
++	unsigned int 	read_idx;
++
++	char*		slots;
++	void*		buffer_mem;
++	unsigned int	failed_writes;
++};
++
++static inline int init_ft_buffer(struct ft_buffer*	buf,
++				 unsigned int 		slot_count,
++				 unsigned int 		slot_size,
++				 char*			slots,
++				 void* 			buffer_mem)
++{
++	int i = 0;
++	if (!slot_count || UINT_MAX % slot_count != slot_count - 1) {
++		/* The slot count must divide UNIT_MAX + 1 so that when it
++		 * wraps around the index correctly points to 0.
++		 */
++		return 0;
++	} else {
++		buf->slot_count    = slot_count;
++		buf->slot_size     = slot_size;
++		buf->slots         = slots;
++		buf->buffer_mem    = buffer_mem;
++		buf->free_count    = slot_count;
++		buf->write_idx     = 0;
++		buf->read_idx      = 0;
++		buf->failed_writes = 0;
++		for (i = 0; i < slot_count; i++)
++			buf->slots[i] = SLOT_FREE;
++		return 1;
++	}
++}
++
++static inline int ft_buffer_start_write(struct ft_buffer* buf, void **ptr)
++{
++	int free = fetch_and_dec(&buf->free_count);
++	unsigned int idx;
++	if (free <= 0) {
++		fetch_and_inc(&buf->free_count);
++		*ptr = 0;
++		fetch_and_inc(&buf->failed_writes);
++		return 0;
++	} else {
++		idx  = fetch_and_inc((int*) &buf->write_idx) % buf->slot_count;
++		buf->slots[idx] = SLOT_BUSY;
++		*ptr = ((char*) buf->buffer_mem) + idx * buf->slot_size;
++		return 1;
++	}
++}
++
++static inline void ft_buffer_finish_write(struct ft_buffer* buf, void *ptr)
++{
++	unsigned int idx = ((char*) ptr - (char*) buf->buffer_mem) / buf->slot_size;
++	buf->slots[idx]  = SLOT_READY;
++}
++
++
++/* exclusive reader access is assumed */
++static inline int ft_buffer_read(struct ft_buffer* buf, void* dest)
++{
++	unsigned int idx;
++	if (buf->free_count == buf->slot_count)
++		/* nothing available */
++		return 0;
++	idx = buf->read_idx % buf->slot_count;
++	if (buf->slots[idx] == SLOT_READY) {
++		memcpy(dest, ((char*) buf->buffer_mem) + idx * buf->slot_size,
++		       buf->slot_size);
++		buf->slots[idx] = SLOT_FREE;
++		buf->read_idx++;
++		fetch_and_inc(&buf->free_count);
++		return 1;
++	} else
++		return 0;
++}
++
++
++#endif
+diff --git a/include/litmus/feather_trace.h b/include/litmus/feather_trace.h
+new file mode 100644
+index 0000000..5c37ea7
+--- /dev/null
++++ b/include/litmus/feather_trace.h
+@@ -0,0 +1,93 @@
++#ifndef _FEATHER_TRACE_H_
++#define _FEATHER_TRACE_H_
++
++#define feather_callback __attribute__((regparm(0)))
++
++/* make the compiler reload any register that is not saved in
++ * a cdecl function call
++ */
++#define CLOBBER_LIST "memory", "cc", "eax", "ecx", "edx"
++
++#define ft_event(id, callback)                                  \
++        __asm__ __volatile__(                                   \
++            "1: jmp 2f                                    \n\t" \
++	    " call " #callback "                          \n\t" \
++            ".section __event_table, \"aw\"               \n\t" \
++            ".long " #id  ", 0, 1b, 2f                    \n\t" \
++            ".previous                                    \n\t" \
++            "2:                                           \n\t" \
++        : : : CLOBBER_LIST)
++
++#define ft_event0(id, callback)                                 \
++        __asm__ __volatile__(                                   \
++            "1: jmp 2f                                    \n\t" \
++	    " subl $4, %%esp                              \n\t" \
++            " movl $" #id  ", (%%esp)                     \n\t" \
++	    " call " #callback "                          \n\t" \
++	    " addl $4, %%esp                              \n\t" \
++            ".section __event_table, \"aw\"               \n\t" \
++            ".long " #id  ", 0, 1b, 2f                    \n\t" \
++            ".previous                                    \n\t" \
++            "2:                                           \n\t" \
++        : :  : CLOBBER_LIST)
++
++#define ft_event1(id, callback, param)                          \
++        __asm__ __volatile__(                                   \
++            "1: jmp 2f                                    \n\t" \
++	    " subl $8, %%esp                              \n\t" \
++	    " movl %0, 4(%%esp)                           \n\t" \
++            " movl $" #id  ", (%%esp)                     \n\t" \
++	    " call " #callback "                          \n\t" \
++	    " addl $8, %%esp                              \n\t" \
++            ".section __event_table, \"aw\"               \n\t" \
++            ".long " #id  ", 0, 1b, 2f                    \n\t" \
++            ".previous                                    \n\t" \
++            "2:                                           \n\t" \
++        : : "r" (param)  : CLOBBER_LIST)
++
++#define ft_event2(id, callback, param, param2)                  \
++        __asm__ __volatile__(                                   \
++            "1: jmp 2f                                    \n\t" \
++	    " subl $12, %%esp                             \n\t" \
++	    " movl %1, 8(%%esp)                           \n\t" \
++	    " movl %0, 4(%%esp)                           \n\t" \
++            " movl $" #id  ", (%%esp)                     \n\t" \
++	    " call " #callback "                          \n\t" \
++	    " addl $12, %%esp                             \n\t" \
++            ".section __event_table, \"aw\"               \n\t" \
++            ".long " #id  ", 0, 1b, 2f                    \n\t" \
++            ".previous                                    \n\t" \
++            "2:                                           \n\t" \
++        : : "r" (param), "r" (param2)  : CLOBBER_LIST)
++
++
++#define ft_event3(id, callback, p, p2, p3)                      \
++        __asm__ __volatile__(                                   \
++            "1: jmp 2f                                    \n\t" \
++	    " subl $16, %%esp                             \n\t" \
++	    " movl %1, 12(%%esp)                          \n\t" \
++	    " movl %1, 8(%%esp)                           \n\t" \
++	    " movl %0, 4(%%esp)                           \n\t" \
++            " movl $" #id  ", (%%esp)                     \n\t" \
++	    " call " #callback "                          \n\t" \
++	    " addl $16, %%esp                             \n\t" \
++            ".section __event_table, \"aw\"               \n\t" \
++            ".long " #id  ", 0, 1b, 2f                    \n\t" \
++            ".previous                                    \n\t" \
++            "2:                                           \n\t" \
++        : : "r" (p), "r" (p2), "r" (p3)  : CLOBBER_LIST)
++
++
++static inline unsigned long long ft_read_tsc(void)
++{
++	unsigned long long ret;
++	__asm__ __volatile__("rdtsc" : "=A" (ret));
++	return ret;
++}
++
++int ft_enable_event(unsigned long id);
++int ft_disable_event(unsigned long id);
++int ft_is_event_enabled(unsigned long id);
++int ft_disable_all_events(void);
++
++#endif
+diff --git a/include/litmus/jobs.h b/include/litmus/jobs.h
+new file mode 100644
+index 0000000..9bd361e
+--- /dev/null
++++ b/include/litmus/jobs.h
+@@ -0,0 +1,9 @@
++#ifndef __LITMUS_JOBS_H__
++#define __LITMUS_JOBS_H__
++
++void prepare_for_next_period(struct task_struct *t);
++void release_at(struct task_struct *t, lt_t start);
++long complete_job(void);
++
++#endif
++
+diff --git a/include/litmus/litmus.h b/include/litmus/litmus.h
+new file mode 100644
+index 0000000..5853ed5
+--- /dev/null
++++ b/include/litmus/litmus.h
+@@ -0,0 +1,200 @@
++/*
++ * Constant definitions related to
++ * scheduling policy.
++ */
++
++#ifndef _LINUX_LITMUS_H_
++#define _LINUX_LITMUS_H_
++
++#include <linux/jiffies.h>
++#include <litmus/sched_trace.h>
++
++typedef enum {
++	SCHED_LINUX 		=  0,
++	SCHED_GSN_EDF		= 10,
++	SCHED_PSN_EDF		= 11,
++	/*      Add your scheduling policy here */
++
++	SCHED_DEFAULT 		=  0,
++	SCHED_INVALID 		= -1,
++} spolicy;
++
++
++typedef enum {
++	LITMUS_RESERVED_RANGE = 1024,
++
++} sched_setup_cmd_t;
++
++/*	per-task modes */
++enum rt_task_mode_t {
++	BACKGROUND_TASK = 0,
++	LITMUS_RT_TASK  = 1
++};
++
++/*	Plugin boot options, for convenience */
++#define PLUGIN_LINUX  		"linux"
++#define PLUGIN_GSN_EDF		"gsn_edf"
++#define PLUGIN_PSN_EDF		"psn_edf"
++
++extern spolicy sched_policy;
++
++/*	RT mode start time	*/
++extern volatile unsigned long rt_start_time;
++
++#define TRACE(fmt, args...) \
++	sched_trace_log_message("%d: " fmt, raw_smp_processor_id(), ## args)
++
++#define TRACE_TASK(t, fmt, args...) \
++	TRACE("(%s/%d) " fmt, (t)->comm, (t)->pid, ##args)
++
++#define TRACE_CUR(fmt, args...) \
++	TRACE_TASK(current, fmt, ## args)
++
++#define TRACE_BUG_ON(cond) \
++	do { if (cond) TRACE("BUG_ON(%s) at %s:%d " \
++			     "called from %p current=%s/%d state=%d " \
++			     "flags=%x partition=%d cpu=%d rtflags=%d"\
++			     " job=%u knp=%d timeslice=%u\n",		\
++	#cond, __FILE__, __LINE__, __builtin_return_address(0), current->comm, \
++	current->pid, current->state, current->flags,  \
++	get_partition(current), smp_processor_id(), get_rt_flags(current), \
++	current->rt_param.job_params.job_no, current->rt_param.kernel_np, \
++	current->time_slice\
++	); } while(0);
++
++
++/* in_list - is a given list_head queued on some list?
++ */
++static inline int in_list(struct list_head* list)
++{
++	return !(  /* case 1: deleted */
++		   (list->next == LIST_POISON1 &&
++		    list->prev == LIST_POISON2)
++		 ||
++		   /* case 2: initialized */
++		   (list->next == list &&
++		    list->prev == list)
++		);
++}
++
++typedef int (*prio_cmp_t)(struct task_struct* first,
++			  struct task_struct* second);
++
++typedef int (*list_cmp_t)(struct list_head*, struct list_head*);
++
++static inline unsigned int list_insert(struct list_head* new,
++				       struct list_head* head,
++				       list_cmp_t order_before)
++{
++	struct list_head *pos;
++	unsigned int passed = 0;
++
++	BUG_ON(!new);
++
++	/* find a spot where the new entry is less than the next */
++	list_for_each(pos, head) {
++		if (unlikely(order_before(new, pos))) {
++			/* pos is not less than new, thus insert here */
++			__list_add(new, pos->prev, pos);
++			goto out;
++		}
++		passed++;
++	}
++	/* if we get to this point either the list is empty or every entry
++	 * queued element is less than new.
++	 * Let's add new to the end. */
++	list_add_tail(new, head);
++ out:
++	return passed;
++}
++
++void list_qsort(struct list_head* list, list_cmp_t less_than);
++
++
++#define RT_PREEMPTIVE 		0x2050 /* = NP */
++#define RT_NON_PREEMPTIVE 	0x4e50 /* =  P */
++#define RT_EXIT_NP_REQUESTED	0x5251 /* = RQ */
++
++/* returns 1 if task t has registered np flag and set it to RT_NON_PREEMPTIVE
++ */
++int is_np(struct task_struct *t);
++
++/* request that the task should call sys_exit_np()
++ */
++void request_exit_np(struct task_struct *t);
++
++/* kill naughty tasks
++ */
++void scheduler_signal(struct task_struct *t, unsigned int signal);
++void send_scheduler_signals(void);
++void np_mem_kill(struct task_struct *t);
++
++void litmus_fork(struct task_struct *tsk);
++void litmus_exec(void);
++/* clean up real-time state of a task */
++void exit_litmus(struct task_struct *dead_tsk);
++
++long transition_to_rt(struct task_struct* tsk);
++long transition_to_be(struct task_struct* tsk);
++
++#define is_realtime(t) 		((t)->rt_param.is_realtime)
++#define rt_transition_pending(t) \
++	((t)->rt_param.transition_pending)
++
++/*	Realtime utility macros */
++#define get_rt_flags(t)		((t)->rt_param.flags)
++#define set_rt_flags(t,f) 	(t)->rt_param.flags=(f)
++#define get_exec_cost(t)  	((t)->rt_param.task_params.exec_cost)
++#define get_exec_time(t)	((t)->rt_param.job_params.exec_time)
++#define get_rt_period(t)	((t)->rt_param.task_params.period)
++#define get_partition(t) 	(t)->rt_param.task_params.cpu
++#define get_deadline(t)		((t)->rt_param.job_params.deadline)
++#define get_class(t)		((t)->rt_param.task_params.cls)
++
++inline static int budget_exhausted(struct task_struct* t)
++{
++	return get_exec_time(t) >= get_exec_cost(t);
++}
++
++
++#define is_hrt(t)     		\
++	((t)->rt_param.task_params.class == RT_CLASS_HARD)
++#define is_srt(t)     		\
++	((t)->rt_param.task_params.class == RT_CLASS_SOFT)
++#define is_be(t)      		\
++	((t)->rt_param.task_params.class == RT_CLASS_BEST_EFFORT)
++
++#define get_release(t) ((t)->rt_param.job_params.release)
++
++/* Honor the flag in the preempt_count variable that is set
++ * when scheduling is in progress.
++ */
++#define is_running(t) 			\
++	((t)->state == TASK_RUNNING || 	\
++	 (t)->thread_info->preempt_count & PREEMPT_ACTIVE)
++
++#define is_blocked(t)       \
++	(!is_running(t))
++#define is_released(t, now)	\
++	(lt_before_eq(get_release(t), now))
++#define is_tardy(t, now)    \
++	(lt_before_eq((t)->rt_param.job_params.deadline, now))
++
++/* real-time comparison macros */
++#define earlier_deadline(a, b) (lt_before(\
++	(a)->rt_param.job_params.deadline,\
++	(b)->rt_param.job_params.deadline))
++#define earlier_release(a, b)  (lt_before(\
++	(a)->rt_param.job_params.release,\
++	(b)->rt_param.job_params.release))
++
++#define shorter_period(a, b) (lt_before(\
++      (a)->rt_param.task_params.period, \
++      (b)->rt_param.task_params.period))
++
++#define make_np(t) do {t->rt_param.kernel_np++;} while(0);
++#define take_np(t) do {t->rt_param.kernel_np--;} while(0);
++
++void srp_ceiling_block(void);
++
++#endif
+diff --git a/include/litmus/rm_common.h b/include/litmus/rm_common.h
+new file mode 100644
+index 0000000..11e8365
+--- /dev/null
++++ b/include/litmus/rm_common.h
+@@ -0,0 +1,44 @@
++/* rate monotonic helper functions.
++ */
++
++
++#ifndef __UNC_RM_COMMON_H__
++#define __UNC_RM_COMMON_H__
++
++#include <litmus/rt_domain.h>
++
++static inline int _rm_higher_prio(struct pcp_priority *p1,
++				  struct pcp_priority *p2)
++{
++	/* does the second task exist and is it a real-time task?  If
++	 * not, the first task (which is a RT task) has higher
++	 * priority.
++	 */
++
++	if (unlikely(!p2))
++		return 1;
++
++	if (p1->in_global_cs == p2->in_global_cs) {
++		/* tie break by RM priority */
++		if (p1->prio == p2->prio)
++			/* tie break equal periods by PID */
++			return p1->pid < p2->pid;
++		else
++			/* shorter period or lower index has higher priority */
++			return p1->prio < p2->prio;
++	} else
++		/* gcs always have higher priority */
++		return p1->in_global_cs > p2->in_global_cs;
++}
++
++
++void rm_domain_init(rt_domain_t* rt, check_resched_needed_t resched);
++
++int  rm_higher_prio(struct task_struct* first,
++		   struct task_struct* second);
++
++int  rm_ready_order(struct list_head* a, struct list_head* b);
++
++int  rm_preemption_needed(rt_domain_t* rt, struct task_struct *t);
++
++#endif
+diff --git a/include/litmus/rt_domain.h b/include/litmus/rt_domain.h
+new file mode 100644
+index 0000000..79b6034
+--- /dev/null
++++ b/include/litmus/rt_domain.h
+@@ -0,0 +1,94 @@
++/* CLEANUP: Add comments and make it less messy.
++ *
++ */
++
++#ifndef __UNC_RT_DOMAIN_H__
++#define __UNC_RT_DOMAIN_H__
++
++struct _rt_domain;
++
++typedef int (*check_resched_needed_t)(struct _rt_domain *rt);
++typedef void (*release_at_t)(struct task_struct *t, lt_t start);
++
++typedef struct _rt_domain {
++	/* runnable rt tasks are in here */
++	rwlock_t 			ready_lock;
++	struct list_head 		ready_queue;
++
++	/* real-time tasks waiting for release are in here */
++	spinlock_t 			release_lock;
++	struct list_head 		release_queue;
++
++	/* how do we check if we need to kick another CPU? */
++	check_resched_needed_t		check_resched;
++
++	/* how are tasks ordered in the ready queue? */
++	list_cmp_t			order;
++} rt_domain_t;
++
++#define next_ready(rt) \
++	(list_entry((rt)->ready_queue.next, struct task_struct, rt_list))
++
++#define ready_jobs_pending(rt) \
++	(!list_empty(&(rt)->ready_queue))
++
++void rt_domain_init(rt_domain_t *rt, check_resched_needed_t f,
++		    list_cmp_t order);
++
++void __add_ready(rt_domain_t* rt, struct task_struct *new);
++void __add_release(rt_domain_t* rt, struct task_struct *task);
++
++struct task_struct* __take_ready(rt_domain_t* rt);
++struct task_struct* __peek_ready(rt_domain_t* rt);
++
++void try_release_pending(rt_domain_t* rt);
++void __release_pending(rt_domain_t* rt);
++
++static inline void add_ready(rt_domain_t* rt, struct task_struct *new)
++{
++	unsigned long flags;
++	/* first we need the write lock for rt_ready_queue */
++	write_lock_irqsave(&rt->ready_lock, flags);
++	__add_ready(rt, new);
++	write_unlock_irqrestore(&rt->ready_lock, flags);
++}
++
++static inline struct task_struct* take_ready(rt_domain_t* rt)
++{
++	unsigned long flags;
++	struct task_struct* ret;
++	/* first we need the write lock for rt_ready_queue */
++	write_lock_irqsave(&rt->ready_lock, flags);
++	ret = __take_ready(rt);
++	write_unlock_irqrestore(&rt->ready_lock, flags);
++	return ret;
++}
++
++
++static inline void add_release(rt_domain_t* rt, struct task_struct *task)
++{
++	unsigned long flags;
++	/* first we need the write lock for rt_ready_queue */
++	spin_lock_irqsave(&rt->release_lock, flags);
++	__add_release(rt, task);
++	spin_unlock_irqrestore(&rt->release_lock, flags);
++}
++
++static inline int __jobs_pending(rt_domain_t* rt)
++{
++	return !list_empty(&rt->ready_queue);
++}
++
++static inline int jobs_pending(rt_domain_t* rt)
++{
++	unsigned long flags;
++	int ret;
++	/* first we need the write lock for rt_ready_queue */
++	read_lock_irqsave(&rt->ready_lock, flags);
++	ret = __jobs_pending(rt);
++	read_unlock_irqrestore(&rt->ready_lock, flags);
++	return ret;
++}
++
++
++#endif
+diff --git a/include/litmus/rt_param.h b/include/litmus/rt_param.h
+new file mode 100644
+index 0000000..37a4495
+--- /dev/null
++++ b/include/litmus/rt_param.h
+@@ -0,0 +1,177 @@
++/*
++ * Definition of the scheduler plugin interface.
++ *
++ */
++#ifndef _LINUX_RT_PARAM_H_
++#define _LINUX_RT_PARAM_H_
++
++/* Litmus time type. */
++typedef unsigned long long lt_t;
++
++static inline int lt_after(lt_t a, lt_t b)
++{
++	return ((long long) b) - ((long long) a) < 0;
++}
++#define lt_before(a, b) lt_after(b, a)
++
++static inline int lt_after_eq(lt_t a, lt_t b)
++{
++	return ((long long) a) - ((long long) b) >= 0;
++}
++#define lt_before_eq(a, b) lt_after_eq(b, a)
++
++/* different types of clients */
++typedef enum {
++	RT_CLASS_HARD,
++	RT_CLASS_SOFT,
++	RT_CLASS_BEST_EFFORT
++} task_class_t;
++
++struct rt_task {
++	lt_t 		exec_cost;
++	lt_t 		period;
++	lt_t		phase;
++	lt_t		prio;
++	unsigned int  	cpu;
++	task_class_t  	cls;
++};
++
++#define DPCP_WAIT 	0x1
++#define DPCP_COMPLETE	0x2
++
++/* don't export internal data structures to user space (liblitmus) */
++#ifdef __KERNEL__
++
++#include <linux/list.h>
++
++struct rt_job {
++	/* Time instant the the job was or will be released.  */
++	lt_t	release;
++	/* What is the current deadline? */
++	lt_t   	deadline;
++	/* How much service has this job received so far?
++	 */
++	lt_t	exec_time;
++
++	/* Which job is this. This is used to let user space
++	 * specify which job to wait for, which is important if jobs
++	 * overrun. If we just call sys_sleep_next_period() then we
++	 * will unintentionally miss jobs after an overrun.
++	 *
++	 * Increase this sequence number when a job is released.
++	 */
++	unsigned int    job_no;
++
++	/* when did this job start executing? */
++	lt_t	exec_start;
++};
++
++
++/* make priority inheritance cleaner for PCP */
++struct pcp_priority {
++	lt_t	prio;
++	int	in_global_cs;
++	int	pid;
++};
++
++struct pcp_semaphore;
++
++/*	RT task parameters for scheduling extensions
++ *	These parameters are inherited during clone and therefore must
++ *	be explicitly set up before the task set is launched.
++ */
++struct rt_param {
++	/* is the task sleeping? */
++	unsigned int 		flags:8;
++
++	/* Real-time marker: 1 iff it is a LITMUS real-time task.
++	 */
++	unsigned int		is_realtime:1;
++
++	/* is a BE->RT or RT->BE transition pending? */
++	unsigned int		transition_pending:1;
++
++	/* is this task under control of litmus?
++	 *
++	 * this is necessary because otherwise signal delivery code
++	 * may try to wake up a task that is already queued in plugin
++	 * data structures.
++	 *
++	 * bbb: I believe this flag is fundamentally flawed and should be
++	 *      taken out in the redesign.
++	 */
++	unsigned int		litmus_controlled:1;
++
++	/* do we need to check for srp blocking? */
++	unsigned int		srp_non_recurse:1;
++
++	/* if a BE->RT transition failed, then this field contains the error */
++	unsigned long		transition_error;
++
++	/* user controlled parameters */
++	struct rt_task 		task_params;
++
++	/* timing parameters */
++	struct rt_job 		job_params;
++
++
++	/* task representing the current "inherited" task
++	 * priority, assigned by inherit_priority and
++	 * return priority in the scheduler plugins.
++	 * could point to self if PI does not result in
++	 * an increased task priority.
++	 */
++	 struct task_struct*	inh_task;
++
++	/* Don't just dereference this pointer in kernel space!
++	 * It might very well point to junk or nothing at all.
++	 * NULL indicates that the task has not requested any non-preemptable
++	 * section support.
++	 * Not inherited upon fork.
++	 */
++	short* 			np_flag;
++
++	/* For the FMLP under PSN-EDF, it is required to make the task
++	 * non-preemptive from kernel space. In order not to interfere with
++	 * user space, this counter indicates the kernel space np setting.
++	 * kernel_np > 0 => task is non-preemptive
++	 */
++	unsigned int 		kernel_np;
++
++	/* This field can be used by plugins to store where the task
++	 * is currently scheduled. It is the responsibility of the
++	 * plugin to avoid race conditions.
++	 *
++	 * Used by GSN-EDF.
++	 */
++	int			scheduled_on;
++
++	/* This field can be used by plugins to store where the task
++	 * is currently linked. It is the responsibility of the plugin
++	 * to avoid race conditions.
++	 *
++	 * Used by GSN-EDF.
++	 */
++	int			linked_on;
++
++	/* Used by RM
++	 */
++	struct pcp_priority	pcp_prio;
++	struct pcp_priority*	cur_prio;
++	struct list_head	owned_semaphores;
++	struct pcp_semaphore*	blocked_on;
++
++	/* Fields saved before BE->RT transition.
++	 */
++	int old_policy;
++	int old_prio;
++};
++
++/*	Possible RT flags	*/
++#define RT_F_RUNNING		0x00000000
++#define RT_F_SLEEP		0x00000001
++#define RT_F_EXIT_SEM		0x00000008
++
++#endif
++
++#endif
+diff --git a/include/litmus/sched_plugin.h b/include/litmus/sched_plugin.h
+new file mode 100644
+index 0000000..337668f
+--- /dev/null
++++ b/include/litmus/sched_plugin.h
+@@ -0,0 +1,120 @@
++/*
++ * Definition of the scheduler plugin interface.
++ *
++ */
++#ifndef _LINUX_SCHED_PLUGIN_H_
++#define _LINUX_SCHED_PLUGIN_H_
++
++#include <linux/sched.h>
++#include <litmus/litmus.h>
++
++/* struct for semaphore with priority inheritance */
++struct pi_semaphore {
++	atomic_t count;
++	int sleepers;
++	wait_queue_head_t wait;
++	union {
++		/* highest-prio holder/waiter */
++		struct task_struct *task;
++		struct task_struct* cpu_task[NR_CPUS];
++	} hp;
++	/* current lock holder */
++	struct task_struct *holder;
++};
++
++int set_hp_task(struct pi_semaphore *sem, prio_cmp_t cmp);
++int set_hp_cpu_task(struct pi_semaphore *sem, int cpu, prio_cmp_t cmp);
++
++/********************* scheduler invocation ******************/
++
++/*  Plugin-specific realtime tick handler */
++typedef void (*scheduler_tick_t) (void);
++/* Novell make sched decision function */
++typedef int (*schedule_t) (struct task_struct * prev,
++			   struct task_struct ** next);
++/* Clean up after the task switch has occured.
++ * This function is called after every (even non-rt) task switch.
++ */
++typedef void (*finish_switch_t)(struct task_struct *prev);
++
++
++/********************* task state changes ********************/
++
++/* called to setup a new real-time task */
++typedef long (*prepare_task_t) (struct task_struct *task);
++/* called to re-introduce a task after blocking */
++typedef void (*wake_up_task_t) (struct task_struct *task);
++/* called to notify the plugin of a blocking real-time task
++ * it will only be called for real-time tasks and before schedule is called */
++typedef void (*task_blocks_t)  (struct task_struct *task);
++/* called when a real-time task exits. Free any allocated resources */
++typedef long (*tear_down_t)    (struct task_struct *);
++
++/* Called when the new_owner is released from the wait queue
++ * it should now inherit the priority from sem, _before_ it gets readded
++ * to any queue
++ */
++typedef long (*inherit_priority_t) (struct pi_semaphore *sem,
++				    struct task_struct *new_owner);
++
++/* Called when the current task releases a semahpore where it might have
++ * inherited a piority from
++ */
++typedef long (*return_priority_t) (struct pi_semaphore *sem);
++
++/* Called when a task tries to acquire a semaphore and fails. Check if its
++ * priority is higher than that of the current holder.
++ */
++typedef long (*pi_block_t) (struct pi_semaphore *sem, struct task_struct *t);
++
++
++/********************* sys call backends  ********************/
++/* This function causes the caller to sleep until the next release */
++typedef long (*sleep_next_period_t) (void);
++
++struct sched_plugin {
++	struct list_head	list;
++	/* 	basic info 		*/
++	char 			*plugin_name;
++	unsigned int		srp_active:1;
++	unsigned int		pcp_active:1;
++
++	/* 	scheduler invocation 	*/
++	scheduler_tick_t 	scheduler_tick;
++	schedule_t 		schedule;
++	finish_switch_t 	finish_switch;
++
++	/*	syscall backend 	*/
++	sleep_next_period_t 	sleep_next_period;
++
++	/*	task state changes 	*/
++	prepare_task_t 		prepare_task;
++	wake_up_task_t 		wake_up_task;
++	task_blocks_t		task_blocks;
++	tear_down_t 		tear_down;
++
++	/*     priority inheritance 	*/
++	inherit_priority_t	inherit_priority;
++	return_priority_t	return_priority;
++	pi_block_t		pi_block;
++} __attribute__ ((__aligned__(SMP_CACHE_BYTES)));
++
++
++extern struct sched_plugin *curr_sched_plugin;
++
++int register_sched_plugin(struct sched_plugin* plugin);
++struct sched_plugin* find_sched_plugin(const char* name);
++int print_sched_plugins(char* buf, int max);
++
++static inline int pcp_active(void)
++{
++	return curr_sched_plugin->pcp_active;
++}
++
++static inline int srp_active(void)
++{
++	return curr_sched_plugin->srp_active;
++}
++
++
++#endif
+diff --git a/include/litmus/sched_trace.h b/include/litmus/sched_trace.h
+new file mode 100644
+index 0000000..f9938c2
+--- /dev/null
++++ b/include/litmus/sched_trace.h
+@@ -0,0 +1,31 @@
++/* sched_trace.h -- record scheduler events to a byte stream for offline analysis.
++ */
++#ifndef _LINUX_SCHED_TRACE_H_
++#define _LINUX_SCHED_TRACE_H_
++
++#include <linux/sched.h>
++
++/* dummies, need to be re-implemented */
++
++/* used in sched.c */
++#define  sched_trace_task_arrival(t)
++#define sched_trace_task_departure(t)
++#define sched_trace_task_preemption(t, by)
++#define sched_trace_task_scheduled(t)
++
++/* used in scheduler plugins */
++#define sched_trace_job_release(t)
++#define sched_trace_job_completion(t)
++
++
++#ifdef CONFIG_SCHED_DEBUG_TRACE
++void sched_trace_log_message(const char* fmt, ...);
++ 
++#else
++
++#define sched_trace_log_message(fmt, ...)
++
++#endif
++
++
++#endif
+diff --git a/include/litmus/trace.h b/include/litmus/trace.h
+new file mode 100644
+index 0000000..5c2c2c0
+--- /dev/null
++++ b/include/litmus/trace.h
+@@ -0,0 +1,106 @@
++
++#ifndef _SYS_TRACE_H_
++#define	_SYS_TRACE_H_
++
++#include <litmus/feather_trace.h>
++#include <litmus/feather_buffer.h>
++
++
++/*********************** TIMESTAMPS ************************/
++
++struct timestamp {
++	unsigned long		event;
++	unsigned long long	timestamp;
++	unsigned int		seq_no;
++	int			cpu;
++};
++
++
++/* buffer holding time stamps - will be provided by driver */
++extern struct ft_buffer* trace_ts_buf;
++
++/* save_timestamp:  stores current time as struct timestamp 
++ * in trace_ts_buf 
++ */
++asmlinkage void save_timestamp(unsigned long event);
++
++#define TIMESTAMP(id) ft_event0(id, save_timestamp)
++
++/* Convention for timestamps
++ * =========================
++ * 
++ * In order to process the trace files with a common tool, we use the following
++ * convention to measure execution times: The end time id of a code segment is
++ * always the next number after the start time event id.
++ */
++
++#define TS_SCHED_START 			TIMESTAMP(100)
++#define TS_SCHED_END			TIMESTAMP(101)
++#define TS_CXS_START			TIMESTAMP(102)
++#define TS_CXS_END			TIMESTAMP(103)
++
++#define TS_TICK_START  			TIMESTAMP(110)
++#define TS_TICK_END    			TIMESTAMP(111)
++
++#define TS_PLUGIN_SCHED_START		TIMESTAMP(120)
++#define TS_PLUGIN_SCHED_END		TIMESTAMP(121)
++
++#define TS_PLUGIN_TICK_START		TIMESTAMP(130)
++#define TS_PLUGIN_TICK_END		TIMESTAMP(131)
++
++#define TS_ENTER_NP_START		TIMESTAMP(140)
++#define TS_ENTER_NP_END			TIMESTAMP(141)
++
++#define TS_EXIT_NP_START		TIMESTAMP(150)
++#define TS_EXIT_NP_END			TIMESTAMP(151)
++
++#define TS_SRP_UP_START			TIMESTAMP(160)
++#define TS_SRP_UP_END			TIMESTAMP(161)
++#define TS_SRP_DOWN_START		TIMESTAMP(162)
++#define TS_SRP_DOWN_END			TIMESTAMP(163)
++
++#define TS_PI_UP_START			TIMESTAMP(170)
++#define TS_PI_UP_END			TIMESTAMP(171)
++#define TS_PI_DOWN_START		TIMESTAMP(172)
++#define TS_PI_DOWN_END			TIMESTAMP(173)
++
++#define TS_FIFO_UP_START		TIMESTAMP(180)
++#define TS_FIFO_UP_END			TIMESTAMP(181)
++#define TS_FIFO_DOWN_START		TIMESTAMP(182)
++#define TS_FIFO_DOWN_END		TIMESTAMP(183)
++
++#define PCP1	200
++#define PCP2	204
++
++#define DPCP	210
++#define MPCP	220
++#define FMLP	230
++#define SRPT	240
++
++#define TS_PCP_UP_START			TIMESTAMP(PCP1)
++#define TS_PCP_UP_END			TIMESTAMP(PCP1 + 1)
++#define TS_PCP1_DOWN_START		TIMESTAMP(PCP1 + 2)
++#define TS_PCP1_DOWN_END		TIMESTAMP(PCP1 + 3)
++#define TS_PCP2_DOWN_START		TIMESTAMP(PCP2 + 2)
++#define TS_PCP2_DOWN_END		TIMESTAMP(PCP2 + 3)
++
++
++#define TS_DPCP_INVOKE_START		TIMESTAMP(DPCP)
++#define TS_DPCP_INVOKE_END		TIMESTAMP(DPCP + 1)
++#define TS_DPCP_AGENT1_START		TIMESTAMP(DPCP + 2)
++#define TS_DPCP_AGENT1_END		TIMESTAMP(DPCP + 3)
++#define TS_DPCP_AGENT2_START		TIMESTAMP(DPCP + 4)
++#define TS_DPCP_AGENT2_END		TIMESTAMP(DPCP + 5)
++
++
++#define TS_MPCP_UP_START		TIMESTAMP(MPCP)
++#define TS_MPCP_UP_END			TIMESTAMP(MPCP + 1)
++#define TS_MPCP_DOWN_START		TIMESTAMP(MPCP + 2)
++#define TS_MPCP_DOWN_END		TIMESTAMP(MPCP + 3)
++
++
++#define TS_SRPT_START			TIMESTAMP(SRPT)
++#define TS_SRPT_END			TIMESTAMP(SRPT + 1)
++
++
++#endif /* !_SYS_TRACE_H_ */
+diff --git a/kernel/exit.c b/kernel/exit.c
+index fec12eb..8a0eb79 100644
+--- a/kernel/exit.c
++++ b/kernel/exit.c
+@@ -50,6 +50,8 @@
+ 
+ extern void sem_exit (void);
+ 
++extern void exit_od_table(struct task_struct* t);
++
+ static void exit_mm(struct task_struct * tsk);
+ 
+ static void __unhash_process(struct task_struct *p)
+@@ -916,6 +918,8 @@ fastcall NORET_TYPE void do_exit(long code)
+ 	if (unlikely(tsk->audit_context))
+ 		audit_free(tsk);
+ 
++	exit_od_table(tsk);
++
+ 	taskstats_exit(tsk, group_dead);
+ 
+ 	exit_mm(tsk);
+diff --git a/kernel/fork.c b/kernel/fork.c
+index d57118d..6fa6e03 100644
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -57,6 +57,9 @@
+ #include <asm/cacheflush.h>
+ #include <asm/tlbflush.h>
+ 
++#include <litmus/litmus.h>
++#include <litmus/sched_plugin.h>
++
+ /*
+  * Protected counters by write_lock_irq(&tasklist_lock)
+  */
+@@ -118,6 +121,8 @@ void __put_task_struct(struct task_struct *tsk)
+ 	WARN_ON(atomic_read(&tsk->usage));
+ 	WARN_ON(tsk == current);
+ 
++	exit_litmus(tsk);
++
+ 	security_task_free(tsk);
+ 	free_uid(tsk->user);
+ 	put_group_info(tsk->group_info);
+diff --git a/kernel/sched.c b/kernel/sched.c
+index cca93cc..fb35f31 100644
+--- a/kernel/sched.c
++++ b/kernel/sched.c
+@@ -56,6 +56,12 @@
+ 
+ #include <asm/unistd.h>
+ 
++#include <litmus/litmus.h>
++#include <litmus/sched_plugin.h>
++#include <litmus/sched_trace.h>
++#include <litmus/rt_param.h>
++#include <litmus/trace.h>
++
+ /*
+  * Convert user-nice values [ -20 ... 0 ... 19 ]
+  * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
+@@ -836,7 +842,7 @@ static int effective_prio(struct task_struct *p)
+ 	 * keep the priority unchanged. Otherwise, update priority
+ 	 * to the normal priority:
+ 	 */
+-	if (!rt_prio(p->prio))
++	if (!rt_prio(p->prio) && !is_realtime(p))
+ 		return p->normal_prio;
+ 	return p->prio;
+ }
+@@ -844,7 +850,7 @@ static int effective_prio(struct task_struct *p)
+ /*
+  * __activate_task - move a task to the runqueue.
+  */
+-static void __activate_task(struct task_struct *p, struct rq *rq)
++void __activate_task(struct task_struct *p, struct rq *rq)
+ {
+ 	struct prio_array *target = rq->active;
+ 
+@@ -999,7 +1005,7 @@ out:
+ /*
+  * deactivate_task - remove a task from the runqueue.
+  */
+-static void deactivate_task(struct task_struct *p, struct rq *rq)
++void deactivate_task(struct task_struct *p, struct rq *rq)
+ {
+ 	dec_nr_running(p, rq);
+ 	dequeue_task(p, p->array);
+@@ -1408,6 +1414,10 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
+ #endif
+ 
+ 	rq = task_rq_lock(p, &flags);
++
++	if (is_realtime(p))
++		TRACE("try_to_wake_up(%s/%d)\n", p->comm, p->pid);
++
+ 	old_state = p->state;
+ 	if (!(old_state & state))
+ 		goto out;
+@@ -1415,6 +1425,12 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
+ 	if (p->array)
+ 		goto out_running;
+ 
++	sched_trace_task_arrival(p);
++	if (is_realtime(p)) {
++		curr_sched_plugin->wake_up_task(p);
++		goto out_running;
++	}
++
+ 	cpu = task_cpu(p);
+ 	this_cpu = smp_processor_id();
+ 
+@@ -1576,6 +1592,8 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags)
+ {
+ 	int cpu = get_cpu();
+ 
++	litmus_fork(p);
++
+ #ifdef CONFIG_SMP
+ 	cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
+ #endif
+@@ -1730,6 +1748,9 @@ void fastcall sched_exit(struct task_struct *p)
+ 	unsigned long flags;
+ 	struct rq *rq;
+ 
++	if (is_realtime(p))
++		return;
++
+ 	/*
+ 	 * If the child was a (relative-) CPU hog then decrease
+ 	 * the sleep_avg of the parent as well.
+@@ -1765,6 +1786,31 @@ static inline void prepare_task_switch(struct rq *rq, struct task_struct *next)
+ 	prepare_arch_switch(next);
+ }
+ 
++static void litmus_transition(struct task_struct *tsk, struct rq *rq)
++{
++	int wakeup = 0;
++	WARN_ON(tsk->state != TASK_STOPPED);
++
++	tsk->rt_param.transition_pending = 0;
++	if (is_realtime(tsk)) {
++		/* RT -> BE transition */
++		tsk->rt_param.transition_error = transition_to_be(tsk);
++		wakeup = tsk->rt_param.transition_error == 0;
++	} else {
++		/* BE -> RT transition */
++		tsk->rt_param.transition_error  = transition_to_rt(tsk);
++		/* If it was rejected as a real-time task, then
++		 * keep it running as a best-effort task.
++		 */
++		wakeup = tsk->rt_param.transition_error != 0;
++	}
++	if (wakeup) {
++		/* we still hold the runqueue lock */
++		tsk->state = TASK_RUNNING;
++		__activate_task(tsk, rq);
++	}
++}
++
+ /**
+  * finish_task_switch - clean up after a task-switch
+  * @rq: runqueue associated with task-switch
+@@ -1801,6 +1847,15 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
+ 	 */
+ 	prev_state = prev->state;
+ 	finish_arch_switch(prev);
++	/* Requeue previous real-time task before we drop the rq lock, cause
++	 * that may lead to a preemption.
++	 */
++	curr_sched_plugin->finish_switch(prev);
++	sched_trace_task_scheduled(current);
++	if (rt_transition_pending(prev))
++		litmus_transition(prev, rq);
++	/* trace before IRQs are enabled */
++	TS_CXS_END;
+ 	finish_lock_switch(rq, prev);
+ 	if (mm)
+ 		mmdrop(mm);
+@@ -2095,6 +2150,10 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
+ 		     struct sched_domain *sd, enum idle_type idle,
+ 		     int *all_pinned)
+ {
++	/* Don't migrate LITMUS^RT tasks. */
++	if (is_realtime(p))
++		return 0;
++
+ 	/*
+ 	 * We do not migrate tasks that are:
+ 	 * 1) running (obviously), or
+@@ -3220,11 +3279,30 @@ void scheduler_tick(void)
+ 
+ 	update_cpu_clock(p, rq, now);
+ 
++	/* real-time accounting is done by the plugin
++	 * call linux functions only for background tasks
++	 */
+ 	if (p == rq->idle)
+-		/* Task on the idle queue */
+-		wake_priority_sleeper(rq);
+-	else
++			/* Task on the idle queue */
++			wake_priority_sleeper(rq);
++	else if (is_realtime(p)) {
++		/* time accounting for LITMUS^RT tasks */
++		p->rt_param.job_params.exec_time +=
++			now - p->rt_param.job_params.exec_start;
++		p->rt_param.job_params.exec_start = now;
++	} else
++		/* normal Linux tasks */
+ 		task_running_tick(rq, p);
++
++	/* check whether the RT scheduler plugin requires a call to
++	 * schedule
++	 */
++	TS_PLUGIN_TICK_START;
++	curr_sched_plugin->scheduler_tick();
++	TS_PLUGIN_TICK_END;
++
++	send_scheduler_signals();
++
+ #ifdef CONFIG_SMP
+ 	update_load(rq);
+ 	if (time_after_eq(jiffies, rq->next_balance))
+@@ -3406,6 +3484,7 @@ static inline int interactive_sleep(enum sleep_type sleep_type)
+ 		sleep_type == SLEEP_INTERRUPTED);
+ }
+ 
++
+ /*
+  * schedule() is the main scheduler function.
+  */
+@@ -3420,6 +3499,7 @@ asmlinkage void __sched schedule(void)
+ 	long *switch_count;
+ 	struct rq *rq;
+ 
++
+ 	/*
+ 	 * Test if we are atomic.  Since do_exit() needs to call into
+ 	 * schedule() atomically, we ignore that path for now.
+@@ -3427,8 +3507,9 @@ asmlinkage void __sched schedule(void)
+ 	 */
+ 	if (unlikely(in_atomic() && !current->exit_state)) {
+ 		printk(KERN_ERR "BUG: scheduling while atomic: "
+-			"%s/0x%08x/%d\n",
+-			current->comm, preempt_count(), current->pid);
++		       "%s/0x%08x/%d %s\n",
++		       current->comm, preempt_count(), current->pid,
++		       is_realtime(current) ? "rt" : "non-rt");
+ 		debug_show_held_locks(current);
+ 		if (irqs_disabled())
+ 			print_irqtrace_events(current);
+@@ -3438,6 +3519,7 @@ asmlinkage void __sched schedule(void)
+ 
+ need_resched:
+ 	preempt_disable();
++	TS_SCHED_START;
+ 	prev = current;
+ 	release_kernel_lock(prev);
+ need_resched_nonpreemptible:
+@@ -3470,6 +3552,7 @@ need_resched_nonpreemptible:
+ 	spin_lock_irq(&rq->lock);
+ 
+ 	switch_count = &prev->nivcsw;
++	/* check for blocking tasks */
+ 	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
+ 		switch_count = &prev->nvcsw;
+ 		if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
+@@ -3478,11 +3561,60 @@ need_resched_nonpreemptible:
+ 		else {
+ 			if (prev->state == TASK_UNINTERRUPTIBLE)
+ 				rq->nr_uninterruptible++;
++
++			if (is_realtime(prev)) {
++				TRACE_TASK(prev, "blocks, state = %d\n",
++				      prev->state);
++				curr_sched_plugin->task_blocks(prev);
++				/* Enable this for all tasks to get _a lot_ of
++				 * data. Can be helpful for debugging.
++				 */
++				sched_trace_task_departure(prev);
++			}
++			/* only indirect switching is supported in the current
++			 * version of LITMUS
++			 */
+ 			deactivate_task(prev, rq);
+ 		}
+ 	}
+ 
++	next = NULL;
++
++	if (is_realtime(prev)) {
++		/* If we are invoked after scheduler_tick(), then
++		 * prev is charged a tiny amount of overhead time.
++		 * Since analysis has (or should have) accounted for
++		 * overheads, this is ok.
++		 */
++		prev->rt_param.job_params.exec_time +=
++			now - prev->rt_param.job_params.exec_start;
++		prev->rt_param.job_params.exec_start = now;
++	}
++
++	/* consult the real-time plugin */
++	TS_PLUGIN_SCHED_START;
++	curr_sched_plugin->schedule(prev, &next);
++	TS_PLUGIN_SCHED_END;
++
+ 	cpu = smp_processor_id();
++
++	if (prev != next && is_realtime(prev) && is_running(prev))
++		deactivate_task(prev, rq);
++	if (next && prev != next) {
++		__activate_task(next, rq);
++		set_task_cpu(next, cpu);
++	}
++
++	/* If the real-time plugin wants to switch to a specific task
++	 * it'll be on the rq and have the highest priority. There will
++	 * be exaclty one such task, thus the selection of the next task
++	 * is unambiguous and the following code can only get
++	 * triggered if there are no RT tasks pending (on this CPU). Thus,
++	 * we may as well skip it.
++	 */
++	if (next)
++		goto switch_tasks;
++
+ 	if (unlikely(!rq->nr_running)) {
+ 		idle_balance(cpu, rq);
+ 		if (!rq->nr_running) {
+@@ -3546,12 +3678,17 @@ switch_tasks:
+ 	prev->timestamp = prev->last_ran = now;
+ 
+ 	sched_info_switch(prev, next);
++	TS_SCHED_END;
+ 	if (likely(prev != next)) {
++		TS_CXS_START;
++		if (is_running(prev))
++			sched_trace_task_preemption(prev, next);
+ 		next->timestamp = now;
+ 		rq->nr_switches++;
+ 		rq->curr = next;
+ 		++*switch_count;
+ 
++		next->rt_param.job_params.exec_start = now;
+ 		prepare_task_switch(rq, next);
+ 		prev = context_switch(rq, prev, next);
+ 		barrier();
+@@ -3561,8 +3698,11 @@ switch_tasks:
+ 		 * frame will be invalid.
+ 		 */
+ 		finish_task_switch(this_rq(), prev);
+-	} else
++	} else {
+ 		spin_unlock_irq(&rq->lock);
++	}
++
++	send_scheduler_signals();
+ 
+ 	prev = current;
+ 	if (unlikely(reacquire_kernel_lock(prev) < 0))
+@@ -3570,6 +3710,8 @@ switch_tasks:
+ 	preempt_enable_no_resched();
+ 	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
+ 		goto need_resched;
++	if (srp_active())
++		srp_ceiling_block();
+ }
+ EXPORT_SYMBOL(schedule);
+ 
+@@ -3691,6 +3833,7 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
+ 	}
+ }
+ 
++
+ /**
+  * __wake_up - wake up threads blocked on a waitqueue.
+  * @q: the waitqueue
+@@ -3709,6 +3852,7 @@ void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
+ }
+ EXPORT_SYMBOL(__wake_up);
+ 
++
+ /*
+  * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
+  */
+@@ -3717,6 +3861,7 @@ void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
+ 	__wake_up_common(q, mode, 1, 0, NULL);
+ }
+ 
++
+ /**
+  * __wake_up_sync - wake up threads blocked on a waitqueue.
+  * @q: the waitqueue
+@@ -3772,6 +3917,18 @@ void fastcall complete_all(struct completion *x)
+ }
+ EXPORT_SYMBOL(complete_all);
+ 
++void fastcall complete_n(struct completion *x, int n)
++{
++	unsigned long flags;
++
++	spin_lock_irqsave(&x->wait.lock, flags);
++	x->done += n;
++	__wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
++			 n, 0, NULL);
++	spin_unlock_irqrestore(&x->wait.lock, flags);
++}
++EXPORT_SYMBOL(complete_n);
++
+ void fastcall __sched wait_for_completion(struct completion *x)
+ {
+ 	might_sleep();
+@@ -4175,7 +4332,7 @@ static inline struct task_struct *find_process_by_pid(pid_t pid)
+ }
+ 
+ /* Actually do priority change: must hold rq lock. */
+-static void __setscheduler(struct task_struct *p, int policy, int prio)
++void __setscheduler(struct task_struct *p, int policy, int prio)
+ {
+ 	BUG_ON(p->array);
+ 
+diff --git a/lib/semaphore-sleepers.c b/lib/semaphore-sleepers.c
+index 1281805..3f4d543 100644
+--- a/lib/semaphore-sleepers.c
++++ b/lib/semaphore-sleepers.c
+@@ -108,7 +108,7 @@ fastcall int __sched __down_interruptible(struct semaphore * sem)
+ 		/*
+ 		 * With signals pending, this turns into
+ 		 * the trylock failure case - we won't be
+-		 * sleeping, and we* can't get the lock as
++		 * sleeping, and we can't get the lock as
+ 		 * it has contention. Just correct the count
+ 		 * and exit.
+ 		 */
+diff --git a/litmus/Makefile b/litmus/Makefile
+new file mode 100644
+index 0000000..db2518d
+--- /dev/null
++++ b/litmus/Makefile
+@@ -0,0 +1,9 @@
++#
++# Makefile for LITMUS^RT
++#
++
++obj-y     = sched_plugin.o litmus.o sched_trace.o \
++	    edf_common.o rm_common.o\
++            sched_gsn_edf.o sched_psn_edf.o litmus_sem.o \
++	    trace.o ft_event.o rt_domain.o fdso.o \
++	    sched_rm.o sync.o jobs.o pcp.o
+diff --git a/litmus/edf_common.c b/litmus/edf_common.c
+new file mode 100644
+index 0000000..2a52835
+--- /dev/null
++++ b/litmus/edf_common.c
+@@ -0,0 +1,95 @@
++/*
++ * kernel/edf_common.c
++ *
++ * Common functions for EDF based scheduler.
++ */
++
++#include <linux/percpu.h>
++#include <linux/sched.h>
++#include <linux/list.h>
++
++#include <litmus/litmus.h>
++#include <litmus/sched_plugin.h>
++#include <litmus/sched_trace.h>
++
++
++#include <litmus/edf_common.h>
++
++/* edf_higher_prio -  returns true if first has a higher EDF priority
++ *                    than second. Deadline ties are broken by PID.
++ *
++ * first first must not be NULL and a real-time task.
++ * second may be NULL or a non-rt task.
++ */
++int edf_higher_prio(struct task_struct* first,
++		    struct task_struct* second)
++{
++	struct task_struct *first_task = first;
++	struct task_struct *second_task = second;
++
++	/* Check for inherited priorities. Change task
++	 * used for comparison in such a case.
++	 */
++	if (first && first->rt_param.inh_task)
++		first_task = first->rt_param.inh_task;
++	if (second && second->rt_param.inh_task)
++		second_task = second->rt_param.inh_task;
++
++	return
++		/* does the second task exist and is it a real-time task?  If
++		 * not, the first task (which is a RT task) has higher
++		 * priority.
++		 */
++		!second_task || !is_realtime(second_task)  ||
++
++		/* is the deadline of the first task earlier?
++		 * Then it has higher priority.
++		 */
++		earlier_deadline(first_task, second_task) ||
++
++		/* Do we have a deadline tie?
++		 * Then break by PID.
++		 */
++		(get_deadline(first_task) == get_deadline(second_task) &&
++	        (first_task->pid < second_task->pid ||
++
++		/* If the PIDs are the same then the task with the inherited
++		 * priority wins.
++		 */
++		(first_task->pid == second_task->pid &&
++		 !second->rt_param.inh_task)));
++}
++
++int edf_ready_order(struct list_head* a, struct list_head* b)
++{
++	return edf_higher_prio(
++		list_entry(a, struct task_struct, rt_list),
++		list_entry(b, struct task_struct, rt_list));
++}
++
++void edf_domain_init(rt_domain_t* rt, check_resched_needed_t resched)
++{
++	rt_domain_init(rt, resched, edf_ready_order);
++}
++
++/* need_to_preempt - check whether the task t needs to be preempted
++ *                   call only with irqs disabled and with  ready_lock acquired
++ *                   THIS DOES NOT TAKE NON-PREEMPTIVE SECTIONS INTO ACCOUNT!
++ */
++int edf_preemption_needed(rt_domain_t* rt, struct task_struct *t)
++{
++	/* we need the read lock for edf_ready_queue */
++	/* no need to preempt if there is nothing pending */
++	if (!ready_jobs_pending(rt))
++		return 0;
++	/* we need to reschedule if t doesn't exist */
++	if (!t)
++		return 1;
++
++	/* NOTE: We cannot check for non-preemptibility since we
++	 *       don't know what address space we're currently in.
++	 */
++
++	/* make sure to get non-rt stuff out of the way */
++	return !is_realtime(t) || edf_higher_prio(next_ready(rt), t);
++}
+diff --git a/litmus/fdso.c b/litmus/fdso.c
+new file mode 100644
+index 0000000..ded9918
+--- /dev/null
++++ b/litmus/fdso.c
+@@ -0,0 +1,289 @@
++/* fdso.c - file descriptor attached shared objects
++ *
++ * (c) 2007 B. Brandenburg, LITMUS^RT project
++ *
++ * Notes:
++ *   - objects descriptor (OD) tables are not cloned during a fork.
++ *   - objects are created on-demand, and freed after the last reference
++ *     is dropped.
++ *   - for now, object types are hard coded.
++ *   - As long as we have live objects, we keep a reference to the inode.
++ */
++
++#include <linux/errno.h>
++#include <linux/sched.h>
++#include <linux/mutex.h>
++#include <linux/file.h>
++#include <asm/uaccess.h>
++
++#include <litmus/fdso.h>
++
++extern struct fdso_ops pi_sem_ops;
++extern struct fdso_ops srp_sem_ops;
++extern struct fdso_ops pcp_sem_ops;
++extern struct fdso_ops mpcp_sem_ops;
++
++static const struct fdso_ops* fdso_ops[] = {
++	&pi_sem_ops,
++	&srp_sem_ops,
++	&pcp_sem_ops,
++	&mpcp_sem_ops,
++};
++
++static void* fdso_create(obj_type_t type)
++{
++	return fdso_ops[type]->create();
++}
++
++static void fdso_destroy(obj_type_t type, void* obj)
++{
++	fdso_ops[type]->destroy(obj);
++}
++
++static int fdso_open(struct od_table_entry* entry, void* __user config)
++{
++	if (fdso_ops[entry->obj->type]->open)
++		return fdso_ops[entry->obj->type]->open(entry, config);
++	else
++		return 0;
++}
++
++static int fdso_close(struct od_table_entry* entry)
++{
++	if (fdso_ops[entry->obj->type]->close)
++		return fdso_ops[entry->obj->type]->close(entry);
++	else
++		return 0;
++}
++
++/* inode must be locked already */
++static struct inode_obj_id* alloc_inode_obj(struct inode* inode,
++					    obj_type_t type,
++					    unsigned int id)
++{
++	struct inode_obj_id* obj;
++	void* raw_obj;
++
++	raw_obj = fdso_create(type);
++	if (!raw_obj)
++		return NULL;
++
++	obj = kmalloc(sizeof(struct inode_obj_id), GFP_KERNEL);
++	if (!obj)
++		return NULL;
++	INIT_LIST_HEAD(&obj->list);
++	atomic_set(&obj->count, 1);
++	obj->type  = type;
++	obj->id    = id;
++	obj->obj   = raw_obj;
++	obj->inode = inode;
++
++	list_add(&obj->list, &inode->i_obj_list);
++	atomic_inc(&inode->i_count);
++/*
++	printk(KERN_DEBUG "alloc_inode_obj(%p, %d, %d): object created\n",
++	inode, type, id);
++*/
++	return obj;
++}
++
++/* inode must be locked already */
++static struct inode_obj_id* get_inode_obj(struct inode* inode,
++					  obj_type_t type,
++					  unsigned int id)
++{
++	struct list_head* pos;
++	struct inode_obj_id* obj = NULL;
++
++	list_for_each(pos, &inode->i_obj_list) {
++		obj = list_entry(pos, struct inode_obj_id, list);
++		if (obj->id == id && obj->type == type) {
++			atomic_inc(&obj->count);
++			return obj;
++		}
++	}
++/*
++	printk(KERN_DEBUG "get_inode_obj(%p, %d, %d): couldn't find object\n",
++	       inode, type, id);
++*/
++	return NULL;
++}
++
++
++static void put_inode_obj(struct inode_obj_id* obj)
++{
++	struct inode* inode;
++	int let_go = 0;
++
++	inode = obj->inode;
++	if (atomic_dec_and_test(&obj->count)) {
++
++		mutex_lock(&inode->i_obj_mutex);
++		/* no new references can be obtained */
++		if (!atomic_read(&obj->count)) {
++			list_del(&obj->list);
++			fdso_destroy(obj->type, obj->obj);
++			kfree(obj);
++			let_go = 1;
++		}
++		mutex_unlock(&inode->i_obj_mutex);
++		if (let_go)
++			iput(inode);
++	}
++}
++
++static struct od_table_entry*  get_od_entry(struct task_struct* t)
++{
++	struct od_table_entry* table;
++	int i;
++
++
++	table = t->od_table;
++	if (!table) {
++		table = (struct od_table_entry*)
++			kzalloc(sizeof(struct  od_table_entry) *
++				MAX_OBJECT_DESCRIPTORS, GFP_KERNEL);
++		t->od_table = table;
++	}
++
++	for (i = 0; table &&  i < MAX_OBJECT_DESCRIPTORS; i++)
++		if (!table[i].used) {
++			table[i].used = 1;
++			return table + i;
++		}
++	return NULL;
++}
++
++static int put_od_entry(struct od_table_entry* od)
++{
++	put_inode_obj(od->obj);
++	od->used = 0;
++	return 0;
++}
++
++void exit_od_table(struct task_struct* t)
++{
++	int i;
++
++	if (t->od_table) {
++		for (i = 0; i < MAX_OBJECT_DESCRIPTORS; i++)
++			if (t->od_table[i].used)
++				put_od_entry(t->od_table + i);
++		kfree(t->od_table);
++		t->od_table = NULL;
++	}
++}
++
++static int do_sys_od_open(struct file* file, obj_type_t type, int id,
++			  void* __user config)
++{
++	int idx = 0, err;
++	struct inode* inode;
++	struct inode_obj_id* obj = NULL;
++	struct od_table_entry* entry;
++
++	inode = file->f_dentry->d_inode;
++
++	entry = get_od_entry(current);
++	if (!entry)
++		return -ENOMEM;
++
++	mutex_lock(&inode->i_obj_mutex);
++	obj = get_inode_obj(inode, type, id);
++	if (!obj)
++		obj = alloc_inode_obj(inode, type, id);
++	if (!obj) {
++		idx = -ENOMEM;
++		entry->used = 0;
++	} else {
++		entry->obj   = obj;
++		entry->extra = NULL;
++		idx = entry - current->od_table;
++	}
++
++	mutex_unlock(&inode->i_obj_mutex);
++
++	/* FIXME: What if the allocation failed? */
++	err = fdso_open(entry, config);
++	if (err < 0) {
++		/* The class rejected the open call.
++		 * We need to clean up and tell user space.
++		 */
++		put_od_entry(entry);
++		idx = err;
++	}
++
++	return idx;
++}
++
++
++struct od_table_entry* __od_lookup(int od)
++{
++	struct task_struct *t = current;
++
++	if (!t->od_table)
++		return NULL;
++	if (od < 0 || od >= MAX_OBJECT_DESCRIPTORS)
++		return NULL;
++	if (!t->od_table[od].used)
++		return NULL;
++	return t->od_table + od;
++}
++
++
++asmlinkage int sys_od_open(int fd, int type, int obj_id, void* __user config)
++{
++	int ret = 0;
++	struct file*  file;
++
++	/*
++	   1) get file from fd, get inode from file
++	   2) lock inode
++	   3) try to lookup object
++	   4) if not present create and enqueue object, inc inode refcnt
++	   5) increment refcnt of object
++	   6) alloc od_table_entry, setup ptrs
++	   7) unlock inode
++	   8) return offset in od_table as OD
++	 */
++
++	if (type < MIN_OBJ_TYPE || type > MAX_OBJ_TYPE) {
++		ret = -EINVAL;
++		goto out;
++	}
++
++	file = fget(fd);
++	if (!file) {
++		ret = -EBADF;
++		goto out;
++	}
++
++	ret = do_sys_od_open(file, type, obj_id, config);
++
++	fput(file);
++
++out:
++	return ret;
++}
++
++
++asmlinkage int sys_od_close(int od)
++{
++	int ret = -EINVAL;
++	struct task_struct *t = current;
++
++	if (od < 0 || od >= MAX_OBJECT_DESCRIPTORS)
++		return ret;
++
++	if (!t->od_table || !t->od_table[od].used)
++		return ret;
++
++
++	/* give the class a chance to reject the close
++	 */
++	ret = fdso_close(t->od_table + od);
++	if (ret == 0)
++		ret = put_od_entry(t->od_table + od);
++
++	return ret;
++}
+diff --git a/litmus/ft_event.c b/litmus/ft_event.c
+new file mode 100644
+index 0000000..db9f4ea
+--- /dev/null
++++ b/litmus/ft_event.c
+@@ -0,0 +1,104 @@
++#include <linux/types.h>
++
++#include <litmus/feather_trace.h>
++
++/* the feather trace management functions assume 
++ * exclusive access to the event table
++ */
++
++
++#define BYTE_JUMP      0xeb
++#define BYTE_JUMP_LEN  0x02
++
++/* for each event, there is an entry in the event table */
++struct trace_event {
++	long 	id;
++	long	count;
++	long	start_addr;
++	long	end_addr;
++};
++
++extern struct trace_event  __start___event_table[];
++extern struct trace_event  __stop___event_table[];
++
++int ft_enable_event(unsigned long id) 
++{
++	struct trace_event* te = __start___event_table;
++	int count = 0;
++	char* delta;
++	unsigned char* instr;
++
++	while (te < __stop___event_table) {
++		if (te->id == id && ++te->count == 1) {
++			instr  = (unsigned char*) te->start_addr;
++			/* make sure we don't clobber something wrong */
++			if (*instr == BYTE_JUMP) {				
++				delta  = (((unsigned char*) te->start_addr) + 1);
++				*delta = 0;
++			}
++		}
++		if (te->id == id)
++			count++;
++		te++;		
++	}
++	return count;
++}
++
++int ft_disable_event(unsigned long id)
++{
++	struct trace_event* te = __start___event_table;
++	int count = 0;
++	char* delta;
++	unsigned char* instr;
++
++	while (te < __stop___event_table) {
++		if (te->id == id && --te->count == 0) {
++			instr  = (unsigned char*) te->start_addr;
++			if (*instr == BYTE_JUMP) {
++				delta  = (((unsigned char*) te->start_addr) + 1);
++				*delta = te->end_addr - te->start_addr - 
++					BYTE_JUMP_LEN;
++			}
++		}
++		if (te->id == id)
++			count++;
++		te++;		
++	}
++	return count;
++}
++
++int ft_disable_all_events(void)
++{
++	struct trace_event* te = __start___event_table;
++	int count = 0;
++	char* delta;
++	unsigned char* instr;
++
++	while (te < __stop___event_table) {
++		if (te->count) {
++			instr  = (unsigned char*) te->start_addr;
++			if (*instr == BYTE_JUMP) {
++				delta  = (((unsigned char*) te->start_addr) 
++					  + 1);
++				*delta = te->end_addr - te->start_addr - 
++					BYTE_JUMP_LEN;
++				te->count = 0;
++				count++;
++			}
++		}
++		te++;		
++	}
++	return count;
++}
++
++int ft_is_event_enabled(unsigned long id)
++{
++	struct trace_event* te = __start___event_table;
++
++	while (te < __stop___event_table) {
++		if (te->id == id)
++			return te->count;
++		te++;
++	}
++	return 0;
++}
+diff --git a/litmus/jobs.c b/litmus/jobs.c
+new file mode 100644
+index 0000000..e294bc5
+--- /dev/null
++++ b/litmus/jobs.c
+@@ -0,0 +1,43 @@
++/* litmus/jobs.c - common job control code
++ */
++
++#include <linux/sched.h>
++
++#include <litmus/litmus.h>
++#include <litmus/jobs.h>
++
++void prepare_for_next_period(struct task_struct *t)
++{
++	BUG_ON(!t);
++	/* prepare next release */
++	t->rt_param.job_params.release   = t->rt_param.job_params.deadline;
++	t->rt_param.job_params.deadline += get_rt_period(t);
++	t->rt_param.job_params.exec_time = 0;
++	/* update job sequence number */
++	t->rt_param.job_params.job_no++;
++
++	/* don't confuse Linux */
++	t->time_slice = 1;
++}
++
++void release_at(struct task_struct *t, lt_t start)
++{
++	t->rt_param.job_params.deadline = start;
++	prepare_for_next_period(t);
++	set_rt_flags(t, RT_F_RUNNING);
++}
++
++
++/*
++ *	Deactivate current task until the beginning of the next period.
++ */
++long complete_job(void)
++{
++	/* Mark that we do not excute anymore */
++	set_rt_flags(current, RT_F_SLEEP);
++	/* call schedule, this will return when a new job arrives
++	 * it also takes care of preparing for the next release
++	 */
++	schedule();
++	return 0;
++}
+diff --git a/litmus/litmus.c b/litmus/litmus.c
+new file mode 100644
+index 0000000..77aad7d
+--- /dev/null
++++ b/litmus/litmus.c
+@@ -0,0 +1,830 @@
++/* litmus.c -- Implementation of the LITMUS syscalls, the LITMUS intialization code,
++ *             and the procfs interface..
++ */
++#include <asm/uaccess.h>
++#include <linux/uaccess.h>
++#include <linux/sysrq.h>
++
++#include <linux/module.h>
++#include <linux/proc_fs.h>
++
++
++#include <litmus/litmus.h>
++#include <linux/sched.h>
++#include <litmus/sched_plugin.h>
++
++#include <litmus/trace.h>
++
++/* Number of RT tasks that exist in the system */
++atomic_t rt_task_count 		= ATOMIC_INIT(0);
++static DEFINE_SPINLOCK(task_transition_lock);
++
++/* To send signals from the scheduler
++ * Must drop locks first.
++ */
++static LIST_HEAD(sched_sig_list);
++static DEFINE_SPINLOCK(sched_sig_list_lock);
++
++/*
++ * sys_set_task_rt_param
++ * @pid: Pid of the task which scheduling parameters must be changed
++ * @param: New real-time extension parameters such as the execution cost and
++ *         period
++ * Syscall for manipulating with task rt extension params
++ * Returns EFAULT  if param is NULL.
++ *         ESRCH   if pid is not corrsponding
++ *	           to a valid task.
++ *	   EINVAL  if either period or execution cost is <=0
++ *	   EPERM   if pid is a real-time task
++ *	   0       if success
++ *
++ * Only non-real-time tasks may be configured with this system call
++ * to avoid races with the scheduler. In practice, this means that a
++ * task's parameters must be set _before_ calling sys_prepare_rt_task()
++ */
++asmlinkage long sys_set_rt_task_param(pid_t pid, struct rt_task __user * param)
++{
++	struct rt_task tp;
++	struct task_struct *target;
++	int retval = -EINVAL;
++
++	printk("Setting up rt task parameters for process %d.\n", pid);
++
++	if (pid < 0 || param == 0) {
++		goto out;
++	}
++	if (copy_from_user(&tp, param, sizeof(tp))) {
++		retval = -EFAULT;
++		goto out;
++	}
++
++	/*      Task search and manipulation must be protected */
++	read_lock_irq(&tasklist_lock);
++	if (!(target = find_task_by_pid(pid))) {
++		retval = -ESRCH;
++		goto out_unlock;
++	}
++
++	if (is_realtime(target)) {
++		/* The task is already a real-time task.
++		 * We cannot not allow parameter changes at this point.
++		 */
++		retval = -EBUSY;
++		goto out_unlock;
++	}
++
++	if (tp.exec_cost <= 0)
++		goto out_unlock;
++	if (tp.period <= 0)
++		goto out_unlock;
++	if (!cpu_online(tp.cpu))
++		goto out_unlock;
++	if (tp.period < tp.exec_cost)
++	{
++		printk(KERN_INFO "litmus: real-time task %d rejected "
++		       "because wcet > period\n", pid);
++		goto out_unlock;
++	}
++
++	target->rt_param.task_params = tp;
++
++	retval = 0;
++      out_unlock:
++	read_unlock_irq(&tasklist_lock);
++      out:
++	return retval;
++}
++
++/*	Getter of task's RT params
++ *	returns EINVAL if param or pid is NULL
++ *	returns ESRCH  if pid does not correspond to a valid task
++ *	returns EFAULT if copying of parameters has failed.
++ */
++asmlinkage long sys_get_rt_task_param(pid_t pid, struct rt_task __user * param)
++{
++	int retval = -EINVAL;
++	struct task_struct *source;
++	struct rt_task lp;
++	if (param == 0 || pid < 0)
++		goto out;
++	read_lock(&tasklist_lock);
++	if (!(source = find_task_by_pid(pid))) {
++		retval = -ESRCH;
++		goto out_unlock;
++	}
++	lp = source->rt_param.task_params;
++	read_unlock(&tasklist_lock);
++	/* Do copying outside the lock */
++	retval =
++	    copy_to_user(param, &lp, sizeof(lp)) ? -EFAULT : 0;
++	return retval;
++      out_unlock:
++	read_unlock(&tasklist_lock);
++      out:
++	return retval;
++
++}
++
++/* sys_task_mode_transition
++ * @target_mode: The desired execution mode after the system call completes.
++ *               Either BACKGROUND_TASK or LITMUS_RT_TASK.
++ * 	Allow a normal task to become a real-time task, vice versa.
++ *	Returns EINVAL	if illegal transition requested.
++ *		0	if task mode was changed succesfully
++ *		other	if plugin failed.
++ */
++asmlinkage long sys_task_mode_transition(int target_mode)
++{
++	int retval = -EINVAL;
++	struct task_struct *t = current;
++
++	if (( is_realtime(t) && target_mode == BACKGROUND_TASK) ||
++	    (!is_realtime(t) && target_mode == LITMUS_RT_TASK)) {
++		TRACE_TASK(t, "attempts mode transition to %s\n",
++			   is_realtime(t) ? "best-effort" : "real-time");
++		preempt_disable();
++		t->rt_param.transition_pending = 1;
++		t->state = TASK_STOPPED;
++		preempt_enable_no_resched();
++
++		schedule();
++
++		retval = t->rt_param.transition_error;
++	}
++	return retval;
++}
++
++/* implemented in kernel/litmus_sem.c */
++void srp_ceiling_block(void);
++
++/*
++ *	This is the crucial function for periodic task implementation,
++ *	It checks if a task is periodic, checks if such kind of sleep
++ *	is permitted and calls plugin-specific sleep, which puts the
++ *	task into a wait array.
++ *	returns 0 on successful wakeup
++ *	returns EPERM if current conditions do not permit such sleep
++ *	returns EINVAL if current task is not able to go to sleep
++ */
++asmlinkage long sys_sleep_next_period(void)
++{
++	int retval = -EPERM;
++	if (!is_realtime(current)) {
++		retval = -EINVAL;
++		goto out;
++	}
++	/* Task with negative or zero period cannot sleep */
++	if (get_rt_period(current) <= 0) {
++		retval = -EINVAL;
++		goto out;
++	}
++	/* The plugin has to put the task into an
++	 * appropriate queue and call schedule
++	 */
++	retval = curr_sched_plugin->sleep_next_period();
++      out:
++	return retval;
++}
++
++/*	This is an "improved" version of sys_sleep_next_period() that
++ *      addresses the problem of unintentionally missing a job after
++ *      an overrun.
++ *
++ *	returns 0 on successful wakeup
++ *	returns EPERM if current conditions do not permit such sleep
++ *	returns EINVAL if current task is not able to go to sleep
++ */
++asmlinkage long sys_wait_for_job_release(unsigned int job)
++{
++	int retval = -EPERM;
++	if (!is_realtime(current)) {
++		retval = -EINVAL;
++		goto out;
++	}
++
++	/* Task with negative or zero period cannot sleep */
++	if (get_rt_period(current) <= 0) {
++		retval = -EINVAL;
++		goto out;
++	}
++
++	retval = 0;
++
++	/* first wait until we have "reached" the desired job
++	 *
++	 * This implementation has at least two problems:
++	 *
++	 * 1) It doesn't gracefully handle the wrap around of
++	 *    job_no. Since LITMUS is a prototype, this is not much
++	 *    of a problem right now.
++	 *
++	 * 2) It is theoretically racy if a job release occurs
++	 *    between checking job_no and calling sleep_next_period().
++	 *    A proper solution would requiring adding another callback
++	 *    in the plugin structure and testing the condition with
++	 *    interrupts disabled.
++	 *
++	 * FIXME: At least problem 2 should be taken care of eventually.
++	 */
++	while (!retval && job > current->rt_param.job_params.job_no)
++	  /* If the last job overran then job <= job_no and we
++	   * don't send the task to sleep.
++	   */
++	  retval = curr_sched_plugin->sleep_next_period();
++      out:
++	return retval;
++}
++
++/*	This is a helper syscall to query the current job sequence number.
++ *
++ *	returns 0 on successful query
++ *	returns EPERM if task is not a real-time task.
++ *      returns EFAULT if &job is not a valid pointer.
++ */
++asmlinkage long sys_query_job_no(unsigned int __user *job)
++{
++	int retval = -EPERM;
++	if (is_realtime(current))
++		retval = put_user(current->rt_param.job_params.job_no, job);
++
++	return retval;
++}
++
++struct sched_sig {
++	struct list_head 	list;
++	struct task_struct*	task;
++	unsigned int		signal:31;
++	int			force:1;
++};
++
++static void __scheduler_signal(struct task_struct *t, unsigned int signo,
++			       int force)
++{
++	struct sched_sig* sig;
++
++	sig = kmalloc(GFP_ATOMIC, sizeof(struct sched_sig));
++	if (!sig) {
++		TRACE_TASK(t, "dropping signal: %u\n", t);
++		return;
++	}
++
++	spin_lock(&sched_sig_list_lock);
++
++	sig->signal = signo;
++	sig->force  = force;
++	sig->task   = t;
++	get_task_struct(t);
++	list_add(&sig->list, &sched_sig_list);
++
++	spin_unlock(&sched_sig_list_lock);
++}
++
++void scheduler_signal(struct task_struct *t, unsigned int signo)
++{
++	__scheduler_signal(t, signo, 0);
++}
++
++void force_scheduler_signal(struct task_struct *t, unsigned int signo)
++{
++	__scheduler_signal(t, signo, 1);
++}
++
++/* FIXME: get rid of the locking and do this on a per-processor basis */
++void send_scheduler_signals(void)
++{
++	unsigned long flags;
++	struct list_head *p, *extra;
++	struct siginfo info;
++	struct sched_sig* sig;
++	struct task_struct* t;
++	struct list_head claimed;
++
++	if (spin_trylock_irqsave(&sched_sig_list_lock, flags)) {
++		if (list_empty(&sched_sig_list))
++			p = NULL;
++		else {
++			p = sched_sig_list.next;
++			list_del(&sched_sig_list);
++			INIT_LIST_HEAD(&sched_sig_list);
++		}
++		spin_unlock_irqrestore(&sched_sig_list_lock, flags);
++
++		/* abort if there are no signals */
++		if (!p)
++			return;
++
++		/* take signal list we just obtained */
++		list_add(&claimed, p);
++
++		list_for_each_safe(p, extra, &claimed) {
++			list_del(p);
++			sig = list_entry(p, struct sched_sig, list);
++			t = sig->task;
++			info.si_signo = sig->signal;
++			info.si_errno = 0;
++			info.si_code  = SI_KERNEL;
++			info.si_pid   = 1;
++			info.si_uid   = 0;
++			TRACE("sending signal %d to %d\n", info.si_signo,
++			      t->pid);
++			if (sig->force)
++				force_sig_info(sig->signal, &info, t);
++			else
++				send_sig_info(sig->signal, &info, t);
++			put_task_struct(t);
++			kfree(sig);
++		}
++	}
++
++}
++
++static inline void np_mem_error(struct task_struct* t, const char* reason)
++{
++	if (t->state != TASK_DEAD && !(t->flags & PF_EXITING)) {
++		TRACE("np section: %s => %s/%d killed\n",
++		      reason, t->comm, t->pid);
++		force_scheduler_signal(t, SIGKILL);
++	}
++}
++
++/*	sys_register_np_flag() allows real-time tasks to register an
++ *	np section indicator.
++ *	returns 0      if the flag was successfully registered
++ *	returns EINVAL if current task is not a real-time task
++ *	returns EFAULT if *flag couldn't be written
++ */
++asmlinkage long sys_register_np_flag(short __user *flag)
++{
++	int retval = -EINVAL;
++	short test_val = RT_PREEMPTIVE;
++
++	/* avoid races with the scheduler */
++	preempt_disable();
++	TRACE("reg_np_flag(%p) for %s/%d\n", flag,
++	      current->comm, current->pid);
++
++	/* Let's first try to write to the address.
++	 * That way it is initialized and any bugs
++	 * involving dangling pointers will caught
++	 * early.
++	 * NULL indicates disabling np section support
++	 * and should not be tested.
++	 */
++	if (flag)
++	  retval = poke_kernel_address(test_val, flag);
++	else
++	  retval = 0;
++	TRACE("reg_np_flag: retval=%d\n", retval);
++	if (unlikely(0 != retval))
++		np_mem_error(current, "np flag: not writable");
++	else
++	  /* the pointer is ok */
++	  current->rt_param.np_flag = flag;
++
++	preempt_enable();
++	return retval;
++}
++
++
++void request_exit_np(struct task_struct *t)
++{
++	int ret;
++	short flag;
++
++	/* We can only do this if t is actually currently scheduled on this CPU
++	 * because otherwise we are in the wrong address space. Thus make sure
++	 * to check.
++	 */
++        BUG_ON(t != current);
++
++	if (unlikely(!is_realtime(t) || !t->rt_param.np_flag)) {
++		TRACE_TASK(t, "request_exit_np(): BAD TASK!\n");
++		return;
++	}
++
++	flag = RT_EXIT_NP_REQUESTED;
++	ret  = poke_kernel_address(flag, t->rt_param.np_flag + 1);
++	TRACE("request_exit_np(%s/%d)\n", t->comm, t->pid);
++	if (unlikely(0 != ret))
++		np_mem_error(current, "request_exit_np(): flag not writable");
++
++}
++
++
++int is_np(struct task_struct* t)
++{
++	int ret;
++	unsigned short flag = 0x5858; /* = XX, looks nicer in debug*/
++
++        BUG_ON(t != current);
++
++	if (unlikely(t->rt_param.kernel_np))
++		return 1;
++	else if (unlikely(t->rt_param.np_flag == NULL) ||
++		 t->flags & PF_EXITING ||
++		 t->state == TASK_DEAD)
++		return 0;
++	else {
++		/* This is the tricky part. The process has registered a
++		 * non-preemptive section marker. We now need to check whether
++		 * it is set to to NON_PREEMPTIVE. Along the way we could
++		 * discover that the pointer points to an unmapped region (=>
++		 * kill the task) or that the location contains some garbage
++		 * value (=> also kill the task). Killing the task in any case
++		 * forces userspace to play nicely. Any bugs will be discovered
++		 * immediately.
++		 */
++		ret = probe_kernel_address(t->rt_param.np_flag, flag);
++		if (0 == ret && (flag == RT_NON_PREEMPTIVE ||
++				 flag == RT_PREEMPTIVE))
++		return flag != RT_PREEMPTIVE;
++		else {
++			/* either we could not read from the address or
++			 * it contained garbage => kill the process
++			 * FIXME: Should we cause a SEGFAULT instead?
++			 */
++			TRACE("is_np: ret=%d flag=%c%c (%x)\n", ret,
++			      flag & 0xff, (flag >> 8) & 0xff, flag);
++			np_mem_error(t, "is_np() could not read");
++			return 0;
++		}
++	}
++}
++
++/*
++ *	sys_exit_np() allows real-time tasks to signal that it left a
++ *      non-preemptable section. It will be called after the kernel requested a
++ *      callback in the preemption indicator flag.
++ *	returns 0      if the signal was valid and processed.
++ *	returns EINVAL if current task is not a real-time task
++ */
++asmlinkage long sys_exit_np(void)
++{
++	int retval = -EINVAL;
++
++	TS_EXIT_NP_START;
++
++	if (!is_realtime(current))
++		goto out;
++
++	TRACE("sys_exit_np(%s/%d)\n", current->comm, current->pid);
++	/* force rescheduling so that we can be preempted */
++	set_tsk_need_resched(current);
++	retval = 0;
++      out:
++
++	TS_EXIT_NP_END;
++	return retval;
++}
++
++void __setscheduler(struct task_struct *, int, int);
++
++/* p is a real-time task. Re-init its state as a best-effort task. */
++static void reinit_litmus_state(struct task_struct* p, int restore)
++{
++	struct rt_task  user_config = {};
++	__user short   *np_flag     = NULL;
++
++	if (restore) {
++		/* Safe user-space provided configuration data.
++		 * FIXME: This is missing service levels for adaptive tasks.
++		 */
++		user_config = p->rt_param.task_params;
++		np_flag     = p->rt_param.np_flag;
++	}
++
++	/* We probably should not be inheriting any task's priority
++	 * at this point in time.
++	 */
++	WARN_ON(p->rt_param.inh_task);
++
++	/* We need to restore the priority of the task. */
++	__setscheduler(p, p->rt_param.old_policy, p->rt_param.old_prio);
++
++	/* Cleanup everything else. */
++	memset(&p->rt_param, 0, sizeof(struct rt_task));
++
++	/* Restore preserved fields. */
++	if (restore) {
++		p->rt_param.task_params = user_config;
++		p->rt_param.np_flag      = np_flag;
++	}
++}
++
++long transition_to_rt(struct task_struct* tsk)
++{
++	long retval;
++	long flags;
++
++	BUG_ON(is_realtime(tsk));
++
++	if (get_rt_period(tsk) == 0 ||
++	    get_exec_cost(tsk) > get_rt_period(tsk)) {
++		TRACE_TASK(tsk, "litmus prepare: invalid task parameters "
++			   "(%lu, %lu)\n",
++		       get_exec_cost(tsk), get_rt_period(tsk));
++		return -EINVAL;
++	}
++
++	if (!cpu_online(get_partition(tsk)))
++	{
++		TRACE_TASK(tsk, "litmus prepare: cpu %d is not online\n",
++			   get_partition(tsk));
++		return -EINVAL;
++	}
++
++	tsk->rt_param.old_prio   = tsk->rt_priority;
++	tsk->rt_param.old_policy = tsk->policy;
++	INIT_LIST_HEAD(&tsk->rt_list);
++
++	/* avoid scheduler plugin changing underneath us */
++	spin_lock_irqsave(&task_transition_lock, flags);
++	retval = curr_sched_plugin->prepare_task(tsk);
++
++	if (!retval) {
++		atomic_inc(&rt_task_count);
++		__setscheduler(tsk, SCHED_FIFO, MAX_RT_PRIO - 1);
++		tsk->rt_param.is_realtime 	= 1;
++		tsk->rt_param.litmus_controlled = 1;
++	}
++	spin_unlock_irqrestore(&task_transition_lock, flags);
++
++	return retval;
++}
++
++long transition_to_be(struct task_struct* tsk)
++{
++	BUG_ON(!is_realtime(tsk));
++
++	curr_sched_plugin->tear_down(tsk);
++	atomic_dec(&rt_task_count);
++	reinit_litmus_state(tsk, 1);
++	return 0;
++}
++
++
++/* Switching a plugin in use is tricky.
++ * We must watch out that no real-time tasks exists
++ * (and that none is created in parallel) and that the plugin is not
++ * currently in use on any processor (in theory).
++ *
++ * For now, we don't enforce the second part since it is unlikely to cause
++ * any trouble by itself as long as we don't unload modules.
++ */
++int switch_sched_plugin(struct sched_plugin* plugin)
++{
++	long flags;
++	int ret = 0;
++
++	BUG_ON(!plugin);
++
++	/* stop task transitions */
++	spin_lock_irqsave(&task_transition_lock, flags);
++
++	/* don't switch if there are active real-time tasks */
++	if (atomic_read(&rt_task_count) == 0) {
++		printk(KERN_INFO "Switching to LITMUS^RT plugin %s.\n", plugin->plugin_name);
++		curr_sched_plugin = plugin;
++	} else
++		ret = -EBUSY;
++
++	spin_unlock_irqrestore(&task_transition_lock, flags);
++	return ret;
++}
++
++/* Called upon fork.
++ * p is the newly forked task.
++ */
++void litmus_fork(struct task_struct* p)
++{
++	if (is_realtime(p))
++		/* clean out any litmus related state, don't preserve anything*/
++		reinit_litmus_state(p, 0);
++}
++
++/* Called upon execve().
++ * current is doing the exec.
++ * Don't let address space specific stuff leak.
++ */
++void litmus_exec(void)
++{
++	struct task_struct* p = current;
++
++	if (is_realtime(p)) {
++		WARN_ON(p->rt_param.inh_task);
++		p->rt_param.np_flag = NULL;
++	}
++}
++
++void exit_litmus(struct task_struct *dead_tsk)
++{
++	if (is_realtime(dead_tsk))
++		transition_to_be(dead_tsk);
++}
++
++
++void list_qsort(struct list_head* list, list_cmp_t less_than)
++{
++	struct list_head lt;
++	struct list_head geq;
++	struct list_head *pos, *extra, *pivot;
++	int n_lt = 0, n_geq = 0;
++	BUG_ON(!list);
++
++	if (list->next == list)
++		return;
++
++	INIT_LIST_HEAD(&lt);
++	INIT_LIST_HEAD(&geq);
++
++	pivot = list->next;
++	list_del(pivot);
++	list_for_each_safe(pos, extra, list) {
++		list_del(pos);
++		if (less_than(pos, pivot)) {
++			list_add(pos, &lt);
++			n_lt++;
++		} else {
++			list_add(pos, &geq);
++			n_geq++;
++		}
++	}
++	if (n_lt < n_geq) {
++		list_qsort(&lt, less_than);
++		list_qsort(&geq, less_than);
++	} else {
++		list_qsort(&geq, less_than);
++		list_qsort(&lt, less_than);
++	}
++	list_splice(&geq, list);
++	list_add(pivot, list);
++	list_splice(&lt, list);
++}
++
++#ifdef CONFIG_MAGIC_SYSRQ
++int sys_kill(int pid, int sig);
++
++static void sysrq_handle_kill_rt_tasks(int key, struct tty_struct *tty)
++{
++	struct task_struct *t;
++	read_lock(&tasklist_lock);
++	for_each_process(t) {
++		if (is_realtime(t)) {
++			sys_kill(t->pid, SIGKILL);
++		}
++	}
++	read_unlock(&tasklist_lock);
++}
++
++static struct sysrq_key_op sysrq_kill_rt_tasks_op = {
++	.handler	= sysrq_handle_kill_rt_tasks,
++	.help_msg	= "Quit-rt-tasks",
++	.action_msg	= "sent SIGKILL to all real-time tasks",
++};
++#endif
++
++static int proc_read_stats(char *page, char **start,
++			   off_t off, int count,
++			   int *eof, void *data)
++{
++	int len;
++
++	len = snprintf(page, PAGE_SIZE,
++		       "real-time task count = %d\n",
++		       atomic_read(&rt_task_count));
++	return len;
++}
++
++static int proc_read_plugins(char *page, char **start,
++			   off_t off, int count,
++			   int *eof, void *data)
++{
++	int len;
++
++	len = print_sched_plugins(page, PAGE_SIZE);
++	return len;
++}
++
++static int proc_read_curr(char *page, char **start,
++			  off_t off, int count,
++			  int *eof, void *data)
++{
++	int len;
++
++	len = snprintf(page, PAGE_SIZE, "%s\n", curr_sched_plugin->plugin_name);
++	return len;
++}
++
++static int proc_write_curr(struct file *file,
++			   const char *buffer,
++			   unsigned long count,
++			   void *data)
++{
++	int len, ret;
++	char name[65];
++	struct sched_plugin* found;
++
++	if(count > 64)
++		len = 64;
++	else
++		len = count;
++
++	if(copy_from_user(name, buffer, len))
++		return -EFAULT;
++
++	name[len] = '\0';
++	/* chomp name */
++	if (len > 1 && name[len - 1] == '\n')
++		name[len - 1] = '\0';
++
++	found = find_sched_plugin(name);
++
++	if (found) {
++		ret = switch_sched_plugin(found);
++		if (ret != 0)
++			printk(KERN_INFO "Could not switch plugin: %d\n", ret);
++	} else
++		printk(KERN_INFO "Plugin '%s' is unknown.\n", name);
++
++	return len;
++}
++
++
++static struct proc_dir_entry *litmus_dir = NULL,
++	*curr_file = NULL,
++	*stat_file = NULL,
++	*plugs_file = NULL;
++
++static int __init init_litmus_proc(void)
++{
++	litmus_dir = proc_mkdir("litmus", NULL);
++	if (!litmus_dir) {
++		printk(KERN_ERR "Could not allocate LITMUS^RT procfs entry.\n");
++		return -ENOMEM;
++	}
++	litmus_dir->owner = THIS_MODULE;
++
++	curr_file = create_proc_entry("active_plugin",
++				      0644, litmus_dir);
++	if (!curr_file) {
++		printk(KERN_ERR "Could not allocate active_plugin "
++		       "procfs entry.\n");
++		return -ENOMEM;
++	}
++	curr_file->owner = THIS_MODULE;
++	curr_file->read_proc  = proc_read_curr;
++	curr_file->write_proc = proc_write_curr;
++
++	stat_file = create_proc_read_entry("stats", 0444, litmus_dir,
++					   proc_read_stats, NULL);
++
++	plugs_file = create_proc_read_entry("plugins", 0444, litmus_dir,
++					   proc_read_plugins, NULL);
++
++	return 0;
++}
++
++static void exit_litmus_proc(void)
++{
++	if (plugs_file)
++		remove_proc_entry("plugins", litmus_dir);
++	if (stat_file)
++		remove_proc_entry("stats", litmus_dir);
++	if (curr_file)
++		remove_proc_entry("active_plugin", litmus_dir);
++	if (litmus_dir)
++		remove_proc_entry("litmus", NULL);
++}
++
++extern struct sched_plugin linux_sched_plugin;
++
++static int __init _init_litmus(void)
++{
++	/*      Common initializers,
++	 *      mode change lock is used to enforce single mode change
++	 *      operation.
++	 */
++	printk("Starting LITMUS^RT kernel\n");
++
++	register_sched_plugin(&linux_sched_plugin);
++
++#ifdef CONFIG_MAGIC_SYSRQ
++	/* offer some debugging help */
++	if (!register_sysrq_key('q', &sysrq_kill_rt_tasks_op))
++		printk("Registered kill rt tasks magic sysrq.\n");
++	else
++		printk("Could not register kill rt tasks magic sysrq.\n");
++#endif
++
++	init_litmus_proc();
++
++	return 0;
++}
++
++static void _exit_litmus(void)
++{
++	exit_litmus_proc();
++}
++
++module_init(_init_litmus);
++module_exit(_exit_litmus);
+diff --git a/litmus/litmus_sem.c b/litmus/litmus_sem.c
+new file mode 100644
+index 0000000..7179b43
+--- /dev/null
++++ b/litmus/litmus_sem.c
+@@ -0,0 +1,551 @@
++/*
++ * PI semaphores and SRP implementations.
++ * Much of the code here is borrowed from include/asm-i386/semaphore.h.
++ *
++ * NOTE: This implementation is very much a prototype and horribly insecure. It
++ *       is intended to be a proof of concept, not a feature-complete solution.
++ */
++
++#include <asm/atomic.h>
++#include <asm/semaphore.h>
++#include <linux/sched.h>
++#include <linux/wait.h>
++#include <linux/spinlock.h>
++#include <litmus/litmus.h>
++#include <litmus/sched_plugin.h>
++#include <litmus/edf_common.h>
++
++#include <litmus/fdso.h>
++
++#include <litmus/trace.h>
++
++/* ************************************************************************** */
++/*                          PRIORITY INHERITANCE                              */
++/* ************************************************************************** */
++
++static  void* create_pi_semaphore(void)
++{
++	struct pi_semaphore* sem;
++	int i;
++
++	sem = kmalloc(sizeof(struct pi_semaphore), GFP_KERNEL);
++	if (!sem)
++		return NULL;
++	atomic_set(&sem->count, 1);
++	sem->sleepers = 0;
++	init_waitqueue_head(&sem->wait);
++	sem->hp.task = NULL;
++	sem->holder = NULL;
++	for (i = 0; i < NR_CPUS; i++)
++		sem->hp.cpu_task[i] = NULL;
++	return sem;
++}
++
++static void destroy_pi_semaphore(void* sem)
++{
++	/* XXX assert invariants */
++	kfree(sem);
++}
++
++struct fdso_ops pi_sem_ops = {
++	.create  = create_pi_semaphore,
++	.destroy = destroy_pi_semaphore
++};
++
++struct wq_pair {
++	struct task_struct*  tsk;
++	struct pi_semaphore* sem;
++};
++
++static int rt_pi_wake_up(wait_queue_t *wait, unsigned mode, int sync,
++			   void *key)
++{
++	struct wq_pair* wqp   = (struct wq_pair*) wait->private;
++	set_rt_flags(wqp->tsk, RT_F_EXIT_SEM);
++	curr_sched_plugin->inherit_priority(wqp->sem, wqp->tsk);
++	TRACE_TASK(wqp->tsk,
++		   "woken up by rt_pi_wake_up() (RT_F_SEM_EXIT, PI)\n");
++	/* point to task for default_wake_function() */
++	wait->private = wqp->tsk;
++	default_wake_function(wait, mode, sync, key);
++
++	/* Always return true since we know that if we encountered a task
++	 * that was already running the wake_up raced with the schedule in
++	 * rt_pi_down(). In that case the task in rt_pi_down() will be scheduled
++	 * immediately and own the lock. We must not wake up another task in
++	 * any case.
++	 */
++	return 1;
++}
++
++/* caller is responsible for locking */
++int set_hp_task(struct pi_semaphore *sem, prio_cmp_t higher_prio)
++{
++	struct list_head	*tmp, *next;
++	struct task_struct 	*queued;
++	int ret = 0;
++
++	sem->hp.task = NULL;
++	list_for_each_safe(tmp, next, &sem->wait.task_list) {
++		queued  = ((struct wq_pair*)
++			list_entry(tmp, wait_queue_t,
++				   task_list)->private)->tsk;
++
++		/* Compare task prios, find high prio task. */
++		if (higher_prio(queued, sem->hp.task)) {
++			sem->hp.task = queued;
++			ret = 1;
++		}
++	}
++	return ret;
++}
++
++/* caller is responsible for locking */
++int set_hp_cpu_task(struct pi_semaphore *sem, int cpu, prio_cmp_t higher_prio)
++{
++	struct list_head	*tmp, *next;
++	struct task_struct 	*queued;
++	int ret = 0;
++
++	sem->hp.cpu_task[cpu] = NULL;
++	list_for_each_safe(tmp, next, &sem->wait.task_list) {
++		queued  = ((struct wq_pair*)
++			list_entry(tmp, wait_queue_t,
++				   task_list)->private)->tsk;
++
++		/* Compare task prios, find high prio task. */
++		if (get_partition(queued) == cpu &&
++		    higher_prio(queued, sem->hp.cpu_task[cpu])) {
++			sem->hp.cpu_task[cpu] = queued;
++			ret = 1;
++		}
++	}
++	return ret;
++}
++
++int do_pi_down(struct pi_semaphore* sem)
++{
++	unsigned long flags;
++	struct task_struct *tsk = current;
++	struct wq_pair pair;
++	int suspended = 1;
++	wait_queue_t wait = {
++		.private = &pair,
++		.func    = rt_pi_wake_up,
++		.task_list = {NULL, NULL}
++	};
++
++	pair.tsk = tsk;
++	pair.sem = sem;
++	spin_lock_irqsave(&sem->wait.lock, flags);
++
++	if (atomic_dec_return(&sem->count) < 0 ||
++	    waitqueue_active(&sem->wait)) {
++		/* we need to suspend */
++		tsk->state = TASK_UNINTERRUPTIBLE;
++		add_wait_queue_exclusive_locked(&sem->wait, &wait);
++
++		TRACE_CUR("suspends on PI lock %p\n", sem);
++		curr_sched_plugin->pi_block(sem, tsk);
++
++		/* release lock before sleeping */
++		spin_unlock_irqrestore(&sem->wait.lock, flags);
++
++		TS_PI_DOWN_END;
++		preempt_enable_no_resched();
++
++
++		/* we depend on the FIFO order
++		 * Thus, we don't need to recheck when we wake up, we
++		 * are guaranteed to have the lock since there is only one
++		 * wake up per release
++		 */
++		schedule();
++
++		TRACE_CUR("woke up, now owns PI lock %p\n", sem);
++
++		/* try_to_wake_up() set our state to TASK_RUNNING,
++		 * all we need to do is to remove our wait queue entry
++		 */
++		remove_wait_queue(&sem->wait, &wait);
++	} else {
++		/* no priority inheritance necessary, since there are no queued
++		 * tasks.
++		 */
++		suspended = 0;
++		TRACE_CUR("acquired PI lock %p, no contention\n", sem);
++		sem->holder  = tsk;
++		sem->hp.task = tsk;
++		curr_sched_plugin->inherit_priority(sem, tsk);
++		spin_unlock_irqrestore(&sem->wait.lock, flags);
++	}
++	return suspended;
++}
++
++void do_pi_up(struct pi_semaphore* sem)
++{
++	unsigned long flags;
++
++	spin_lock_irqsave(&sem->wait.lock, flags);
++
++	TRACE_CUR("releases PI lock %p\n", sem);
++	curr_sched_plugin->return_priority(sem);
++	sem->holder = NULL;
++	if (atomic_inc_return(&sem->count) < 1)
++		/* there is a task queued */
++		wake_up_locked(&sem->wait);
++
++	spin_unlock_irqrestore(&sem->wait.lock, flags);
++}
++
++asmlinkage long sys_pi_down(int sem_od)
++{
++	long ret = 0;
++	struct pi_semaphore * sem;
++	int suspended = 0;
++
++	preempt_disable();
++	TS_PI_DOWN_START;
++
++	sem = lookup_pi_sem(sem_od);
++	if (sem)
++		suspended = do_pi_down(sem);
++	else
++		ret = -EINVAL;
++
++	if (!suspended) {
++		TS_PI_DOWN_END;
++		preempt_enable();
++	}
++
++	return ret;
++}
++
++asmlinkage long sys_pi_up(int sem_od)
++{
++	long ret = 0;
++	struct pi_semaphore * sem;
++
++	preempt_disable();
++	TS_PI_UP_START;
++
++	sem = lookup_pi_sem(sem_od);
++	if (sem)
++		do_pi_up(sem);
++	else
++		ret = -EINVAL;
++
++
++	TS_PI_UP_END;
++	preempt_enable();
++
++	return ret;
++}
++
++
++/* ************************************************************************** */
++/*                          STACK RESOURCE POLICY                             */
++/* ************************************************************************** */
++
++
++struct srp_priority {
++	struct list_head	list;
++        unsigned int 		period;
++	pid_t			pid;
++};
++
++#define list2prio(l) list_entry(l, struct srp_priority, list)
++
++/* SRP task priority comparison function. Smaller periods have highest
++ * priority, tie-break is PID. Special case: period == 0 <=> no priority
++ */
++static int srp_higher_prio(struct srp_priority* first,
++			   struct srp_priority* second)
++{
++	if (!first->period)
++		return 0;
++	else
++		return  !second->period ||
++			first->period < second->period || (
++			first->period == second->period &&
++			first->pid < second->pid);
++}
++
++struct srp {
++	struct list_head	ceiling;
++	wait_queue_head_t	ceiling_blocked;
++};
++
++
++atomic_t srp_objects_in_use = ATOMIC_INIT(0);
++
++DEFINE_PER_CPU(struct srp, srp);
++
++
++/* Initialize SRP semaphores at boot time. */
++static int __init srp_init(void)
++{
++	int i;
++
++	printk("Initializing SRP per-CPU ceilings...");
++	for (i = 0; i < NR_CPUS; i++) {
++		init_waitqueue_head(&per_cpu(srp, i).ceiling_blocked);
++		INIT_LIST_HEAD(&per_cpu(srp, i).ceiling);
++	}
++	printk(" done!\n");
++
++	return 0;
++}
++module_init(srp_init);
++
++
++#define system_ceiling(srp) list2prio(srp->ceiling.next)
++
++
++#define UNDEF_SEM -2
++
++
++/* struct for uniprocessor SRP "semaphore" */
++struct srp_semaphore {
++	struct srp_priority ceiling;
++	struct task_struct* owner;
++	int cpu; /* cpu associated with this "semaphore" and resource */
++};
++
++#define ceiling2sem(c) container_of(c, struct srp_semaphore, ceiling) 
++
++static int srp_exceeds_ceiling(struct task_struct* first,
++			       struct srp* srp)
++{
++	return list_empty(&srp->ceiling) ||
++	       get_rt_period(first) < system_ceiling(srp)->period ||
++	       (get_rt_period(first) == system_ceiling(srp)->period &&
++		first->pid < system_ceiling(srp)->pid) || 
++		ceiling2sem(system_ceiling(srp))->owner == first;
++}
++
++static void srp_add_prio(struct srp* srp, struct srp_priority* prio)
++{
++	struct list_head *pos;
++	if (in_list(&prio->list)) {
++		printk(KERN_CRIT "WARNING: SRP violation detected, prio is already in "
++		       "ceiling list! cpu=%d, srp=%p\n", smp_processor_id(), ceiling2sem(prio));
++		return;
++	}
++	list_for_each(pos, &srp->ceiling)
++		if (unlikely(srp_higher_prio(prio, list2prio(pos)))) {
++			__list_add(&prio->list, pos->prev, pos);
++			return;
++		}
++
++	list_add_tail(&prio->list, &srp->ceiling);
++}
++
++
++static void* create_srp_semaphore(void)
++{
++	struct srp_semaphore* sem;
++
++	sem = kmalloc(sizeof(*sem), GFP_KERNEL);
++	if (!sem)
++		return NULL;
++
++	INIT_LIST_HEAD(&sem->ceiling.list);
++	sem->ceiling.period = 0;
++	sem->cpu     = UNDEF_SEM;
++	sem->owner   = NULL;
++	atomic_inc(&srp_objects_in_use);
++	return sem;
++}
++
++static noinline int open_srp_semaphore(struct od_table_entry* entry, void* __user arg)
++{
++	struct srp_semaphore* sem = (struct srp_semaphore*) entry->obj->obj;
++	int ret = 0;
++	struct task_struct* t = current;
++	struct srp_priority t_prio;
++
++	TRACE("opening SRP semaphore %p, cpu=%d\n", sem, sem->cpu);
++	if (!srp_active())
++		return -EBUSY;
++
++	if (sem->cpu == UNDEF_SEM)
++		sem->cpu = get_partition(t);
++	else if (sem->cpu != get_partition(t))
++		ret = -EPERM;
++
++	if (ret == 0) {
++		t_prio.period = get_rt_period(t);
++		t_prio.pid    = t->pid;
++		if (srp_higher_prio(&t_prio, &sem->ceiling)) {
++			sem->ceiling.period = t_prio.period;
++			sem->ceiling.pid    = t_prio.pid;
++		}
++	}
++
++	return ret;
++}
++
++static void destroy_srp_semaphore(void* sem)
++{
++	/* XXX invariants */
++	atomic_dec(&srp_objects_in_use);
++	kfree(sem);
++}
++
++struct fdso_ops srp_sem_ops = {
++	.create  = create_srp_semaphore,
++	.open    = open_srp_semaphore,
++	.destroy = destroy_srp_semaphore
++};
++
++
++void do_srp_down(struct srp_semaphore* sem)
++{
++	/* Update ceiling. */
++	srp_add_prio(&__get_cpu_var(srp), &sem->ceiling);
++	WARN_ON(sem->owner != NULL);
++	sem->owner = current;
++	TRACE_CUR("acquired srp 0x%p\n", sem);
++}
++
++void do_srp_up(struct srp_semaphore* sem)
++{	
++	/* Determine new system priority ceiling for this CPU. */
++	WARN_ON(!in_list(&sem->ceiling.list));
++	if (in_list(&sem->ceiling.list))
++		list_del(&sem->ceiling.list);
++
++	sem->owner = NULL;
++
++	/* Wake tasks on this CPU, if they exceed current ceiling. */
++	TRACE_CUR("released srp 0x%p\n", sem);
++	wake_up_all(&__get_cpu_var(srp).ceiling_blocked);
++}
++
++/* Adjust the system-wide priority ceiling if resource is claimed. */
++asmlinkage long sys_srp_down(int sem_od)
++{
++	int cpu;
++	int ret = -EINVAL;
++	struct srp_semaphore* sem;
++
++	/* disabling preemptions is sufficient protection since
++	 * SRP is strictly per CPU and we don't interfere with any
++	 * interrupt handlers
++	 */
++	preempt_disable();
++	TS_SRP_DOWN_START;
++
++	cpu = smp_processor_id();
++	sem = lookup_srp_sem(sem_od);
++	if (sem && sem->cpu == cpu) {
++		do_srp_down(sem);
++		ret = 0;
++	}
++
++	TS_SRP_DOWN_END;
++	preempt_enable();
++	return ret;
++}
++
++/* Adjust the system-wide priority ceiling if resource is freed. */
++asmlinkage long sys_srp_up(int sem_od)
++{
++	int cpu;
++	int ret = -EINVAL;
++	struct srp_semaphore* sem;
++
++	preempt_disable();
++	TS_SRP_UP_START;
++
++	cpu = smp_processor_id();
++	sem = lookup_srp_sem(sem_od);
++
++	if (sem && sem->cpu == cpu) {
++		do_srp_up(sem);
++		ret = 0;
++	}
++
++	TS_SRP_UP_END;
++	preempt_enable();
++	return ret;
++}
++
++asmlinkage long sys_reg_task_srp_sem(int sem_od)
++{
++	/* unused */
++	return 0;
++}
++
++static int srp_wake_up(wait_queue_t *wait, unsigned mode, int sync,
++		       void *key)
++{
++	int cpu = smp_processor_id();
++	struct task_struct *tsk = wait->private;
++	if (cpu != get_partition(tsk))
++		TRACE_TASK(tsk, "srp_wake_up on wrong cpu, partition is %d\b",
++			   get_partition(tsk));
++	else if (srp_exceeds_ceiling(tsk, &__get_cpu_var(srp)))
++		return default_wake_function(wait, mode, sync, key);
++	return 0;
++}
++
++
++
++static void do_ceiling_block(struct task_struct *tsk)
++{
++	wait_queue_t wait = {
++		.private   = tsk,
++		.func      = srp_wake_up,
++		.task_list = {NULL, NULL}
++	};
++
++	tsk->state = TASK_UNINTERRUPTIBLE;
++	add_wait_queue(&__get_cpu_var(srp).ceiling_blocked, &wait);
++	tsk->rt_param.srp_non_recurse = 1;
++	preempt_enable_no_resched();
++	schedule();
++	preempt_disable();
++	tsk->rt_param.srp_non_recurse = 0;
++	remove_wait_queue(&__get_cpu_var(srp).ceiling_blocked, &wait);
++}
++
++/* Wait for current task priority to exceed system-wide priority ceiling.
++ */
++void srp_ceiling_block(void)
++{
++	struct task_struct *tsk = current;
++	
++	TS_SRPT_START;
++
++	/* Only applies to real-time tasks, but optimize for RT tasks. */
++	if (unlikely(!is_realtime(tsk)))
++		return;
++
++	/* Avoid recursive ceiling blocking. */
++	if (unlikely(tsk->rt_param.srp_non_recurse))
++		return;
++
++	/* Bail out early if there aren't any SRP resources around. */
++	if (likely(!atomic_read(&srp_objects_in_use)))
++		return;
++
++	preempt_disable();
++	if (!srp_exceeds_ceiling(tsk, &__get_cpu_var(srp))) {
++		TRACE_CUR("is priority ceiling blocked.\n");
++		TS_SRPT_END;
++		while (!srp_exceeds_ceiling(tsk, &__get_cpu_var(srp)))
++			do_ceiling_block(tsk);
++		TRACE_CUR("finally exceeds system ceiling.\n");
++	} else {
++		TS_SRPT_END;
++		TRACE_CUR("is not priority ceiling blocked\n");	
++	}
++	preempt_enable();
++}
++
++/* ************************************************************************** */
++
++
++
+diff --git a/litmus/pcp.c b/litmus/pcp.c
+new file mode 100644
+index 0000000..06030d4
+--- /dev/null
++++ b/litmus/pcp.c
+@@ -0,0 +1,764 @@
++/* pcp.c -- Implementations of the PCP, D-PCP, and M-PCP.
++ *
++ */
++#include <asm/uaccess.h>
++#include <linux/wait.h>
++#include <linux/list.h>
++#include <linux/sched.h>
++#include <linux/spinlock.h>
++#include <linux/completion.h>
++
++#include <litmus/sched_plugin.h>
++#include <litmus/litmus.h>
++#include <litmus/rm_common.h>
++#include <litmus/fdso.h>
++#include <litmus/trace.h>
++
++/* from sched_rm.c */
++void rm_set_prio(struct task_struct *t, struct pcp_priority* new_prio);
++
++#define GLOBAL_SEM  -1
++#define UNDEF_SEM -2
++
++#define get_prio(t) 		((t)->rt_param.cur_prio)
++#define get_base_prio(t)	(&((t)->rt_param.pcp_prio))
++
++
++struct dpcp_request {
++	struct list_head	list;
++	struct completion	done;
++	long 			arg;
++	lt_t			prio;
++	int			pid;
++};
++
++struct pcp_semaphore {
++	int 			cpu;
++
++	/* waiting tasks */
++	wait_queue_head_t 	blocked;
++	struct pcp_priority*	blocked_prio;
++
++	/* system ceiling support */
++	struct list_head	list;
++	struct pcp_priority	ceiling;
++
++	/* task_struct owned_semaphore list */
++	struct list_head	owned_list;
++
++	/* Current lock holder.
++	 * NULL implies unlocked.
++	 */
++	struct task_struct*	holder;
++
++	/* D-PCP support */
++	spinlock_t		dpcp_lock;
++	struct list_head 	dpcp_requests;
++	int			dpcp_count;
++	struct dpcp_request*	dpcp_current;
++	struct completion	dpcp_job;
++	struct task_struct*	dpcp_agent;
++};
++
++static DEFINE_PER_CPU(spinlock_t, pcp_lock);
++static DEFINE_PER_CPU(struct list_head, sys_ceiling);
++
++static noinline void init_pcp_sem(struct pcp_semaphore *sem, int cpu)
++{
++	sem->cpu = cpu;
++	init_waitqueue_head(&sem->blocked);
++	INIT_LIST_HEAD(&sem->list);
++	INIT_LIST_HEAD(&sem->owned_list);
++	INIT_LIST_HEAD(&sem->dpcp_requests);
++	sem->holder = NULL;
++	sem->dpcp_current = NULL;
++	sem->blocked_prio = NULL;
++	sem->ceiling      = (struct pcp_priority) {ULLONG_MAX, 0, INT_MAX};
++	init_completion(&sem->dpcp_job);
++	spin_lock_init(&sem->dpcp_lock);
++	sem->dpcp_count = 0;
++	sem->dpcp_agent = NULL;
++}
++
++static noinline int tsk_pcp_higher_prio(struct task_struct* t,
++					struct pcp_priority* p2)
++{
++	return _rm_higher_prio(t->rt_param.cur_prio, p2);
++}
++
++static noinline struct pcp_semaphore* get_ceiling(int cpu)
++{
++	struct list_head *ceil_list = &per_cpu(sys_ceiling, cpu);
++	if (list_empty(ceil_list))
++		return NULL;
++	return list_entry(ceil_list->next, struct pcp_semaphore, list);
++}
++
++static noinline void raise_ceiling(struct pcp_semaphore* sem, int cpu)
++{
++	struct list_head *ceil_list = &per_cpu(sys_ceiling, cpu);
++	list_add(&sem->list, ceil_list);
++}
++
++static noinline int exceeds_ceiling(struct task_struct* t,
++				    struct pcp_semaphore* ceil)
++{
++	return !ceil || ceil->holder == t ||
++		tsk_pcp_higher_prio(t, &ceil->ceiling);
++}
++
++static noinline void give_priority(struct task_struct* t, struct pcp_semaphore* sem)
++{
++	struct pcp_semaphore* next;
++	/* sem->blocked_prio can be NULL, but _rm_higher_prio() handles that */
++
++	/* only update if we actually exceed existing priorities */
++	if (_rm_higher_prio(get_prio(t), sem->blocked_prio) &&
++	    _rm_higher_prio(get_prio(t), get_base_prio(sem->holder))) {
++		/* we need to register our priority */
++		sem->blocked_prio = get_prio(t);
++
++		/* only update task if it results in a priority increase */
++		if (_rm_higher_prio(get_prio(t), get_prio(sem->holder))) {
++			/* update prio */
++			TRACE("PCP: %s/%d inherits from %s/%d\n",
++				sem->holder->comm, sem->holder->pid,
++				t->comm, t->pid);
++			rm_set_prio(sem->holder, get_prio(t));
++			/* check if recipient is blocked, too */
++			next = sem->holder->rt_param.blocked_on;
++			if (next)
++				/* Transitive priority inheritance.
++				 * Recurse.
++				 */
++				give_priority(sem->holder, next);
++		}
++	}
++}
++
++static noinline long local_pcp_down(struct pcp_semaphore *sem)
++{
++	long ret = 0;
++	struct task_struct* t = current;
++	struct pcp_semaphore* ceiling;
++	int cpu;
++	int ceiling_passed = 0;
++
++	/* don't allow recursive locking */
++	if (sem->holder == t)
++		return -EINVAL;
++
++	cpu = smp_processor_id();
++	if (cpu != sem->cpu) {
++		preempt_enable();
++		return -EPERM;
++	}
++
++
++	/* first we need to pass the local system ceiling */
++	while (!ceiling_passed) {
++		ceiling = get_ceiling(cpu);
++		TRACE_TASK(t, "PCP: I want %p, ceiling is %p\n", sem, ceiling);
++		ceiling_passed = exceeds_ceiling(t, ceiling);
++		if (!ceiling_passed) {
++			/* block on sys_ceiling */
++			DECLARE_WAITQUEUE(waitq, t);
++			TRACE_TASK(t, "blocks on PCP system ceiling\n");
++			add_wait_queue(&ceiling->blocked, &waitq);
++			/* initiate priority inheritance */
++			give_priority(t, ceiling);
++			t->rt_param.blocked_on = ceiling;
++			t->state = TASK_UNINTERRUPTIBLE;
++			preempt_enable_no_resched();
++			TS_PCP1_DOWN_END;
++			schedule();
++			preempt_disable();
++			t->rt_param.blocked_on = NULL;
++			remove_wait_queue(&ceiling->blocked, &waitq);
++		} else {
++			if (ceiling)
++				TRACE_TASK(t,
++					   "system ceiling passed: {%llu, %d, %d} < "
++					   "{%llu, %d, %d}\n",
++					   ceiling->ceiling.prio,
++					   ceiling->ceiling.in_global_cs,
++					   ceiling->ceiling.pid,
++					   t->rt_param.cur_prio->prio,
++					   t->rt_param.cur_prio->in_global_cs,
++					   t->rt_param.cur_prio->pid
++					);
++			else
++				TRACE_TASK(t,
++					   "system ceiling passed: NULL < "
++					   "{%llu, %d, %d}\n",
++					   t->rt_param.cur_prio->prio,
++					   t->rt_param.cur_prio->in_global_cs,
++					   t->rt_param.cur_prio->pid
++					);
++			TS_PCP1_DOWN_END;
++		}
++	}
++
++	TS_PCP2_DOWN_START;
++	/* Since we have passed the priority ceiling the semaphore cannot be
++	 * in use. If it were in use then the ceiling would be at least as high
++	 * as our priority.
++	 */
++	WARN_ON(sem->holder);
++
++	TRACE_TASK(t, "taking PCP semaphore 0x%p, owner:%p\n", sem, sem->holder);
++
++	/* We can become the owner. */
++	sem->holder = t;
++	list_add(&sem->owned_list, &t->rt_param.owned_semaphores);
++
++	/* We need to update the system ceiling, but only
++	 * if the new ceiling is higher than the old.
++	 */
++	ceiling = get_ceiling(cpu);
++	/* if the priorities are equal then t already owns ceiling,
++	 * otherwise it would not have gotten past the system ceiling
++	 */
++	if (!ceiling || _rm_higher_prio(&sem->ceiling, &ceiling->ceiling)) {
++		raise_ceiling(sem, cpu);
++		TRACE_TASK(t, "raised ceiling on %d\n", cpu);
++	}
++
++	TS_PCP2_DOWN_END;
++	return ret;
++}
++
++static noinline struct pcp_priority* fetch_highest_prio(struct task_struct *t)
++{
++	struct pcp_priority *prio;
++	struct list_head* pos;
++	struct pcp_semaphore* sem;
++
++	/* base case is that the task uses its normal priority */
++	prio = get_base_prio(t);
++
++	/* now search the list of semaphores that we own for a higher priority
++	 * to inherit
++	 */
++	list_for_each(pos, &t->rt_param.owned_semaphores) {
++		sem = list_entry(pos, struct pcp_semaphore, owned_list);
++		/* sem->blocked_prio could be NULL */
++		if (!_rm_higher_prio(prio, sem->blocked_prio))
++			prio = sem->blocked_prio;
++	}
++	return prio;
++}
++
++static noinline long local_pcp_up(struct pcp_semaphore *sem)
++{
++	long ret = 0;
++	struct task_struct* t = current;
++	int cpu;
++
++	cpu = smp_processor_id();
++
++	if (cpu != sem->cpu)
++		return -EPERM;
++
++	if (sem->holder == t) {
++		TRACE_TASK(t, "giving up PCP semaphore 0x%p.\n", sem);
++
++		/* we need to unblock all tasks in the wait_queue */
++		wake_up_all(&sem->blocked);
++
++		/* unlock semaphore */
++		sem->holder = NULL;
++		list_del(&sem->owned_list);
++
++		/* remove from system ceiling list */
++		if (in_list(&sem->list))
++			list_del(&sem->list);
++
++		if (sem->blocked_prio == get_prio(t)) {
++			/* We are currently inheriting  from this
++			 * semaphore. We need to figure out which priority
++			 * we should fall back to.
++			 */
++			 TRACE_TASK(t, "giving up inherited prio.\n");
++			 rm_set_prio(t, fetch_highest_prio(t));
++		}
++		/* reset semaphore priority inheritance */
++		sem->blocked_prio = NULL;
++	} else {
++		TRACE_TASK(t, "local_pcp_up EINVAL 0x%p.\n", sem);
++		ret = -EINVAL;
++	}
++
++	TS_PCP_UP_END;
++	return ret;
++}
++
++static noinline struct task_struct* wqlist2task(struct list_head* l)
++{
++	return (struct task_struct*)
++		list_entry(l, wait_queue_t, task_list)->private;
++}
++
++static noinline int wait_order(struct list_head* la, struct list_head* lb)
++{
++	return rm_higher_prio(wqlist2task(la), wqlist2task(lb));
++}
++
++/* The default function is too picky.
++ * We really only want to wake up one task.
++ */
++int single_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
++{
++	int ret = default_wake_function(wait, mode, sync, key);
++	if (!ret)
++		TRACE("Overriding default_wake_function() return code.\n");
++	return 1;
++}
++
++static noinline long global_pcp_down(struct pcp_semaphore* sem)
++{
++	unsigned long flags;
++	long ret = 0;
++	struct task_struct* t = current;
++
++	/* don't allow recursive locking */
++	if (sem->holder == t)
++		return -EINVAL;
++
++	spin_lock_irqsave(&sem->blocked.lock, flags);
++
++	/* Get the global priority. Do this before
++	 * we block, so that we wake up as a high-priority task.
++	 */
++	t->rt_param.pcp_prio.in_global_cs = 1;
++	rm_set_prio(t, &t->rt_param.pcp_prio);
++
++	if (sem->holder) {
++		/* semaphore is not free. We need to block. */
++		DECLARE_WAITQUEUE(waitq, t);
++		TRACE_TASK(t, "blocks on MPCP semaphore %p.\n", sem);
++		waitq.flags = WQ_FLAG_EXCLUSIVE;
++		waitq.func  = single_wake_function;
++		/* insert ordered by priority */
++		list_insert(&waitq.task_list, &sem->blocked.task_list,
++			    wait_order);
++		t->state = TASK_UNINTERRUPTIBLE;
++		spin_unlock_irqrestore(&sem->blocked.lock, flags);		
++		preempt_enable_no_resched();
++		TS_MPCP_DOWN_END;
++		
++		schedule();
++
++		preempt_disable();
++		/* once we wake up we are the owner of the lock */
++		spin_lock_irqsave(&sem->blocked.lock, flags);
++		remove_wait_queue_locked(&sem->blocked, &waitq);
++	} else {
++		/* semaphore is free. We can proceed. */
++		TS_MPCP_DOWN_END;
++		sem->holder = t;
++	}
++	if (sem->holder != t) {
++		if (sem->holder)
++			TRACE("expected %s/%d, but I am %s/%d\n",
++			      sem->holder->comm, sem->holder->pid, t->comm, t->pid);
++		else
++			TRACE("expected NULL, but I am %s/%d\n",
++			      t->comm, t->pid);
++	}
++	TRACE_TASK(t, "acquired MPCP semaphore %p.\n", sem);
++
++
++	spin_unlock_irqrestore(&sem->blocked.lock, flags);
++	return ret;
++}
++
++static noinline long global_pcp_up(struct pcp_semaphore* sem)
++{
++	unsigned long flags;
++	long ret = 0;
++	struct task_struct* t = current;
++
++	if (sem->holder != t)
++		return -EINVAL;
++
++	TRACE_TASK(t, "releasing MPCP semaphore %p.\n", sem);
++
++	spin_lock_irqsave(&sem->blocked.lock, flags);
++	if (waitqueue_active(&sem->blocked)) {
++		/* pass ownership on */
++		sem->holder = wqlist2task(sem->blocked.task_list.next);
++		TRACE_TASK(t, "waking up next (=%s/%d) on MPCP semaphore %p.\n",
++			   sem->holder->comm, sem->holder->pid, sem);
++		/* wake up first */
++		wake_up_locked(&sem->blocked);
++	} else
++		sem->holder = NULL;
++
++	/* restore our own priority */
++	t->rt_param.pcp_prio.in_global_cs = 0;
++	rm_set_prio(t, &t->rt_param.pcp_prio);
++
++	TS_MPCP_UP_END;
++	spin_unlock_irqrestore(&sem->blocked.lock, flags);	
++	return ret;
++}
++
++static noinline int request_order(struct list_head* la, struct list_head* lb)
++{
++	struct dpcp_request *a, *b;
++	a = list_entry(la, struct dpcp_request, list);
++	b = list_entry(lb, struct dpcp_request, list);
++	return a->prio < b->prio;
++}
++
++static noinline long dpcp_invoke(struct pcp_semaphore* sem, long arg)
++{
++	unsigned long flags;
++	long ret = 0;
++	struct task_struct* t = current, *a;
++	struct dpcp_request req;
++
++	spin_lock_irqsave(&sem->dpcp_lock, flags);
++
++	init_completion(&req.done);
++	req.arg  = arg;
++	req.prio = t->rt_param.pcp_prio.prio;
++	req.pid  = t->rt_param.pcp_prio.pid;
++
++	list_insert(&req.list, &sem->dpcp_requests,
++		    request_order);
++
++	if (!(sem->dpcp_count++)) {
++		/* agent needs to be awakened */
++		TRACE_TASK(t, "waking DPCP agent for %p.\n", sem);
++		if (sem->dpcp_agent) {
++			a = sem->dpcp_agent;
++			/* set agent priority */
++			a->rt_param.pcp_prio.in_global_cs = 1;
++			a->rt_param.pcp_prio.prio = req.prio;
++			rm_set_prio(a, &a->rt_param.pcp_prio);
++		}
++		complete(&sem->dpcp_job);
++	}
++
++	spin_unlock_irqrestore(&sem->dpcp_lock, flags);
++	TRACE_TASK(t, "blocking on DPCP sem %p.\n", sem);
++	preempt_enable_no_resched();
++	TS_DPCP_INVOKE_END;
++
++	wait_for_completion(&req.done);
++
++	preempt_disable();
++	/* we don't need to clean up, the remote agent did that for us */
++	return ret;
++}
++
++static noinline long dpcp_agent(struct pcp_semaphore* sem, long flags, long *arg)
++{
++	unsigned long spinflags;
++	long ret = 0;
++	struct task_struct* t = current;
++
++	spin_lock_irqsave(&sem->dpcp_lock, spinflags);
++
++	/* defend against multiple concurrent agents */
++	if (sem->dpcp_agent && sem->dpcp_agent != t) {
++		spin_unlock_irqrestore(&sem->dpcp_lock, spinflags);
++		return -EBUSY;
++	} else
++		sem->dpcp_agent = t;
++
++	if (sem->cpu != get_partition(t)) {
++		int cpu = smp_processor_id();
++		spin_unlock_irqrestore(&sem->dpcp_lock, spinflags);
++		printk(KERN_CRIT 
++		       "dpcp_agent: sem->cpu: %d, but agent "
++		       "is on %d, and part=%d\n",
++		       sem->cpu, cpu, get_partition(t));
++		return -EINVAL;
++	}
++
++	if ((flags & DPCP_COMPLETE) && sem->dpcp_current) {
++		TRACE_TASK(t, "completing DPCP sem %p.\n", sem);
++		/* we need to release the holder */
++		complete(&sem->dpcp_current->done);
++		sem->dpcp_count--;
++		sem->dpcp_current = NULL;
++	}
++
++	if (flags & DPCP_WAIT) {
++		do {
++			if (sem->dpcp_count) {
++				/* pass ownership on */
++				sem->dpcp_current = list_entry(
++					sem->dpcp_requests.next, 
++					struct dpcp_request, list);
++				list_del(sem->dpcp_requests.next);
++				t->rt_param.pcp_prio.in_global_cs = 1;
++				t->rt_param.pcp_prio.prio = 
++					sem->dpcp_current->prio;
++				t->rt_param.pcp_prio.pid = sem->dpcp_current->pid;
++				rm_set_prio(t, &t->rt_param.pcp_prio);
++				TS_DPCP_AGENT2_END;
++			} else {
++				/* need to wait */
++				spin_unlock_irqrestore(&sem->dpcp_lock, 
++						       spinflags);
++				TRACE_TASK(t, "agent waiting for "
++					   "DPCP sem %p.\n", sem);
++
++				preempt_enable_no_resched();
++				TS_DPCP_AGENT2_END;
++				ret = wait_for_completion_interruptible(&sem->dpcp_job);
++				preempt_disable();
++				TRACE_TASK(t, "got DPCP job on sem %p, "
++					   "ret=%d.\n", sem, ret);
++				spin_lock_irqsave(&sem->dpcp_lock, spinflags);
++				if (ret != 0) {
++					/* FIXME: set priority */
++					break;
++				}
++			}
++		} while (!sem->dpcp_current);	
++		if (ret == 0)
++			*arg = sem->dpcp_current->arg;
++	} else {
++		/* restore our own priority */
++		t->rt_param.pcp_prio.in_global_cs = 0;
++		t->rt_param.pcp_prio.prio = ULLONG_MAX;
++		rm_set_prio(t, &t->rt_param.pcp_prio);			
++		sem->dpcp_agent = NULL;
++	}
++
++	spin_unlock_irqrestore(&sem->dpcp_lock, spinflags);
++	return ret;
++}
++
++
++/* system calls */
++
++asmlinkage long sys_pcp_down(int sem_od)
++{
++	long ret = 0;
++	struct pcp_semaphore * sem;
++
++	preempt_disable();
++	TS_MPCP_DOWN_START;
++	TS_PCP1_DOWN_START;
++
++	if (!is_realtime(current)) {
++		ret =  -EPERM;
++		goto out;
++	}
++
++	sem = lookup_pcp_sem(sem_od);
++	if (sem) {
++		if (sem->cpu != GLOBAL_SEM)
++			ret = local_pcp_down(sem);
++		else
++			ret = global_pcp_down(sem);
++	} else
++		ret = -EINVAL;
++
++out:
++	preempt_enable();
++	return ret;
++}
++
++asmlinkage long sys_pcp_up(int sem_od)
++{
++	long ret = 0;
++	struct pcp_semaphore * sem;
++
++	preempt_disable();
++	TS_PCP_UP_START;
++	TS_MPCP_UP_START;
++
++	if (!is_realtime(current)) {
++		ret = -EPERM;
++		goto out;
++	}
++
++	sem = lookup_pcp_sem(sem_od);
++	if (sem) {
++		if (sem->cpu != GLOBAL_SEM)
++			ret = local_pcp_up(sem);
++		else
++			ret = global_pcp_up(sem);
++	} else
++		ret = -EINVAL;
++
++out:
++	preempt_enable();
++	return ret;
++}
++
++
++asmlinkage long sys_dpcp_invoke(int sem_od, long arg)
++{
++	long ret = 0;
++	struct pcp_semaphore * sem;
++
++	preempt_disable();
++	TS_DPCP_INVOKE_START;
++
++	if (!is_realtime(current)) {
++		ret = -EPERM;
++		goto out;
++	}
++
++	sem = lookup_pcp_sem(sem_od);
++	if (sem) {
++		ret = dpcp_invoke(sem, arg);
++	} else
++		ret = -EINVAL;
++
++out:
++	preempt_enable();
++	return ret;
++}
++
++asmlinkage long sys_dpcp_agent(int sem_od, long flags, long __user *__arg)
++{
++	long ret = 0;
++	long arg;
++	struct pcp_semaphore * sem;
++
++	preempt_disable();
++	TS_DPCP_AGENT1_START;
++
++	if (!is_realtime(current)) {
++		ret = -EPERM;
++		goto out;
++	}
++
++	sem = lookup_pcp_sem(sem_od);
++	if (sem) {
++		TS_DPCP_AGENT1_END;
++		if (flags & DPCP_COMPLETE) {
++			TS_PCP_UP_START;			
++			local_pcp_up(sem);
++		}
++		TS_DPCP_AGENT2_START;
++		ret = dpcp_agent(sem, flags, &arg);
++		if (ret == 0 && (flags & DPCP_WAIT)) {			
++			ret = put_user(arg, __arg);
++			if (ret == 0) {
++				TS_PCP1_DOWN_START;
++				local_pcp_down(sem);
++			}
++		}
++	} else
++		ret = -EINVAL;
++
++out:
++	preempt_enable();
++	return ret;
++}
++
++
++/* FDSO callbacks */
++
++static noinline  void* create_pcp_semaphore(void)
++{
++	struct pcp_semaphore* sem;
++
++	sem = kmalloc(sizeof(struct pcp_semaphore), GFP_KERNEL);
++	if (!sem)
++		return NULL;
++	init_pcp_sem(sem, UNDEF_SEM);
++	TRACE("allocated PCP semaphore %p\n", sem);
++	return sem;
++}
++
++static noinline void destroy_pcp_semaphore(void* obj)
++{
++	struct pcp_semaphore* sem = (struct pcp_semaphore*) obj;
++	WARN_ON(sem->holder);
++	WARN_ON(in_list(&sem->list));
++	kfree(sem);
++}
++
++static noinline void update_pcp_ceiling(struct pcp_semaphore* sem, struct task_struct* t, int global)
++{
++	struct pcp_priority prio = {get_rt_period(t), 1, t->pid};
++	if (global && !sem->ceiling.in_global_cs)
++		sem->ceiling.in_global_cs = 1;
++	if (_rm_higher_prio(&prio, &sem->ceiling))
++		sem->ceiling = prio;
++}
++
++static noinline int open_pcp_semaphore(struct od_table_entry* entry, void __user *__arg)
++{
++	struct pcp_semaphore* sem = (struct pcp_semaphore*) entry->obj->obj;
++	int *arg = (int*) __arg;
++	struct task_struct* t = current;
++	int cpu= get_partition(t);
++	
++	TRACE("opening PCP semaphore %p, cpu=%d\n", sem, sem->cpu);
++	if (!pcp_active())
++		return -EBUSY;
++
++	if (arg && get_user(cpu, arg) != 0)
++		return -EFAULT;
++
++	if (sem->cpu == UNDEF_SEM)
++		sem->cpu = cpu;
++
++	update_pcp_ceiling(sem, t, sem->cpu != get_partition(t));
++
++	return 0;
++}
++
++static noinline void update_mpcp_ceiling(struct pcp_semaphore* sem, struct task_struct* t)
++{
++	struct pcp_priority prio = {get_rt_period(t), 1, t->pid};
++	if (_rm_higher_prio(&prio, &sem->ceiling))
++		sem->ceiling = prio;
++}
++
++static noinline int open_mpcp_semaphore(struct od_table_entry* entry, void* __user arg)
++{
++	struct pcp_semaphore* sem = (struct pcp_semaphore*) entry->obj->obj;
++	int ret = 0;
++	struct task_struct* t = current;
++
++	if (!pcp_active())
++		return -EBUSY;
++
++	if (sem->cpu == UNDEF_SEM)
++		sem->cpu = GLOBAL_SEM;
++
++	update_mpcp_ceiling(sem, t);
++
++	return ret;
++}
++
++struct fdso_ops pcp_sem_ops = {
++	.create  = create_pcp_semaphore,
++	.destroy = destroy_pcp_semaphore,
++	.open    = open_pcp_semaphore
++};
++
++struct fdso_ops mpcp_sem_ops = {
++	.create  = create_pcp_semaphore,
++	.destroy = destroy_pcp_semaphore,
++	.open    = open_mpcp_semaphore
++};
++
++static noinline int __init pcp_boot_init(void)
++{
++	int i;
++
++	printk("Initializing PCP per-CPU ceilings...");
++	for (i = 0; i < NR_CPUS; i++) {
++		INIT_LIST_HEAD(&per_cpu(sys_ceiling, i));
++		per_cpu(pcp_lock, i) = __SPIN_LOCK_UNLOCKED(pcp_lock);
++	}
++	printk(" done!\n");
++
++	return 0;
++}
++
++module_init(pcp_boot_init);
+diff --git a/litmus/rm_common.c b/litmus/rm_common.c
+new file mode 100644
+index 0000000..9bf21fd
+--- /dev/null
++++ b/litmus/rm_common.c
+@@ -0,0 +1,76 @@
++/*
++ * litmus/rm_common.c
++ *
++ * Common functions for RM based schedulers.
++ *
++ * FIXME: Too much code duplication with edf_common.c
++ */
++
++#include <linux/percpu.h>
++#include <linux/sched.h>
++#include <linux/list.h>
++
++#include <litmus/litmus.h>
++#include <litmus/sched_plugin.h>
++#include <litmus/sched_trace.h>
++
++
++#include <litmus/rm_common.h>
++
++/* rm_higher_prio -  returns true if first has a higher RM priority
++ *                   than second. Period ties are broken by PID.
++ *
++ * first first must not be NULL and a real-time task.
++ * second may be NULL or a non-rt task.
++ */
++int rm_higher_prio(struct task_struct* first,
++		   struct task_struct* second)
++{
++	struct pcp_priority *p1, *p2;
++
++	/* verify assumptions in DEBUG build */
++	BUG_ON(!first);
++	BUG_ON(!is_realtime(first));
++	BUG_ON(second && !is_realtime(second) && second->rt_param.cur_prio);
++
++	p1 = first->rt_param.cur_prio;
++
++	/* if second is not a real-time task, then cur_prio is NULL */
++	p2 = second ? second->rt_param.cur_prio : NULL;
++	return _rm_higher_prio(p1, p2);
++}
++
++int rm_ready_order(struct list_head* a, struct list_head* b)
++{
++	return rm_higher_prio(
++		list_entry(a, struct task_struct, rt_list),
++		list_entry(b, struct task_struct, rt_list));
++}
++
++
++void rm_domain_init(rt_domain_t* rt, check_resched_needed_t resched)
++{
++	rt_domain_init(rt, resched, rm_ready_order);
++}
++
++/* need_to_preempt - check whether the task t needs to be preempted
++ *                   call only with irqs disabled and with  ready_lock acquired
++ *                   THIS DOES NOT TAKE NON-PREEMPTIVE SECTIONS INTO ACCOUNT!
++ */
++int rm_preemption_needed(rt_domain_t* rt, struct task_struct *t)
++{
++	/* we need the read lock for edf_ready_queue */
++	/* no need to preempt if there is nothing pending */
++	if (!ready_jobs_pending(rt))
++		return 0;
++	/* we need to reschedule if t doesn't exist */
++	if (!t)
++		return 1;
++
++	/* NOTE: We cannot check for non-preemptibility since we
++	 *       don't know what address space we're currently in.
++	 */
++
++	/* make sure to get non-rt stuff out of the way */
++	return !is_realtime(t) || rm_higher_prio(next_ready(rt), t);
++}
+diff --git a/litmus/rt_domain.c b/litmus/rt_domain.c
+new file mode 100644
+index 0000000..fe7bd29
+--- /dev/null
++++ b/litmus/rt_domain.c
+@@ -0,0 +1,130 @@
++/*
++ * kernel/rt_domain.c
++ *
++ * LITMUS real-time infrastructure. This file contains the
++ * functions that manipulate RT domains. RT domains are an abstraction
++ * of a ready queue and a release queue.
++ */
++
++#include <linux/percpu.h>
++#include <linux/sched.h>
++#include <linux/list.h>
++
++#include <litmus/litmus.h>
++#include <litmus/sched_plugin.h>
++#include <litmus/sched_trace.h>
++
++#include <litmus/rt_domain.h>
++
++
++static int dummy_resched(rt_domain_t *rt)
++{
++	return 0;
++}
++
++static int dummy_order(struct list_head* a, struct list_head* b)
++{
++	return 0;
++}
++
++int release_order(struct list_head* a, struct list_head* b)
++{
++	return earlier_release(
++		list_entry(a, struct task_struct, rt_list),
++		list_entry(b, struct task_struct, rt_list));
++}
++
++
++void rt_domain_init(rt_domain_t *rt,
++		    check_resched_needed_t f,
++		    list_cmp_t order)
++{
++	BUG_ON(!rt);
++	if (!f)
++		f = dummy_resched;
++	if (!order)
++		order = dummy_order;
++	INIT_LIST_HEAD(&rt->ready_queue);
++	INIT_LIST_HEAD(&rt->release_queue);
++	rt->ready_lock  	= RW_LOCK_UNLOCKED;
++	rt->release_lock 	= SPIN_LOCK_UNLOCKED;
++	rt->check_resched 	= f;
++	rt->order		= order;
++}
++
++/* add_ready - add a real-time task to the rt ready queue. It must be runnable.
++ * @new:      the newly released task
++ */
++void __add_ready(rt_domain_t* rt, struct task_struct *new)
++{
++	TRACE("rt: adding %s/%d (%llu, %llu) to ready queue at %llu\n",
++	      new->comm, new->pid, get_exec_cost(new), get_rt_period(new),
++	      sched_clock());
++
++	if (!list_insert(&new->rt_list, &rt->ready_queue, rt->order))
++		rt->check_resched(rt);
++}
++
++struct task_struct* __take_ready(rt_domain_t* rt)
++{
++	struct task_struct *t = __peek_ready(rt);
++
++	/* kick it out of the ready list */
++	if (t)
++		list_del(&t->rt_list);
++	return t;
++}
++
++struct task_struct* __peek_ready(rt_domain_t* rt)
++{
++	if (!list_empty(&rt->ready_queue))
++		return next_ready(rt);
++	else
++		return NULL;
++}
++
++/* add_release - add a real-time task to the rt release queue.
++ * @task:        the sleeping task
++ */
++void __add_release(rt_domain_t* rt, struct task_struct *task)
++{
++	TRACE("rt: adding %s/%d (%llu, %llu) rel=%llu to release queue\n",
++	      task->comm, task->pid, get_exec_cost(task), get_rt_period(task),
++	      get_release(task));
++
++	list_insert(&task->rt_list, &rt->release_queue, release_order);
++}
++
++void __release_pending(rt_domain_t* rt)
++{
++	struct list_head *pos, *save;
++	struct task_struct   *queued;
++	lt_t now = sched_clock();
++	list_for_each_safe(pos, save, &rt->release_queue) {
++		queued = list_entry(pos, struct task_struct, rt_list);
++		if (likely(is_released(queued, now))) {
++			/* this one is ready to go*/
++			list_del(pos);
++			set_rt_flags(queued, RT_F_RUNNING);
++
++			sched_trace_job_release(queued);
++
++			/* now it can be picked up */
++			barrier();
++			add_ready(rt, queued);
++		}
++		else
++			/* the release queue is ordered */
++			break;
++	}
++}
++
++void try_release_pending(rt_domain_t* rt)
++{
++	unsigned long flags;
++
++	if (spin_trylock_irqsave(&rt->release_lock, flags)) {
++		__release_pending(rt);
++		spin_unlock_irqrestore(&rt->release_lock, flags);
++	}
++}
+diff --git a/litmus/sched_gsn_edf.c b/litmus/sched_gsn_edf.c
+new file mode 100644
+index 0000000..314f8a1
+--- /dev/null
++++ b/litmus/sched_gsn_edf.c
+@@ -0,0 +1,733 @@
++/*
++ * kernel/sched_gsn_edf.c
++ *
++ * Implementation of the GSN-EDF scheduling algorithm.
++ *
++ * This version uses the simple approach and serializes all scheduling
++ * decisions by the use of a queue lock. This is probably not the
++ * best way to do it, but it should suffice for now.
++ */
++
++#include <linux/spinlock.h>
++#include <linux/percpu.h>
++#include <linux/sched.h>
++#include <linux/list.h>
++
++#include <litmus/litmus.h>
++#include <litmus/jobs.h>
++#include <litmus/sched_plugin.h>
++#include <litmus/edf_common.h>
++#include <litmus/sched_trace.h>
++
++#include <linux/module.h>
++
++/* Overview of GSN-EDF operations.
++ *
++ * For a detailed explanation of GSN-EDF have a look at the FMLP paper. This
++ * description only covers how the individual operations are implemented in
++ * LITMUS.
++ *
++ * link_task_to_cpu(T, cpu) 	- Low-level operation to update the linkage
++ *                                structure (NOT the actually scheduled
++ *                                task). If there is another linked task To
++ *                                already it will set To->linked_on = NO_CPU
++ *                                (thereby removing its association with this
++ *                                CPU). However, it will not requeue the
++ *                                previously linked task (if any). It will set
++ *                                T's state to RT_F_RUNNING and check whether
++ *                                it is already running somewhere else. If T
++ *                                is scheduled somewhere else it will link
++ *                                it to that CPU instead (and pull the linked
++ *                                task to cpu). T may be NULL.
++ *
++ * unlink(T)			- Unlink removes T from all scheduler data
++ *                                structures. If it is linked to some CPU it
++ *                                will link NULL to that CPU. If it is
++ *                                currently queued in the gsnedf queue it will
++ *                                be removed from the T->rt_list. It is safe to
++ *                                call unlink(T) if T is not linked. T may not
++ *                                be NULL.
++ *
++ * requeue(T)			- Requeue will insert T into the appropriate
++ *                                queue. If the system is in real-time mode and
++ *                                the T is released already, it will go into the
++ *                                ready queue. If the system is not in
++ *                                real-time mode is T, then T will go into the
++ *                                release queue. If T's release time is in the
++ *                                future, it will go into the release
++ *                                queue. That means that T's release time/job
++ *                                no/etc. has to be updated before requeu(T) is
++ *                                called. It is not safe to call requeue(T)
++ *                                when T is already queued. T may not be NULL.
++ *
++ * gsnedf_job_arrival(T)	- This is the catch all function when T enters
++ *                                the system after either a suspension or at a
++ *                                job release. It will queue T (which means it
++ *                                is not safe to call gsnedf_job_arrival(T) if
++ *                                T is already queued) and then check whether a
++ *                                preemption is necessary. If a preemption is
++ *                                necessary it will update the linkage
++ *                                accordingly and cause scheduled to be called
++ *                                (either with an IPI or need_resched). It is
++ *                                safe to call gsnedf_job_arrival(T) if T's
++ *                                next job has not been actually released yet
++ *                                (releast time in the future). T will be put
++ *                                on the release queue in that case.
++ *
++ * job_completion(T)		- Take care of everything that needs to be done
++ *                                to prepare T for its next release and place
++ *                                it in the right queue with
++ *                                gsnedf_job_arrival().
++ *
++ *
++ * When we now that T is linked to CPU then link_task_to_cpu(NULL, CPU) is
++ * equivalent to unlink(T). Note that if you unlink a task from a CPU none of
++ * the functions will automatically propagate pending task from the ready queue
++ * to a linked task. This is the job of the calling function ( by means of
++ * __take_ready).
++ */
++
++
++/* cpu_entry_t - maintain the linked and scheduled state
++ */
++typedef struct  {
++	int 			cpu;
++	struct task_struct*	linked;		/* only RT tasks */
++	struct task_struct*	scheduled;	/* only RT tasks */
++	struct list_head	list;
++	atomic_t		will_schedule;	/* prevent unneeded IPIs */
++} cpu_entry_t;
++DEFINE_PER_CPU(cpu_entry_t, gsnedf_cpu_entries);
++
++#define set_will_schedule() \
++	(atomic_set(&__get_cpu_var(gsnedf_cpu_entries).will_schedule, 1))
++#define clear_will_schedule() \
++	(atomic_set(&__get_cpu_var(gsnedf_cpu_entries).will_schedule, 0))
++#define test_will_schedule(cpu) \
++	(atomic_read(&per_cpu(gsnedf_cpu_entries, cpu).will_schedule))
++
++
++#define NO_CPU 0xffffffff
++
++/* The gsnedf_lock is used to serialize all scheduling events.
++ * It protects
++ */
++static DEFINE_SPINLOCK(gsnedf_lock);
++/* the cpus queue themselves according to priority in here */
++static LIST_HEAD(gsnedf_cpu_queue);
++
++static rt_domain_t gsnedf;
++
++
++/* update_cpu_position - Move the cpu entry to the correct place to maintain
++ *                       order in the cpu queue. Caller must hold gsnedf lock.
++ *
++ *						This really should be a heap.
++ */
++static void update_cpu_position(cpu_entry_t *entry)
++{
++	cpu_entry_t *other;
++	struct list_head *pos;
++
++	if (likely(in_list(&entry->list)))
++		list_del(&entry->list);
++	/* if we do not execute real-time jobs we just move
++	 * to the end of the queue
++	 */
++	if (entry->linked) {
++		list_for_each(pos, &gsnedf_cpu_queue) {
++			other = list_entry(pos, cpu_entry_t, list);
++			if (edf_higher_prio(entry->linked, other->linked)) {
++				__list_add(&entry->list, pos->prev, pos);
++				return;
++			}
++		}
++	}
++	/* if we get this far we have the lowest priority job */
++	list_add_tail(&entry->list, &gsnedf_cpu_queue);
++}
++
++/* link_task_to_cpu - Update the link of a CPU.
++ *                    Handles the case where the to-be-linked task is already
++ *                    scheduled on a different CPU.
++ */
++static noinline void link_task_to_cpu(struct task_struct* linked,
++				      cpu_entry_t *entry)
++{
++	cpu_entry_t *sched;
++	struct task_struct* tmp;
++	int on_cpu;
++
++	BUG_ON(linked && !is_realtime(linked));
++
++	/* Currently linked task is set to be unlinked. */
++	if (entry->linked) {
++		entry->linked->rt_param.linked_on = NO_CPU;
++	}
++
++	/* Link new task to CPU. */
++	if (linked) {
++		set_rt_flags(linked, RT_F_RUNNING);
++		/* handle task is already scheduled somewhere! */
++		on_cpu = linked->rt_param.scheduled_on;
++		if (on_cpu != NO_CPU) {
++			sched = &per_cpu(gsnedf_cpu_entries, on_cpu);
++			/* this should only happen if not linked already */
++			BUG_ON(sched->linked == linked);
++
++			/* If we are already scheduled on the CPU to which we
++			 * wanted to link, we don't need to do the swap --
++			 * we just link ourselves to the CPU and depend on
++			 * the caller to get things right.
++			 */
++			if (entry != sched) {
++				tmp = sched->linked;
++				linked->rt_param.linked_on = sched->cpu;
++				sched->linked = linked;
++				update_cpu_position(sched);
++				linked = tmp;
++			}
++		}
++		if (linked) /* might be NULL due to swap */
++			linked->rt_param.linked_on = entry->cpu;
++	}
++	entry->linked = linked;
++	update_cpu_position(entry);
++}
++
++/* unlink - Make sure a task is not linked any longer to an entry
++ *          where it was linked before. Must hold gsnedf_lock.
++ */
++static noinline void unlink(struct task_struct* t)
++{
++    	cpu_entry_t *entry;
++
++	if (unlikely(!t)) {
++		TRACE_BUG_ON(!t);
++		return;
++	}
++
++	if (t->rt_param.linked_on != NO_CPU) {
++		/* unlink */
++		entry = &per_cpu(gsnedf_cpu_entries, t->rt_param.linked_on);
++		t->rt_param.linked_on = NO_CPU;
++		link_task_to_cpu(NULL, entry);
++	} else if (in_list(&t->rt_list)) {
++		/* This is an interesting situation: t is scheduled,
++		 * but was just recently unlinked.  It cannot be
++		 * linked anywhere else (because then it would have
++		 * been relinked to this CPU), thus it must be in some
++		 * queue. We must remove it from the list in this
++		 * case.
++		 */
++		list_del(&t->rt_list);
++	}
++}
++
++
++/* preempt - force a CPU to reschedule
++ */
++static noinline void preempt(cpu_entry_t *entry)
++{
++	/* We cannot make the is_np() decision here if it is a remote CPU
++	 * because requesting exit_np() requires that we currently use the
++	 * address space of the task. Thus, in the remote case we just send
++	 * the IPI and let schedule() handle the problem.
++	 */
++
++	if (smp_processor_id() == entry->cpu) {
++		if (entry->scheduled && is_np(entry->scheduled))
++			request_exit_np(entry->scheduled);
++		else
++			set_tsk_need_resched(current);
++	} else
++		/* in case that it is a remote CPU we have to defer the
++		 * the decision to the remote CPU
++		 * FIXME: We could save a few IPI's here if we leave the flag
++		 * set when we are waiting for a np_exit().
++		 */
++		if (!test_will_schedule(entry->cpu))
++			smp_send_reschedule(entry->cpu);
++}
++
++/* requeue - Put an unlinked task into gsn-edf domain.
++ *           Caller must hold gsnedf_lock.
++ */
++static noinline void requeue(struct task_struct* task)
++{
++	BUG_ON(!task);
++	/* sanity check rt_list before insertion */
++	BUG_ON(in_list(&task->rt_list));
++
++	if (get_rt_flags(task) == RT_F_SLEEP) {
++		/* this task has expired
++		 * _schedule has already taken care of updating
++		 * the release and
++		 * deadline. We just must check if it has been released.
++		 */
++		if (is_released(task, sched_clock()))
++			__add_ready(&gsnedf, task);
++		else {
++			/* it has got to wait */
++			__add_release(&gsnedf, task);
++		}
++
++	} else
++		/* this is a forced preemption
++		 * thus the task stays in the ready_queue
++		 * we only must make it available to others
++		 */
++		__add_ready(&gsnedf, task);
++}
++
++/* gsnedf_job_arrival: task is either resumed or released */
++static noinline void gsnedf_job_arrival(struct task_struct* task)
++{
++	cpu_entry_t* last;
++
++	BUG_ON(list_empty(&gsnedf_cpu_queue));
++	BUG_ON(!task);
++
++	/* first queue arriving job */
++	requeue(task);
++
++	/* then check for any necessary preemptions */
++	last = list_entry(gsnedf_cpu_queue.prev, cpu_entry_t, list);
++	if (edf_preemption_needed(&gsnedf, last->linked)) {
++		/* preemption necessary */
++		task = __take_ready(&gsnedf);
++		TRACE("job_arrival: task %d linked to %d\n",
++		      task->pid, last->cpu);
++		if (last->linked)
++			requeue(last->linked);
++
++		link_task_to_cpu(task, last);
++		preempt(last);
++	}
++}
++
++/* check for current job releases */
++static noinline  void gsnedf_release_jobs(void)
++{
++	struct list_head *pos, *save;
++	struct task_struct   *queued;
++	lt_t now = sched_clock();
++
++
++	list_for_each_safe(pos, save, &gsnedf.release_queue) {
++		queued = list_entry(pos, struct task_struct, rt_list);
++		if (likely(is_released(queued, now))) {
++			/* this one is ready to go*/
++			list_del(pos);
++			set_rt_flags(queued, RT_F_RUNNING);
++
++			sched_trace_job_release(queued);
++			gsnedf_job_arrival(queued);
++		}
++		else
++			/* the release queue is ordered */
++			break;
++	}
++}
++
++/* gsnedf_scheduler_tick - this function is called for every local timer
++ *                       interrupt.
++ *
++ *                   checks whether the current task has expired and checks
++ *                   whether we need to preempt it if it has not expired
++ */
++static void gsnedf_scheduler_tick(void)
++{
++	unsigned long 		flags;
++	struct task_struct*	t = current;
++
++	if (is_realtime(t) && budget_exhausted(t)) {
++		if (!is_np(t)) {
++			/* np tasks will be preempted when they become
++			 * preemptable again
++			 */
++			set_tsk_need_resched(t);
++			set_will_schedule();
++			TRACE("gsnedf_scheduler_tick: "
++			      "%d is preemptable "
++			      " => FORCE_RESCHED\n", t->pid);
++		} else {
++			TRACE("gsnedf_scheduler_tick: "
++			      "%d is non-preemptable, "
++			      "preemption delayed.\n", t->pid);
++			request_exit_np(t);
++		}
++	}
++
++	/* only the first CPU needs to release jobs */
++	if (smp_processor_id() == 0) {
++		spin_lock_irqsave(&gsnedf_lock, flags);
++
++		/* Try to release pending jobs */
++		gsnedf_release_jobs();
++
++		/* We don't need to check linked != scheduled since
++		 * set_tsk_need_resched has been set by preempt() if necessary.
++		 */
++
++		spin_unlock_irqrestore(&gsnedf_lock, flags);
++	}
++}
++
++/* caller holds gsnedf_lock */
++static noinline void job_completion(struct task_struct *t)
++{
++	BUG_ON(!t);
++
++	sched_trace_job_completion(t);
++
++	TRACE_TASK(t, "job_completion().\n");
++
++	/* set flags */
++	set_rt_flags(t, RT_F_SLEEP);
++	/* prepare for next period */
++	prepare_for_next_period(t);
++	/* unlink */
++	unlink(t);
++	/* requeue
++	 * But don't requeue a blocking task. */
++	if (is_running(t))
++		gsnedf_job_arrival(t);
++}
++
++
++/* Getting schedule() right is a bit tricky. schedule() may not make any
++ * assumptions on the state of the current task since it may be called for a
++ * number of reasons. The reasons include a scheduler_tick() determined that it
++ * was necessary, because sys_exit_np() was called, because some Linux
++ * subsystem determined so, or even (in the worst case) because there is a bug
++ * hidden somewhere. Thus, we must take extreme care to determine what the
++ * current state is.
++ *
++ * The CPU could currently be scheduling a task (or not), be linked (or not).
++ *
++ * The following assertions for the scheduled task could hold:
++ *
++ *      - !is_running(scheduled)        // the job blocks
++ *	- scheduled->timeslice == 0	// the job completed (forcefully)
++ *	- get_rt_flag() == RT_F_SLEEP	// the job completed (by syscall)
++ * 	- linked != scheduled		// we need to reschedule (for any reason)
++ * 	- is_np(scheduled)		// rescheduling must be delayed,
++ *					   sys_exit_np must be requested
++ *
++ * Any of these can occur together.
++ */
++static int gsnedf_schedule(struct task_struct * prev,
++			 struct task_struct ** next)
++{
++	cpu_entry_t* 	entry = &__get_cpu_var(gsnedf_cpu_entries);
++	int 			out_of_time, sleep, preempt, np, exists, blocks;
++
++	/* Will be released in finish_switch. */
++	spin_lock(&gsnedf_lock);
++	clear_will_schedule();
++
++	/* sanity checking */
++	BUG_ON(entry->scheduled && entry->scheduled != prev);
++	BUG_ON(entry->scheduled && !is_realtime(prev));
++	BUG_ON(is_realtime(prev) && !entry->scheduled);
++
++	/* (0) Determine state */
++	exists      = entry->scheduled != NULL;
++	blocks      = exists && !is_running(entry->scheduled);
++	out_of_time = exists && budget_exhausted(entry->scheduled);
++	np 	    = exists && is_np(entry->scheduled);
++	sleep	    = exists && get_rt_flags(entry->scheduled) == RT_F_SLEEP;
++	preempt     = entry->scheduled != entry->linked;
++
++	/* If a task blocks we have no choice but to reschedule.
++	 */
++	if (blocks)
++		unlink(entry->scheduled);
++
++	/* Request a sys_exit_np() call if we would like to preempt but cannot.
++	 * We need to make sure to update the link structure anyway in case
++	 * that we are still linked. Multiple calls to request_exit_np() don't
++	 * hurt.
++	 */
++	if (np && (out_of_time || preempt || sleep)) {
++		unlink(entry->scheduled);
++		request_exit_np(entry->scheduled);
++	}
++
++	/* Any task that is preemptable and either exhausts its execution
++	 * budget or wants to sleep completes. We may have to reschedule after
++	 * this.
++	 */
++	if (!np && (out_of_time || sleep))
++		job_completion(entry->scheduled);
++
++	/* Link pending task if we became unlinked.
++	 */
++	if (!entry->linked)
++		link_task_to_cpu(__take_ready(&gsnedf), entry);
++
++	/* The final scheduling decision. Do we need to switch for some reason?
++	 * If linked different from scheduled select linked as next.
++	 */
++	if ((!np || blocks) &&
++	    entry->linked != entry->scheduled) {
++		/* Schedule a linked job? */
++		if (entry->linked)
++			*next = entry->linked;
++	} else
++		/* Only override Linux scheduler if we have real-time task
++		 * scheduled that needs to continue.
++		 */
++		if (exists)
++			*next = prev;
++
++	spin_unlock(&gsnedf_lock);
++
++	/* don't race with a concurrent switch */
++	if (*next && prev != *next)
++		while ((*next)->rt_param.scheduled_on != NO_CPU)
++			cpu_relax();
++	return 0;
++}
++
++
++/* _finish_switch - we just finished the switch away from prev
++ */
++static void gsnedf_finish_switch(struct task_struct *prev)
++{
++	cpu_entry_t* 	entry = &__get_cpu_var(gsnedf_cpu_entries);
++
++	entry->scheduled = is_realtime(current) ? current : NULL;
++
++	prev->rt_param.scheduled_on    = NO_CPU;
++	current->rt_param.scheduled_on = smp_processor_id();
++}
++
++
++/*	Prepare a task for running in RT mode
++ *	Enqueues the task into master queue data structure
++ *	returns
++ *		-EPERM  if task is not TASK_STOPPED
++ */
++static long gsnedf_prepare_task(struct task_struct * t)
++{
++	unsigned long 		flags;
++	TRACE("gsn edf: prepare task %d\n", t->pid);
++
++	if (t->state == TASK_STOPPED) {
++		t->rt_param.scheduled_on       = NO_CPU;
++		t->rt_param.linked_on          = NO_CPU;
++
++		/* delay by 1ms */
++		release_at(t, sched_clock() + 1000000);
++
++		/* The task should be running in the queue, otherwise signal
++		 * code will try to wake it up with fatal consequences.
++		 */
++		t->state = TASK_RUNNING;
++		spin_lock_irqsave(&gsnedf_lock, flags);
++		t->rt_param.litmus_controlled = 1;
++		requeue(t);
++		spin_unlock_irqrestore(&gsnedf_lock, flags);
++		return 0;
++	}
++	else
++		return -EPERM;
++}
++
++static void gsnedf_wake_up_task(struct task_struct *task)
++{
++	unsigned long flags;
++	lt_t now;
++	/* We must determine whether task should go into the release
++	 * queue or into the ready queue. It may enter the ready queue
++	 * if it has credit left in its time slice and has not yet reached
++	 * its deadline. If it is now passed its deadline we assume this the
++	 * arrival of a new sporadic job and thus put it in the ready queue
++	 * anyway.If it has zero budget and the next release is in the future
++	 * it has to go to the release queue.
++	 */
++	TRACE("gsnedf: %d unsuspends with budget=%d\n",
++	      task->pid, task->time_slice);
++
++	spin_lock_irqsave(&gsnedf_lock, flags);
++	if (!task->rt_param.litmus_controlled) {
++		task->rt_param.litmus_controlled = 1;
++		/* We need to take suspensions because of semaphores into
++		 * account! If a job resumes after being suspended due to acquiring
++		 * a semaphore, it should never be treated as a new job release.
++		 */
++		if (get_rt_flags(task) == RT_F_EXIT_SEM) {
++			set_rt_flags(task, RT_F_RUNNING);
++		} else {
++			now = sched_clock();
++			if (is_tardy(task, now)) {
++				/* new sporadic release */
++				release_at(task, now);
++				sched_trace_job_release(task);
++			}
++			else if (task->time_slice)
++				/* came back in time before deadline
++				 */
++				set_rt_flags(task, RT_F_RUNNING);
++		}
++		task->state = TASK_RUNNING;
++		gsnedf_job_arrival(task);
++	}
++	spin_unlock_irqrestore(&gsnedf_lock, flags);
++}
++
++static void gsnedf_task_blocks(struct task_struct *t)
++{
++	unsigned long flags;
++
++	/* unlink if necessary */
++	spin_lock_irqsave(&gsnedf_lock, flags);
++	unlink(t);
++	t->rt_param.litmus_controlled = 0;
++	spin_unlock_irqrestore(&gsnedf_lock, flags);
++
++	BUG_ON(!is_realtime(t));
++	TRACE("task %d suspends with budget=%d\n", t->pid, t->time_slice);
++	BUG_ON(t->rt_list.next != LIST_POISON1);
++	BUG_ON(t->rt_list.prev != LIST_POISON2);
++}
++
++
++/* When _tear_down is called, the task should not be in any queue any more
++ * as it must have blocked first. We don't have any internal state for the task,
++ * it is all in the task_struct.
++ */
++static long gsnedf_tear_down(struct task_struct * t)
++{
++	BUG_ON(!is_realtime(t));
++        TRACE_TASK(t, "RIP\n");
++	BUG_ON(t->array);
++	BUG_ON(t->rt_list.next != LIST_POISON1);
++	BUG_ON(t->rt_list.prev != LIST_POISON2);
++	return 0;
++}
++
++static long gsnedf_pi_block(struct pi_semaphore *sem,
++			    struct task_struct *new_waiter)
++{
++	/* This callback has to handle the situation where a new waiter is
++	 * added to the wait queue of the semaphore.
++	 *
++	 * We must check if has a higher priority than the currently
++	 * highest-priority task, and then potentially reschedule.
++	 */
++
++	BUG_ON(!new_waiter);
++
++	if (edf_higher_prio(new_waiter, sem->hp.task)) {
++		TRACE_TASK(new_waiter, " boosts priority\n");
++		/* called with IRQs disabled */
++		spin_lock(&gsnedf_lock);
++		/* store new highest-priority task */
++		sem->hp.task = new_waiter;
++		if (sem->holder) {
++			/* let holder inherit */
++			sem->holder->rt_param.inh_task = new_waiter;
++			unlink(sem->holder);
++			gsnedf_job_arrival(sem->holder);
++		}
++		spin_unlock(&gsnedf_lock);
++	}
++
++	return 0;
++}
++
++static long gsnedf_inherit_priority(struct pi_semaphore *sem,
++				    struct task_struct *new_owner)
++{
++	/* We don't need to acquire the gsnedf_lock since at the time of this
++	 * call new_owner isn't actually scheduled yet (it's still sleeping)
++	 * and since the calling function already holds sem->wait.lock, which
++	 * prevents concurrent sem->hp.task changes.
++	 */
++
++	if (sem->hp.task && sem->hp.task != new_owner) {
++		new_owner->rt_param.inh_task = sem->hp.task;
++		TRACE_TASK(new_owner, "inherited priority from %s/%d\n",
++			   sem->hp.task->comm, sem->hp.task->pid);
++	} else
++		TRACE_TASK(new_owner,
++			   "cannot inherit priority, "
++			   "no higher priority job waits.\n");
++	return 0;
++}
++
++/* This function is called on a semaphore release, and assumes that
++ * the current task is also the semaphore holder.
++ */
++static long gsnedf_return_priority(struct pi_semaphore *sem)
++{
++	struct task_struct* t = current;
++	int ret = 0;
++
++        /* Find new highest-priority semaphore task
++	 * if holder task is the current hp.task.
++	 *
++	 * Calling function holds sem->wait.lock.
++	 */
++	if (t == sem->hp.task)
++		set_hp_task(sem, edf_higher_prio);
++
++	TRACE_CUR("gsnedf_return_priority for lock %p\n", sem);
++
++	if (t->rt_param.inh_task) {
++		/* interrupts already disabled by PI code */
++		spin_lock(&gsnedf_lock);
++
++		/* Reset inh_task to NULL. */
++		t->rt_param.inh_task = NULL;
++
++		/* Check if rescheduling is necessary */
++		unlink(t);
++		gsnedf_job_arrival(t);
++		spin_unlock(&gsnedf_lock);
++	}
++
++	return ret;
++}
++
++/*	Plugin object	*/
++static struct sched_plugin gsn_edf_plugin __cacheline_aligned_in_smp = {
++	.plugin_name		= "GSN-EDF",
++	.scheduler_tick		= gsnedf_scheduler_tick,
++	.prepare_task		= gsnedf_prepare_task,
++	.sleep_next_period	= complete_job,
++	.tear_down		= gsnedf_tear_down,
++	.schedule		= gsnedf_schedule,
++	.finish_switch 		= gsnedf_finish_switch,
++	.wake_up_task		= gsnedf_wake_up_task,
++	.task_blocks		= gsnedf_task_blocks,
++	.inherit_priority	= gsnedf_inherit_priority,
++	.return_priority	= gsnedf_return_priority,
++	.pi_block		= gsnedf_pi_block
++};
++
++
++static int __init init_gsn_edf(void)
++{
++	int cpu;
++	cpu_entry_t *entry;
++
++	/* initialize CPU state */
++	for (cpu = 0; cpu < NR_CPUS; cpu++)  {
++		entry = &per_cpu(gsnedf_cpu_entries, cpu);
++		atomic_set(&entry->will_schedule, 0);
++		entry->linked    = NULL;
++		entry->scheduled = NULL;
++		entry->cpu 	 = cpu;
++		INIT_LIST_HEAD(&entry->list);
++	}
++
++	edf_domain_init(&gsnedf, NULL);
++	return register_sched_plugin(&gsn_edf_plugin);
++}
++
++
++module_init(init_gsn_edf);
+diff --git a/litmus/sched_plugin.c b/litmus/sched_plugin.c
+new file mode 100644
+index 0000000..f05fc56
+--- /dev/null
++++ b/litmus/sched_plugin.c
+@@ -0,0 +1,169 @@
++/* sched_plugin.c -- core infrastructure for the scheduler plugin system
++ *
++ * This file includes the initialization of the plugin system, the no-op Linux
++ * scheduler plugin and some dummy functions.
++ */
++
++#include <linux/list.h>
++#include <linux/spinlock.h>
++
++#include <litmus/litmus.h>
++#include <litmus/sched_plugin.h>
++
++
++/*************************************************************
++ *                   Dummy plugin functions                  *
++ *************************************************************/
++
++static void litmus_dummy_finish_switch(struct task_struct * prev)
++{
++}
++
++static int litmus_dummy_schedule(struct task_struct * prev,
++				 struct task_struct** next)
++{
++	return 0;
++}
++
++static void litmus_dummy_scheduler_tick(void)
++{
++}
++
++static long litmus_dummy_prepare_task(struct task_struct *t)
++{
++	return -ENOSYS;
++}
++
++static void litmus_dummy_wake_up_task(struct task_struct *task)
++{
++	printk(KERN_WARNING "task %d: unhandled real-time wake up!\n",
++	  task->pid);
++}
++
++static void litmus_dummy_task_blocks(struct task_struct *task)
++{
++}
++
++static long litmus_dummy_tear_down(struct task_struct *task)
++{
++	return 0;
++}
++
++static long litmus_dummy_sleep_next_period(void)
++{
++	return -ENOSYS;
++}
++
++static long litmus_dummy_inherit_priority(struct pi_semaphore *sem,
++					  struct task_struct *new_owner)
++{
++	return -ENOSYS;
++}
++
++static long litmus_dummy_return_priority(struct pi_semaphore *sem)
++{
++	return -ENOSYS;
++}
++
++static long litmus_dummy_pi_block(struct pi_semaphore *sem,
++				  struct task_struct *new_waiter)
++{
++	return -ENOSYS;
++}
++
++
++/* The default scheduler plugin. It doesn't do anything and lets Linux do its
++ * job.
++ */
++struct sched_plugin linux_sched_plugin = {
++	.plugin_name = "Linux",
++	.scheduler_tick = litmus_dummy_scheduler_tick,
++	.prepare_task = litmus_dummy_prepare_task,
++	.tear_down = litmus_dummy_tear_down,
++	.wake_up_task = litmus_dummy_wake_up_task,
++	.task_blocks = litmus_dummy_task_blocks,
++	.sleep_next_period = litmus_dummy_sleep_next_period,
++	.schedule = litmus_dummy_schedule,
++	.finish_switch = litmus_dummy_finish_switch,
++	.inherit_priority = litmus_dummy_inherit_priority,
++	.return_priority = litmus_dummy_return_priority,
++	.pi_block = litmus_dummy_pi_block
++};
++
++/*
++ *	The reference to current plugin that is used to schedule tasks within
++ *	the system. It stores references to actual function implementations
++ *	Should be initialized by calling "init_***_plugin()"
++ */
++struct sched_plugin *curr_sched_plugin = &linux_sched_plugin;
++
++/* the list of registered scheduling plugins */
++static LIST_HEAD(sched_plugins);
++static DEFINE_SPINLOCK(sched_plugins_lock);
++
++#define CHECK(func) {\
++	if (!plugin->func) \
++		plugin->func = litmus_dummy_ ## func;}
++
++/* FIXME: get reference to module  */
++int register_sched_plugin(struct sched_plugin* plugin)
++{
++	printk(KERN_INFO "Registering LITMUS^RT plugin %s.\n",
++	       plugin->plugin_name);
++
++	/* make sure we don't trip over null pointers later */
++	CHECK(finish_switch);
++	CHECK(schedule);
++	CHECK(scheduler_tick);
++	CHECK(wake_up_task);
++	CHECK(tear_down);
++	CHECK(task_blocks);
++	CHECK(prepare_task);
++	CHECK(sleep_next_period);
++	CHECK(inherit_priority);
++	CHECK(return_priority);
++	CHECK(pi_block);
++
++	spin_lock(&sched_plugins_lock);
++	list_add(&plugin->list, &sched_plugins);
++	spin_unlock(&sched_plugins_lock);
++
++	return 0;
++}
++
++
++/* FIXME: reference counting, etc. */
++struct sched_plugin* find_sched_plugin(const char* name)
++{
++	struct list_head *pos;
++	struct sched_plugin *plugin;
++
++	spin_lock(&sched_plugins_lock);
++	list_for_each(pos, &sched_plugins) {
++		plugin = list_entry(pos, struct sched_plugin, list);
++		if (!strcmp(plugin->plugin_name, name))
++		    goto out_unlock;
++	}
++	plugin = NULL;
++
++out_unlock:
++	spin_unlock(&sched_plugins_lock);
++	return plugin;
++}
++
++int print_sched_plugins(char* buf, int max)
++{
++	int count = 0;
++	struct list_head *pos;
++	struct sched_plugin *plugin;
++
++	spin_lock(&sched_plugins_lock);
++	list_for_each(pos, &sched_plugins) {
++		plugin = list_entry(pos, struct sched_plugin, list);
++		count += snprintf(buf + count, max - count, "%s\n", plugin->plugin_name);
++		if (max - count <= 0)
++			break;
++	}
++	spin_unlock(&sched_plugins_lock);
++	return 	count;
++}
+diff --git a/litmus/sched_psn_edf.c b/litmus/sched_psn_edf.c
+new file mode 100644
+index 0000000..27f4b5c
+--- /dev/null
++++ b/litmus/sched_psn_edf.c
+@@ -0,0 +1,458 @@
++
++/*
++ * kernel/sched_psn_edf.c
++ *
++ * Implementation of the PSN-EDF scheduler plugin.
++ * Based on kern/sched_part_edf.c and kern/sched_gsn_edf.c.
++ *
++ * Suspensions and non-preemptable sections are supported.
++ * Priority inheritance is not supported.
++ */
++
++#include <linux/percpu.h>
++#include <linux/sched.h>
++#include <linux/list.h>
++#include <linux/spinlock.h>
++
++#include <linux/module.h>
++
++#include <litmus/litmus.h>
++#include <litmus/jobs.h>
++#include <litmus/sched_plugin.h>
++#include <litmus/edf_common.h>
++
++
++typedef struct {
++	rt_domain_t 		domain;
++	int          		cpu;
++	struct task_struct* 	scheduled; /* only RT tasks */
++	spinlock_t   		lock;      /* protects the domain and
++                                            * serializes scheduling decisions
++					    */
++} psnedf_domain_t;
++
++DEFINE_PER_CPU(psnedf_domain_t, psnedf_domains);
++
++#define local_edf		(&__get_cpu_var(psnedf_domains).domain)
++#define local_pedf		(&__get_cpu_var(psnedf_domains))
++#define remote_edf(cpu)		(&per_cpu(psnedf_domains, cpu).domain)
++#define remote_pedf(cpu)	(&per_cpu(psnedf_domains, cpu))
++#define task_edf(task)		remote_edf(get_partition(task))
++#define task_pedf(task)		remote_pedf(get_partition(task))
++
++
++static void psnedf_domain_init(psnedf_domain_t* pedf,
++				 check_resched_needed_t check,
++				 int cpu)
++{
++	edf_domain_init(&pedf->domain, check);
++	pedf->cpu      		= cpu;
++	pedf->lock     		= SPIN_LOCK_UNLOCKED;
++	pedf->scheduled		= NULL;
++}
++
++static void requeue(struct task_struct* t, rt_domain_t *edf)
++{
++	/* only requeue if t is actually running */
++	BUG_ON(!is_running(t));
++
++	if (t->state != TASK_RUNNING)
++		TRACE_TASK(t, "requeue: !TASK_RUNNING");
++
++	set_rt_flags(t, RT_F_RUNNING);
++	if (is_released(t, sched_clock()))
++		__add_ready(edf, t);
++	else
++		__add_release(edf, t); /* it has got to wait */
++}
++
++/* we assume the lock is being held */
++static void preempt(psnedf_domain_t *pedf)
++{
++	if (smp_processor_id() == pedf->cpu) {
++		if (pedf->scheduled && is_np(pedf->scheduled))
++			request_exit_np(pedf->scheduled);
++		else
++			set_tsk_need_resched(current);
++	} else
++		/* in case that it is a remote CPU we have to defer the
++		 * the decision to the remote CPU
++		 */
++		smp_send_reschedule(pedf->cpu);
++}
++
++/* This check is trivial in partioned systems as we only have to consider
++ * the CPU of the partition.
++ */
++static int psnedf_check_resched(rt_domain_t *edf)
++{
++	psnedf_domain_t *pedf = container_of(edf, psnedf_domain_t, domain);
++	int ret = 0;
++
++	/* because this is a callback from rt_domain_t we already hold
++	 * the necessary lock for the ready queue
++	 */
++	if (edf_preemption_needed(edf, pedf->scheduled)) {
++		preempt(pedf);
++		ret = 1;
++	}
++	return ret;
++}
++
++
++static void psnedf_scheduler_tick(void)
++{
++	unsigned long       flags;
++	struct task_struct *t            = current;
++	rt_domain_t        *edf          = local_edf;
++	psnedf_domain_t    *pedf         = local_pedf;
++
++	/* Check for inconsistency. We don't need the lock for this since
++	 * ->scheduled is only changed in schedule, which obviously is not
++	 *  executing in parallel on this CPU
++	 */
++	BUG_ON(is_realtime(t) && t != pedf->scheduled);
++
++	if (is_realtime(t) && budget_exhausted(t)) {
++		if (!is_np(t))
++			set_tsk_need_resched(t);
++		else {
++			TRACE("psnedf_scheduler_tick: "
++			      "%d is non-preemptable, "
++			      "preemption delayed.\n", t->pid);
++			request_exit_np(t);
++		}
++	}
++
++	spin_lock_irqsave(&pedf->lock, flags);
++	__release_pending(edf);
++	if (edf_preemption_needed(edf, t))
++		set_tsk_need_resched(t);
++	spin_unlock_irqrestore(&pedf->lock, flags);
++}
++
++static void job_completion(struct task_struct* t)
++{
++	TRACE_TASK(t, "job_completion().\n");
++	set_rt_flags(t, RT_F_SLEEP);
++	prepare_for_next_period(t);
++}
++
++static int psnedf_schedule(struct task_struct * prev,
++			     struct task_struct ** next)
++{
++	psnedf_domain_t* 	pedf = local_pedf;
++	rt_domain_t*		edf  = &pedf->domain;
++
++	int 			out_of_time, sleep, preempt,
++					np, exists, blocks, resched;
++
++	spin_lock(&pedf->lock);
++
++	/* sanity checking */
++	BUG_ON(pedf->scheduled && pedf->scheduled != prev);
++	BUG_ON(pedf->scheduled && !is_realtime(prev));
++
++	/* (0) Determine state */
++	exists      = pedf->scheduled != NULL;
++	blocks      = exists && !is_running(pedf->scheduled);
++	out_of_time = exists && budget_exhausted(pedf->scheduled);
++	np 	    = exists && is_np(pedf->scheduled);
++	sleep	    = exists && get_rt_flags(pedf->scheduled) == RT_F_SLEEP;
++	preempt     = edf_preemption_needed(edf, prev);
++
++	/* If we need to preempt do so.
++	 * The following checks set resched to 1 in case of special
++	 * circumstances.
++	 */
++	resched = preempt;
++
++	/* If a task blocks we have no choice but to reschedule.
++	 */
++	if (blocks)
++		resched = 1;
++
++	/* Request a sys_exit_np() call if we would like to preempt but cannot.
++	 * Multiple calls to request_exit_np() don't hurt.
++	 */
++	if (np && (out_of_time || preempt || sleep))
++		request_exit_np(pedf->scheduled);
++
++	/* Any task that is preemptable and either exhausts its execution
++	 * budget or wants to sleep completes. We may have to reschedule after
++	 * this.
++	 */
++	if (!np && (out_of_time || sleep)) {
++		job_completion(pedf->scheduled);
++		resched = 1;
++	}
++
++	/* The final scheduling decision. Do we need to switch for some reason?
++	 * Switch if we are in RT mode and have no task or if we need to
++	 * resched.
++	 */
++	*next = NULL;
++	if ((!np || blocks) && (resched || !exists)) {
++		/* Take care of a previously scheduled
++		 * job by taking it out of the Linux runqueue.
++		 */
++		if (pedf->scheduled) {
++			/* as opposed to global schedulers that switch without
++			 * a lock being held we can requeue already here since
++			 * no other CPU will schedule from this domain.
++			 */
++			if (!blocks)
++				requeue(pedf->scheduled, edf);
++		}
++		*next = __take_ready(edf);
++	} else
++		/* Only override Linux scheduler if we have a real-time task
++		 * scheduled that needs to continue.
++		 */
++		if (exists)
++			*next = prev;
++
++	if (*next)
++		set_rt_flags(*next, RT_F_RUNNING);
++
++	pedf->scheduled = *next;
++	spin_unlock(&pedf->lock);
++	return 0;
++}
++
++
++/*	Prepare a task for running in RT mode
++ *	Enqueues the task into master queue data structure
++ *	returns
++ *		-EPERM  if task is not TASK_STOPPED
++ */
++static long psnedf_prepare_task(struct task_struct * t)
++{
++	rt_domain_t* 		edf  = task_edf(t);
++	psnedf_domain_t* 	pedf = task_pedf(t);
++	unsigned long		flags;
++
++	TRACE("[%d] psn edf: prepare task %d on CPU %d\n",
++		smp_processor_id(), t->pid, get_partition(t));
++	if (t->state == TASK_STOPPED) {
++
++		/* 1ms delay */
++		release_at(t, sched_clock() + 1000000);
++
++		/* The task should be running in the queue, otherwise signal
++		 * code will try to wake it up with fatal consequences.
++		 */
++		t->state = TASK_RUNNING;
++		spin_lock_irqsave(&pedf->lock, flags);
++		t->rt_param.litmus_controlled = 1;
++		__add_release(edf, t);
++		spin_unlock_irqrestore(&pedf->lock, flags);
++		return 0;
++	} else
++		return -EPERM;
++}
++
++static void psnedf_wake_up_task(struct task_struct *task)
++{
++	unsigned long		flags;
++	psnedf_domain_t* 	pedf = task_pedf(task);
++	rt_domain_t* 		edf  = task_edf(task);
++	lt_t			now;
++
++	TRACE("psnedf: %d unsuspends with budget=%d\n",
++	      task->pid, task->time_slice);
++
++	spin_lock_irqsave(&pedf->lock, flags);
++	if (!task->rt_param.litmus_controlled) {
++		BUG_ON(in_list(&task->rt_list));
++		task->rt_param.litmus_controlled = 1;
++		/* We need to take suspensions because of semaphores into
++		 * account! If a job resumes after being suspended due to acquiring
++		 * a semaphore, it should never be treated as a new job release.
++		 */
++		now = sched_clock();
++		if (is_tardy(task, now) &&
++		    get_rt_flags(task) != RT_F_EXIT_SEM) {
++			/* new sporadic release */
++			release_at(task, now);
++			sched_trace_job_release(task);
++		}
++		task->state = TASK_RUNNING;
++		requeue(task, edf);
++	}
++	spin_unlock_irqrestore(&pedf->lock, flags);
++}
++
++static void psnedf_task_blocks(struct task_struct *t)
++{
++	BUG_ON(!is_realtime(t));
++	/* not really anything to do since it can only block if
++	 * it is running, and when it is not running it is not in any
++	 * queue anyway.
++	 */
++	TRACE("task %d blocks with budget=%d\n", t->pid, t->time_slice);
++	BUG_ON(in_list(&t->rt_list));
++	t->rt_param.litmus_controlled = 0;
++}
++
++
++/* When _tear_down is called, the task should not be in any queue any more
++ * as it must have blocked first. We don't have any internal state for the task,
++ * it is all in the task_struct.
++ */
++static long psnedf_tear_down(struct task_struct * t)
++{
++	BUG_ON(!is_realtime(t));
++        TRACE_TASK(t, "tear down called");
++	BUG_ON(t->array);
++	BUG_ON(in_list(&t->rt_list));
++	return 0;
++}
++
++static long psnedf_pi_block(struct pi_semaphore *sem,
++			    struct task_struct *new_waiter)
++{
++	psnedf_domain_t* 	pedf;
++	rt_domain_t*		edf;
++	struct task_struct*	t;
++	int cpu  = get_partition(new_waiter);
++
++	BUG_ON(!new_waiter);
++
++	if (edf_higher_prio(new_waiter, sem->hp.cpu_task[cpu])) {
++		TRACE_TASK(new_waiter, " boosts priority\n");
++		pedf = task_pedf(new_waiter);
++		edf  = task_edf(new_waiter);
++
++		/* interrupts already disabled */
++		spin_lock(&pedf->lock);
++
++		/* store new highest-priority task */
++		sem->hp.cpu_task[cpu] = new_waiter;
++		if (sem->holder &&
++		    get_partition(sem->holder) == get_partition(new_waiter)) {
++			/* let holder inherit */
++			sem->holder->rt_param.inh_task = new_waiter;
++			t = sem->holder;
++			if (in_list(&t->rt_list)) {
++				/* queued in domain*/
++				list_del(&t->rt_list);
++				/* readd to make priority change take place */
++				if (is_released(t, sched_clock()))
++					__add_ready(edf, t);
++				else
++					__add_release(edf, t);
++			}
++		}
++
++		/* check if we need to reschedule */
++		if (edf_preemption_needed(edf, current))
++			preempt(pedf);
++
++		spin_unlock(&pedf->lock);
++	}
++
++	return 0;
++}
++
++static long psnedf_inherit_priority(struct pi_semaphore *sem,
++				    struct task_struct *new_owner)
++{
++	int cpu  = get_partition(new_owner);
++
++	/* FIXME: This doesn't look correct at all!
++	 *        Why do we inherit in any case???
++	 */
++	new_owner->rt_param.inh_task = sem->hp.cpu_task[cpu];
++	if (sem->hp.cpu_task[cpu] && new_owner != sem->hp.cpu_task[cpu]) {
++		TRACE_TASK(new_owner,
++			   "inherited priority from %s/%d\n",
++			   sem->hp.cpu_task[cpu]->comm,
++			   sem->hp.cpu_task[cpu]->pid);
++	} else
++		TRACE_TASK(new_owner,
++			   "cannot inherit priority: "
++			   "no higher priority job waits on this CPU!\n");
++	/* make new owner non-preemptable as required by FMLP under
++	 * PSN-EDF.
++	 */
++	make_np(new_owner);
++	return 0;
++}
++
++
++/* This function is called on a semaphore release, and assumes that
++ * the current task is also the semaphore holder.
++ */
++static long psnedf_return_priority(struct pi_semaphore *sem)
++{
++	struct task_struct* 	t    = current;
++	psnedf_domain_t* 	pedf = task_pedf(t);
++	rt_domain_t*		edf  = task_edf(t);
++	int 			ret  = 0;
++	int			cpu  = get_partition(current);
++
++
++        /* Find new highest-priority semaphore task
++	 * if holder task is the current hp.cpu_task[cpu].
++	 *
++	 * Calling function holds sem->wait.lock.
++	 */
++	if (t == sem->hp.cpu_task[cpu])
++		set_hp_cpu_task(sem, cpu, edf_higher_prio);
++
++	take_np(t);
++	if (current->rt_param.inh_task) {
++		TRACE_CUR("return priority of %s/%d\n",
++			  current->rt_param.inh_task->comm,
++			  current->rt_param.inh_task->pid);
++		spin_lock(&pedf->lock);
++
++		/* Reset inh_task to NULL. */
++		current->rt_param.inh_task = NULL;
++
++		/* check if we need to reschedule */
++		if (edf_preemption_needed(edf, current))
++			preempt(pedf);
++
++		spin_unlock(&pedf->lock);
++	} else
++		TRACE_CUR(" no priority to return %p\n", sem);
++
++	return ret;
++}
++
++
++/*	Plugin object	*/
++static struct sched_plugin psn_edf_plugin __cacheline_aligned_in_smp = {
++	.plugin_name		= "PSN-EDF",
++	.srp_active		= 1,
++	.scheduler_tick		= psnedf_scheduler_tick,
++	.prepare_task		= psnedf_prepare_task,
++	.sleep_next_period	= complete_job,
++	.tear_down		= psnedf_tear_down,
++	.schedule		= psnedf_schedule,
++	.wake_up_task		= psnedf_wake_up_task,
++	.task_blocks		= psnedf_task_blocks,
++	.pi_block		= psnedf_pi_block,
++	.inherit_priority	= psnedf_inherit_priority,
++	.return_priority	= psnedf_return_priority
++};
++
++
++static int __init init_psn_edf(void)
++{
++	int i;
++
++	for (i = 0; i < NR_CPUS; i++)
++	{
++		psnedf_domain_init(remote_pedf(i),
++				   psnedf_check_resched, i);
++		printk("PSN-EDF: CPU partition %d initialized.\n", i);
++	}
++	return register_sched_plugin(&psn_edf_plugin);
++}
++
++
++
++module_init(init_psn_edf);
+diff --git a/litmus/sched_rm.c b/litmus/sched_rm.c
+new file mode 100644
+index 0000000..57acde4
+--- /dev/null
++++ b/litmus/sched_rm.c
+@@ -0,0 +1,397 @@
++
++/* RM implementation.
++ * Will support the M-PCP eventually.
++ */
++
++#include <linux/percpu.h>
++#include <linux/sched.h>
++#include <linux/list.h>
++#include <linux/spinlock.h>
++
++#include <linux/module.h>
++
++#include <litmus/litmus.h>
++#include <litmus/jobs.h>
++#include <litmus/sched_plugin.h>
++#include <litmus/rm_common.h>
++
++
++typedef struct {
++	rt_domain_t 		domain;
++	int          		cpu;
++	struct task_struct* 	scheduled; /* only RT tasks */
++	spinlock_t   		lock;      /* protects the domain and
++                                            * serializes scheduling decisions
++					    */
++} rm_domain_t;
++
++DEFINE_PER_CPU(rm_domain_t, rm_domains);
++
++#define local_dom		(&__get_cpu_var(rm_domains).domain)
++#define local_part		(&__get_cpu_var(rm_domains))
++#define remote_dom(cpu)		(&per_cpu(rm_domains, cpu).domain)
++#define remote_part(cpu)	(&per_cpu(rm_domains, cpu))
++#define task_dom(task)		remote_dom(get_partition(task))
++#define task_part(task)		remote_part(get_partition(task))
++
++
++static void prm_domain_init(rm_domain_t* part,
++			   check_resched_needed_t check,
++			   int cpu)
++{
++	rm_domain_init(&part->domain, check);
++	part->cpu      		= cpu;
++	part->lock     		= SPIN_LOCK_UNLOCKED;
++	part->scheduled		= NULL;
++}
++
++static void requeue(struct task_struct* t, rt_domain_t *dom)
++{
++	/* only requeue if t is actually running */
++	BUG_ON(!is_running(t));
++
++	if (t->state != TASK_RUNNING)
++		TRACE_TASK(t, "requeue: !TASK_RUNNING");
++
++	set_rt_flags(t, RT_F_RUNNING);
++	if (is_released(t, sched_clock()))
++		__add_ready(dom, t);
++	else
++		__add_release(dom, t); /* it has got to wait */
++}
++
++/* we assume the lock is being held */
++static void preempt(rm_domain_t *part)
++{
++	if (smp_processor_id() == part->cpu) {
++		if (part->scheduled && is_np(part->scheduled))
++			request_exit_np(part->scheduled);
++		else
++			set_tsk_need_resched(current);
++	} else
++		/* in case that it is a remote CPU we have to defer the
++		 * the decision to the remote CPU
++		 */
++		smp_send_reschedule(part->cpu);
++}
++
++/* This check is trivial in partioned systems as we only have to consider
++ * the CPU of the partition.
++ */
++static int rm_check_resched(rt_domain_t *dom)
++{
++	rm_domain_t *part = container_of(dom, rm_domain_t, domain);
++	int ret = 0;
++
++	/* because this is a callback from rt_domain_t we already hold
++	 * the necessary lock for the ready queue
++	 */
++	if (rm_preemption_needed(dom, part->scheduled)) {
++		preempt(part);
++		ret = 1;
++	}
++	return ret;
++}
++
++static void __rm_set_prio(struct task_struct *t, struct pcp_priority* new_prio,
++			  rm_domain_t* part)
++{
++	t->rt_param.cur_prio = new_prio;
++	if (in_list(&t->rt_list)) {
++		list_del(&t->rt_list);
++		requeue(t, &part->domain);
++	} else
++		rm_check_resched(&part->domain);
++}
++
++/* call only with IRQs disabled */
++void rm_set_prio(struct task_struct *t, struct pcp_priority* new_prio)
++{
++	unsigned long flags;
++	rm_domain_t *part = task_part(t);
++
++	BUG_ON(!is_realtime(t));
++	spin_lock_irqsave(&part->lock, flags);
++	__rm_set_prio(t, new_prio, part);
++	spin_unlock_irqrestore(&part->lock, flags);
++}
++
++static void rm_scheduler_tick(void)
++{
++	unsigned long       flags;
++	struct task_struct *t            = current;
++	rt_domain_t        *dom          = local_dom;
++	rm_domain_t    	   *part       	 = local_part;
++
++	/* Check for inconsistency. We don't need the lock for this since
++	 * ->scheduled is only changed in schedule, which obviously is not
++	 *  executing in parallel on this CPU
++	 */
++	BUG_ON(is_realtime(t) && t != part->scheduled);
++
++/*	if (is_realtime(t) && budget_exhausted(t)) {
++		if (!is_np(t))
++			set_tsk_need_resched(t);
++		else {
++			TRACE("rm_scheduler_tick: "
++			      "%d is non-preemptable, "
++			      "preemption delayed.\n", t->pid);
++			request_exit_np(t);
++		}
++	}
++*/
++	spin_lock_irqsave(&part->lock, flags);
++	__release_pending(dom);
++	if (rm_preemption_needed(dom, t))
++		set_tsk_need_resched(t);
++	spin_unlock_irqrestore(&part->lock, flags);
++}
++
++static void job_completion(struct task_struct* t)
++{
++	TRACE_TASK(t, "job_completion().\n");
++	set_rt_flags(t, RT_F_SLEEP);
++	prepare_for_next_period(t);
++}
++
++static int rm_schedule(struct task_struct * prev,
++			     struct task_struct ** next)
++{
++	rm_domain_t* 	part = local_part;
++	rt_domain_t*	dom  = &part->domain;
++
++	int 		sleep, preempt,
++			np, exists, blocks, resched;
++//	int		out_of_time;
++
++	spin_lock(&part->lock);
++
++	/* sanity checking */
++	BUG_ON(part->scheduled && part->scheduled != prev);
++	BUG_ON(part->scheduled && !is_realtime(prev));
++
++	/* (0) Determine state */
++	exists      = part->scheduled != NULL;
++	blocks      = exists && !is_running(part->scheduled);
++//	out_of_time = exists && budget_exhausted(part->scheduled);
++#define out_of_time 0
++	np 	    = exists && is_np(part->scheduled);
++	sleep	    = exists && get_rt_flags(part->scheduled) == RT_F_SLEEP;
++	preempt     = rm_preemption_needed(dom, prev);
++
++	/* If we need to preempt do so.
++	 * The following checks set resched to 1 in case of special
++	 * circumstances.
++	 */
++	resched = preempt;
++
++	/* If a task blocks we have no choice but to reschedule.
++	 */
++	if (blocks)
++		resched = 1;
++
++	/* Request a sys_exit_np() call if we would like to preempt but cannot.
++	 * Multiple calls to request_exit_np() don't hurt.
++	 */
++	if (np && (out_of_time || preempt || sleep))
++		request_exit_np(part->scheduled);
++
++	/* Any task that is preemptable and either exhausts its execution
++	 * budget or wants to sleep completes. We may have to reschedule after
++	 * this.
++	 */
++	if (!np && (out_of_time || sleep)) {
++		job_completion(part->scheduled);
++		resched = 1;
++	}
++
++	/* The final scheduling decision. Do we need to switch for some reason?
++	 * Switch if we are in RT mode and have no task or if we need to
++	 * resched.
++	 */
++	*next = NULL;
++	if ((!np || blocks) && (resched || !exists)) {
++		/* Take care of a previously scheduled
++		 * job by taking it out of the Linux runqueue.
++		 */
++		if (part->scheduled) {
++			/* as opposed to global schedulers that switch without
++			 * a lock being held we can requeue already here since
++			 * no other CPU will schedule from this domain.
++			 */
++			if (!blocks)
++				requeue(part->scheduled, dom);
++		}
++		*next = __take_ready(dom);
++	} else
++		/* Only override Linux scheduler if we have a real-time task
++		 * scheduled that needs to continue.
++		 */
++		if (exists)
++			*next = prev;
++
++	if (*next)
++		set_rt_flags(*next, RT_F_RUNNING);
++
++	part->scheduled = *next;
++	spin_unlock(&part->lock);
++	return 0;
++}
++
++
++/*	Prepare a task for running in RT mode
++ *	Enqueues the task into master queue data structure
++ *	returns
++ *		-EPERM  if task is not TASK_STOPPED
++ */
++static long rm_prepare_task(struct task_struct * t)
++{
++	rt_domain_t* 		dom  = task_dom(t);
++	rm_domain_t* 	part = task_part(t);
++	unsigned long		flags;
++
++	TRACE("[%d] P-RM: prepare task %d on CPU %d\n",
++		smp_processor_id(), t->pid, get_partition(t));
++	if (t->state == TASK_STOPPED) {
++//FIXME		if (!t->rt_param.task_params.prio) {
++			TRACE_TASK(t, "using rate-monotonic prio  assignment\n");
++			t->rt_param.pcp_prio.prio = get_rt_period(t);
++//		} else {
++//			TRACE_TASK(t, "using user-defined static prio assignment\n");
++//			t->rt_param.pcp_prio.prio = t->rt_param.task_params.prio;
++//		}
++		t->rt_param.pcp_prio.in_global_cs  = 0;
++		t->rt_param.pcp_prio.pid           = t->pid;
++		t->rt_param.cur_prio = &t->rt_param.pcp_prio;
++		INIT_LIST_HEAD(&t->rt_param.owned_semaphores);
++		/* 1ms delay */
++		release_at(t, sched_clock() + 1000000);
++
++		/* The task should be running in the queue, otherwise signal
++		 * code will try to wake it up with fatal consequences.
++		 */
++		t->state = TASK_RUNNING;
++
++		spin_lock_irqsave(&part->lock, flags);
++		t->rt_param.litmus_controlled = 1;
++		__add_release(dom, t);
++		spin_unlock_irqrestore(&part->lock, flags);
++		return 0;
++	} else
++		return -EPERM;
++}
++
++static void rm_wake_up_task(struct task_struct *task)
++{
++	unsigned long		flags;
++	rm_domain_t* 	part = task_part(task);
++	rt_domain_t* 		dom  = task_dom(task);
++
++	TRACE_TASK(task, "P-RM: %d unsuspends.\n");
++
++	spin_lock_irqsave(&part->lock, flags);
++	if (!task->rt_param.litmus_controlled) {
++		BUG_ON(in_list(&task->rt_list));
++		task->rt_param.litmus_controlled = 1;
++		task->state = TASK_RUNNING;
++		requeue(task, dom);
++	}
++	spin_unlock_irqrestore(&part->lock, flags);
++}
++
++static void rm_task_blocks(struct task_struct *t)
++{
++	BUG_ON(!is_realtime(t));
++	/* not really anything to do since it can only block if
++	 * it is running, and when it is not running it is not in any
++	 * queue anyway.
++	 */
++	TRACE("task %d blocks with budget=%d\n", t->pid, t->time_slice);
++	BUG_ON(in_list(&t->rt_list));
++	t->rt_param.litmus_controlled = 0;
++}
++
++
++/* When _tear_down is called, the task should not be in any queue any more
++ * as it must have blocked first. We don't have any internal state for the task,
++ * it is all in the task_struct.
++ */
++static long rm_tear_down(struct task_struct * t)
++{
++	BUG_ON(!is_realtime(t));
++        TRACE_TASK(t, "tear down called");
++	BUG_ON(t->array);
++	BUG_ON(in_list(&t->rt_list));
++	return 0;
++}
++
++static struct pcp_priority boosted = {0, 1, INT_MAX};
++
++static long rm_pi_block(struct pi_semaphore *sem,
++			struct task_struct *new_waiter)
++{
++	return 0;
++}
++
++static long rm_inherit_priority(struct pi_semaphore *sem,
++				struct task_struct *new_owner)
++{
++	rm_set_prio(new_owner, &boosted);
++	TRACE_TASK(new_owner, "priority boosted");
++	make_np(new_owner);
++	return 0;
++}
++
++
++/* This function is called on a semaphore release, and assumes that
++ * the current task is also the semaphore holder.
++ */
++static long rm_return_priority(struct pi_semaphore *sem)
++{
++	struct task_struct* 	t    = current;
++
++	take_np(t);
++	/* reset prio to trigger resched if required */
++	rm_set_prio(t, &t->rt_param.pcp_prio);
++	TRACE_TASK(t, "prio boost ended");
++	return 0;
++}
++
++/*	Plugin object	*/
++static struct sched_plugin p_rm_plugin __cacheline_aligned_in_smp = {
++	.plugin_name		= "P-RM",
++	/* PCP and SRP don't really work together, but this is something the
++	 * user has to get right for the moment. 
++	 * System will not crash and burn, but timing correctness is not ensured.
++	 * Just don't use both APIs at the same time for now.
++	 */
++	.pcp_active		= 1, 
++	.srp_active		= 1,
++	.scheduler_tick		= rm_scheduler_tick,
++	.prepare_task		= rm_prepare_task,
++	.sleep_next_period	= complete_job,
++	.tear_down		= rm_tear_down,
++	.schedule		= rm_schedule,
++	.wake_up_task		= rm_wake_up_task,
++	.task_blocks		= rm_task_blocks,
++	.pi_block		= rm_pi_block,
++	.inherit_priority	= rm_inherit_priority,
++	.return_priority	= rm_return_priority
++};
++
++static int __init init_rm(void)
++{
++	int i;
++
++	for (i = 0; i < NR_CPUS; i++)
++	{
++		prm_domain_init(remote_part(i),
++				rm_check_resched, i);
++		printk("P-RM: CPU partition %d initialized.\n", i);
++	}
++	return register_sched_plugin(&p_rm_plugin);
++}
++
++
++
++module_init(init_rm);
+diff --git a/litmus/sched_trace.c b/litmus/sched_trace.c
+new file mode 100644
+index 0000000..0976e83
+--- /dev/null
++++ b/litmus/sched_trace.c
+@@ -0,0 +1,541 @@
++/* sched_trace.c -- record scheduling events to a byte stream.
++ *
++ * TODO: Move ring buffer to a lockfree implementation.
++ */
++
++#include <linux/spinlock.h>
++#include <linux/fs.h>
++#include <linux/cdev.h>
++#include <asm/semaphore.h>
++#include <asm/uaccess.h>
++#include <linux/module.h>
++
++#include <litmus/sched_trace.h>
++#include <litmus/litmus.h>
++
++
++typedef struct {
++        /*	guard read and write pointers			*/
++	spinlock_t 	lock;
++	/*	guard against concurrent freeing of buffer 	*/
++	rwlock_t	del_lock;
++
++	/*	memory allocated for ring buffer		*/
++	unsigned long	order;
++	char*  		buf;
++	char*		end;
++
++	/*	Read/write pointer. May not cross.
++	 *	They point to the position of next write and
++	 *	last read.
++	 */
++	char* 		writep;
++	char*		readp;
++
++} ring_buffer_t;
++
++#define EMPTY_RING_BUFFER {	\
++	.lock     = SPIN_LOCK_UNLOCKED,		\
++	.del_lock = RW_LOCK_UNLOCKED,  		\
++	.buf      = NULL,      			\
++	.end      = NULL,			\
++	.writep   = NULL,			\
++	.readp    = NULL			\
++}
++
++void rb_init(ring_buffer_t* buf)
++{
++	*buf = (ring_buffer_t) EMPTY_RING_BUFFER;
++}
++
++int rb_alloc_buf(ring_buffer_t* buf, unsigned long order)
++{
++	unsigned long flags;
++	int error = 0;
++	char *mem;
++
++	/* do memory allocation while not atomic */
++	mem = (char *) __get_free_pages(GFP_KERNEL, order);
++	if (!mem)
++		return -ENOMEM;
++	write_lock_irqsave(&buf->del_lock, flags);
++	BUG_ON(buf->buf);
++	buf->buf = mem;
++	buf->end = buf->buf + PAGE_SIZE * (1 << order) - 1;
++	memset(buf->buf, 0xff, buf->end - buf->buf);
++	buf->order = order;
++	buf->writep = buf->buf + 1;
++	buf->readp  = buf->buf;
++	write_unlock_irqrestore(&buf->del_lock, flags);
++	return error;
++}
++
++int rb_free_buf(ring_buffer_t* buf)
++{
++	unsigned long flags;
++	int error = 0;
++	write_lock_irqsave(&buf->del_lock, flags);
++	BUG_ON(!buf->buf);
++	free_pages((unsigned long) buf->buf, buf->order);
++	buf->buf    = NULL;
++	buf->end    = NULL;
++	buf->writep = NULL;
++	buf->readp  = NULL;
++	write_unlock_irqrestore(&buf->del_lock, flags);
++	return error;
++}
++
++/* Assumption: concurrent writes are serialized externally
++ *
++ * Will only succeed if there is enough space for all len bytes.
++ */
++int rb_put(ring_buffer_t* buf, char* mem, size_t len)
++{
++	unsigned long flags;
++	char* r , *w;
++	int error = 0;
++	read_lock_irqsave(&buf->del_lock, flags);
++	if (!buf->buf) {
++		error = -ENODEV;
++		goto out;
++	}
++	spin_lock(&buf->lock);
++	r = buf->readp;
++	w = buf->writep;
++	spin_unlock(&buf->lock);
++	if (r < w && buf->end - w >= len - 1) {
++		/* easy case: there is enough space in the buffer
++		 * to write it in one continous chunk*/
++		memcpy(w, mem, len);
++		w += len;
++		if (w > buf->end)
++			/* special case: fit exactly into buffer
++			 * w is now buf->end + 1
++			 */
++			w = buf->buf;
++	} else if (w < r && r - w >= len) { /* >= len because  may not cross */
++		/* we are constrained by the read pointer but we there
++		 * is enough space
++		 */
++		memcpy(w, mem, len);
++		w += len;
++	} else if (r <= w && buf->end - w < len - 1) {
++		/* the wrap around case: there may or may not be space */
++		if ((buf->end - w) + (r - buf->buf) >= len - 1) {
++			/* copy chunk that fits at the end */
++			memcpy(w, mem, buf->end - w + 1);
++			mem += buf->end - w + 1;
++			len -= (buf->end - w + 1);
++			w = buf->buf;
++			/* copy the rest */
++			memcpy(w, mem, len);
++			w += len;
++		}
++		else
++			error = -ENOMEM;
++	} else {
++		error = -ENOMEM;
++	}
++	if (!error) {
++		spin_lock(&buf->lock);
++		buf->writep = w;
++		spin_unlock(&buf->lock);
++	}
++ out:
++	read_unlock_irqrestore(&buf->del_lock, flags);
++	return error;
++}
++
++/* Assumption: concurrent reads are serialized externally */
++int rb_get(ring_buffer_t* buf, char* mem, size_t len)
++{
++	unsigned long flags;
++	char* r , *w;
++	int error = 0;
++	read_lock_irqsave(&buf->del_lock, flags);
++	if (!buf->buf) {
++		error = -ENODEV;
++		goto out;
++	}
++	spin_lock(&buf->lock);
++	r = buf->readp;
++	w = buf->writep;
++	spin_unlock(&buf->lock);
++
++	if (w <= r && buf->end - r >= len) {
++		/* easy case: there is enough data in the buffer
++		 * to get it in one  chunk*/
++		memcpy(mem, r + 1, len);
++		r += len;
++		error = len;
++
++	} else if (r + 1 < w && w - r - 1 >= len) {
++		/* we are constrained by the write pointer but
++		 * there is enough data
++		 */
++		memcpy(mem, r + 1, len);
++		r += len;
++		error = len;
++
++	} else if (r + 1 < w && w - r - 1 < len) {
++		/* we are constrained by the write pointer and there
++		 * there is not enough data
++		 */
++		memcpy(mem, r + 1, w - r - 1);
++		error = w - r - 1;
++		r    += w - r - 1;
++
++	} else if (w <= r && buf->end - r < len) {
++		/* the wrap around case: there may or may not be enough data
++		 * first let's get what is available
++		 */
++		memcpy(mem, r + 1, buf->end - r);
++		error += (buf->end - r);
++		mem   += (buf->end - r);
++		len   -= (buf->end - r);
++		r     += (buf->end - r);
++
++		if (w > buf->buf) {
++			/* there is more to get */
++			r = buf->buf - 1;
++			if (w - r >= len) {
++				/* plenty */
++				memcpy(mem, r + 1, len);
++				error += len;
++				r     += len;
++			} else {
++				memcpy(mem, r + 1, w - r - 1);
++				error += w - r - 1;
++				r     += w - r - 1;
++			}
++		}
++	} /* nothing available */
++
++	if (error > 0) {
++		spin_lock(&buf->lock);
++		buf->readp = r;
++		spin_unlock(&buf->lock);
++	}
++ out:
++	read_unlock_irqrestore(&buf->del_lock, flags);
++	return error;
++}
++
++
++
++/******************************************************************************/
++/*                        DEVICE FILE DRIVER                                  */
++/******************************************************************************/
++
++
++
++/* Allocate a buffer of about 1 MB per CPU.
++ *
++ */
++#define BUFFER_ORDER 8
++
++typedef struct {
++	ring_buffer_t 		buf;
++	atomic_t		reader_cnt;
++	struct semaphore	reader_mutex;
++} trace_buffer_t;
++
++
++/* This does not initialize the semaphore!! */
++
++#define EMPTY_TRACE_BUFFER \
++	{ .buf = EMPTY_RING_BUFFER, .reader_cnt = ATOMIC_INIT(0)}
++
++static DEFINE_PER_CPU(trace_buffer_t, trace_buffer);
++
++#ifdef CONFIG_SCHED_DEBUG_TRACE
++static spinlock_t		log_buffer_lock = SPIN_LOCK_UNLOCKED;
++#endif
++static trace_buffer_t 		log_buffer = EMPTY_TRACE_BUFFER;
++
++static void init_buffers(void)
++{
++	int i;
++
++	for (i = 0; i < NR_CPUS; i++) {
++		rb_init(&per_cpu(trace_buffer, i).buf);
++		init_MUTEX(&per_cpu(trace_buffer, i).reader_mutex);
++		atomic_set(&per_cpu(trace_buffer, i).reader_cnt, 0);
++	}
++	/* only initialize the mutex, the rest was initialized as part
++	 * of the static initialization macro
++	 */
++	init_MUTEX(&log_buffer.reader_mutex);
++}
++
++static int trace_release(struct inode *in, struct file *filp)
++{
++	int error 		= -EINVAL;
++	trace_buffer_t* buf 	= filp->private_data;
++
++	BUG_ON(!filp->private_data);
++
++	if (down_interruptible(&buf->reader_mutex)) {
++		error = -ERESTARTSYS;
++		goto out;
++	}
++
++	/*	last release must deallocate buffers 	*/
++	if (atomic_dec_return(&buf->reader_cnt) == 0) {
++		error = rb_free_buf(&buf->buf);
++	}
++
++	up(&buf->reader_mutex);
++ out:
++	return error;
++}
++
++static ssize_t trace_read(struct file *filp, char __user *to, size_t len,
++		      loff_t *f_pos)
++{
++	/* 	we ignore f_pos, this is strictly sequential */
++
++	ssize_t error = -EINVAL;
++	char*   mem;
++	trace_buffer_t *buf = filp->private_data;
++
++	if (down_interruptible(&buf->reader_mutex)) {
++		error = -ERESTARTSYS;
++		goto out;
++	}
++
++	if (len > 64 * 1024)
++		len = 64 * 1024;
++	mem = kmalloc(len, GFP_KERNEL);
++	if (!mem) {
++		error = -ENOMEM;
++		goto out_unlock;
++	}
++
++	error = rb_get(&buf->buf, mem, len);
++	while (!error) {
++		set_current_state(TASK_INTERRUPTIBLE);
++		schedule_timeout(110);
++		if (signal_pending(current))
++			error = -ERESTARTSYS;
++		else
++			error = rb_get(&buf->buf, mem, len);
++	}
++
++	if (error > 0 && copy_to_user(to, mem, error))
++		error = -EFAULT;
++
++	kfree(mem);
++ out_unlock:
++	up(&buf->reader_mutex);
++ out:
++	return error;
++}
++
++
++/* trace_open - Open one of the per-CPU sched_trace buffers.
++ */
++static int trace_open(struct inode *in, struct file *filp)
++{
++	int error 		= -EINVAL;
++	int cpu   		= MINOR(in->i_rdev);
++	trace_buffer_t* buf;
++
++	if (!cpu_online(cpu)) {
++		printk(KERN_WARNING "sched trace: "
++			"CPU #%d is not online. (open failed)\n", cpu);
++		error = -ENODEV;
++		goto out;
++	}
++
++	buf = &per_cpu(trace_buffer, cpu);
++
++	if (down_interruptible(&buf->reader_mutex)) {
++		error = -ERESTARTSYS;
++		goto out;
++	}
++
++	/*	first open must allocate buffers 	*/
++	if (atomic_inc_return(&buf->reader_cnt) == 1) {
++		if ((error = rb_alloc_buf(&buf->buf, BUFFER_ORDER)))
++		{
++			atomic_dec(&buf->reader_cnt);
++			goto out_unlock;
++		}
++	}
++
++	error = 0;
++	filp->private_data = buf;
++
++ out_unlock:
++	up(&buf->reader_mutex);
++ out:
++	return error;
++}
++
++/* log_open - open the global log message ring buffer.
++ */
++static int log_open(struct inode *in, struct file *filp)
++{
++	int error 		= -EINVAL;
++	trace_buffer_t* buf;
++
++	buf = &log_buffer;
++
++	if (down_interruptible(&buf->reader_mutex)) {
++		error = -ERESTARTSYS;
++		goto out;
++	}
++
++	/*	first open must allocate buffers 	*/
++	if (atomic_inc_return(&buf->reader_cnt) == 1) {
++		if ((error = rb_alloc_buf(&buf->buf, BUFFER_ORDER)))
++		{
++			atomic_dec(&buf->reader_cnt);
++			goto out_unlock;
++		}
++	}
++
++	error = 0;
++	filp->private_data = buf;
++
++ out_unlock:
++	up(&buf->reader_mutex);
++ out:
++	return error;
++}
++
++/******************************************************************************/
++/*                          Device Registration                               */
++/******************************************************************************/
++
++/* the major numbes are from the unassigned/local use block
++ *
++ * This should be converted to dynamic allocation at some point...
++ */
++#define TRACE_MAJOR	250
++#define LOG_MAJOR	251
++
++/* trace_fops - The file operations for accessing the per-CPU scheduling event
++ *              trace buffers.
++ */
++struct file_operations trace_fops = {
++	.owner   = THIS_MODULE,
++	.open    = trace_open,
++	.release = trace_release,
++	.read    = trace_read,
++};
++
++/* log_fops  - The file operations for accessing the global LITMUS log message
++ *             buffer.
++ *
++ * Except for opening the device file it uses the same operations as trace_fops.
++ */
++struct file_operations log_fops = {
++	.owner   = THIS_MODULE,
++	.open    = log_open,
++	.release = trace_release,
++	.read    = trace_read,
++};
++
++static int __init register_buffer_dev(const char* name,
++				      struct file_operations* fops,
++				      int major, int count)
++{
++	dev_t  trace_dev;
++	struct cdev *cdev;
++	int error = 0;
++
++	trace_dev = MKDEV(major, 0);
++	error     = register_chrdev_region(trace_dev, count, name);
++	if (error)
++	{
++		printk(KERN_WARNING "sched trace: "
++		       "Could not register major/minor number %d\n", major);
++		return error;
++	}
++	cdev = cdev_alloc();
++	if (!cdev) {
++		printk(KERN_WARNING "sched trace: "
++			"Could not get a cdev for %s.\n", name);
++		return -ENOMEM;
++	}
++	cdev->owner = THIS_MODULE;
++	cdev->ops   = fops;
++	error = cdev_add(cdev, trace_dev, count);
++	if (error) {
++		printk(KERN_WARNING "sched trace: "
++			"add_cdev failed for %s.\n", name);
++		return -ENOMEM;
++	}
++	return error;
++
++}
++
++static int __init init_sched_trace(void)
++{
++	int error1 = 0, error2 = 0;
++
++	printk("Initializing scheduler trace device\n");
++	init_buffers();
++
++	error1 = register_buffer_dev("schedtrace", &trace_fops,
++				    TRACE_MAJOR, NR_CPUS);
++
++	error2 = register_buffer_dev("litmus_log", &log_fops,
++				     LOG_MAJOR, 1);
++	if (error1 || error2)
++		return min(error1, error2);
++	else
++		return 0;
++}
++
++module_init(init_sched_trace);
++
++/******************************************************************************/
++/*                                KERNEL API                                  */
++/******************************************************************************/
++
++/* The per-CPU LITMUS log buffer. Don't put it on the stack, it is too big for
++ * that and the kernel gets very picky with nested interrupts and small stacks.
++ */
++
++#ifdef CONFIG_SCHED_DEBUG_TRACE
++
++#define MSG_SIZE 255
++static DEFINE_PER_CPU(char[MSG_SIZE], fmt_buffer);
++
++/* sched_trace_log_message - This is the only function that accesses the the
++ *                           log buffer inside the kernel for writing.
++ *                           Concurrent access to it is serialized via the
++ *                           log_buffer_lock.
++ *
++ *                           The maximum length of a formatted message is 255.
++ */
++void sched_trace_log_message(const char* fmt, ...)
++{
++	unsigned long 	flags;
++	va_list 	args;
++	size_t		len;
++	char*		buf;
++
++	va_start(args, fmt);
++	local_irq_save(flags);
++
++	/* format message */
++	buf = __get_cpu_var(fmt_buffer);
++	len = vscnprintf(buf, MSG_SIZE, fmt, args);
++
++	spin_lock(&log_buffer_lock);
++	/* Don't copy the trailing null byte, we don't want null bytes
++	 * in a text file.
++	 */
++	rb_put(&log_buffer.buf, buf, len);
++	spin_unlock(&log_buffer_lock);
++
++	local_irq_restore(flags);
++	va_end(args);
++}
++
++#endif
++
+diff --git a/litmus/sync.c b/litmus/sync.c
+new file mode 100644
+index 0000000..4405228
+--- /dev/null
++++ b/litmus/sync.c
+@@ -0,0 +1,84 @@
++/* litmus/sync.c - Support for synchronous and asynchronous task system releases.
++ *
++ *
++ */
++
++#include <asm/atomic.h>
++#include <asm/uaccess.h>
++#include <linux/spinlock.h>
++#include <linux/list.h>
++#include <linux/sched.h>
++#include <linux/completion.h>
++
++#include <litmus/litmus.h>
++#include <litmus/jobs.h>
++
++static DECLARE_COMPLETION(ts_release);
++
++static long do_wait_for_ts_release(void)
++{
++	long ret = 0;
++	
++	/* If the interruption races with a release, the completion object
++	 * may have a non-zero counter. To avoid this problem, this should
++	 * be replaced by wait_for_completion().
++	 *
++	 * For debugging purposes, this is interruptible for now. 
++	 */
++	ret = wait_for_completion_interruptible(&ts_release);
++
++	return ret;
++}
++
++
++static long do_release_ts(lt_t start)
++{
++	int  task_count = 0;
++	long flags;
++	struct list_head	*pos;
++	struct task_struct 	*t;	
++	
++
++	spin_lock_irqsave(&ts_release.wait.lock, flags);
++		
++	list_for_each(pos, &ts_release.wait.task_list) {
++		t = (struct task_struct*) list_entry(pos, 
++						     struct __wait_queue, 
++						     task_list)->private;
++		task_count++;
++		release_at(t, start + t->rt_param.task_params.phase);
++	}
++	
++	spin_unlock_irqrestore(&ts_release.wait.lock, flags);
++
++	complete_n(&ts_release, task_count);
++
++	return task_count;
++}
++
++
++asmlinkage long sys_wait_for_ts_release(void)
++{
++	long ret = -EPERM;
++	struct task_struct *t = current;
++
++	if (is_realtime(t))
++		ret = do_wait_for_ts_release();
++	
++	return ret;
++}
++
++
++asmlinkage long sys_release_ts(lt_t __user *__delay)
++{
++	long ret;
++	lt_t delay;
++
++	/* FIXME: check capabilities... */
++	
++	ret = copy_from_user(&delay, __delay, sizeof(lt_t));
++	if (ret == 0)
++		ret = do_release_ts(sched_clock() + delay);
++
++	return ret;
++}
+diff --git a/litmus/trace.c b/litmus/trace.c
+new file mode 100644
+index 0000000..bcdf103
+--- /dev/null
++++ b/litmus/trace.c
+@@ -0,0 +1,302 @@
++#include <linux/fs.h>
++#include <linux/cdev.h>
++#include <asm/semaphore.h>
++#include <asm/uaccess.h>
++#include <linux/module.h>
++
++#include <litmus/trace.h>
++
++/******************************************************************************/
++/*                          Allocation                                        */
++/******************************************************************************/
++
++struct ft_buffer* trace_ts_buf = NULL;
++
++static unsigned int ts_seq_no = 0;
++
++feather_callback void save_timestamp(unsigned long event)
++{
++	unsigned int seq_no = fetch_and_inc((int *) &ts_seq_no);
++	struct timestamp *ts;
++	if (ft_buffer_start_write(trace_ts_buf, (void**)  &ts)) {		
++		ts->event     = event;
++		ts->timestamp = ft_read_tsc();
++		ts->seq_no    = seq_no;
++		ts->cpu       = raw_smp_processor_id();
++		ft_buffer_finish_write(trace_ts_buf, ts);
++	}
++}
++
++static struct ft_buffer* alloc_ft_buffer(unsigned int count, size_t size)
++{
++	struct ft_buffer* buf;
++	size_t total = (size + 1) * count;
++	char* mem;
++	int order = 0, pages = 1;
++
++	buf = kmalloc(sizeof(struct ft_buffer), GFP_KERNEL);
++	if (!buf)
++		return NULL;
++
++	total = (total / PAGE_SIZE) + (total % PAGE_SIZE != 0);
++	while (pages < total) {
++		order++;
++		pages *= 2;
++	}
++
++	mem = (char*) __get_free_pages(GFP_KERNEL, order);
++	if (!mem) {
++		kfree(buf);
++		return NULL;
++	}
++	
++	if (!init_ft_buffer(buf, count, size, 			     
++			    mem + (count * size),  /* markers at the end */
++			    mem)) {                /* buffer objects     */
++		free_pages((unsigned long) mem, order);
++		kfree(buf);
++		return NULL;
++	}
++	return buf;
++}
++
++static void free_ft_buffer(struct ft_buffer* buf)
++{
++	int order = 0, pages = 1;
++	size_t total;
++
++	if (buf) {
++		total = (buf->slot_size + 1) * buf->slot_count;
++		total = (total / PAGE_SIZE) + (total % PAGE_SIZE != 0);
++		while (pages < total) {
++			order++;
++			pages *= 2;
++		}
++		free_pages((unsigned long) buf->buffer_mem, order);
++		kfree(buf);
++	}
++}
++
++
++/******************************************************************************/
++/*                        DEVICE FILE DRIVER                                  */
++/******************************************************************************/
++
++#define NO_TIMESTAMPS 262144
++
++static DECLARE_MUTEX(feather_lock);
++static int use_count = 0;
++
++static int trace_release(struct inode *in, struct file *filp)
++{
++	int err 		= -EINVAL;
++
++	if (down_interruptible(&feather_lock)) {
++		err = -ERESTARTSYS;
++		goto out;
++	}
++
++	printk(KERN_ALERT "%s/%d disconnects from feather trace device. "
++	       "use_count=%d\n",
++	       current->comm, current->pid, use_count);
++
++	if (use_count == 1) {
++		/* disable events */
++		ft_disable_all_events();
++		
++		/* wait for any pending events to complete */
++		set_current_state(TASK_UNINTERRUPTIBLE);
++		schedule_timeout(HZ);
++		
++		printk(KERN_ALERT "Failed trace writes: %u\n", 
++		       trace_ts_buf->failed_writes);
++	
++		free_ft_buffer(trace_ts_buf);
++		trace_ts_buf = NULL;
++	}
++
++	use_count--;
++	up(&feather_lock);
++out:
++	return err;
++}
++
++
++static ssize_t trace_read(struct file *filp, char __user *to, size_t len, 
++		      loff_t *f_pos)
++{
++	/* 	we ignore f_pos, this is strictly sequential */	
++	ssize_t error = 0;
++	struct timestamp ts;
++
++	if (down_interruptible(&feather_lock)) {
++		error = -ERESTARTSYS;
++		goto out;
++	}
++
++	
++	while (len >= sizeof(struct timestamp)) {
++		if (ft_buffer_read(trace_ts_buf, &ts)) {
++			if (copy_to_user(to, &ts, sizeof(struct timestamp))) {
++				error = -EFAULT;
++				break;
++			} else {
++				len    -= sizeof(struct timestamp);
++				to     += sizeof(struct timestamp);
++				error  += sizeof(struct timestamp);
++			}
++	        } else {
++			set_current_state(TASK_INTERRUPTIBLE);
++			schedule_timeout(50);
++			if (signal_pending(current)) {
++				error = -ERESTARTSYS;
++				break;
++			}
++		}
++	}
++	up(&feather_lock);
++out:
++	return error;
++}
++
++#define ENABLE_CMD 	0
++#define DISABLE_CMD 	1
++
++static ssize_t trace_write(struct file *filp, const char __user *from, 
++			   size_t len, loff_t *f_pos) 
++{
++	ssize_t error = -EINVAL;
++	unsigned long cmd;
++	unsigned long id;
++	
++	if (len % sizeof(long) || len < 2 * sizeof(long))
++		goto out;
++
++	if (copy_from_user(&cmd, from, sizeof(long))) {
++		error = -EFAULT;
++	        goto out;
++	}
++	len  -= sizeof(long);
++	from += sizeof(long);
++
++	if (cmd != ENABLE_CMD && cmd != DISABLE_CMD) 
++		goto out;
++
++	if (down_interruptible(&feather_lock)) {
++		error = -ERESTARTSYS;
++		goto out;
++	}
++	
++	error = sizeof(long);
++	while (len) {
++		if (copy_from_user(&id, from, sizeof(long))) {
++			error = -EFAULT;
++			goto out;
++		}
++		len  -= sizeof(long);
++		from += sizeof(long);
++		if (cmd) {
++			printk(KERN_INFO 
++			       "Disabling feather-trace event %lu.\n", id);
++			ft_disable_event(id);
++		} else {
++			printk(KERN_INFO 
++			       "Enabling feather-trace event %lu.\n", id);
++			ft_enable_event(id);
++		}
++		error += sizeof(long);
++	}
++	
++	up(&feather_lock);
++ out:
++	return error;
++}
++
++static int trace_open(struct inode *in, struct file *filp) 
++{
++	int err = 0;
++        unsigned int count = NO_TIMESTAMPS;
++
++	if (down_interruptible(&feather_lock)) {
++		err = -ERESTARTSYS;
++		goto out;
++	}
++       
++	while (count && !trace_ts_buf) {
++		printk("trace: trying to allocate %u time stamps.\n", count);
++		trace_ts_buf = alloc_ft_buffer(count, sizeof(struct timestamp));
++		count /= 2;
++	}
++	if (!trace_ts_buf)
++		err = -ENOMEM;
++	else
++		use_count++;
++
++	up(&feather_lock);
++out:
++	return err;
++}
++
++/******************************************************************************/
++/*                          Device Registration                               */
++/******************************************************************************/
++
++#define FT_TRACE_MAJOR	252
++
++struct file_operations ft_trace_fops = {
++	.owner   = THIS_MODULE,
++	.open    = trace_open,
++	.release = trace_release,
++	.write   = trace_write,
++	.read    = trace_read,
++};
++
++
++static int __init register_buffer_dev(const char* name,
++				      struct file_operations* fops, 
++				      int major, int count) 
++{
++	dev_t   trace_dev;
++	struct cdev *cdev;	
++	int error = 0;
++
++	trace_dev = MKDEV(major, 0);
++	error     = register_chrdev_region(trace_dev, count, name);
++	if (error)
++	{
++		printk(KERN_WARNING "trace: "
++		       "Could not register major/minor number %d\n", major);
++		return error;
++	}
++	cdev = cdev_alloc();
++	if (!cdev) {
++		printk(KERN_WARNING "trace: "
++			"Could not get a cdev for %s.\n", name);
++		return -ENOMEM;
++	}
++	cdev->owner = THIS_MODULE;
++	cdev->ops   = fops;
++	error = cdev_add(cdev, trace_dev, count);
++	if (error) {
++		printk(KERN_WARNING "trace: "
++			"add_cdev failed for %s.\n", name);
++		return -ENOMEM;		
++	}
++	return error;
++
++}
++
++static int __init init_sched_trace(void) 
++{
++	int error = 0;
++
++	printk("Initializing Feather-Trace device\n");
++	/* dummy entry to make linker happy */
++	ft_event0(666, save_timestamp);
++
++	error = register_buffer_dev("ft_trace", &ft_trace_fops, 
++				    FT_TRACE_MAJOR, 1);
++	return error;
++}
++
++module_init(init_sched_trace);
diff --git a/index.html b/index.html
index 4e251e9..af6bef4 100644
--- a/index.html
+++ b/index.html
@@ -126,6 +126,23 @@
 	<cite>Proceedings of the 14th IEEE International Conference on Embedded and Real-Time Computing Systems and Applications</cite>, to appear, August 2008. 
 	<a href="http://www.cs.unc.edu/~anderson/papers/rtcsa08.ps">Postscript</a>. <a href="http://www.cs.unc.edu/~anderson/papers/rtcsa08.pdf">PDF</a>.
 	</p>
+	<p><strong>Note:</strong> The work described in this paper took part in a branch that is currently not part of
+	   the main distribution. For reference, we provide the branch as a separate download:
+	</p>
+ 	  <ul>
+	 <li>
+	 	<a href="download/RTCSA08/litmus-rt-RTCSA08.patch">litmus-rt-RTCSA08.patch</a>
+	 </li>
+	 <li>
+	 	<a href="download/RTCSA08/liblitmus-RTCSA08.tgz">liblitmus-RTCSA08.tgz</a>
+	 </li>
+	 <li><a href="download/RTCSA08/SHA256SUMS">SHA256 check sums</a>
+	 </li>
+         </ul>
+	 <p>Please don't use this version for active development. If you are interested in this work, it would be best 
+	 to first port the desired features to LTIMUS<sup>RT</sup> 2008 and merge them into the main distribution.
+	 </p>
+
       </li>
 
       <li>
-- 
cgit v1.2.2