From b1e1fea67bca3796d5f9133a92c300ec4fa93a4f Mon Sep 17 00:00:00 2001
From: Jeremy Erickson <jerickso@cs.unc.edu>
Date: Thu, 30 Aug 2012 21:01:47 -0400
Subject: Bjoern's Dissertation Code with Priority Donation

---
 Makefile                                    |    4 +-
 arch/arm/Kconfig                            |    8 +
 arch/arm/include/asm/timex.h                |    2 +
 arch/arm/include/asm/unistd.h               |    3 +
 arch/arm/kernel/calls.S                     |   12 +
 arch/arm/kernel/smp.c                       |    4 +
 arch/arm/mach-realview/include/mach/timex.h |   27 +
 arch/x86/Kconfig                            |    8 +
 arch/x86/include/asm/entry_arch.h           |    1 +
 arch/x86/include/asm/feather_trace.h        |   17 +
 arch/x86/include/asm/feather_trace_32.h     |   79 ++
 arch/x86/include/asm/feather_trace_64.h     |   67 ++
 arch/x86/include/asm/hw_irq.h               |    3 +
 arch/x86/include/asm/irq_vectors.h          |    5 +
 arch/x86/include/asm/processor.h            |    4 +
 arch/x86/include/asm/unistd_32.h            |    6 +-
 arch/x86/include/asm/unistd_64.h            |    4 +
 arch/x86/kernel/Makefile                    |    2 +
 arch/x86/kernel/cpu/intel_cacheinfo.c       |   17 +
 arch/x86/kernel/entry_64.S                  |    2 +
 arch/x86/kernel/ft_event.c                  |  118 ++
 arch/x86/kernel/irqinit.c                   |    3 +
 arch/x86/kernel/smp.c                       |   27 +
 arch/x86/kernel/syscall_table_32.S          |   12 +
 drivers/tty/vt/consolemap_deftbl.c          |   86 ++
 drivers/tty/vt/defkeymap.c                  |  262 +++++
 fs/exec.c                                   |   13 +-
 fs/inode.c                                  |    2 +
 include/linux/completion.h                  |    1 +
 include/linux/fs.h                          |   21 +-
 include/linux/hrtimer.h                     |   32 +
 include/linux/sched.h                       |   19 +-
 include/linux/smp.h                         |    5 +
 include/linux/tick.h                        |    5 +
 include/litmus/bheap.h                      |   77 ++
 include/litmus/budget.h                     |    8 +
 include/litmus/clustered.h                  |   44 +
 include/litmus/debug_trace.h                |   37 +
 include/litmus/edf_common.h                 |   33 +
 include/litmus/fdso.h                       |   77 ++
 include/litmus/feather_buffer.h             |   94 ++
 include/litmus/feather_trace.h              |   65 ++
 include/litmus/fp_common.h                  |  105 ++
 include/litmus/ftdev.h                      |   55 +
 include/litmus/jobs.h                       |    9 +
 include/litmus/litmus.h                     |  292 +++++
 include/litmus/litmus_proc.h                |   25 +
 include/litmus/locking.h                    |   28 +
 include/litmus/preempt.h                    |  165 +++
 include/litmus/rt_domain.h                  |  182 ++++
 include/litmus/rt_param.h                   |  228 ++++
 include/litmus/sched_plugin.h               |  117 ++
 include/litmus/sched_plugin.h.rej           |   22 +
 include/litmus/sched_trace.h                |  200 ++++
 include/litmus/srp.h                        |   28 +
 include/litmus/trace.h                      |  129 +++
 include/litmus/unistd_32.h                  |   21 +
 include/litmus/unistd_64.h                  |   33 +
 include/litmus/wait.h                       |   57 +
 kernel/exit.c                               |    4 +
 kernel/fork.c                               |    7 +
 kernel/hrtimer.c                            |   95 ++
 kernel/printk.c                             |   14 +-
 kernel/sched.c                              |  127 ++-
 kernel/sched_fair.c                         |    2 +-
 kernel/sched_rt.c                           |    2 +-
 kernel/time/tick-sched.c                    |   47 +
 litmus/Kconfig                              |  185 ++++
 litmus/Makefile                             |   30 +
 litmus/bheap.c                              |  314 ++++++
 litmus/budget.c                             |  111 ++
 litmus/clustered.c                          |  111 ++
 litmus/ctrldev.c                            |  150 +++
 litmus/edf_common.c                         |  143 +++
 litmus/fdso.c                               |  297 ++++++
 litmus/fp_common.c                          |  119 +++
 litmus/ft_event.c                           |   43 +
 litmus/ftdev.c                              |  446 ++++++++
 litmus/jobs.c                               |   43 +
 litmus/litmus.c                             |  555 ++++++++++
 litmus/litmus_proc.c                        |  347 ++++++
 litmus/locking.c                            |  186 ++++
 litmus/preempt.c                            |  131 +++
 litmus/rt_domain.c                          |  357 +++++++
 litmus/sched_cedf.c                         | 1526 ++++++++++++++++++++++++++
 litmus/sched_cedf.c.rej                     |   53 +
 litmus/sched_gfl_split_namechange.c         | 1149 ++++++++++++++++++++
 litmus/sched_gsn_edf.c                      | 1286 ++++++++++++++++++++++
 litmus/sched_gsn_edf_split_namechange.c     | 1165 ++++++++++++++++++++
 litmus/sched_litmus.c                       |  328 ++++++
 litmus/sched_litmus.c.rej                   |   11 +
 litmus/sched_pfair.c                        | 1056 ++++++++++++++++++
 litmus/sched_pfp.c                          | 1542 +++++++++++++++++++++++++++
 litmus/sched_plugin.c                       |  233 ++++
 litmus/sched_psn_edf.c                      |  917 ++++++++++++++++
 litmus/sched_task_trace.c                   |  241 +++++
 litmus/sched_trace.c                        |  252 +++++
 litmus/srp.c                                |  295 +++++
 litmus/sync.c                               |  104 ++
 litmus/trace.c                              |  213 ++++
 100 files changed, 17213 insertions(+), 36 deletions(-)
 create mode 100644 arch/x86/include/asm/feather_trace.h
 create mode 100644 arch/x86/include/asm/feather_trace_32.h
 create mode 100644 arch/x86/include/asm/feather_trace_64.h
 create mode 100644 arch/x86/kernel/ft_event.c
 create mode 100644 drivers/tty/vt/consolemap_deftbl.c
 create mode 100644 drivers/tty/vt/defkeymap.c
 create mode 100644 include/litmus/bheap.h
 create mode 100644 include/litmus/budget.h
 create mode 100644 include/litmus/clustered.h
 create mode 100644 include/litmus/debug_trace.h
 create mode 100644 include/litmus/edf_common.h
 create mode 100644 include/litmus/fdso.h
 create mode 100644 include/litmus/feather_buffer.h
 create mode 100644 include/litmus/feather_trace.h
 create mode 100644 include/litmus/fp_common.h
 create mode 100644 include/litmus/ftdev.h
 create mode 100644 include/litmus/jobs.h
 create mode 100644 include/litmus/litmus.h
 create mode 100644 include/litmus/litmus_proc.h
 create mode 100644 include/litmus/locking.h
 create mode 100644 include/litmus/preempt.h
 create mode 100644 include/litmus/rt_domain.h
 create mode 100644 include/litmus/rt_param.h
 create mode 100644 include/litmus/sched_plugin.h
 create mode 100644 include/litmus/sched_plugin.h.rej
 create mode 100644 include/litmus/sched_trace.h
 create mode 100644 include/litmus/srp.h
 create mode 100644 include/litmus/trace.h
 create mode 100644 include/litmus/unistd_32.h
 create mode 100644 include/litmus/unistd_64.h
 create mode 100644 include/litmus/wait.h
 create mode 100644 litmus/Kconfig
 create mode 100644 litmus/Makefile
 create mode 100644 litmus/bheap.c
 create mode 100644 litmus/budget.c
 create mode 100644 litmus/clustered.c
 create mode 100644 litmus/ctrldev.c
 create mode 100644 litmus/edf_common.c
 create mode 100644 litmus/fdso.c
 create mode 100644 litmus/fp_common.c
 create mode 100644 litmus/ft_event.c
 create mode 100644 litmus/ftdev.c
 create mode 100644 litmus/jobs.c
 create mode 100644 litmus/litmus.c
 create mode 100644 litmus/litmus_proc.c
 create mode 100644 litmus/locking.c
 create mode 100644 litmus/preempt.c
 create mode 100644 litmus/rt_domain.c
 create mode 100644 litmus/sched_cedf.c
 create mode 100644 litmus/sched_cedf.c.rej
 create mode 100644 litmus/sched_gfl_split_namechange.c
 create mode 100644 litmus/sched_gsn_edf.c
 create mode 100644 litmus/sched_gsn_edf_split_namechange.c
 create mode 100644 litmus/sched_litmus.c
 create mode 100644 litmus/sched_litmus.c.rej
 create mode 100644 litmus/sched_pfair.c
 create mode 100644 litmus/sched_pfp.c
 create mode 100644 litmus/sched_plugin.c
 create mode 100644 litmus/sched_psn_edf.c
 create mode 100644 litmus/sched_task_trace.c
 create mode 100644 litmus/sched_trace.c
 create mode 100644 litmus/srp.c
 create mode 100644 litmus/sync.c
 create mode 100644 litmus/trace.c

diff --git a/Makefile b/Makefile
index 860c26af52c3..8e53f47a311b 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 VERSION = 2
 PATCHLEVEL = 6
 SUBLEVEL = 36
-EXTRAVERSION =
+EXTRAVERSION =-litmus2010
 NAME = Flesh-Eating Bats with Fangs
 
 # *DOCUMENTATION*
@@ -659,7 +659,7 @@ export mod_strip_cmd
 
 
 ifeq ($(KBUILD_EXTMOD),)
-core-y		+= kernel/ mm/ fs/ ipc/ security/ crypto/ block/
+core-y		+= kernel/ mm/ fs/ ipc/ security/ crypto/ block/ litmus/
 
 vmlinux-dirs	:= $(patsubst %/,%,$(filter %/, $(init-y) $(init-m) \
 		     $(core-y) $(core-m) $(drivers-y) $(drivers-m) \
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 9c26ba7244fb..babad6d7681a 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1808,3 +1808,11 @@ source "security/Kconfig"
 source "crypto/Kconfig"
 
 source "lib/Kconfig"
+
+config ARCH_HAS_SEND_PULL_TIMERS
+	def_bool n
+
+config ARCH_HAS_FEATHER_TRACE
+	def_bool n
+
+source "litmus/Kconfig"
diff --git a/arch/arm/include/asm/timex.h b/arch/arm/include/asm/timex.h
index 3be8de3adaba..8a102a383a36 100644
--- a/arch/arm/include/asm/timex.h
+++ b/arch/arm/include/asm/timex.h
@@ -16,9 +16,11 @@
 
 typedef unsigned long cycles_t;
 
+#ifndef get_cycles
 static inline cycles_t get_cycles (void)
 {
 	return 0;
 }
+#endif
 
 #endif
diff --git a/arch/arm/include/asm/unistd.h b/arch/arm/include/asm/unistd.h
index c891eb76c0e3..625b30490624 100644
--- a/arch/arm/include/asm/unistd.h
+++ b/arch/arm/include/asm/unistd.h
@@ -397,6 +397,9 @@
 #define __NR_fanotify_mark		(__NR_SYSCALL_BASE+368)
 #define __NR_prlimit64			(__NR_SYSCALL_BASE+369)
 
+#define __NR_LITMUS (__NR_SYSCALL_BASE+370)
+#include <litmus/unistd_32.h>
+
 /*
  * The following SWIs are ARM private.
  */
diff --git a/arch/arm/kernel/calls.S b/arch/arm/kernel/calls.S
index 5c26eccef998..b99087ac85b9 100644
--- a/arch/arm/kernel/calls.S
+++ b/arch/arm/kernel/calls.S
@@ -379,6 +379,18 @@
 		CALL(sys_fanotify_init)
 		CALL(sys_fanotify_mark)
 		CALL(sys_prlimit64)
+/* 370 */	CALL(sys_set_rt_task_param)
+		CALL(sys_get_rt_task_param)
+		CALL(sys_complete_job)
+		CALL(sys_od_open)
+		CALL(sys_od_close)
+/* 375 */	CALL(sys_litmus_lock)
+		CALL(sys_litmus_unlock)
+		CALL(sys_query_job_no)
+		CALL(sys_wait_for_job_release)
+		CALL(sys_wait_for_ts_release)
+/* 380 */	CALL(sys_release_ts)
+		CALL(sys_null_call)
 #ifndef syscalls_counted
 .equ syscalls_padding, ((NR_syscalls + 3) & ~3) - NR_syscalls
 #define syscalls_counted
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index 40dc74f2b27f..b72fbf3d043c 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -38,6 +38,8 @@
 #include <asm/localtimer.h>
 #include <asm/smp_plat.h>
 
+#include <litmus/preempt.h>
+
 /*
  * as from 2.5, kernels no longer have an init_tasks structure
  * so we need some other way of telling a new secondary core
@@ -533,6 +535,8 @@ asmlinkage void __exception do_IPI(struct pt_regs *regs)
 				 * nothing more to do - eveything is
 				 * done on the interrupt return path
 				 */
+				/* LITMUS^RT: take action based on scheduler state */
+				sched_state_ipi();
 				break;
 
 			case IPI_CALL_FUNC:
diff --git a/arch/arm/mach-realview/include/mach/timex.h b/arch/arm/mach-realview/include/mach/timex.h
index 4eeb069373c2..e8bcc40d1f08 100644
--- a/arch/arm/mach-realview/include/mach/timex.h
+++ b/arch/arm/mach-realview/include/mach/timex.h
@@ -21,3 +21,30 @@
  */
 
 #define CLOCK_TICK_RATE		(50000000 / 16)
+
+#if defined(CONFIG_MACH_REALVIEW_PB11MP) || defined(CONFIG_MACH_REALVIEW_PB1176)
+
+static inline unsigned long realview_get_arm11_cp15_ccnt(void)
+{
+	unsigned long cycles;
+	/* Read CP15 CCNT register. */
+	asm volatile ("mrc p15, 0, %0, c15, c12, 1" : "=r" (cycles));
+	return cycles;
+}
+
+#define get_cycles realview_get_arm11_cp15_ccnt
+
+#elif defined(CONFIG_MACH_REALVIEW_PBA8)
+
+
+static inline unsigned long realview_get_a8_cp15_ccnt(void)
+{
+	unsigned long cycles;
+	/* Read CP15 CCNT register. */
+	asm volatile ("mrc p15, 0, %0, c9, c13, 0" : "=r" (cycles));
+	return cycles;
+}
+
+#define get_cycles realview_get_a8_cp15_ccnt
+
+#endif
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cea0cd9a316f..5181ed3a211a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2142,3 +2142,11 @@ source "crypto/Kconfig"
 source "arch/x86/kvm/Kconfig"
 
 source "lib/Kconfig"
+
+config ARCH_HAS_FEATHER_TRACE
+	def_bool y
+
+config ARCH_HAS_SEND_PULL_TIMERS
+	def_bool y
+
+source "litmus/Kconfig"
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index 8e8ec663a98f..5d07dea2ebb8 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -13,6 +13,7 @@
 BUILD_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR)
 BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR)
 BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR)
+BUILD_INTERRUPT(pull_timers_interrupt,PULL_TIMERS_VECTOR)
 BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR)
 BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR)
 
diff --git a/arch/x86/include/asm/feather_trace.h b/arch/x86/include/asm/feather_trace.h
new file mode 100644
index 000000000000..4fd31633405d
--- /dev/null
+++ b/arch/x86/include/asm/feather_trace.h
@@ -0,0 +1,17 @@
+#ifndef _ARCH_FEATHER_TRACE_H
+#define _ARCH_FEATHER_TRACE_H
+
+#include <asm/msr.h>
+
+static inline unsigned long long ft_timestamp(void)
+{
+	return __native_read_tsc();
+}
+
+#ifdef CONFIG_X86_32
+#include "feather_trace_32.h"
+#else
+#include "feather_trace_64.h"
+#endif
+
+#endif
diff --git a/arch/x86/include/asm/feather_trace_32.h b/arch/x86/include/asm/feather_trace_32.h
new file mode 100644
index 000000000000..70202f90f169
--- /dev/null
+++ b/arch/x86/include/asm/feather_trace_32.h
@@ -0,0 +1,79 @@
+/* Do not directly include this file. Include feather_trace.h instead */
+
+#define feather_callback __attribute__((regparm(0)))
+
+/*
+ * make the compiler reload any register that is not saved in
+ * a cdecl function call
+ */
+#define CLOBBER_LIST "memory", "cc", "eax", "ecx", "edx"
+
+#define ft_event(id, callback)                                  \
+        __asm__ __volatile__(                                   \
+            "1: jmp 2f                                    \n\t" \
+	    " call " #callback "                          \n\t" \
+            ".section __event_table, \"aw\"               \n\t" \
+            ".long " #id  ", 0, 1b, 2f                    \n\t" \
+            ".previous                                    \n\t" \
+            "2:                                           \n\t" \
+        : : : CLOBBER_LIST)
+
+#define ft_event0(id, callback)                                 \
+        __asm__ __volatile__(                                   \
+            "1: jmp 2f                                    \n\t" \
+	    " subl $4, %%esp                              \n\t" \
+            " movl $" #id  ", (%%esp)                     \n\t" \
+	    " call " #callback "                          \n\t" \
+	    " addl $4, %%esp                              \n\t" \
+            ".section __event_table, \"aw\"               \n\t" \
+            ".long " #id  ", 0, 1b, 2f                    \n\t" \
+            ".previous                                    \n\t" \
+            "2:                                           \n\t" \
+        : :  : CLOBBER_LIST)
+
+#define ft_event1(id, callback, param)                          \
+        __asm__ __volatile__(                                   \
+            "1: jmp 2f                                    \n\t" \
+	    " subl $8, %%esp                              \n\t" \
+	    " movl %0, 4(%%esp)                           \n\t" \
+            " movl $" #id  ", (%%esp)                     \n\t" \
+	    " call " #callback "                          \n\t" \
+	    " addl $8, %%esp                              \n\t" \
+            ".section __event_table, \"aw\"               \n\t" \
+            ".long " #id  ", 0, 1b, 2f                    \n\t" \
+            ".previous                                    \n\t" \
+            "2:                                           \n\t" \
+        : : "r" (param)  : CLOBBER_LIST)
+
+#define ft_event2(id, callback, param, param2)                  \
+        __asm__ __volatile__(                                   \
+            "1: jmp 2f                                    \n\t" \
+	    " subl $12, %%esp                             \n\t" \
+	    " movl %1, 8(%%esp)                           \n\t" \
+	    " movl %0, 4(%%esp)                           \n\t" \
+            " movl $" #id  ", (%%esp)                     \n\t" \
+	    " call " #callback "                          \n\t" \
+	    " addl $12, %%esp                             \n\t" \
+            ".section __event_table, \"aw\"               \n\t" \
+            ".long " #id  ", 0, 1b, 2f                    \n\t" \
+            ".previous                                    \n\t" \
+            "2:                                           \n\t" \
+        : : "r" (param), "r" (param2)  : CLOBBER_LIST)
+
+
+#define ft_event3(id, callback, p, p2, p3)                      \
+        __asm__ __volatile__(                                   \
+            "1: jmp 2f                                    \n\t" \
+	    " subl $16, %%esp                             \n\t" \
+	    " movl %2, 12(%%esp)                          \n\t" \
+	    " movl %1, 8(%%esp)                           \n\t" \
+	    " movl %0, 4(%%esp)                           \n\t" \
+            " movl $" #id  ", (%%esp)                     \n\t" \
+	    " call " #callback "                          \n\t" \
+	    " addl $16, %%esp                             \n\t" \
+            ".section __event_table, \"aw\"               \n\t" \
+            ".long " #id  ", 0, 1b, 2f                    \n\t" \
+            ".previous                                    \n\t" \
+            "2:                                           \n\t" \
+        : : "r" (p), "r" (p2), "r" (p3)  : CLOBBER_LIST)
+
diff --git a/arch/x86/include/asm/feather_trace_64.h b/arch/x86/include/asm/feather_trace_64.h
new file mode 100644
index 000000000000..54ac2aeb3a28
--- /dev/null
+++ b/arch/x86/include/asm/feather_trace_64.h
@@ -0,0 +1,67 @@
+/* Do not directly include this file. Include feather_trace.h instead */
+
+/* regparm is the default on x86_64 */
+#define feather_callback
+
+# define _EVENT_TABLE(id,from,to) \
+            ".section __event_table, \"aw\"\n\t" \
+	    ".balign 8\n\t" \
+            ".quad " #id  ", 0, " #from ", " #to " \n\t" \
+            ".previous \n\t"
+
+/*
+ * x86_64 callee only owns rbp, rbx, r12 -> r15
+ * the called can freely modify the others
+ */
+#define CLOBBER_LIST	"memory", "cc", "rdi", "rsi", "rdx", "rcx", \
+			"r8", "r9", "r10", "r11", "rax"
+
+#define ft_event(id, callback)                                  \
+        __asm__ __volatile__(                                   \
+            "1: jmp 2f                                    \n\t" \
+	    " call " #callback "                          \n\t" \
+            _EVENT_TABLE(id,1b,2f) \
+            "2:                                           \n\t" \
+        : : : CLOBBER_LIST)
+
+#define ft_event0(id, callback)                                 \
+        __asm__ __volatile__(                                   \
+            "1: jmp 2f                                    \n\t" \
+	    " movq $" #id ", %%rdi			  \n\t" \
+	    " call " #callback "                          \n\t" \
+	    _EVENT_TABLE(id,1b,2f) \
+            "2:                                           \n\t" \
+        : :  : CLOBBER_LIST)
+
+#define ft_event1(id, callback, param)                          \
+	__asm__ __volatile__(                                   \
+	    "1: jmp 2f                                    \n\t" \
+	    " movq %0, %%rsi				  \n\t"	\
+	    " movq $" #id ", %%rdi			  \n\t" \
+	    " call " #callback "                          \n\t" \
+	    _EVENT_TABLE(id,1b,2f) \
+	    "2:                                           \n\t" \
+	: : "r" (param)  : CLOBBER_LIST)
+
+#define ft_event2(id, callback, param, param2)                  \
+        __asm__ __volatile__(                                   \
+            "1: jmp 2f                                    \n\t" \
+	    " movq %1, %%rdx				  \n\t"	\
+	    " movq %0, %%rsi				  \n\t"	\
+	    " movq $" #id ", %%rdi			  \n\t" \
+	    " call " #callback "                          \n\t" \
+            _EVENT_TABLE(id,1b,2f) \
+            "2:                                           \n\t" \
+        : : "r" (param), "r" (param2)  : CLOBBER_LIST)
+
+#define ft_event3(id, callback, p, p2, p3)                      \
+        __asm__ __volatile__(                                   \
+            "1: jmp 2f                                    \n\t" \
+	    " movq %2, %%rcx				  \n\t"	\
+	    " movq %1, %%rdx				  \n\t"	\
+	    " movq %0, %%rsi				  \n\t"	\
+	    " movq $" #id ", %%rdi			  \n\t" \
+	    " call " #callback "                          \n\t" \
+            _EVENT_TABLE(id,1b,2f) \
+            "2:                                           \n\t" \
+        : : "r" (p), "r" (p2), "r" (p3)  : CLOBBER_LIST)
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 46c0fe05f230..c17411503f28 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -53,6 +53,8 @@ extern void threshold_interrupt(void);
 extern void call_function_interrupt(void);
 extern void call_function_single_interrupt(void);
 
+extern void pull_timers_interrupt(void);
+
 /* IOAPIC */
 #define IO_APIC_IRQ(x) (((x) >= NR_IRQS_LEGACY) || ((1<<(x)) & io_apic_irqs))
 extern unsigned long io_apic_irqs;
@@ -122,6 +124,7 @@ extern asmlinkage void smp_irq_move_cleanup_interrupt(void);
 extern void smp_reschedule_interrupt(struct pt_regs *);
 extern void smp_call_function_interrupt(struct pt_regs *);
 extern void smp_call_function_single_interrupt(struct pt_regs *);
+extern void smp_pull_timers_interrupt(struct pt_regs *);
 #ifdef CONFIG_X86_32
 extern void smp_invalidate_interrupt(struct pt_regs *);
 #else
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index e2ca30092557..6143ebeeebfa 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -108,6 +108,11 @@
  */
 #define LOCAL_TIMER_VECTOR		0xef
 
+/*
+ * LITMUS^RT pull timers IRQ vector
+ */
+#define PULL_TIMERS_VECTOR		0xee
+
 /*
  * Generic system vector for platform specific use
  */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 325b7bdbebaa..ebaa04a8d3af 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -169,6 +169,10 @@ extern void print_cpu_info(struct cpuinfo_x86 *);
 extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
 extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
 extern unsigned short num_cache_leaves;
+#ifdef CONFIG_SYSFS
+extern int get_shared_cpu_map(cpumask_var_t mask,
+			       unsigned int cpu, int index);
+#endif
 
 extern void detect_extended_topology(struct cpuinfo_x86 *c);
 extern void detect_ht(struct cpuinfo_x86 *c);
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index b766a5e8ba0e..b7ba19acd3f8 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -347,9 +347,13 @@
 #define __NR_fanotify_mark	339
 #define __NR_prlimit64		340
 
+#define __NR_LITMUS		341
+
+#include "litmus/unistd_32.h"
+
 #ifdef __KERNEL__
 
-#define NR_syscalls 341
+#define NR_syscalls 341 + NR_litmus_syscalls
 
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 363e9b8a715b..332bf3c9c84c 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -670,6 +670,10 @@ __SYSCALL(__NR_fanotify_mark, sys_fanotify_mark)
 #define __NR_prlimit64				302
 __SYSCALL(__NR_prlimit64, sys_prlimit64)
 
+#define __NR_LITMUS				303
+
+#include "litmus/unistd_64.h"
+
 #ifndef __NO_STUBS
 #define __ARCH_WANT_OLD_READDIR
 #define __ARCH_WANT_OLD_STAT
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index fedf32a8c3ec..6890dbb9ac15 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -118,6 +118,8 @@ obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
 
 obj-$(CONFIG_SWIOTLB)			+= pci-swiotlb.o
 
+obj-$(CONFIG_FEATHER_TRACE)	+= ft_event.o
+
 ###
 # 64 bit specific files
 ifeq ($(CONFIG_X86_64),y)
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 898c2f4eab88..3fec7d9bfd62 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -758,6 +758,23 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
 static DEFINE_PER_CPU(struct _cpuid4_info *, ici_cpuid4_info);
 #define CPUID4_INFO_IDX(x, y)	(&((per_cpu(ici_cpuid4_info, x))[y]))
 
+/* returns CPUs that share the index cache with cpu */
+int get_shared_cpu_map(cpumask_var_t mask, unsigned int cpu, int index)
+{
+	int ret = 0;
+	struct _cpuid4_info *this_leaf;
+
+	if (index >= num_cache_leaves) {
+		index = num_cache_leaves - 1;
+		ret = index;
+	}
+
+	this_leaf = CPUID4_INFO_IDX(cpu,index);
+	cpumask_copy(mask, to_cpumask(this_leaf->shared_cpu_map));
+
+	return ret;
+}
+
 #ifdef CONFIG_SMP
 static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
 {
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 17be5ec7cbba..115e8951e8c8 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1016,6 +1016,8 @@ apicinterrupt CALL_FUNCTION_VECTOR \
 	call_function_interrupt smp_call_function_interrupt
 apicinterrupt RESCHEDULE_VECTOR \
 	reschedule_interrupt smp_reschedule_interrupt
+apicinterrupt PULL_TIMERS_VECTOR \
+	pull_timers_interrupt smp_pull_timers_interrupt
 #endif
 
 apicinterrupt ERROR_APIC_VECTOR \
diff --git a/arch/x86/kernel/ft_event.c b/arch/x86/kernel/ft_event.c
new file mode 100644
index 000000000000..37cc33252713
--- /dev/null
+++ b/arch/x86/kernel/ft_event.c
@@ -0,0 +1,118 @@
+#include <linux/types.h>
+
+#include <litmus/feather_trace.h>
+
+/* the feather trace management functions assume
+ * exclusive access to the event table
+ */
+
+#ifndef CONFIG_DEBUG_RODATA
+
+#define BYTE_JUMP      0xeb
+#define BYTE_JUMP_LEN  0x02
+
+/* for each event, there is an entry in the event table */
+struct trace_event {
+	long 	id;
+	long	count;
+	long	start_addr;
+	long	end_addr;
+};
+
+extern struct trace_event  __start___event_table[];
+extern struct trace_event  __stop___event_table[];
+
+/* Workaround: if no events are defined, then the event_table section does not
+ * exist and the above references cause linker errors. This could probably be
+ * fixed by adjusting the linker script, but it is easier to maintain for us if
+ * we simply create a dummy symbol in the event table section.
+ */
+int __event_table_dummy[0] __attribute__ ((section("__event_table")));
+
+int ft_enable_event(unsigned long id)
+{
+	struct trace_event* te = __start___event_table;
+	int count = 0;
+	char* delta;
+	unsigned char* instr;
+
+	while (te < __stop___event_table) {
+		if (te->id == id && ++te->count == 1) {
+			instr  = (unsigned char*) te->start_addr;
+			/* make sure we don't clobber something wrong */
+			if (*instr == BYTE_JUMP) {
+				delta  = (((unsigned char*) te->start_addr) + 1);
+				*delta = 0;
+			}
+		}
+		if (te->id == id)
+			count++;
+		te++;
+	}
+
+	printk(KERN_DEBUG "ft_enable_event: enabled %d events\n", count);
+	return count;
+}
+
+int ft_disable_event(unsigned long id)
+{
+	struct trace_event* te = __start___event_table;
+	int count = 0;
+	char* delta;
+	unsigned char* instr;
+
+	while (te < __stop___event_table) {
+		if (te->id == id && --te->count == 0) {
+			instr  = (unsigned char*) te->start_addr;
+			if (*instr == BYTE_JUMP) {
+				delta  = (((unsigned char*) te->start_addr) + 1);
+				*delta = te->end_addr - te->start_addr -
+					BYTE_JUMP_LEN;
+			}
+		}
+		if (te->id == id)
+			count++;
+		te++;
+	}
+
+	printk(KERN_DEBUG "ft_disable_event: disabled %d events\n", count);
+	return count;
+}
+
+int ft_disable_all_events(void)
+{
+	struct trace_event* te = __start___event_table;
+	int count = 0;
+	char* delta;
+	unsigned char* instr;
+
+	while (te < __stop___event_table) {
+		if (te->count) {
+			instr  = (unsigned char*) te->start_addr;
+			if (*instr == BYTE_JUMP) {
+				delta  = (((unsigned char*) te->start_addr)
+					  + 1);
+				*delta = te->end_addr - te->start_addr -
+					BYTE_JUMP_LEN;
+				te->count = 0;
+				count++;
+			}
+		}
+		te++;
+	}
+	return count;
+}
+
+int ft_is_event_enabled(unsigned long id)
+{
+	struct trace_event* te = __start___event_table;
+
+	while (te < __stop___event_table) {
+		if (te->id == id)
+			return te->count;
+		te++;
+	}
+	return 0;
+}
+
+#endif
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 990ae7cfc578..9772b1a0f9a4 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -189,6 +189,9 @@ static void __init smp_intr_init(void)
 	alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
 			call_function_single_interrupt);
 
+	/* IPI for hrtimer pulling on remote cpus */
+	alloc_intr_gate(PULL_TIMERS_VECTOR, pull_timers_interrupt);
+
 	/* Low priority IPI to cleanup after moving an irq */
 	set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
 	set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index d801210945d6..74cca6014c0e 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -23,6 +23,10 @@
 #include <linux/cpu.h>
 #include <linux/gfp.h>
 
+#include <litmus/preempt.h>
+#include <litmus/debug_trace.h>
+#include <litmus/trace.h>
+
 #include <asm/mtrr.h>
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
@@ -118,6 +122,7 @@ static void native_smp_send_reschedule(int cpu)
 		WARN_ON(1);
 		return;
 	}
+	TS_SEND_RESCHED_START(cpu);
 	apic->send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR);
 }
 
@@ -147,6 +152,16 @@ void native_send_call_func_ipi(const struct cpumask *mask)
 	free_cpumask_var(allbutself);
 }
 
+/* trigger timers on remote cpu */
+void smp_send_pull_timers(int cpu)
+{
+	if (unlikely(cpu_is_offline(cpu))) {
+		WARN_ON(1);
+		return;
+	}
+	apic->send_IPI_mask(cpumask_of(cpu), PULL_TIMERS_VECTOR);
+}
+
 /*
  * this function calls the 'stop' function on all other CPUs in the system.
  */
@@ -198,7 +213,10 @@ static void native_smp_send_stop(void)
 void smp_reschedule_interrupt(struct pt_regs *regs)
 {
 	ack_APIC_irq();
+	/* LITMUS^RT: this IPI might need to trigger the sched state machine. */
+	sched_state_ipi();
 	inc_irq_stat(irq_resched_count);
+	TS_SEND_RESCHED_END;
 	/*
 	 * KVM uses this interrupt to force a cpu out of guest mode
 	 */
@@ -222,6 +240,15 @@ void smp_call_function_single_interrupt(struct pt_regs *regs)
 	irq_exit();
 }
 
+extern void hrtimer_pull(void);
+
+void smp_pull_timers_interrupt(struct pt_regs *regs)
+{
+	ack_APIC_irq();
+	TRACE("pull timer interrupt\n");
+	hrtimer_pull();
+}
+
 struct smp_ops smp_ops = {
 	.smp_prepare_boot_cpu	= native_smp_prepare_boot_cpu,
 	.smp_prepare_cpus	= native_smp_prepare_cpus,
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index b35786dc9b8f..37702905f658 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -340,3 +340,15 @@ ENTRY(sys_call_table)
 	.long sys_fanotify_init
 	.long sys_fanotify_mark
 	.long sys_prlimit64		/* 340 */
+	.long sys_set_rt_task_param	/* LITMUS^RT 341 */
+	.long sys_get_rt_task_param
+	.long sys_complete_job
+	.long sys_od_open
+	.long sys_od_close
+	.long sys_litmus_lock
+	.long sys_litmus_unlock
+	.long sys_query_job_no
+	.long sys_wait_for_job_release
+	.long sys_wait_for_ts_release
+	.long sys_release_ts
+	.long sys_null_call
diff --git a/drivers/tty/vt/consolemap_deftbl.c b/drivers/tty/vt/consolemap_deftbl.c
new file mode 100644
index 000000000000..5f141383566b
--- /dev/null
+++ b/drivers/tty/vt/consolemap_deftbl.c
@@ -0,0 +1,86 @@
+/*
+ * Do not edit this file; it was automatically generated by
+ *
+ * conmakehash drivers/tty/vt/cp437.uni > [this file]
+ *
+ */
+
+#include <linux/types.h>
+
+u8 dfont_unicount[256] = 
+{
+	  1,   1,   1,   1,   2,   1,   1,   1,
+	  1,   1,   1,   1,   1,   1,   1,   2,
+	  2,   2,   1,   1,   1,   1,   1,   1,
+	  1,   1,   1,   1,   1,   1,   1,   1,
+	  1,   1,   2,   1,   1,   1,   1,   2,
+	  1,   1,   1,   1,   2,   2,   1,   1,
+	  1,   1,   1,   1,   1,   1,   1,   1,
+	  1,   1,   1,   1,   1,   1,   1,   1,
+	  1,   5,   1,   2,   2,   4,   1,   1,
+	  1,   5,   1,   2,   1,   1,   1,   5,
+	  1,   1,   2,   1,   1,   4,   1,   1,
+	  1,   2,   1,   1,   1,   1,   1,   3,
+	  1,   2,   1,   1,   1,   1,   1,   1,
+	  1,   1,   1,   1,   1,   1,   1,   2,
+	  1,   1,   1,   1,   1,   1,   1,   1,
+	  2,   2,   1,   1,   2,   1,   1,   1,
+	  1,   1,   1,   1,   1,   1,   1,   1,
+	  1,   1,   1,   1,   1,   1,   1,   2,
+	  1,   1,   1,   1,   1,   1,   1,   1,
+	  1,   1,   1,   1,   1,   1,   1,   1,
+	  1,   1,   1,   1,   1,   1,   1,   1,
+	  1,   1,   1,   1,   1,   1,   1,   1,
+	  1,   1,   1,   1,   1,   1,   1,   1,
+	  1,   1,   1,   1,   1,   1,   1,   1,
+	  1,   1,   1,   1,   1,   1,   1,   1,
+	  1,   1,   1,   1,   1,   1,   1,   1,
+	  1,   1,   1,   1,   1,   1,   1,   1,
+	  1,   1,   1,   1,   1,   1,   1,   1,
+	  1,   2,   1,   1,   1,   1,   2,   1,
+	  2,   1,   2,   2,   1,   2,   2,   1,
+	  1,   1,   1,   1,   1,   1,   1,   1,
+	  1,   1,   1,   1,   1,   1,   2,   1
+};
+
+u16 dfont_unitable[303] = 
+{
+	0x0000, 0x263a, 0x263b, 0x2665, 0x2666, 0x25c6, 0x2663, 0x2660,
+	0x2022, 0x25d8, 0x25cb, 0x25d9, 0x2642, 0x2640, 0x266a, 0x266b,
+	0x263c, 0x00a4, 0x25b6, 0x25ba, 0x25c0, 0x25c4, 0x2195, 0x203c,
+	0x00b6, 0x00a7, 0x25ac, 0x21a8, 0x2191, 0x2193, 0x2192, 0x2190,
+	0x221f, 0x2194, 0x25b2, 0x25bc, 0x0020, 0x0021, 0x0022, 0x00a8,
+	0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x00b4, 0x0028, 0x0029,
+	0x002a, 0x002b, 0x002c, 0x00b8, 0x002d, 0x00ad, 0x002e, 0x002f,
+	0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
+	0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f,
+	0x0040, 0x0041, 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x0042, 0x0043,
+	0x00a9, 0x0044, 0x00d0, 0x0045, 0x00c8, 0x00ca, 0x00cb, 0x0046,
+	0x0047, 0x0048, 0x0049, 0x00cc, 0x00cd, 0x00ce, 0x00cf, 0x004a,
+	0x004b, 0x212a, 0x004c, 0x004d, 0x004e, 0x004f, 0x00d2, 0x00d3,
+	0x00d4, 0x00d5, 0x0050, 0x0051, 0x0052, 0x00ae, 0x0053, 0x0054,
+	0x0055, 0x00d9, 0x00da, 0x00db, 0x0056, 0x0057, 0x0058, 0x0059,
+	0x00dd, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, 0x23bd,
+	0xf804, 0x0060, 0x0061, 0x00e3, 0x0062, 0x0063, 0x0064, 0x0065,
+	0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d,
+	0x006e, 0x006f, 0x00f5, 0x0070, 0x0071, 0x0072, 0x0073, 0x0074,
+	0x0075, 0x0076, 0x0077, 0x0078, 0x00d7, 0x0079, 0x00fd, 0x007a,
+	0x007b, 0x007c, 0x00a6, 0x007d, 0x007e, 0x2302, 0x00c7, 0x00fc,
+	0x00e9, 0x00e2, 0x00e4, 0x00e0, 0x00e5, 0x00e7, 0x00ea, 0x00eb,
+	0x00e8, 0x00ef, 0x00ee, 0x00ec, 0x00c4, 0x00c5, 0x212b, 0x00c9,
+	0x00e6, 0x00c6, 0x00f4, 0x00f6, 0x00f2, 0x00fb, 0x00f9, 0x00ff,
+	0x00d6, 0x00dc, 0x00a2, 0x00a3, 0x00a5, 0x20a7, 0x0192, 0x00e1,
+	0x00ed, 0x00f3, 0x00fa, 0x00f1, 0x00d1, 0x00aa, 0x00ba, 0x00bf,
+	0x2310, 0x00ac, 0x00bd, 0x00bc, 0x00a1, 0x00ab, 0x00bb, 0x2591,
+	0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556, 0x2555,
+	0x2563, 0x2551, 0x2557, 0x255d, 0x255c, 0x255b, 0x2510, 0x2514,
+	0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x255e, 0x255f, 0x255a,
+	0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x2567, 0x2568,
+	0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256b, 0x256a,
+	0x2518, 0x250c, 0x2588, 0x2584, 0x258c, 0x2590, 0x2580, 0x03b1,
+	0x03b2, 0x00df, 0x0393, 0x03c0, 0x03a3, 0x03c3, 0x00b5, 0x03bc,
+	0x03c4, 0x03a6, 0x00d8, 0x0398, 0x03a9, 0x2126, 0x03b4, 0x00f0,
+	0x221e, 0x03c6, 0x00f8, 0x03b5, 0x2208, 0x2229, 0x2261, 0x00b1,
+	0x2265, 0x2264, 0x2320, 0x2321, 0x00f7, 0x2248, 0x00b0, 0x2219,
+	0x00b7, 0x221a, 0x207f, 0x00b2, 0x25a0, 0xfffd, 0x00a0
+};
diff --git a/drivers/tty/vt/defkeymap.c b/drivers/tty/vt/defkeymap.c
new file mode 100644
index 000000000000..d2208dfe3f67
--- /dev/null
+++ b/drivers/tty/vt/defkeymap.c
@@ -0,0 +1,262 @@
+/* Do not edit this file! It was automatically generated by   */
+/*    loadkeys --mktable defkeymap.map > defkeymap.c          */
+
+#include <linux/types.h>
+#include <linux/keyboard.h>
+#include <linux/kd.h>
+
+u_short plain_map[NR_KEYS] = {
+	0xf200,	0xf01b,	0xf031,	0xf032,	0xf033,	0xf034,	0xf035,	0xf036,
+	0xf037,	0xf038,	0xf039,	0xf030,	0xf02d,	0xf03d,	0xf07f,	0xf009,
+	0xfb71,	0xfb77,	0xfb65,	0xfb72,	0xfb74,	0xfb79,	0xfb75,	0xfb69,
+	0xfb6f,	0xfb70,	0xf05b,	0xf05d,	0xf201,	0xf702,	0xfb61,	0xfb73,
+	0xfb64,	0xfb66,	0xfb67,	0xfb68,	0xfb6a,	0xfb6b,	0xfb6c,	0xf03b,
+	0xf027,	0xf060,	0xf700,	0xf05c,	0xfb7a,	0xfb78,	0xfb63,	0xfb76,
+	0xfb62,	0xfb6e,	0xfb6d,	0xf02c,	0xf02e,	0xf02f,	0xf700,	0xf30c,
+	0xf703,	0xf020,	0xf207,	0xf100,	0xf101,	0xf102,	0xf103,	0xf104,
+	0xf105,	0xf106,	0xf107,	0xf108,	0xf109,	0xf208,	0xf209,	0xf307,
+	0xf308,	0xf309,	0xf30b,	0xf304,	0xf305,	0xf306,	0xf30a,	0xf301,
+	0xf302,	0xf303,	0xf300,	0xf310,	0xf206,	0xf200,	0xf03c,	0xf10a,
+	0xf10b,	0xf200,	0xf200,	0xf200,	0xf200,	0xf200,	0xf200,	0xf200,
+	0xf30e,	0xf702,	0xf30d,	0xf01c,	0xf701,	0xf205,	0xf114,	0xf603,
+	0xf118,	0xf601,	0xf602,	0xf117,	0xf600,	0xf119,	0xf115,	0xf116,
+	0xf11a,	0xf10c,	0xf10d,	0xf11b,	0xf11c,	0xf110,	0xf311,	0xf11d,
+	0xf200,	0xf200,	0xf200,	0xf200,	0xf200,	0xf200,	0xf200,	0xf200,
+};
+
+u_short shift_map[NR_KEYS] = {
+	0xf200,	0xf01b,	0xf021,	0xf040,	0xf023,	0xf024,	0xf025,	0xf05e,
+	0xf026,	0xf02a,	0xf028,	0xf029,	0xf05f,	0xf02b,	0xf07f,	0xf009,
+	0xfb51,	0xfb57,	0xfb45,	0xfb52,	0xfb54,	0xfb59,	0xfb55,	0xfb49,
+	0xfb4f,	0xfb50,	0xf07b,	0xf07d,	0xf201,	0xf702,	0xfb41,	0xfb53,
+	0xfb44,	0xfb46,	0xfb47,	0xfb48,	0xfb4a,	0xfb4b,	0xfb4c,	0xf03a,
+	0xf022,	0xf07e,	0xf700,	0xf07c,	0xfb5a,	0xfb58,	0xfb43,	0xfb56,
+	0xfb42,	0xfb4e,	0xfb4d,	0xf03c,	0xf03e,	0xf03f,	0xf700,	0xf30c,
+	0xf703,	0xf020,	0xf207,	0xf10a,	0xf10b,	0xf10c,	0xf10d,	0xf10e,
+	0xf10f,	0xf110,	0xf111,	0xf112,	0xf113,	0xf213,	0xf203,	0xf307,
+	0xf308,	0xf309,	0xf30b,	0xf304,	0xf305,	0xf306,	0xf30a,	0xf301,
+	0xf302,	0xf303,	0xf300,	0xf310,	0xf206,	0xf200,	0xf03e,	0xf10a,
+	0xf10b,	0xf200,	0xf200,	0xf200,	0xf200,	0xf200,	0xf200,	0xf200,
+	0xf30e,	0xf702,	0xf30d,	0xf200,	0xf701,	0xf205,	0xf114,	0xf603,
+	0xf20b,	0xf601,	0xf602,	0xf117,	0xf600,	0xf20a,	0xf115,	0xf116,
+	0xf11a,	0xf10c,	0xf10d,	0xf11b,	0xf11c,	0xf110,	0xf311,	0xf11d,
+	0xf200,	0xf200,	0xf200,	0xf200,	0xf200,	0xf200,	0xf200,	0xf200,
+};
+
+u_short altgr_map[NR_KEYS] = {
+	0xf200,	0xf200,	0xf200,	0xf040,	0xf200,	0xf024,	0xf200,	0xf200,
+	0xf07b,	0xf05b,	0xf05d,	0xf07d,	0xf05c,	0xf200,	0xf200,	0xf200,
+	0xfb71,	0xfb77,	0xf918,	0xfb72,	0xfb74,	0xfb79,	0xfb75,	0xfb69,
+	0xfb6f,	0xfb70,	0xf200,	0xf07e,	0xf201,	0xf702,	0xf914,	0xfb73,
+	0xf917,	0xf919,	0xfb67,	0xfb68,	0xfb6a,	0xfb6b,	0xfb6c,	0xf200,
+	0xf200,	0xf200,	0xf700,	0xf200,	0xfb7a,	0xfb78,	0xf916,	0xfb76,
+	0xf915,	0xfb6e,	0xfb6d,	0xf200,	0xf200,	0xf200,	0xf700,	0xf30c,
+	0xf703,	0xf200,	0xf207,	0xf50c,	0xf50d,	0xf50e,	0xf50f,	0xf510,
+	0xf511,	0xf512,	0xf513,	0xf514,	0xf515,	0xf208,	0xf202,	0xf911,
+	0xf912,	0xf913,	0xf30b,	0xf90e,	0xf90f,	0xf910,	0xf30a,	0xf90b,
+	0xf90c,	0xf90d,	0xf90a,	0xf310,	0xf206,	0xf200,	0xf07c,	0xf516,
+	0xf517,	0xf200,	0xf200,	0xf200,	0xf200,	0xf200,	0xf200,	0xf200,
+	0xf30e,	0xf702,	0xf30d,	0xf200,	0xf701,	0xf205,	0xf114,	0xf603,
+	0xf118,	0xf601,	0xf602,	0xf117,	0xf600,	0xf119,	0xf115,	0xf116,
+	0xf11a,	0xf10c,	0xf10d,	0xf11b,	0xf11c,	0xf110,	0xf311,	0xf11d,
+	0xf200,	0xf200,	0xf200,	0xf200,	0xf200,	0xf200,	0xf200,	0xf200,
+};
+
+u_short ctrl_map[NR_KEYS] = {
+	0xf200,	0xf200,	0xf200,	0xf000,	0xf01b,	0xf01c,	0xf01d,	0xf01e,
+	0xf01f,	0xf07f,	0xf200,	0xf200,	0xf01f,	0xf200,	0xf008,	0xf200,
+	0xf011,	0xf017,	0xf005,	0xf012,	0xf014,	0xf019,	0xf015,	0xf009,
+	0xf00f,	0xf010,	0xf01b,	0xf01d,	0xf201,	0xf702,	0xf001,	0xf013,
+	0xf004,	0xf006,	0xf007,	0xf008,	0xf00a,	0xf00b,	0xf00c,	0xf200,
+	0xf007,	0xf000,	0xf700,	0xf01c,	0xf01a,	0xf018,	0xf003,	0xf016,
+	0xf002,	0xf00e,	0xf00d,	0xf200,	0xf20e,	0xf07f,	0xf700,	0xf30c,
+	0xf703,	0xf000,	0xf207,	0xf100,	0xf101,	0xf102,	0xf103,	0xf104,
+	0xf105,	0xf106,	0xf107,	0xf108,	0xf109,	0xf208,	0xf204,	0xf307,
+	0xf308,	0xf309,	0xf30b,	0xf304,	0xf305,	0xf306,	0xf30a,	0xf301,
+	0xf302,	0xf303,	0xf300,	0xf310,	0xf206,	0xf200,	0xf200,	0xf10a,
+	0xf10b,	0xf200,	0xf200,	0xf200,	0xf200,	0xf200,	0xf200,	0xf200,
+	0xf30e,	0xf702,	0xf30d,	0xf01c,	0xf701,	0xf205,	0xf114,	0xf603,
+	0xf118,	0xf601,	0xf602,	0xf117,	0xf600,	0xf119,	0xf115,	0xf116,
+	0xf11a,	0xf10c,	0xf10d,	0xf11b,	0xf11c,	0xf110,	0xf311,	0xf11d,
+	0xf200,	0xf200,	0xf200,	0xf200,	0xf200,	0xf200,	0xf200,	0xf200,
+};
+
+u_short shift_ctrl_map[NR_KEYS] = {
+	0xf200,	0xf200,	0xf200,	0xf000,	0xf200,	0xf200,	0xf200,	0xf200,
+	0xf200,	0xf200,	0xf200,	0xf200,	0xf01f,	0xf200,	0xf200,	0xf200,
+	0xf011,	0xf017,	0xf005,	0xf012,	0xf014,	0xf019,	0xf015,	0xf009,
+	0xf00f,	0xf010,	0xf200,	0xf200,	0xf201,	0xf702,	0xf001,	0xf013,
+	0xf004,	0xf006,	0xf007,	0xf008,	0xf00a,	0xf00b,	0xf00c,	0xf200,
+	0xf200,	0xf200,	0xf700,	0xf200,	0xf01a,	0xf018,	0xf003,	0xf016,
+	0xf002,	0xf00e,	0xf00d,	0xf200,	0xf200,	0xf200,	0xf700,	0xf30c,
+	0xf703,	0xf200,	0xf207,	0xf200,	0xf200,	0xf200,	0xf200,	0xf200,
+	0xf200,	0xf200,	0xf200,	0xf200,	0xf200,	0xf208,	0xf200,	0xf307,
+	0xf308,	0xf309,	0xf30b,	0xf304,	0xf305,	0xf306,	0xf30a,	0xf301,
+	0xf302,	0xf303,	0xf300,	0xf310,	0xf206,	0xf200,	0xf200,	0xf200,
+	0xf200,	0xf200,	0xf200,	0xf200,	0xf200,	0xf200,	0xf200,	0xf200,
+	0xf30e,	0xf702,	0xf30d,	0xf200,	0xf701,	0xf205,	0xf114,	0xf603,
+	0xf118,	0xf601,	0xf602,	0xf117,	0xf600,	0xf119,	0xf115,	0xf116,
+	0xf11a,	0xf10c,	0xf10d,	0xf11b,	0xf11c,	0xf110,	0xf311,	0xf11d,
+	0xf200,	0xf200,	0xf200,	0xf200,	0xf200,	0xf200,	0xf200,	0xf200,
+};
+
+u_short alt_map[NR_KEYS] = {
+	0xf200,	0xf81b,	0xf831,	0xf832,	0xf833,	0xf834,	0xf835,	0xf836,
+	0xf837,	0xf838,	0xf839,	0xf830,	0xf82d,	0xf83d,	0xf87f,	0xf809,
+	0xf871,	0xf877,	0xf865,	0xf872,	0xf874,	0xf879,	0xf875,	0xf869,
+	0xf86f,	0xf870,	0xf85b,	0xf85d,	0xf80d,	0xf702,	0xf861,	0xf873,
+	0xf864,	0xf866,	0xf867,	0xf868,	0xf86a,	0xf86b,	0xf86c,	0xf83b,
+	0xf827,	0xf860,	0xf700,	0xf85c,	0xf87a,	0xf878,	0xf863,	0xf876,
+	0xf862,	0xf86e,	0xf86d,	0xf82c,	0xf82e,	0xf82f,	0xf700,	0xf30c,
+	0xf703,	0xf820,	0xf207,	0xf500,	0xf501,	0xf502,	0xf503,	0xf504,
+	0xf505,	0xf506,	0xf507,	0xf508,	0xf509,	0xf208,	0xf209,	0xf907,
+	0xf908,	0xf909,	0xf30b,	0xf904,	0xf905,	0xf906,	0xf30a,	0xf901,
+	0xf902,	0xf903,	0xf900,	0xf310,	0xf206,	0xf200,	0xf83c,	0xf50a,
+	0xf50b,	0xf200,	0xf200,	0xf200,	0xf200,	0xf200,	0xf200,	0xf200,
+	0xf30e,	0xf702,	0xf30d,	0xf01c,	0xf701,	0xf205,	0xf114,	0xf603,
+	0xf118,	0xf210,	0xf211,	0xf117,	0xf600,	0xf119,	0xf115,	0xf116,
+	0xf11a,	0xf10c,	0xf10d,	0xf11b,	0xf11c,	0xf110,	0xf311,	0xf11d,
+	0xf200,	0xf200,	0xf200,	0xf200,	0xf200,	0xf200,	0xf200,	0xf200,
+};
+
+u_short ctrl_alt_map[NR_KEYS] = {
+	0xf200,	0xf200,	0xf200,	0xf200,	0xf200,	0xf200,	0xf200,	0xf200,
+	0xf200,	0xf200,	0xf200,	0xf200,	0xf200,	0xf200,	0xf200,	0xf200,
+	0xf811,	0xf817,	0xf805,	0xf812,	0xf814,	0xf819,	0xf815,	0xf809,
+	0xf80f,	0xf810,	0xf200,	0xf200,	0xf201,	0xf702,	0xf801,	0xf813,
+	0xf804,	0xf806,	0xf807,	0xf808,	0xf80a,	0xf80b,	0xf80c,	0xf200,
+	0xf200,	0xf200,	0xf700,	0xf200,	0xf81a,	0xf818,	0xf803,	0xf816,
+	0xf802,	0xf80e,	0xf80d,	0xf200,	0xf200,	0xf200,	0xf700,	0xf30c,
+	0xf703,	0xf200,	0xf207,	0xf500,	0xf501,	0xf502,	0xf503,	0xf504,
+	0xf505,	0xf506,	0xf507,	0xf508,	0xf509,	0xf208,	0xf200,	0xf307,
+	0xf308,	0xf309,	0xf30b,	0xf304,	0xf305,	0xf306,	0xf30a,	0xf301,
+	0xf302,	0xf303,	0xf300,	0xf20c,	0xf206,	0xf200,	0xf200,	0xf50a,
+	0xf50b,	0xf200,	0xf200,	0xf200,	0xf200,	0xf200,	0xf200,	0xf200,
+	0xf30e,	0xf702,	0xf30d,	0xf200,	0xf701,	0xf205,	0xf114,	0xf603,
+	0xf118,	0xf601,	0xf602,	0xf117,	0xf600,	0xf119,	0xf115,	0xf20c,
+	0xf11a,	0xf10c,	0xf10d,	0xf11b,	0xf11c,	0xf110,	0xf311,	0xf11d,
+	0xf200,	0xf200,	0xf200,	0xf200,	0xf200,	0xf200,	0xf200,	0xf200,
+};
+
+ushort *key_maps[MAX_NR_KEYMAPS] = {
+	plain_map, shift_map, altgr_map, NULL,
+	ctrl_map, shift_ctrl_map, NULL, NULL,
+	alt_map, NULL, NULL, NULL,
+	ctrl_alt_map, NULL
+};
+
+unsigned int keymap_count = 7;
+
+/*
+ * Philosophy: most people do not define more strings, but they who do
+ * often want quite a lot of string space. So, we statically allocate
+ * the default and allocate dynamically in chunks of 512 bytes.
+ */
+
+char func_buf[] = {
+	'\033', '[', '[', 'A', 0, 
+	'\033', '[', '[', 'B', 0, 
+	'\033', '[', '[', 'C', 0, 
+	'\033', '[', '[', 'D', 0, 
+	'\033', '[', '[', 'E', 0, 
+	'\033', '[', '1', '7', '~', 0, 
+	'\033', '[', '1', '8', '~', 0, 
+	'\033', '[', '1', '9', '~', 0, 
+	'\033', '[', '2', '0', '~', 0, 
+	'\033', '[', '2', '1', '~', 0, 
+	'\033', '[', '2', '3', '~', 0, 
+	'\033', '[', '2', '4', '~', 0, 
+	'\033', '[', '2', '5', '~', 0, 
+	'\033', '[', '2', '6', '~', 0, 
+	'\033', '[', '2', '8', '~', 0, 
+	'\033', '[', '2', '9', '~', 0, 
+	'\033', '[', '3', '1', '~', 0, 
+	'\033', '[', '3', '2', '~', 0, 
+	'\033', '[', '3', '3', '~', 0, 
+	'\033', '[', '3', '4', '~', 0, 
+	'\033', '[', '1', '~', 0, 
+	'\033', '[', '2', '~', 0, 
+	'\033', '[', '3', '~', 0, 
+	'\033', '[', '4', '~', 0, 
+	'\033', '[', '5', '~', 0, 
+	'\033', '[', '6', '~', 0, 
+	'\033', '[', 'M', 0, 
+	'\033', '[', 'P', 0, 
+};
+
+char *funcbufptr = func_buf;
+int funcbufsize = sizeof(func_buf);
+int funcbufleft = 0;          /* space left */
+
+char *func_table[MAX_NR_FUNC] = {
+	func_buf + 0,
+	func_buf + 5,
+	func_buf + 10,
+	func_buf + 15,
+	func_buf + 20,
+	func_buf + 25,
+	func_buf + 31,
+	func_buf + 37,
+	func_buf + 43,
+	func_buf + 49,
+	func_buf + 55,
+	func_buf + 61,
+	func_buf + 67,
+	func_buf + 73,
+	func_buf + 79,
+	func_buf + 85,
+	func_buf + 91,
+	func_buf + 97,
+	func_buf + 103,
+	func_buf + 109,
+	func_buf + 115,
+	func_buf + 120,
+	func_buf + 125,
+	func_buf + 130,
+	func_buf + 135,
+	func_buf + 140,
+	func_buf + 145,
+	NULL,
+	NULL,
+	func_buf + 149,
+	NULL,
+};
+
+struct kbdiacruc accent_table[MAX_DIACR] = {
+	{'`', 'A', 0300},	{'`', 'a', 0340},
+	{'\'', 'A', 0301},	{'\'', 'a', 0341},
+	{'^', 'A', 0302},	{'^', 'a', 0342},
+	{'~', 'A', 0303},	{'~', 'a', 0343},
+	{'"', 'A', 0304},	{'"', 'a', 0344},
+	{'O', 'A', 0305},	{'o', 'a', 0345},
+	{'0', 'A', 0305},	{'0', 'a', 0345},
+	{'A', 'A', 0305},	{'a', 'a', 0345},
+	{'A', 'E', 0306},	{'a', 'e', 0346},
+	{',', 'C', 0307},	{',', 'c', 0347},
+	{'`', 'E', 0310},	{'`', 'e', 0350},
+	{'\'', 'E', 0311},	{'\'', 'e', 0351},
+	{'^', 'E', 0312},	{'^', 'e', 0352},
+	{'"', 'E', 0313},	{'"', 'e', 0353},
+	{'`', 'I', 0314},	{'`', 'i', 0354},
+	{'\'', 'I', 0315},	{'\'', 'i', 0355},
+	{'^', 'I', 0316},	{'^', 'i', 0356},
+	{'"', 'I', 0317},	{'"', 'i', 0357},
+	{'-', 'D', 0320},	{'-', 'd', 0360},
+	{'~', 'N', 0321},	{'~', 'n', 0361},
+	{'`', 'O', 0322},	{'`', 'o', 0362},
+	{'\'', 'O', 0323},	{'\'', 'o', 0363},
+	{'^', 'O', 0324},	{'^', 'o', 0364},
+	{'~', 'O', 0325},	{'~', 'o', 0365},
+	{'"', 'O', 0326},	{'"', 'o', 0366},
+	{'/', 'O', 0330},	{'/', 'o', 0370},
+	{'`', 'U', 0331},	{'`', 'u', 0371},
+	{'\'', 'U', 0332},	{'\'', 'u', 0372},
+	{'^', 'U', 0333},	{'^', 'u', 0373},
+	{'"', 'U', 0334},	{'"', 'u', 0374},
+	{'\'', 'Y', 0335},	{'\'', 'y', 0375},
+	{'T', 'H', 0336},	{'t', 'h', 0376},
+	{'s', 's', 0337},	{'"', 'y', 0377},
+	{'s', 'z', 0337},	{'i', 'j', 0377},
+};
+
+unsigned int accent_table_size = 68;
diff --git a/fs/exec.c b/fs/exec.c
index 6d2b6f936858..56536ad0e7cc 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -19,7 +19,7 @@
  * current->executable is only used by the procfs.  This allows a dispatch
  * table to check for several different types  of binary formats.  We keep
  * trying until we recognize the file or we run out of supported binary
- * formats. 
+ * formats.
  */
 
 #include <linux/slab.h>
@@ -55,6 +55,8 @@
 #include <linux/fs_struct.h>
 #include <linux/pipe_fs_i.h>
 
+#include <litmus/litmus.h>
+
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
 #include <asm/tlb.h>
@@ -78,7 +80,7 @@ int __register_binfmt(struct linux_binfmt * fmt, int insert)
 	insert ? list_add(&fmt->lh, &formats) :
 		 list_add_tail(&fmt->lh, &formats);
 	write_unlock(&binfmt_lock);
-	return 0;	
+	return 0;
 }
 
 EXPORT_SYMBOL(__register_binfmt);
@@ -1064,7 +1066,7 @@ void setup_new_exec(struct linux_binprm * bprm)
 	   group */
 
 	current->self_exec_id++;
-			
+
 	flush_signal_handlers(current, 0);
 	flush_old_files(current->files);
 }
@@ -1154,8 +1156,8 @@ int check_unsafe_exec(struct linux_binprm *bprm)
 	return res;
 }
 
-/* 
- * Fill the binprm structure from the inode. 
+/*
+ * Fill the binprm structure from the inode.
  * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes
  *
  * This may be called multiple times for binary chains (scripts for example).
@@ -1367,6 +1369,7 @@ int do_execve(const char * filename,
 		goto out_unmark;
 
 	sched_exec();
+	litmus_exec();
 
 	bprm->file = file;
 	bprm->filename = filename;
diff --git a/fs/inode.c b/fs/inode.c
index 86464332e590..d4fe9c031864 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -266,6 +266,8 @@ void inode_init_once(struct inode *inode)
 #ifdef CONFIG_FSNOTIFY
 	INIT_HLIST_HEAD(&inode->i_fsnotify_marks);
 #endif
+	INIT_LIST_HEAD(&inode->i_obj_list);
+	mutex_init(&inode->i_obj_mutex);
 }
 EXPORT_SYMBOL(inode_init_once);
 
diff --git a/include/linux/completion.h b/include/linux/completion.h
index 51e3145196f6..c63950e8a863 100644
--- a/include/linux/completion.h
+++ b/include/linux/completion.h
@@ -90,6 +90,7 @@ extern bool completion_done(struct completion *x);
 
 extern void complete(struct completion *);
 extern void complete_all(struct completion *);
+extern void complete_n(struct completion *, int n);
 
 /**
  * INIT_COMPLETION: - reinitialize a completion structure
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 63d069bd80b7..29a672458d27 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -16,8 +16,8 @@
  * nr_file rlimit, so it's safe to set up a ridiculously high absolute
  * upper limit on files-per-process.
  *
- * Some programs (notably those using select()) may have to be 
- * recompiled to take full advantage of the new limits..  
+ * Some programs (notably those using select()) may have to be
+ * recompiled to take full advantage of the new limits..
  */
 
 /* Fixed constants first: */
@@ -172,7 +172,7 @@ struct inodes_stat_t {
 #define SEL_EX		4
 
 /* public flags for file_system_type */
-#define FS_REQUIRES_DEV 1 
+#define FS_REQUIRES_DEV 1
 #define FS_BINARY_MOUNTDATA 2
 #define FS_HAS_SUBTYPE 4
 #define FS_REVAL_DOT	16384	/* Check the paths ".", ".." for staleness */
@@ -470,7 +470,7 @@ struct iattr {
  */
 #include <linux/quota.h>
 
-/** 
+/**
  * enum positive_aop_returns - aop return codes with specific semantics
  *
  * @AOP_WRITEPAGE_ACTIVATE: Informs the caller that page writeback has
@@ -480,7 +480,7 @@ struct iattr {
  * 			    be a candidate for writeback again in the near
  * 			    future.  Other callers must be careful to unlock
  * 			    the page if they get this return.  Returned by
- * 			    writepage(); 
+ * 			    writepage();
  *
  * @AOP_TRUNCATED_PAGE: The AOP method that was handed a locked page has
  *  			unlocked it and the page might have been truncated.
@@ -721,6 +721,7 @@ static inline int mapping_writably_mapped(struct address_space *mapping)
 
 struct posix_acl;
 #define ACL_NOT_CACHED ((void *)(-1))
+struct inode_obj_id_table;
 
 struct inode {
 	struct hlist_node	i_hash;
@@ -784,6 +785,8 @@ struct inode {
 	struct posix_acl	*i_acl;
 	struct posix_acl	*i_default_acl;
 #endif
+	struct list_head	i_obj_list;
+	struct mutex		i_obj_mutex;
 	void			*i_private; /* fs or device private pointer */
 };
 
@@ -997,10 +1000,10 @@ static inline int file_check_writeable(struct file *filp)
 
 #define	MAX_NON_LFS	((1UL<<31) - 1)
 
-/* Page cache limit. The filesystems should put that into their s_maxbytes 
-   limits, otherwise bad things can happen in VM. */ 
+/* Page cache limit. The filesystems should put that into their s_maxbytes
+   limits, otherwise bad things can happen in VM. */
 #if BITS_PER_LONG==32
-#define MAX_LFS_FILESIZE	(((u64)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1) 
+#define MAX_LFS_FILESIZE	(((u64)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1)
 #elif BITS_PER_LONG==64
 #define MAX_LFS_FILESIZE 	0x7fffffffffffffffUL
 #endif
@@ -2145,7 +2148,7 @@ extern int may_open(struct path *, int, int);
 
 extern int kernel_read(struct file *, loff_t, char *, unsigned long);
 extern struct file * open_exec(const char *);
- 
+
 /* fs/dcache.c -- generic fs support functions */
 extern int is_subdir(struct dentry *, struct dentry *);
 extern int path_is_under(struct path *, struct path *);
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index fd0c1b857d3d..76da541c1f66 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -167,6 +167,7 @@ struct hrtimer_clock_base {
  * @nr_retries:		Total number of hrtimer interrupt retries
  * @nr_hangs:		Total number of hrtimer interrupt hangs
  * @max_hang_time:	Maximum time spent in hrtimer_interrupt
+ * @to_pull:		LITMUS^RT list of timers to be pulled on this cpu
  */
 struct hrtimer_cpu_base {
 	raw_spinlock_t			lock;
@@ -180,8 +181,32 @@ struct hrtimer_cpu_base {
 	unsigned long			nr_hangs;
 	ktime_t				max_hang_time;
 #endif
+	struct list_head		to_pull;
 };
 
+#ifdef CONFIG_ARCH_HAS_SEND_PULL_TIMERS
+
+#define HRTIMER_START_ON_INACTIVE	0
+#define HRTIMER_START_ON_QUEUED		1
+
+/*
+ * struct hrtimer_start_on_info - save timer info on remote cpu
+ * @list:	list of hrtimer_start_on_info on remote cpu (to_pull)
+ * @timer:	timer to be triggered on remote cpu
+ * @time:	time event
+ * @mode:	timer mode
+ * @state:	activity flag
+ */
+struct hrtimer_start_on_info {
+	struct list_head	list;
+	struct hrtimer		*timer;
+	ktime_t			time;
+	enum hrtimer_mode	mode;
+	atomic_t		state;
+};
+
+#endif
+
 static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time)
 {
 	timer->_expires = time;
@@ -348,6 +373,13 @@ __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 			 unsigned long delta_ns,
 			 const enum hrtimer_mode mode, int wakeup);
 
+#ifdef CONFIG_ARCH_HAS_SEND_PULL_TIMERS
+extern void hrtimer_start_on_info_init(struct hrtimer_start_on_info *info);
+extern int hrtimer_start_on(int cpu, struct hrtimer_start_on_info *info,
+			struct hrtimer *timer, ktime_t time,
+			const enum hrtimer_mode mode);
+#endif
+
 extern int hrtimer_cancel(struct hrtimer *timer);
 extern int hrtimer_try_to_cancel(struct hrtimer *timer);
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1e2a6db2d7dd..c9ac4fc837ba 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -38,6 +38,7 @@
 #define SCHED_BATCH		3
 /* SCHED_ISO: reserved but not implemented yet */
 #define SCHED_IDLE		5
+#define SCHED_LITMUS		6
 /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */
 #define SCHED_RESET_ON_FORK     0x40000000
 
@@ -94,6 +95,9 @@ struct sched_param {
 
 #include <asm/processor.h>
 
+#include <litmus/rt_param.h>
+#include <litmus/preempt.h>
+
 struct exec_domain;
 struct futex_pi_state;
 struct robust_list_head;
@@ -1159,6 +1163,7 @@ struct sched_rt_entity {
 };
 
 struct rcu_node;
+struct od_table_entry;
 
 struct task_struct {
 	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
@@ -1243,9 +1248,9 @@ struct task_struct {
 	unsigned long stack_canary;
 #endif
 
-	/* 
+	/*
 	 * pointers to (original) parent process, youngest child, younger sibling,
-	 * older sibling, respectively.  (p->father can be replaced with 
+	 * older sibling, respectively.  (p->father can be replaced with
 	 * p->real_parent->pid)
 	 */
 	struct task_struct *real_parent; /* real parent process */
@@ -1453,6 +1458,13 @@ struct task_struct {
 	int make_it_fail;
 #endif
 	struct prop_local_single dirties;
+
+	/* LITMUS RT parameters and state */
+	struct rt_param rt_param;
+
+	/* references to PI semaphores, etc. */
+	struct od_table_entry *od_table;
+
 #ifdef CONFIG_LATENCYTOP
 	int latency_record_count;
 	struct latency_record latency_record[LT_SAVECOUNT];
@@ -2014,7 +2026,7 @@ static inline int dequeue_signal_lock(struct task_struct *tsk, sigset_t *mask, s
 	spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
 
 	return ret;
-}	
+}
 
 extern void block_all_signals(int (*notifier)(void *priv), void *priv,
 			      sigset_t *mask);
@@ -2290,6 +2302,7 @@ static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag)
 static inline void set_tsk_need_resched(struct task_struct *tsk)
 {
 	set_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
+	sched_state_will_schedule(tsk);
 }
 
 static inline void clear_tsk_need_resched(struct task_struct *tsk)
diff --git a/include/linux/smp.h b/include/linux/smp.h
index cfa2d20e35f1..f86d40768e7f 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -79,6 +79,11 @@ void __smp_call_function_single(int cpuid, struct call_single_data *data,
 int smp_call_function_any(const struct cpumask *mask,
 			  void (*func)(void *info), void *info, int wait);
 
+/*
+ * sends a 'pull timer' event to a remote CPU
+ */
+extern void smp_send_pull_timers(int cpu);
+
 /*
  * Generic and arch helpers
  */
diff --git a/include/linux/tick.h b/include/linux/tick.h
index b232ccc0ee29..1e29bd5b18af 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -74,6 +74,11 @@ extern int tick_is_oneshot_available(void);
 extern struct tick_device *tick_get_device(int cpu);
 
 # ifdef CONFIG_HIGH_RES_TIMERS
+/* LITMUS^RT tick alignment */
+#define LINUX_DEFAULT_TICKS	0
+#define LITMUS_ALIGNED_TICKS	1
+#define	LITMUS_STAGGERED_TICKS	2
+
 extern int tick_init_highres(void);
 extern int tick_program_event(ktime_t expires, int force);
 extern void tick_setup_sched_timer(void);
diff --git a/include/litmus/bheap.h b/include/litmus/bheap.h
new file mode 100644
index 000000000000..cf4864a498d8
--- /dev/null
+++ b/include/litmus/bheap.h
@@ -0,0 +1,77 @@
+/* bheaps.h -- Binomial Heaps
+ *
+ * (c) 2008, 2009 Bjoern Brandenburg
+ */
+
+#ifndef BHEAP_H
+#define BHEAP_H
+
+#define NOT_IN_HEAP UINT_MAX
+
+struct bheap_node {
+	struct bheap_node* 	parent;
+	struct bheap_node* 	next;
+	struct bheap_node* 	child;
+
+	unsigned int 		degree;
+	void*			value;
+	struct bheap_node**	ref;
+};
+
+struct bheap {
+	struct bheap_node* 	head;
+	/* We cache the minimum of the heap.
+	 * This speeds up repeated peek operations.
+	 */
+	struct bheap_node*	min;
+};
+
+typedef int (*bheap_prio_t)(struct bheap_node* a, struct bheap_node* b);
+
+void bheap_init(struct bheap* heap);
+void bheap_node_init(struct bheap_node** ref_to_bheap_node_ptr, void* value);
+
+static inline int bheap_node_in_heap(struct bheap_node* h)
+{
+	return h->degree != NOT_IN_HEAP;
+}
+
+static inline int bheap_empty(struct bheap* heap)
+{
+	return heap->head == NULL && heap->min == NULL;
+}
+
+/* insert (and reinitialize) a node into the heap */
+void bheap_insert(bheap_prio_t higher_prio,
+		 struct bheap* heap,
+		 struct bheap_node* node);
+
+/* merge addition into target */
+void bheap_union(bheap_prio_t higher_prio,
+		struct bheap* target,
+		struct bheap* addition);
+
+struct bheap_node* bheap_peek(bheap_prio_t higher_prio,
+			    struct bheap* heap);
+
+struct bheap_node* bheap_take(bheap_prio_t higher_prio,
+			    struct bheap* heap);
+
+void bheap_uncache_min(bheap_prio_t higher_prio, struct bheap* heap);
+int  bheap_decrease(bheap_prio_t higher_prio, struct bheap_node* node);
+
+void bheap_delete(bheap_prio_t higher_prio,
+		 struct bheap* heap,
+		 struct bheap_node* node);
+
+/* allocate from memcache */
+struct bheap_node* bheap_node_alloc(int gfp_flags);
+void bheap_node_free(struct bheap_node* hn);
+
+/* allocate a heap node for value and insert into the heap */
+int bheap_add(bheap_prio_t higher_prio, struct bheap* heap,
+	     void* value, int gfp_flags);
+
+void* bheap_take_del(bheap_prio_t higher_prio,
+		    struct bheap* heap);
+#endif
diff --git a/include/litmus/budget.h b/include/litmus/budget.h
new file mode 100644
index 000000000000..732530e63491
--- /dev/null
+++ b/include/litmus/budget.h
@@ -0,0 +1,8 @@
+#ifndef _LITMUS_BUDGET_H_
+#define _LITMUS_BUDGET_H_
+
+/* Update the per-processor enforcement timer (arm/reproram/cancel) for
+ * the next task. */
+void update_enforcement_timer(struct task_struct* t);
+
+#endif
diff --git a/include/litmus/clustered.h b/include/litmus/clustered.h
new file mode 100644
index 000000000000..0c18dcb15e6c
--- /dev/null
+++ b/include/litmus/clustered.h
@@ -0,0 +1,44 @@
+#ifndef CLUSTERED_H
+#define CLUSTERED_H
+
+/* Which cache level should be used to group CPUs into clusters?
+ * GLOBAL_CLUSTER means that all CPUs form a single cluster (just like under
+ * global scheduling).
+ */
+enum cache_level {
+	GLOBAL_CLUSTER = 0,
+	L1_CLUSTER     = 1,
+	L2_CLUSTER     = 2,
+	L3_CLUSTER     = 3
+};
+
+int parse_cache_level(const char *str, enum cache_level *level);
+const char* cache_level_name(enum cache_level level);
+
+/* expose a cache level in a /proc dir */
+struct proc_dir_entry* create_cluster_file(struct proc_dir_entry* parent,
+					   enum cache_level* level);
+
+
+
+struct scheduling_cluster {
+	unsigned int id;
+	/* list of CPUs that are part of this cluster */
+	struct list_head cpus;
+};
+
+struct cluster_cpu {
+	unsigned int id; /* which CPU is this? */
+	struct list_head cluster_list; /* List of the CPUs in this cluster. */
+	struct scheduling_cluster* cluster; /* The cluster that this CPU belongs to. */
+};
+
+int get_cluster_size(enum cache_level level);
+
+int assign_cpus_to_clusters(enum cache_level level,
+			    struct scheduling_cluster* clusters[],
+			    unsigned int num_clusters,
+			    struct cluster_cpu* cpus[],
+			    unsigned int num_cpus);
+
+#endif
diff --git a/include/litmus/debug_trace.h b/include/litmus/debug_trace.h
new file mode 100644
index 000000000000..48d086d5a44c
--- /dev/null
+++ b/include/litmus/debug_trace.h
@@ -0,0 +1,37 @@
+#ifndef LITMUS_DEBUG_TRACE_H
+#define LITMUS_DEBUG_TRACE_H
+
+#ifdef CONFIG_SCHED_DEBUG_TRACE
+void sched_trace_log_message(const char* fmt, ...);
+void dump_trace_buffer(int max);
+#else
+
+#define sched_trace_log_message(fmt, ...)
+
+#endif
+
+extern atomic_t __log_seq_no;
+
+#ifdef CONFIG_SCHED_DEBUG_TRACE_CALLER
+#define TRACE_PREFIX "%d P%d [%s@%s:%d]: "
+#define TRACE_ARGS  atomic_add_return(1, &__log_seq_no),	\
+		raw_smp_processor_id(),				\
+		__FUNCTION__, __FILE__, __LINE__
+#else
+#define TRACE_PREFIX "%d P%d: "
+#define TRACE_ARGS  atomic_add_return(1, &__log_seq_no), \
+		raw_smp_processor_id()
+#endif
+
+#define TRACE(fmt, args...)						\
+	sched_trace_log_message(TRACE_PREFIX fmt,			\
+				TRACE_ARGS,  ## args)
+
+#define TRACE_TASK(t, fmt, args...)			\
+	TRACE("(%s/%d:%d) " fmt, (t)->comm, (t)->pid,	\
+	      (t)->rt_param.job_params.job_no,  ##args)
+
+#define TRACE_CUR(fmt, args...) \
+	TRACE_TASK(current, fmt, ## args)
+
+#endif
diff --git a/include/litmus/edf_common.h b/include/litmus/edf_common.h
new file mode 100644
index 000000000000..2c4266f77c03
--- /dev/null
+++ b/include/litmus/edf_common.h
@@ -0,0 +1,33 @@
+/*
+ * EDF common data structures and utility functions shared by all EDF
+ * based scheduler plugins
+ */
+
+/* CLEANUP: Add comments and make it less messy.
+ *
+ */
+
+#ifndef __UNC_EDF_COMMON_H__
+#define __UNC_EDF_COMMON_H__
+
+#include <litmus/rt_domain.h>
+
+void edf_domain_init(rt_domain_t* rt, check_resched_needed_t resched,
+		     release_jobs_t release);
+
+int edf_higher_prio(struct task_struct* first,
+		    struct task_struct* second);
+
+#ifdef CONFIG_LITMUS_LOCKING
+/* priority comparison without priority inheritance */
+int edf_higher_base_prio(struct task_struct* first,
+			 struct task_struct* second);
+
+int edf_pending_order(struct bheap_node* a, struct bheap_node* b);
+#endif
+
+int edf_ready_order(struct bheap_node* a, struct bheap_node* b);
+
+int edf_preemption_needed(rt_domain_t* rt, struct task_struct *t);
+
+#endif
diff --git a/include/litmus/fdso.h b/include/litmus/fdso.h
new file mode 100644
index 000000000000..d1ee0d1142d8
--- /dev/null
+++ b/include/litmus/fdso.h
@@ -0,0 +1,77 @@
+/* fdso.h - file descriptor attached shared objects
+ *
+ * (c) 2007--2011 B. Brandenburg, LITMUS^RT project
+ */
+
+#ifndef _LINUX_FDSO_H_
+#define _LINUX_FDSO_H_
+
+#include <linux/list.h>
+#include <asm/atomic.h>
+
+#include <linux/fs.h>
+#include <linux/slab.h>
+
+#define MAX_OBJECT_DESCRIPTORS 85
+
+typedef enum  {
+	MIN_OBJ_TYPE 	= 0,
+
+	FMLP_SEM	= 0,
+	SRP_SEM		= 1,
+
+	MPCP_SEM	= 2,
+	MPCP_VS_SEM	= 3,
+	DPCP_SEM	= 4,
+
+	OMLP_SEM	= 5,
+
+	MAX_OBJ_TYPE	= 5
+} obj_type_t;
+
+struct inode_obj_id {
+	struct list_head	list;
+	atomic_t		count;
+	struct inode*		inode;
+
+	obj_type_t 		type;
+	void*			obj;
+	unsigned int		id;
+};
+
+struct fdso_ops;
+
+struct od_table_entry {
+	unsigned int		used;
+
+	struct inode_obj_id*	obj;
+	const struct fdso_ops*	class;
+};
+
+struct fdso_ops {
+	int   (*create)(void** obj_ref, obj_type_t type, void* __user);
+	void  (*destroy)(obj_type_t type, void*);
+	int   (*open)	(struct od_table_entry*, void* __user);
+	int   (*close)	(struct od_table_entry*);
+};
+
+/* translate a userspace supplied od into the raw table entry
+ * returns NULL if od is invalid
+ */
+struct od_table_entry* get_entry_for_od(int od);
+
+/* translate a userspace supplied od into the associated object
+ * returns NULL if od is invalid
+ */
+static inline void* od_lookup(int od, obj_type_t type)
+{
+	struct od_table_entry* e = get_entry_for_od(od);
+	return e && e->obj->type == type ? e->obj->obj : NULL;
+}
+
+#define lookup_fmlp_sem(od)((struct pi_semaphore*)  od_lookup(od, FMLP_SEM))
+#define lookup_srp_sem(od) ((struct srp_semaphore*) od_lookup(od, SRP_SEM))
+#define lookup_ics(od)     ((struct ics*)           od_lookup(od, ICS_ID))
+
+
+#endif
diff --git a/include/litmus/feather_buffer.h b/include/litmus/feather_buffer.h
new file mode 100644
index 000000000000..6c18277fdfc9
--- /dev/null
+++ b/include/litmus/feather_buffer.h
@@ -0,0 +1,94 @@
+#ifndef _FEATHER_BUFFER_H_
+#define _FEATHER_BUFFER_H_
+
+/* requires UINT_MAX and memcpy */
+
+#define SLOT_FREE	0
+#define	SLOT_BUSY 	1
+#define	SLOT_READY	2
+
+struct ft_buffer {
+	unsigned int	slot_count;
+	unsigned int	slot_size;
+
+	int 		free_count;
+	unsigned int 	write_idx;
+	unsigned int 	read_idx;
+
+	char*		slots;
+	void*		buffer_mem;
+	unsigned int	failed_writes;
+};
+
+static inline int init_ft_buffer(struct ft_buffer*	buf,
+				 unsigned int 		slot_count,
+				 unsigned int 		slot_size,
+				 char*			slots,
+				 void* 			buffer_mem)
+{
+	int i = 0;
+	if (!slot_count || UINT_MAX % slot_count != slot_count - 1) {
+		/* The slot count must divide UNIT_MAX + 1 so that when it
+		 * wraps around the index correctly points to 0.
+		 */
+		return 0;
+	} else {
+		buf->slot_count    = slot_count;
+		buf->slot_size     = slot_size;
+		buf->slots         = slots;
+		buf->buffer_mem    = buffer_mem;
+		buf->free_count    = slot_count;
+		buf->write_idx     = 0;
+		buf->read_idx      = 0;
+		buf->failed_writes = 0;
+		for (i = 0; i < slot_count; i++)
+			buf->slots[i] = SLOT_FREE;
+		return 1;
+	}
+}
+
+static inline int ft_buffer_start_write(struct ft_buffer* buf, void **ptr)
+{
+	int free = fetch_and_dec(&buf->free_count);
+	unsigned int idx;
+	if (free <= 0) {
+		fetch_and_inc(&buf->free_count);
+		*ptr = 0;
+		fetch_and_inc(&buf->failed_writes);
+		return 0;
+	} else {
+		idx  = fetch_and_inc((int*) &buf->write_idx) % buf->slot_count;
+		buf->slots[idx] = SLOT_BUSY;
+		*ptr = ((char*) buf->buffer_mem) + idx * buf->slot_size;
+		return 1;
+	}
+}
+
+static inline void ft_buffer_finish_write(struct ft_buffer* buf, void *ptr)
+{
+	unsigned int idx = ((char*) ptr - (char*) buf->buffer_mem) / buf->slot_size;
+	buf->slots[idx]  = SLOT_READY;
+}
+
+
+/* exclusive reader access is assumed */
+static inline int ft_buffer_read(struct ft_buffer* buf, void* dest)
+{
+	unsigned int idx;
+	if (buf->free_count == buf->slot_count)
+		/* nothing available */
+		return 0;
+	idx = buf->read_idx % buf->slot_count;
+	if (buf->slots[idx] == SLOT_READY) {
+		memcpy(dest, ((char*) buf->buffer_mem) + idx * buf->slot_size,
+		       buf->slot_size);
+		buf->slots[idx] = SLOT_FREE;
+		buf->read_idx++;
+		fetch_and_inc(&buf->free_count);
+		return 1;
+	} else
+		return 0;
+}
+
+
+#endif
diff --git a/include/litmus/feather_trace.h b/include/litmus/feather_trace.h
new file mode 100644
index 000000000000..028dfb206fb0
--- /dev/null
+++ b/include/litmus/feather_trace.h
@@ -0,0 +1,65 @@
+#ifndef _FEATHER_TRACE_H_
+#define _FEATHER_TRACE_H_
+
+#include <asm/atomic.h>
+
+int ft_enable_event(unsigned long id);
+int ft_disable_event(unsigned long id);
+int ft_is_event_enabled(unsigned long id);
+int ft_disable_all_events(void);
+
+/* atomic_* funcitons are inline anyway */
+static inline int fetch_and_inc(int *val)
+{
+	return atomic_add_return(1, (atomic_t*) val) - 1;
+}
+
+static inline int fetch_and_dec(int *val)
+{
+	return atomic_sub_return(1, (atomic_t*) val) + 1;
+}
+
+/* Don't use rewriting implementation if kernel text pages are read-only.
+ * Ftrace gets around this by using the identity mapping, but that's more
+ * effort that is warrented right now for Feather-Trace.
+ * Eventually, it may make sense to replace Feather-Trace with ftrace.
+ */
+#if defined(CONFIG_ARCH_HAS_FEATHER_TRACE) && !defined(CONFIG_DEBUG_RODATA)
+
+#include <asm/feather_trace.h>
+
+#else /* !__ARCH_HAS_FEATHER_TRACE */
+
+/* provide default implementation */
+
+#include <asm/timex.h> /* for get_cycles() */
+
+static inline unsigned long long ft_timestamp(void)
+{
+	return get_cycles();
+}
+
+#define feather_callback
+
+#define MAX_EVENTS 1024
+
+extern int ft_events[MAX_EVENTS];
+
+#define ft_event(id, callback) \
+	if (ft_events[id]) callback();
+
+#define ft_event0(id, callback) \
+	if (ft_events[id]) callback(id);
+
+#define ft_event1(id, callback, param) \
+	if (ft_events[id]) callback(id, param);
+
+#define ft_event2(id, callback, param, param2) \
+	if (ft_events[id]) callback(id, param, param2);
+
+#define ft_event3(id, callback, p, p2, p3) \
+	if (ft_events[id]) callback(id, p, p2, p3);
+
+#endif /* __ARCH_HAS_FEATHER_TRACE */
+
+#endif
diff --git a/include/litmus/fp_common.h b/include/litmus/fp_common.h
new file mode 100644
index 000000000000..dd1f7bf1e347
--- /dev/null
+++ b/include/litmus/fp_common.h
@@ -0,0 +1,105 @@
+/* Fixed-priority scheduler support.
+ */
+
+#ifndef __FP_COMMON_H__
+#define __FP_COMMON_H__
+
+#include <litmus/rt_domain.h>
+
+#include <asm/bitops.h>
+
+
+void fp_domain_init(rt_domain_t* rt, check_resched_needed_t resched,
+		    release_jobs_t release);
+
+int fp_higher_prio(struct task_struct* first,
+		   struct task_struct* second);
+
+int fp_ready_order(struct bheap_node* a, struct bheap_node* b);
+
+#define FP_PRIO_BIT_WORDS (LITMUS_MAX_PRIORITY / BITS_PER_LONG)
+
+#if (LITMUS_MAX_PRIORITY % BITS_PER_LONG)
+#error LITMUS_MAX_PRIORITY must be a multiple of BITS_PER_LONG
+#endif
+
+/* bitmask-inexed priority queue */
+struct fp_prio_queue {
+	unsigned long	bitmask[FP_PRIO_BIT_WORDS];
+	struct bheap	queue[LITMUS_MAX_PRIORITY];
+};
+
+void fp_prio_queue_init(struct fp_prio_queue* q);
+
+static inline void fpq_set(struct fp_prio_queue* q, unsigned int index)
+{
+	unsigned long *word = q->bitmask + (index / BITS_PER_LONG);
+	__set_bit(index % BITS_PER_LONG, word);
+}
+
+static inline void fpq_clear(struct fp_prio_queue* q, unsigned int index)
+{
+	unsigned long *word = q->bitmask + (index / BITS_PER_LONG);
+	__clear_bit(index % BITS_PER_LONG, word);
+}
+
+static inline unsigned int fpq_find(struct fp_prio_queue* q)
+{
+	int i;
+
+	/* loop optimizer should unroll this */
+	for (i = 0; i < FP_PRIO_BIT_WORDS; i++)
+		if (q->bitmask[i])
+			return __ffs(q->bitmask[i]) + i * BITS_PER_LONG;
+
+	return LITMUS_MAX_PRIORITY; /* nothing found */
+}
+
+static inline void fp_prio_add(struct fp_prio_queue* q, struct task_struct* t, unsigned int index)
+{
+
+	BUG_ON(bheap_node_in_heap(tsk_rt(t)->heap_node));
+
+	fpq_set(q, index);
+	bheap_insert(fp_ready_order, &q->queue[index], tsk_rt(t)->heap_node);
+}
+
+static inline void fp_prio_remove(struct fp_prio_queue* q, struct task_struct* t, unsigned int index)
+{
+	BUG_ON(!is_queued(t));
+
+	bheap_delete(fp_ready_order, &q->queue[index], tsk_rt(t)->heap_node);
+	if (likely(bheap_empty(&q->queue[index])))
+		fpq_clear(q, index);
+}
+
+static inline struct task_struct* fp_prio_peek(struct fp_prio_queue* q)
+{
+	unsigned int idx = fpq_find(q);
+	struct bheap_node* hn;
+
+	if (idx < LITMUS_MAX_PRIORITY) {
+		hn = bheap_peek(fp_ready_order, &q->queue[idx]);
+		return bheap2task(hn);
+	} else
+		return NULL;
+}
+
+static inline struct task_struct* fp_prio_take(struct fp_prio_queue* q)
+{
+	unsigned int idx = fpq_find(q);
+	struct bheap_node* hn;
+
+	if (idx < LITMUS_MAX_PRIORITY) {
+		hn = bheap_take(fp_ready_order, &q->queue[idx]);
+		if (likely(bheap_empty(&q->queue[idx])))
+			fpq_clear(q, idx);
+		return bheap2task(hn);
+	} else
+		return NULL;
+}
+
+int fp_preemption_needed(struct fp_prio_queue*  q, struct task_struct *t);
+
+
+#endif
diff --git a/include/litmus/ftdev.h b/include/litmus/ftdev.h
new file mode 100644
index 000000000000..0b959874dd70
--- /dev/null
+++ b/include/litmus/ftdev.h
@@ -0,0 +1,55 @@
+#ifndef _LITMUS_FTDEV_H_
+#define	_LITMUS_FTDEV_H_
+
+#include <litmus/feather_trace.h>
+#include <litmus/feather_buffer.h>
+#include <linux/mutex.h>
+#include <linux/cdev.h>
+
+#define FTDEV_ENABLE_CMD 	0
+#define FTDEV_DISABLE_CMD 	1
+
+struct ftdev;
+
+/* return 0 if buffer can be opened, otherwise -$REASON */
+typedef int  (*ftdev_can_open_t)(struct ftdev* dev, unsigned int buf_no);
+/* return 0 on success, otherwise -$REASON */
+typedef int  (*ftdev_alloc_t)(struct ftdev* dev, unsigned int buf_no);
+typedef void (*ftdev_free_t)(struct ftdev* dev, unsigned int buf_no);
+/* Let devices handle writes from userspace. No synchronization provided. */
+typedef ssize_t (*ftdev_write_t)(struct ft_buffer* buf, size_t len, const char __user *from);
+
+struct ftdev_event;
+
+struct ftdev_minor {
+	struct ft_buffer*	buf;
+	unsigned int		readers;
+	struct mutex		lock;
+	/* FIXME: filter for authorized events */
+	struct ftdev_event*	events;
+	struct device*		device;
+	struct ftdev*		ftdev;
+};
+
+struct ftdev {
+	dev_t			major;
+	struct cdev		cdev;
+	struct class*		class;
+	const char*		name;
+	struct ftdev_minor*	minor;
+	unsigned int		minor_cnt;
+	ftdev_alloc_t		alloc;
+	ftdev_free_t		free;
+	ftdev_can_open_t	can_open;
+	ftdev_write_t		write;
+};
+
+struct ft_buffer* alloc_ft_buffer(unsigned int count, size_t size);
+void free_ft_buffer(struct ft_buffer* buf);
+
+int ftdev_init(	struct ftdev* ftdev, struct module* owner,
+		const int minor_cnt, const char* name);
+void ftdev_exit(struct ftdev* ftdev);
+int register_ftdev(struct ftdev* ftdev);
+
+#endif
diff --git a/include/litmus/jobs.h b/include/litmus/jobs.h
new file mode 100644
index 000000000000..9bd361ef3943
--- /dev/null
+++ b/include/litmus/jobs.h
@@ -0,0 +1,9 @@
+#ifndef __LITMUS_JOBS_H__
+#define __LITMUS_JOBS_H__
+
+void prepare_for_next_period(struct task_struct *t);
+void release_at(struct task_struct *t, lt_t start);
+long complete_job(void);
+
+#endif
+
diff --git a/include/litmus/litmus.h b/include/litmus/litmus.h
new file mode 100644
index 000000000000..31ac72eddef7
--- /dev/null
+++ b/include/litmus/litmus.h
@@ -0,0 +1,292 @@
+/*
+ * Constant definitions related to
+ * scheduling policy.
+ */
+
+#ifndef _LINUX_LITMUS_H_
+#define _LINUX_LITMUS_H_
+
+#include <litmus/debug_trace.h>
+
+#ifdef CONFIG_RELEASE_MASTER
+extern atomic_t release_master_cpu;
+#endif
+
+/* in_list - is a given list_head queued on some list?
+ */
+static inline int in_list(struct list_head* list)
+{
+	return !(  /* case 1: deleted */
+		   (list->next == LIST_POISON1 &&
+		    list->prev == LIST_POISON2)
+		 ||
+		   /* case 2: initialized */
+		   (list->next == list &&
+		    list->prev == list)
+		);
+}
+
+#define NO_CPU			0xffffffff
+
+void litmus_fork(struct task_struct *tsk);
+void litmus_exec(void);
+/* clean up real-time state of a task */
+void exit_litmus(struct task_struct *dead_tsk);
+
+long litmus_admit_task(struct task_struct *tsk);
+void litmus_exit_task(struct task_struct *tsk);
+
+#define is_realtime(t) 		((t)->policy == SCHED_LITMUS)
+#define rt_transition_pending(t) \
+	((t)->rt_param.transition_pending)
+
+#define tsk_rt(t)		(&(t)->rt_param)
+
+/*	Realtime utility macros */
+#define get_rt_flags(t)		(tsk_rt(t)->flags)
+#define set_rt_flags(t,f) 	(tsk_rt(t)->flags=(f))
+#define get_exec_cost(t)  	(tsk_rt(t)->task_params.exec_cost)
+#define get_exec_time(t)	(tsk_rt(t)->job_params.exec_time)
+#define get_rt_period(t)	(tsk_rt(t)->task_params.period)
+#define get_rt_phase(t)		(tsk_rt(t)->task_params.phase)
+#define get_partition(t) 	(tsk_rt(t)->task_params.cpu)
+#define get_priority(t) 	(tsk_rt(t)->task_params.priority)
+#define get_deadline(t)		(tsk_rt(t)->job_params.deadline)
+#define get_release(t)		(tsk_rt(t)->job_params.release)
+#define get_class(t)		(tsk_rt(t)->task_params.cls)
+
+#define is_priority_boosted(t)	(tsk_rt(t)->priority_boosted)
+#define get_boost_start(t)	(tsk_rt(t)->boost_start_time)
+
+inline static int budget_exhausted(struct task_struct* t)
+{
+	return get_exec_time(t) >= get_exec_cost(t);
+}
+
+inline static lt_t budget_remaining(struct task_struct* t)
+{
+	if (!budget_exhausted(t))
+		return get_exec_cost(t) - get_exec_time(t);
+	else
+		/* avoid overflow */
+		return 0;
+}
+
+#define budget_enforced(t) (tsk_rt(t)->task_params.budget_policy != NO_ENFORCEMENT)
+
+#define budget_precisely_enforced(t) (tsk_rt(t)->task_params.budget_policy \
+				      == PRECISE_ENFORCEMENT)
+
+#define is_hrt(t)     		\
+	(tsk_rt(t)->task_params.cls == RT_CLASS_HARD)
+#define is_srt(t)     		\
+	(tsk_rt(t)->task_params.cls == RT_CLASS_SOFT)
+#define is_be(t)      		\
+	(tsk_rt(t)->task_params.cls == RT_CLASS_BEST_EFFORT)
+
+/* Our notion of time within LITMUS: kernel monotonic time. */
+static inline lt_t litmus_clock(void)
+{
+	return ktime_to_ns(ktime_get());
+}
+
+/* A macro to convert from nanoseconds to ktime_t. */
+#define ns_to_ktime(t)		ktime_add_ns(ktime_set(0, 0), t)
+
+#define get_domain(t) (tsk_rt(t)->domain)
+
+/* Honor the flag in the preempt_count variable that is set
+ * when scheduling is in progress.
+ */
+#define is_running(t) 			\
+	((t)->state == TASK_RUNNING || 	\
+	 task_thread_info(t)->preempt_count & PREEMPT_ACTIVE)
+
+#define is_blocked(t)       \
+	(!is_running(t))
+#define is_released(t, now)	\
+	(lt_before_eq(get_release(t), now))
+#define is_tardy(t, now)    \
+	(lt_before_eq(tsk_rt(t)->job_params.deadline, now))
+
+/* real-time comparison macros */
+#define earlier_deadline(a, b) (lt_before(\
+	(a)->rt_param.job_params.deadline,\
+	(b)->rt_param.job_params.deadline))
+#define earlier_release(a, b)  (lt_before(\
+	(a)->rt_param.job_params.release,\
+	(b)->rt_param.job_params.release))
+
+void preempt_if_preemptable(struct task_struct* t, int on_cpu);
+
+#ifdef CONFIG_LITMUS_LOCKING
+void srp_ceiling_block(void);
+#else
+#define srp_ceiling_block() /* nothing */
+#endif
+
+#define bheap2task(hn) ((struct task_struct*) hn->value)
+
+static inline struct control_page* get_control_page(struct task_struct *t)
+{
+	return tsk_rt(t)->ctrl_page;
+}
+
+static inline int has_control_page(struct task_struct* t)
+{
+	return tsk_rt(t)->ctrl_page != NULL;
+}
+
+#ifdef CONFIG_NP_SECTION
+
+static inline int is_kernel_np(struct task_struct *t)
+{
+	return tsk_rt(t)->kernel_np;
+}
+
+static inline int is_user_np(struct task_struct *t)
+{
+	return tsk_rt(t)->ctrl_page ? tsk_rt(t)->ctrl_page->sched.np.flag : 0;
+}
+
+static inline void request_exit_np(struct task_struct *t)
+{
+	if (is_user_np(t)) {
+		/* Set the flag that tells user space to call
+		 * into the kernel at the end of a critical section. */
+		if (likely(tsk_rt(t)->ctrl_page)) {
+			TRACE_TASK(t, "setting delayed_preemption flag\n");
+			tsk_rt(t)->ctrl_page->sched.np.preempt = 1;
+		}
+	}
+}
+
+static inline void make_np(struct task_struct *t)
+{
+	tsk_rt(t)->kernel_np++;
+}
+
+/* Caller should check if preemption is necessary when
+ * the function return 0.
+ */
+static inline int take_np(struct task_struct *t)
+{
+	return --tsk_rt(t)->kernel_np;
+}
+
+/* returns 0 if remote CPU needs an IPI to preempt, 1 if no IPI is required */
+static inline int request_exit_np_atomic(struct task_struct *t)
+{
+	union np_flag old, new;
+	int ok;
+
+	if (tsk_rt(t)->ctrl_page) {
+		old.raw = tsk_rt(t)->ctrl_page->sched.raw;
+		if (old.np.flag == 0) {
+			/* no longer non-preemptive */
+			return 0;
+		} else if (old.np.preempt) {
+			/* already set, nothing for us to do */
+			TRACE_TASK(t, "not setting np.preempt flag again\n");
+			return 1;
+		} else {
+			/* non preemptive and flag not set */
+			new.raw = old.raw;
+			new.np.preempt = 1;
+			/* if we get old back, then we atomically set the flag */
+			ok = cmpxchg(&tsk_rt(t)->ctrl_page->sched.raw, old.raw, new.raw) == old.raw;
+			/* If we raced with a concurrent change, then so be
+			 * it. Deliver it by IPI.  We don't want an unbounded
+			 * retry loop here since tasks might exploit that to
+			 * keep the kernel busy indefinitely. */
+			TRACE_TASK(t,  "request_exit_np => %d\n", ok);
+			return ok;
+		}
+	} else
+		return 0;
+}
+
+#else
+
+static inline int is_kernel_np(struct task_struct* t)
+{
+	return 0;
+}
+
+static inline int is_user_np(struct task_struct* t)
+{
+	return 0;
+}
+
+static inline void request_exit_np(struct task_struct *t)
+{
+	/* request_exit_np() shouldn't be called if !CONFIG_NP_SECTION */
+	BUG();
+}
+
+static inline int request_exist_np_atomic(struct task_struct *t)
+{
+	return 0;
+}
+
+#endif
+
+static inline void clear_exit_np(struct task_struct *t)
+{
+	if (likely(tsk_rt(t)->ctrl_page))
+		tsk_rt(t)->ctrl_page->sched.np.preempt = 0;
+}
+
+static inline int is_np(struct task_struct *t)
+{
+#ifdef CONFIG_SCHED_DEBUG_TRACE
+	int kernel, user;
+	kernel = is_kernel_np(t);
+	user   = is_user_np(t);
+	if (kernel || user)
+		TRACE_TASK(t, " is non-preemptive: kernel=%d user=%d\n",
+
+			   kernel, user);
+	return kernel || user;
+#else
+	return unlikely(is_kernel_np(t) || is_user_np(t));
+#endif
+}
+
+static inline int is_present(struct task_struct* t)
+{
+	return t && tsk_rt(t)->present;
+}
+
+
+/* make the unit explicit */
+typedef unsigned long quanta_t;
+
+enum round {
+	FLOOR,
+	CEIL
+};
+
+
+/* Tick period is used to convert ns-specified execution
+ * costs and periods into tick-based equivalents.
+ */
+extern ktime_t tick_period;
+
+static inline quanta_t time2quanta(lt_t time, enum round round)
+{
+	s64  quantum_length = ktime_to_ns(tick_period);
+
+	if (do_div(time, quantum_length) && round == CEIL)
+		time++;
+	return (quanta_t) time;
+}
+
+/* By how much is cpu staggered behind CPU 0? */
+u64 cpu_stagger_offset(int cpu);
+
+#define TS_SYSCALL_IN_START						\
+	if (has_control_page(current))					\
+		__TS_SYSCALL_IN_START(&get_control_page(current)->ts_syscall_start);
+
+#endif
diff --git a/include/litmus/litmus_proc.h b/include/litmus/litmus_proc.h
new file mode 100644
index 000000000000..6800e725d48c
--- /dev/null
+++ b/include/litmus/litmus_proc.h
@@ -0,0 +1,25 @@
+#include <litmus/sched_plugin.h>
+#include <linux/proc_fs.h>
+
+int __init init_litmus_proc(void);
+void exit_litmus_proc(void);
+
+/*
+ * On success, returns 0 and sets the pointer to the location of the new
+ * proc dir entry, otherwise returns an error code and sets pde to NULL.
+ */
+long make_plugin_proc_dir(struct sched_plugin* plugin,
+		struct proc_dir_entry** pde);
+
+/*
+ * Plugins should deallocate all child proc directory entries before
+ * calling this, to avoid memory leaks.
+ */
+void remove_plugin_proc_dir(struct sched_plugin* plugin);
+
+
+/* Copy at most size-1 bytes from ubuf into kbuf, null-terminate buf, and
+ * remove a '\n' if present. Returns the number of bytes that were read or
+ * -EFAULT. */
+int copy_and_chomp(char *kbuf, unsigned long ksize,
+		   __user const char* ubuf, unsigned long ulength);
diff --git a/include/litmus/locking.h b/include/litmus/locking.h
new file mode 100644
index 000000000000..4d7b870cb443
--- /dev/null
+++ b/include/litmus/locking.h
@@ -0,0 +1,28 @@
+#ifndef LITMUS_LOCKING_H
+#define LITMUS_LOCKING_H
+
+struct litmus_lock_ops;
+
+/* Generic base struct for LITMUS^RT userspace semaphores.
+ * This structure should be embedded in protocol-specific semaphores.
+ */
+struct litmus_lock {
+	struct litmus_lock_ops *ops;
+	int type;
+};
+
+struct litmus_lock_ops {
+	/* Current task tries to obtain / drop a reference to a lock.
+	 * Optional methods, allowed by default. */
+	int (*open)(struct litmus_lock*, void* __user);
+	int (*close)(struct litmus_lock*);
+
+	/* Current tries to lock/unlock this lock (mandatory methods). */
+	int (*lock)(struct litmus_lock*);
+	int (*unlock)(struct litmus_lock*);
+
+	/* The lock is no longer being referenced (mandatory method). */
+	void (*deallocate)(struct litmus_lock*);
+};
+
+#endif
diff --git a/include/litmus/preempt.h b/include/litmus/preempt.h
new file mode 100644
index 000000000000..f3cf29ad87ee
--- /dev/null
+++ b/include/litmus/preempt.h
@@ -0,0 +1,165 @@
+#ifndef LITMUS_PREEMPT_H
+#define LITMUS_PREEMPT_H
+
+#include <linux/types.h>
+#include <linux/cache.h>
+#include <linux/percpu.h>
+#include <asm/atomic.h>
+
+#include <litmus/debug_trace.h>
+
+extern DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, resched_state);
+
+//#ifdef CONFIG_DEBUG_KERNEL
+#if 0
+const char* sched_state_name(int s);
+#define TRACE_STATE(fmt, args...) TRACE("SCHED_STATE " fmt, args)
+#else
+#define TRACE_STATE(fmt, args...) /* ignore */
+#endif
+
+#define VERIFY_SCHED_STATE(x)						\
+	do { int __s = get_sched_state();				\
+		if ((__s & (x)) == 0)					\
+			TRACE_STATE("INVALID s=0x%x (%s) not "		\
+				    "in 0x%x (%s) [%s]\n",		\
+				    __s, sched_state_name(__s),		\
+				    (x), #x, __FUNCTION__);		\
+	} while (0);
+
+#define TRACE_SCHED_STATE_CHANGE(x, y, cpu)				\
+	TRACE_STATE("[P%d] 0x%x (%s) -> 0x%x (%s)\n",			\
+		    cpu,  (x), sched_state_name(x),			\
+		    (y), sched_state_name(y))
+
+
+typedef enum scheduling_state {
+	TASK_SCHEDULED    = (1 << 0),  /* The currently scheduled task is the one that
+					* should be scheduled, and the processor does not
+					* plan to invoke schedule(). */
+	SHOULD_SCHEDULE   = (1 << 1),  /* A remote processor has determined that the
+					* processor should reschedule, but this has not
+					* been communicated yet (IPI still pending). */
+	WILL_SCHEDULE     = (1 << 2),  /* The processor has noticed that it has to
+					* reschedule and will do so shortly. */
+	TASK_PICKED       = (1 << 3),  /* The processor is currently executing schedule(),
+					* has selected a new task to schedule, but has not
+					* yet performed the actual context switch. */
+	PICKED_WRONG_TASK = (1 << 4),  /* The processor has not yet performed the context
+					* switch, but a remote processor has already
+					* determined that a higher-priority task became
+					* eligible after the task was picked. */
+} sched_state_t;
+
+static inline sched_state_t get_sched_state_on(int cpu)
+{
+	return atomic_read(&per_cpu(resched_state, cpu));
+}
+
+static inline sched_state_t get_sched_state(void)
+{
+	return atomic_read(&__get_cpu_var(resched_state));
+}
+
+static inline int is_in_sched_state(int possible_states)
+{
+	return get_sched_state() & possible_states;
+}
+
+static inline int cpu_is_in_sched_state(int cpu, int possible_states)
+{
+	return get_sched_state_on(cpu) & possible_states;
+}
+
+static inline void set_sched_state(sched_state_t s)
+{
+	TRACE_SCHED_STATE_CHANGE(get_sched_state(), s, smp_processor_id());
+	atomic_set(&__get_cpu_var(resched_state), s);
+}
+
+static inline int sched_state_transition(sched_state_t from, sched_state_t to)
+{
+	sched_state_t old_state;
+
+	old_state = atomic_cmpxchg(&__get_cpu_var(resched_state), from, to);
+	if (old_state == from) {
+		TRACE_SCHED_STATE_CHANGE(from, to, smp_processor_id());
+		return 1;
+	} else
+		return 0;
+}
+
+static inline int sched_state_transition_on(int cpu,
+					    sched_state_t from,
+					    sched_state_t to)
+{
+	sched_state_t old_state;
+
+	old_state = atomic_cmpxchg(&per_cpu(resched_state, cpu), from, to);
+	if (old_state == from) {
+		TRACE_SCHED_STATE_CHANGE(from, to, cpu);
+		return 1;
+	} else
+		return 0;
+}
+
+/* Plugins must call this function after they have decided which job to
+ * schedule next.  IMPORTANT: this function must be called while still holding
+ * the lock that is used to serialize scheduling decisions.
+ *
+ * (Ideally, we would like to use runqueue locks for this purpose, but that
+ * would lead to deadlocks with the migration code.)
+ */
+static inline void sched_state_task_picked(void)
+{
+	VERIFY_SCHED_STATE(WILL_SCHEDULE);
+
+	/* WILL_SCHEDULE has only a local tansition => simple store is ok */
+	set_sched_state(TASK_PICKED);
+}
+
+static inline void sched_state_entered_schedule(void)
+{
+	/* Update state for the case that we entered schedule() not due to
+	 * set_tsk_need_resched() */
+	set_sched_state(WILL_SCHEDULE);
+}
+
+/* Called by schedule() to check if the scheduling decision is still valid
+ * after a context switch. Returns 1 if the CPU needs to reschdule. */
+static inline int sched_state_validate_switch(void)
+{
+	int left_state_ok = 0;
+
+	VERIFY_SCHED_STATE(PICKED_WRONG_TASK | TASK_PICKED);
+
+	if (is_in_sched_state(TASK_PICKED)) {
+		/* Might be good; let's try to transition out of this
+		 * state. This must be done atomically since remote processors
+		 * may try to change the state, too. */
+		left_state_ok = sched_state_transition(TASK_PICKED, TASK_SCHEDULED);
+	}
+
+	if (!left_state_ok) {
+		/* We raced with a higher-priority task arrival => not
+		 * valid. The CPU needs to reschedule. */
+		set_sched_state(WILL_SCHEDULE);
+		return 1;
+	} else
+		return 0;
+}
+
+/* State transition events. See litmus/preempt.c for details. */
+void sched_state_will_schedule(struct task_struct* tsk);
+void sched_state_ipi(void);
+/* Cause a CPU (remote or local) to reschedule. */
+void litmus_reschedule(int cpu);
+void litmus_reschedule_local(void);
+
+#ifdef CONFIG_DEBUG_KERNEL
+void sched_state_plugin_check(void);
+#else
+#define sched_state_plugin_check() /* no check */
+#endif
+
+#endif
diff --git a/include/litmus/rt_domain.h b/include/litmus/rt_domain.h
new file mode 100644
index 000000000000..ac249292e866
--- /dev/null
+++ b/include/litmus/rt_domain.h
@@ -0,0 +1,182 @@
+/* CLEANUP: Add comments and make it less messy.
+ *
+ */
+
+#ifndef __UNC_RT_DOMAIN_H__
+#define __UNC_RT_DOMAIN_H__
+
+#include <litmus/bheap.h>
+
+#define RELEASE_QUEUE_SLOTS 127 /* prime */
+
+struct _rt_domain;
+
+typedef int (*check_resched_needed_t)(struct _rt_domain *rt);
+typedef void (*release_jobs_t)(struct _rt_domain *rt, struct bheap* tasks);
+
+struct release_queue {
+	/* each slot maintains a list of release heaps sorted
+	 * by release time */
+	struct list_head		slot[RELEASE_QUEUE_SLOTS];
+};
+
+typedef struct _rt_domain {
+	/* runnable rt tasks are in here */
+	raw_spinlock_t 			ready_lock;
+	struct bheap	 		ready_queue;
+
+	/* real-time tasks waiting for release are in here */
+	raw_spinlock_t 			release_lock;
+	struct release_queue 		release_queue;
+
+#ifdef CONFIG_RELEASE_MASTER
+	int				release_master;
+#endif
+
+	/* for moving tasks to the release queue */
+	raw_spinlock_t			tobe_lock;
+	struct list_head		tobe_released;
+
+	/* how do we check if we need to kick another CPU? */
+	check_resched_needed_t		check_resched;
+
+	/* how do we release jobs? */
+	release_jobs_t			release_jobs;
+
+	/* how are tasks ordered in the ready queue? */
+	bheap_prio_t			order;
+} rt_domain_t;
+
+struct release_heap {
+	/* list_head for per-time-slot list */
+	struct list_head		list;
+	lt_t				release_time;
+	/* all tasks to be released at release_time */
+	struct bheap			heap;
+	/* used to trigger the release */
+	struct hrtimer			timer;
+
+#ifdef CONFIG_RELEASE_MASTER
+	/* used to delegate releases */
+	struct hrtimer_start_on_info	info;
+#endif
+	/* required for the timer callback */
+	rt_domain_t*			dom;
+};
+
+
+static inline struct task_struct* __next_ready(rt_domain_t* rt)
+{
+	struct bheap_node *hn = bheap_peek(rt->order, &rt->ready_queue);
+	if (hn)
+		return bheap2task(hn);
+	else
+		return NULL;
+}
+
+void rt_domain_init(rt_domain_t *rt, bheap_prio_t order,
+		    check_resched_needed_t check,
+		    release_jobs_t relase);
+
+void __add_ready(rt_domain_t* rt, struct task_struct *new);
+void __merge_ready(rt_domain_t* rt, struct bheap *tasks);
+void __add_release(rt_domain_t* rt, struct task_struct *task);
+
+static inline struct task_struct* __take_ready(rt_domain_t* rt)
+{
+	struct bheap_node* hn = bheap_take(rt->order, &rt->ready_queue);
+	if (hn)
+		return bheap2task(hn);
+	else
+		return NULL;
+}
+
+static inline struct task_struct* __peek_ready(rt_domain_t* rt)
+{
+	struct bheap_node* hn = bheap_peek(rt->order, &rt->ready_queue);
+	if (hn)
+		return bheap2task(hn);
+	else
+		return NULL;
+}
+
+static inline int  is_queued(struct task_struct *t)
+{
+	BUG_ON(!tsk_rt(t)->heap_node);
+	return bheap_node_in_heap(tsk_rt(t)->heap_node);
+}
+
+static inline void remove(rt_domain_t* rt, struct task_struct *t)
+{
+	bheap_delete(rt->order, &rt->ready_queue, tsk_rt(t)->heap_node);
+}
+
+static inline void add_ready(rt_domain_t* rt, struct task_struct *new)
+{
+	unsigned long flags;
+	/* first we need the write lock for rt_ready_queue */
+	raw_spin_lock_irqsave(&rt->ready_lock, flags);
+	__add_ready(rt, new);
+	raw_spin_unlock_irqrestore(&rt->ready_lock, flags);
+}
+
+static inline void merge_ready(rt_domain_t* rt, struct bheap* tasks)
+{
+	unsigned long flags;
+	raw_spin_lock_irqsave(&rt->ready_lock, flags);
+	__merge_ready(rt, tasks);
+	raw_spin_unlock_irqrestore(&rt->ready_lock, flags);
+}
+
+static inline struct task_struct* take_ready(rt_domain_t* rt)
+{
+	unsigned long flags;
+	struct task_struct* ret;
+	/* first we need the write lock for rt_ready_queue */
+	raw_spin_lock_irqsave(&rt->ready_lock, flags);
+	ret = __take_ready(rt);
+	raw_spin_unlock_irqrestore(&rt->ready_lock, flags);
+	return ret;
+}
+
+
+static inline void add_release(rt_domain_t* rt, struct task_struct *task)
+{
+	unsigned long flags;
+	raw_spin_lock_irqsave(&rt->tobe_lock, flags);
+	__add_release(rt, task);
+	raw_spin_unlock_irqrestore(&rt->tobe_lock, flags);
+}
+
+#ifdef CONFIG_RELEASE_MASTER
+void __add_release_on(rt_domain_t* rt, struct task_struct *task,
+		      int target_cpu);
+
+static inline void add_release_on(rt_domain_t* rt,
+				  struct task_struct *task,
+				  int target_cpu)
+{
+	unsigned long flags;
+	raw_spin_lock_irqsave(&rt->tobe_lock, flags);
+	__add_release_on(rt, task, target_cpu);
+	raw_spin_unlock_irqrestore(&rt->tobe_lock, flags);
+}
+#endif
+
+static inline int __jobs_pending(rt_domain_t* rt)
+{
+	return !bheap_empty(&rt->ready_queue);
+}
+
+static inline int jobs_pending(rt_domain_t* rt)
+{
+	unsigned long flags;
+	int ret;
+	/* first we need the write lock for rt_ready_queue */
+	raw_spin_lock_irqsave(&rt->ready_lock, flags);
+	ret = !bheap_empty(&rt->ready_queue);
+	raw_spin_unlock_irqrestore(&rt->ready_lock, flags);
+	return ret;
+}
+
+#endif
diff --git a/include/litmus/rt_param.h b/include/litmus/rt_param.h
new file mode 100644
index 000000000000..a23ce1524051
--- /dev/null
+++ b/include/litmus/rt_param.h
@@ -0,0 +1,228 @@
+/*
+ * Definition of the scheduler plugin interface.
+ *
+ */
+#ifndef _LINUX_RT_PARAM_H_
+#define _LINUX_RT_PARAM_H_
+
+/* Litmus time type. */
+typedef unsigned long long lt_t;
+
+static inline int lt_after(lt_t a, lt_t b)
+{
+	return ((long long) b) - ((long long) a) < 0;
+}
+#define lt_before(a, b) lt_after(b, a)
+
+static inline int lt_after_eq(lt_t a, lt_t b)
+{
+	return ((long long) a) - ((long long) b) >= 0;
+}
+#define lt_before_eq(a, b) lt_after_eq(b, a)
+
+/* different types of clients */
+typedef enum {
+	RT_CLASS_HARD,
+	RT_CLASS_SOFT,
+	RT_CLASS_BEST_EFFORT
+} task_class_t;
+
+typedef enum {
+	NO_ENFORCEMENT,      /* job may overrun unhindered */
+	QUANTUM_ENFORCEMENT, /* budgets are only checked on quantum boundaries */
+	PRECISE_ENFORCEMENT  /* NOT IMPLEMENTED - enforced with hrtimers */
+} budget_policy_t;
+
+#define LITMUS_MAX_PRIORITY 512
+
+struct rt_task {
+	lt_t 		exec_cost;
+	lt_t 		period;
+	lt_t		phase;
+	unsigned int	cpu;
+	unsigned int	priority;
+	task_class_t	cls;
+	budget_policy_t budget_policy; /* ignored by pfair */
+};
+
+union np_flag {
+	uint32_t raw;
+	struct {
+		/* Is the task currently in a non-preemptive section? */
+		uint32_t flag:31;
+		/* Should the task call into the scheduler? */
+		uint32_t preempt:1;
+	} np;
+};
+
+/* The definition of the data that is shared between the kernel and real-time
+ * tasks via a shared page (see litmus/ctrldev.c).
+ *
+ * WARNING: User space can write to this, so don't trust
+ * the correctness of the fields!
+ *
+ * This servees two purposes: to enable efficient signaling
+ * of non-preemptive sections (user->kernel) and
+ * delayed preemptions (kernel->user), and to export
+ * some real-time relevant statistics such as preemption and
+ * migration data to user space. We can't use a device to export
+ * statistics because we want to avoid system call overhead when
+ * determining preemption/migration overheads).
+ */
+struct control_page {
+	volatile union np_flag sched;
+
+	/* locking overhead tracing: time stamp prior to system call */
+	uint64_t ts_syscall_start; /* Feather-Trace cycles */
+
+	/* to be extended */
+};
+
+/* don't export internal data structures to user space (liblitmus) */
+#ifdef __KERNEL__
+
+struct _rt_domain;
+struct bheap_node;
+struct release_heap;
+
+struct rt_job {
+	/* Time instant the the job was or will be released.  */
+	lt_t	release;
+	/* What is the current deadline? */
+	lt_t   	deadline;
+
+	/* How much service has this job received so far? */
+	lt_t	exec_time;
+
+	/* Which job is this. This is used to let user space
+	 * specify which job to wait for, which is important if jobs
+	 * overrun. If we just call sys_sleep_next_period() then we
+	 * will unintentionally miss jobs after an overrun.
+	 *
+	 * Increase this sequence number when a job is released.
+	 */
+	unsigned int    job_no;
+};
+
+struct pfair_param;
+
+/*	RT task parameters for scheduling extensions
+ *	These parameters are inherited during clone and therefore must
+ *	be explicitly set up before the task set is launched.
+ */
+struct rt_param {
+	/* is the task sleeping? */
+	unsigned int 		flags:8;
+
+	/* do we need to check for srp blocking? */
+	unsigned int		srp_non_recurse:1;
+
+	/* is the task present? (true if it can be scheduled) */
+	unsigned int		present:1;
+
+#ifdef CONFIG_LITMUS_LOCKING
+	/* Is the task being priority-boosted by a locking protocol? */
+	unsigned int		priority_boosted:1;
+	/* If so, when did this start? */
+	lt_t			boost_start_time;
+#endif
+
+	/* user controlled parameters */
+	struct rt_task 		task_params;
+
+	/* timing parameters */
+	struct rt_job 		job_params;
+
+	/* task representing the current "inherited" task
+	 * priority, assigned by inherit_priority and
+	 * return priority in the scheduler plugins.
+	 * could point to self if PI does not result in
+	 * an increased task priority.
+	 */
+	 struct task_struct*	inh_task;
+
+#ifdef CONFIG_NP_SECTION
+	/* For the FMLP under PSN-EDF, it is required to make the task
+	 * non-preemptive from kernel space. In order not to interfere with
+	 * user space, this counter indicates the kernel space np setting.
+	 * kernel_np > 0 => task is non-preemptive
+	 */
+	unsigned int	kernel_np;
+#endif
+
+	/* This field can be used by plugins to store where the task
+	 * is currently scheduled. It is the responsibility of the
+	 * plugin to avoid race conditions.
+	 *
+	 * This used by GSN-EDF and PFAIR.
+	 */
+	volatile int		scheduled_on;
+
+	/* Is the stack of the task currently in use? This is updated by
+	 * the LITMUS core.
+	 *
+	 * Be careful to avoid deadlocks!
+	 */
+	volatile int		stack_in_use;
+
+	/* This field can be used by plugins to store where the task
+	 * is currently linked. It is the responsibility of the plugin
+	 * to avoid race conditions.
+	 *
+	 * Used by GSN-EDF.
+	 */
+	volatile int		linked_on;
+
+	/* PFAIR/PD^2 state. Allocated on demand. */
+	struct pfair_param*	pfair;
+
+	/* Fields saved before BE->RT transition.
+	 */
+	int old_policy;
+	int old_prio;
+
+	/* ready queue for this task */
+	struct _rt_domain* domain;
+
+	/* heap element for this task
+	 *
+	 * Warning: Don't statically allocate this node. The heap
+	 *          implementation swaps these between tasks, thus after
+	 *          dequeuing from a heap you may end up with a different node
+	 *          then the one you had when enqueuing the task.  For the same
+	 *          reason, don't obtain and store references to this node
+	 *          other than this pointer (which is updated by the heap
+	 *          implementation).
+	 */
+	struct bheap_node*	heap_node;
+	struct release_heap*	rel_heap;
+
+#ifdef CONFIG_LITMUS_LOCKING
+	/* task in heap of pending jobs -- used by C-EDF for priority donation */
+	struct bheap_node*	pending_node;
+	/* is the job in a critical section or a wait queue?*/
+	unsigned int		request_incomplete;
+	/* is the job currently a donor? */
+	unsigned int		is_donor;
+	/* is this job suspended, waiting to become eligible? */
+	unsigned int		waiting_eligible;
+
+	int			pending_on;
+#endif
+
+	/* Used by rt_domain to queue task in release list.
+	 */
+	struct list_head list;
+
+	/* Pointer to the page shared between userspace and kernel. */
+	struct control_page * ctrl_page;
+};
+
+/*	Possible RT flags	*/
+#define RT_F_RUNNING		0x00000000
+#define RT_F_SLEEP		0x00000001
+#define RT_F_EXIT_SEM		0x00000008
+
+#endif
+
+#endif
diff --git a/include/litmus/sched_plugin.h b/include/litmus/sched_plugin.h
new file mode 100644
index 000000000000..b5d1ae7bc3b6
--- /dev/null
+++ b/include/litmus/sched_plugin.h
@@ -0,0 +1,117 @@
+/*
+ * Definition of the scheduler plugin interface.
+ *
+ */
+#ifndef _LINUX_SCHED_PLUGIN_H_
+#define _LINUX_SCHED_PLUGIN_H_
+
+#include <linux/sched.h>
+
+#ifdef CONFIG_LITMUS_LOCKING
+#include <litmus/locking.h>
+#endif
+
+/************************ setup/tear down ********************/
+
+typedef long (*activate_plugin_t) (void);
+typedef long (*deactivate_plugin_t) (void);
+
+
+
+/********************* scheduler invocation ******************/
+
+/*  Plugin-specific realtime tick handler */
+typedef void (*scheduler_tick_t) (struct task_struct *cur);
+/* Novell make sched decision function */
+typedef struct task_struct* (*schedule_t)(struct task_struct * prev);
+/* Clean up after the task switch has occured.
+ * This function is called after every (even non-rt) task switch.
+ */
+typedef void (*finish_switch_t)(struct task_struct *prev);
+
+
+/********************* task state changes ********************/
+
+/* Called to setup a new real-time task.
+ * Release the first job, enqueue, etc.
+ * Task may already be running.
+ */
+typedef void (*task_new_t) (struct task_struct *task,
+			    int on_rq,
+			    int running);
+
+/* Called to re-introduce a task after blocking.
+ * Can potentially be called multiple times.
+ */
+typedef void (*task_wake_up_t) (struct task_struct *task);
+/* called to notify the plugin of a blocking real-time task
+ * it will only be called for real-time tasks and before schedule is called */
+typedef void (*task_block_t)  (struct task_struct *task);
+/* Called when a real-time task exits or changes to a different scheduling
+ * class.
+ * Free any allocated resources
+ */
+typedef void (*task_exit_t)    (struct task_struct *);
+
+/* called early before the caller  holds the runqueue lock */
+typedef void (*pre_setsched_t) (struct task_struct *, int policy);
+
+
+/* Called when the current task attempts to create a new lock of a given
+ * protocol type. */
+typedef long (*allocate_lock_t) (struct litmus_lock **lock, int type,
+				 void* __user config);
+
+
+/********************* sys call backends  ********************/
+/* This function causes the caller to sleep until the next release */
+typedef long (*complete_job_t) (void);
+
+typedef long (*admit_task_t)(struct task_struct* tsk);
+
+typedef void (*release_at_t)(struct task_struct *t, lt_t start);
+
+struct sched_plugin {
+	struct list_head	list;
+	/* 	basic info 		*/
+	char 			*plugin_name;
+
+	/*	setup			*/
+	activate_plugin_t	activate_plugin;
+	deactivate_plugin_t	deactivate_plugin;
+
+	/* 	scheduler invocation 	*/
+	scheduler_tick_t        tick;
+	schedule_t 		schedule;
+	finish_switch_t 	finish_switch;
+
+	/*	syscall backend 	*/
+	complete_job_t 		complete_job;
+	release_at_t		release_at;
+
+	/*	task state changes 	*/
+	admit_task_t		admit_task;
+
+        task_new_t 		task_new;
+	task_wake_up_t		task_wake_up;
+	task_block_t		task_block;
+	task_exit_t 		task_exit;
+
+	pre_setsched_t		pre_setsched;
+
+#ifdef CONFIG_LITMUS_LOCKING
+	/*	locking protocols	*/
+	allocate_lock_t		allocate_lock;
+#endif
+} __attribute__ ((__aligned__(SMP_CACHE_BYTES)));
+
+
+extern struct sched_plugin *litmus;
+
+int register_sched_plugin(struct sched_plugin* plugin);
+struct sched_plugin* find_sched_plugin(const char* name);
+int print_sched_plugins(char* buf, int max);
+
+extern struct sched_plugin linux_sched_plugin;
+
+#endif
diff --git a/include/litmus/sched_plugin.h.rej b/include/litmus/sched_plugin.h.rej
new file mode 100644
index 000000000000..47e0c27c5061
--- /dev/null
+++ b/include/litmus/sched_plugin.h.rej
@@ -0,0 +1,22 @@
+--- include/litmus/sched_plugin.h
++++ include/litmus/sched_plugin.h
+@@ -53,6 +53,10 @@
+  */
+ typedef void (*task_exit_t)    (struct task_struct *);
+ 
++/* called early before the caller  holds the runqueue lock */
++typedef void (*pre_setsched_t) (struct task_struct *, int policy);
++
++
+ /* Called when the current task attempts to create a new lock of a given
+  * protocol type. */
+ typedef long (*allocate_lock_t) (struct litmus_lock **lock, int type,
+@@ -93,6 +97,8 @@
+ 	task_block_t		task_block;
+ 	task_exit_t 		task_exit;
+ 
++	pre_setsched_t		pre_setsched;
++
+ #ifdef CONFIG_LITMUS_LOCKING
+ 	/*	locking protocols	*/
+ 	allocate_lock_t		allocate_lock;
diff --git a/include/litmus/sched_trace.h b/include/litmus/sched_trace.h
new file mode 100644
index 000000000000..7ca34cb13881
--- /dev/null
+++ b/include/litmus/sched_trace.h
@@ -0,0 +1,200 @@
+/*
+ * sched_trace.h -- record scheduler events to a byte stream for offline analysis.
+ */
+#ifndef _LINUX_SCHED_TRACE_H_
+#define _LINUX_SCHED_TRACE_H_
+
+/* all times in nanoseconds */
+
+struct st_trace_header {
+	u8	type;		/* Of what type is this record?  */
+	u8	cpu;		/* On which CPU was it recorded? */
+	u16	pid;		/* PID of the task.              */
+	u32	job;		/* The job sequence number.      */
+};
+
+#define ST_NAME_LEN 16
+struct st_name_data {
+	char	cmd[ST_NAME_LEN];/* The name of the executable of this process. */
+};
+
+struct st_param_data {		/* regular params */
+	u32	wcet;
+	u32	period;
+	u32	phase;
+	u8	partition;
+	u8	class;
+	u8	__unused[2];
+};
+
+struct st_release_data {	/* A job is was/is going to be released. */
+	u64	release;	/* What's the release time?              */
+	u64	deadline;	/* By when must it finish?		 */
+};
+
+struct st_assigned_data {	/* A job was asigned to a CPU. 		 */
+	u64	when;
+	u8	target;		/* Where should it execute?	         */
+	u8	__unused[7];
+};
+
+struct st_switch_to_data {	/* A process was switched to on a given CPU.   */
+	u64	when;		/* When did this occur?                        */
+	u32	exec_time;	/* Time the current job has executed.          */
+	u8	__unused[4];
+
+};
+
+struct st_switch_away_data {	/* A process was switched away from on a given CPU. */
+	u64	when;
+	u64	exec_time;
+};
+
+struct st_completion_data {	/* A job completed. */
+	u64	when;
+	u8	forced:1; 	/* Set to 1 if job overran and kernel advanced to the
+				 * next task automatically; set to 0 otherwise.
+				 */
+	u8	__uflags:7;
+	u8	__unused[7];
+};
+
+struct st_block_data {		/* A task blocks. */
+	u64	when;
+	u64	__unused;
+};
+
+struct st_resume_data {		/* A task resumes. */
+	u64	when;
+	u64	__unused;
+};
+
+struct st_action_data {
+	u64	when;
+	u8	action;
+	u8	__unused[7];
+};
+
+struct st_sys_release_data {
+	u64	when;
+	u64	release;
+};
+
+#define DATA(x) struct st_ ## x ## _data x;
+
+typedef enum {
+        ST_NAME = 1,		/* Start at one, so that we can spot
+				 * uninitialized records. */
+	ST_PARAM,
+	ST_RELEASE,
+	ST_ASSIGNED,
+	ST_SWITCH_TO,
+	ST_SWITCH_AWAY,
+	ST_COMPLETION,
+	ST_BLOCK,
+	ST_RESUME,
+	ST_ACTION,
+	ST_SYS_RELEASE
+} st_event_record_type_t;
+
+struct st_event_record {
+	struct st_trace_header hdr;
+	union {
+		u64 raw[2];
+
+		DATA(name);
+		DATA(param);
+		DATA(release);
+		DATA(assigned);
+		DATA(switch_to);
+		DATA(switch_away);
+		DATA(completion);
+		DATA(block);
+		DATA(resume);
+		DATA(action);
+		DATA(sys_release);
+	} data;
+};
+
+#undef DATA
+
+#ifdef __KERNEL__
+
+#include <linux/sched.h>
+#include <litmus/feather_trace.h>
+
+#ifdef CONFIG_SCHED_TASK_TRACE
+
+#define SCHED_TRACE(id, callback, task) \
+	ft_event1(id, callback, task)
+#define SCHED_TRACE2(id, callback, task, xtra) \
+	ft_event2(id, callback, task, xtra)
+
+/* provide prototypes; needed on sparc64 */
+#ifndef NO_TASK_TRACE_DECLS
+feather_callback void do_sched_trace_task_name(unsigned long id,
+					       struct task_struct* task);
+feather_callback void do_sched_trace_task_param(unsigned long id,
+						struct task_struct* task);
+feather_callback void do_sched_trace_task_release(unsigned long id,
+						  struct task_struct* task);
+feather_callback void do_sched_trace_task_switch_to(unsigned long id,
+						    struct task_struct* task);
+feather_callback void do_sched_trace_task_switch_away(unsigned long id,
+						      struct task_struct* task);
+feather_callback void do_sched_trace_task_completion(unsigned long id,
+						     struct task_struct* task,
+						     unsigned long forced);
+feather_callback void do_sched_trace_task_block(unsigned long id,
+						struct task_struct* task);
+feather_callback void do_sched_trace_task_resume(unsigned long id,
+						 struct task_struct* task);
+feather_callback void do_sched_trace_action(unsigned long id,
+					    struct task_struct* task,
+					    unsigned long action);
+feather_callback void do_sched_trace_sys_release(unsigned long id,
+						 lt_t* start);
+
+#endif
+
+#else
+
+#define SCHED_TRACE(id, callback, task)        /* no tracing */
+#define SCHED_TRACE2(id, callback, task, xtra) /* no tracing */
+
+#endif
+
+
+#define SCHED_TRACE_BASE_ID 500
+
+
+#define sched_trace_task_name(t) \
+	SCHED_TRACE(SCHED_TRACE_BASE_ID + 1, do_sched_trace_task_name, t)
+#define sched_trace_task_param(t) \
+	SCHED_TRACE(SCHED_TRACE_BASE_ID + 2, do_sched_trace_task_param, t)
+#define sched_trace_task_release(t) \
+	SCHED_TRACE(SCHED_TRACE_BASE_ID + 3, do_sched_trace_task_release, t)
+#define sched_trace_task_switch_to(t) \
+	SCHED_TRACE(SCHED_TRACE_BASE_ID + 4, do_sched_trace_task_switch_to, t)
+#define sched_trace_task_switch_away(t) \
+	SCHED_TRACE(SCHED_TRACE_BASE_ID + 5, do_sched_trace_task_switch_away, t)
+#define sched_trace_task_completion(t, forced) \
+	SCHED_TRACE2(SCHED_TRACE_BASE_ID + 6, do_sched_trace_task_completion, t, \
+		     (unsigned long) forced)
+#define sched_trace_task_block(t) \
+	SCHED_TRACE(SCHED_TRACE_BASE_ID + 7, do_sched_trace_task_block, t)
+#define sched_trace_task_resume(t) \
+	SCHED_TRACE(SCHED_TRACE_BASE_ID + 8, do_sched_trace_task_resume, t)
+#define sched_trace_action(t, action) \
+	SCHED_TRACE2(SCHED_TRACE_BASE_ID + 9, do_sched_trace_action, t, \
+		     (unsigned long) action);
+/* when is a pointer, it does not need an explicit cast to unsigned long */
+#define sched_trace_sys_release(when) \
+	SCHED_TRACE(SCHED_TRACE_BASE_ID + 10, do_sched_trace_sys_release, when)
+
+
+#define sched_trace_quantum_boundary() /* NOT IMPLEMENTED */
+
+#endif /* __KERNEL__ */
+
+#endif
diff --git a/include/litmus/srp.h b/include/litmus/srp.h
new file mode 100644
index 000000000000..c9a4552b2bf3
--- /dev/null
+++ b/include/litmus/srp.h
@@ -0,0 +1,28 @@
+#ifndef LITMUS_SRP_H
+#define LITMUS_SRP_H
+
+struct srp_semaphore;
+
+struct srp_priority {
+	struct list_head	list;
+        unsigned int 		priority;
+	pid_t			pid;
+};
+#define list2prio(l) list_entry(l, struct srp_priority, list)
+
+/* struct for uniprocessor SRP "semaphore" */
+struct srp_semaphore {
+	struct litmus_lock litmus_lock;
+	struct srp_priority ceiling;
+	struct task_struct* owner;
+	int cpu; /* cpu associated with this "semaphore" and resource */
+};
+
+/* map a task to its SRP preemption level priority */
+typedef unsigned int (*srp_prioritization_t)(struct task_struct* t);
+/* Must be updated by each plugin that uses SRP.*/
+extern srp_prioritization_t get_srp_prio;
+
+struct srp_semaphore* allocate_srp_semaphore(void);
+
+#endif
diff --git a/include/litmus/trace.h b/include/litmus/trace.h
new file mode 100644
index 000000000000..d6829c416912
--- /dev/null
+++ b/include/litmus/trace.h
@@ -0,0 +1,129 @@
+#ifndef _SYS_TRACE_H_
+#define	_SYS_TRACE_H_
+
+#ifdef CONFIG_SCHED_OVERHEAD_TRACE
+
+#include <litmus/feather_trace.h>
+#include <litmus/feather_buffer.h>
+
+
+/*********************** TIMESTAMPS ************************/
+
+enum task_type_marker {
+	TSK_BE,
+	TSK_RT,
+	TSK_UNKNOWN
+};
+
+struct timestamp {
+	uint64_t		timestamp;
+	uint32_t		seq_no;
+	uint8_t			cpu;
+	uint8_t			event;
+	uint8_t			task_type;
+};
+
+/* tracing callbacks */
+feather_callback void save_timestamp(unsigned long event);
+feather_callback void save_timestamp_def(unsigned long event, unsigned long type);
+feather_callback void save_timestamp_task(unsigned long event, unsigned long t_ptr);
+feather_callback void save_timestamp_cpu(unsigned long event, unsigned long cpu);
+feather_callback void save_task_latency(unsigned long event, unsigned long when_ptr);
+feather_callback void save_timestamp_time(unsigned long event, unsigned long time_ptr);
+
+#define TIMESTAMP(id) ft_event0(id, save_timestamp)
+
+#define DTIMESTAMP(id, def)  ft_event1(id, save_timestamp_def, (unsigned long) def)
+
+#define TTIMESTAMP(id, task) \
+	ft_event1(id, save_timestamp_task, (unsigned long) task)
+
+#define CTIMESTAMP(id, cpu) \
+	ft_event1(id, save_timestamp_cpu, (unsigned long) cpu)
+
+#define LTIMESTAMP(id, task) \
+	ft_event1(id, save_task_latency, (unsigned long) task)
+
+#define TIMESTAMP_TIME(id, time_ptr) \
+	ft_event1(id, save_timestamp_time, (unsigned long) time_ptr)
+
+#define TIMESTAMP_PID(id) ft_event0(id, save_timestamp_pid)
+
+#else /* !CONFIG_SCHED_OVERHEAD_TRACE */
+
+#define TIMESTAMP(id)        /* no tracing */
+
+#define DTIMESTAMP(id, def)  /* no tracing */
+
+#define TTIMESTAMP(id, task) /* no tracing */
+
+#define CTIMESTAMP(id, cpu)  /* no tracing */
+
+#define LTIMESTAMP(id, when_ptr) /* no tracing */
+
+#define TIMESTAMP_TIME(id, time_ptr) /* no tracing */
+
+#define TIMESTAMP_PID(id) /* no tracing */
+
+#endif
+
+
+/* Convention for timestamps
+ * =========================
+ *
+ * In order to process the trace files with a common tool, we use the following
+ * convention to measure execution times: The end time id of a code segment is
+ * always the next number after the start time event id.
+ */
+
+#define __TS_SYSCALL_IN_START(p)	TIMESTAMP_TIME(10, p)
+#define TS_SYSCALL_IN_END		TIMESTAMP_PID(11)
+
+#define TS_SYSCALL_OUT_START		TIMESTAMP_PID(20)
+#define TS_SYSCALL_OUT_END		TIMESTAMP_PID(21)
+
+#define TS_LOCK_START			TIMESTAMP_PID(30)
+#define TS_LOCK_END			TIMESTAMP_PID(31)
+
+#define TS_LOCK_SUSPEND			TIMESTAMP_PID(38)
+#define TS_LOCK_RESUME			TIMESTAMP_PID(39)
+
+#define TS_UNLOCK_START			TIMESTAMP_PID(40)
+#define TS_UNLOCK_END			TIMESTAMP_PID(41)
+
+#define TS_SCHED_START			DTIMESTAMP(100, TSK_UNKNOWN) /* we only
+								      * care
+								      * about
+								      * next */
+#define TS_SCHED_END(t)			TTIMESTAMP(101, t)
+#define TS_SCHED2_START(t) 		TTIMESTAMP(102, t)
+#define TS_SCHED2_END(t)       		TTIMESTAMP(103, t)
+
+#define TS_CXS_START(t)			TTIMESTAMP(104, t)
+#define TS_CXS_END(t)			TTIMESTAMP(105, t)
+
+#define TS_RELEASE_START		DTIMESTAMP(106, TSK_RT)
+#define TS_RELEASE_END			DTIMESTAMP(107, TSK_RT)
+
+#define TS_TICK_START(t)		TTIMESTAMP(110, t)
+#define TS_TICK_END(t) 			TTIMESTAMP(111, t)
+
+
+#define TS_PLUGIN_SCHED_START		/* TIMESTAMP(120) */  /* currently unused */
+#define TS_PLUGIN_SCHED_END		/* TIMESTAMP(121) */
+
+#define TS_PLUGIN_TICK_START		/* TIMESTAMP(130) */
+#define TS_PLUGIN_TICK_END		/* TIMESTAMP(131) */
+
+#define TS_ENTER_NP_START		TIMESTAMP(140)
+#define TS_ENTER_NP_END			TIMESTAMP(141)
+
+#define TS_EXIT_NP_START		TIMESTAMP(150)
+#define TS_EXIT_NP_END			TIMESTAMP(151)
+
+#define TS_SEND_RESCHED_START(c)	CTIMESTAMP(190, c)
+#define TS_SEND_RESCHED_END		DTIMESTAMP(191, TSK_UNKNOWN)
+
+#define TS_RELEASE_LATENCY(when)	LTIMESTAMP(208, &(when))
+
+#endif /* !_SYS_TRACE_H_ */
diff --git a/include/litmus/unistd_32.h b/include/litmus/unistd_32.h
new file mode 100644
index 000000000000..94264c27d9ac
--- /dev/null
+++ b/include/litmus/unistd_32.h
@@ -0,0 +1,21 @@
+/*
+ * included from arch/x86/include/asm/unistd_32.h
+ *
+ * LITMUS^RT syscalls with "relative" numbers
+ */
+#define __LSC(x) (__NR_LITMUS + x)
+
+#define __NR_set_rt_task_param	__LSC(0)
+#define __NR_get_rt_task_param	__LSC(1)
+#define __NR_complete_job	__LSC(2)
+#define __NR_od_open		__LSC(3)
+#define __NR_od_close		__LSC(4)
+#define __NR_litmus_lock       	__LSC(5)
+#define __NR_litmus_unlock	__LSC(6)
+#define __NR_query_job_no	__LSC(7)
+#define __NR_wait_for_job_release __LSC(8)
+#define __NR_wait_for_ts_release __LSC(9)
+#define __NR_release_ts		__LSC(10)
+#define __NR_null_call		__LSC(11)
+
+#define NR_litmus_syscalls 12
diff --git a/include/litmus/unistd_64.h b/include/litmus/unistd_64.h
new file mode 100644
index 000000000000..d5ced0d2642c
--- /dev/null
+++ b/include/litmus/unistd_64.h
@@ -0,0 +1,33 @@
+/*
+ * included from arch/x86/include/asm/unistd_64.h
+ *
+ * LITMUS^RT syscalls with "relative" numbers
+ */
+#define __LSC(x) (__NR_LITMUS + x)
+
+#define __NR_set_rt_task_param			__LSC(0)
+__SYSCALL(__NR_set_rt_task_param, sys_set_rt_task_param)
+#define __NR_get_rt_task_param			__LSC(1)
+__SYSCALL(__NR_get_rt_task_param, sys_get_rt_task_param)
+#define __NR_complete_job	  		__LSC(2)
+__SYSCALL(__NR_complete_job, sys_complete_job)
+#define __NR_od_open				__LSC(3)
+__SYSCALL(__NR_od_open, sys_od_open)
+#define __NR_od_close				__LSC(4)
+__SYSCALL(__NR_od_close, sys_od_close)
+#define __NR_litmus_lock	       		__LSC(5)
+__SYSCALL(__NR_litmus_lock, sys_litmus_lock)
+#define __NR_litmus_unlock	       		__LSC(6)
+__SYSCALL(__NR_litmus_unlock, sys_litmus_unlock)
+#define __NR_query_job_no			__LSC(7)
+__SYSCALL(__NR_query_job_no, sys_query_job_no)
+#define __NR_wait_for_job_release		__LSC(8)
+__SYSCALL(__NR_wait_for_job_release, sys_wait_for_job_release)
+#define __NR_wait_for_ts_release		__LSC(9)
+__SYSCALL(__NR_wait_for_ts_release, sys_wait_for_ts_release)
+#define __NR_release_ts				__LSC(10)
+__SYSCALL(__NR_release_ts, sys_release_ts)
+#define __NR_null_call				__LSC(11)
+__SYSCALL(__NR_null_call, sys_null_call)
+
+#define NR_litmus_syscalls 12
diff --git a/include/litmus/wait.h b/include/litmus/wait.h
new file mode 100644
index 000000000000..ce1347c355f8
--- /dev/null
+++ b/include/litmus/wait.h
@@ -0,0 +1,57 @@
+#ifndef _LITMUS_WAIT_H_
+#define _LITMUS_WAIT_H_
+
+struct task_struct* __waitqueue_remove_first(wait_queue_head_t *wq);
+
+/* wrap regular wait_queue_t head */
+struct __prio_wait_queue {
+	wait_queue_t wq;
+
+	/* some priority point */
+	lt_t priority;
+	/* break ties in priority by lower tie_breaker */
+	unsigned int tie_breaker;
+};
+
+typedef struct __prio_wait_queue prio_wait_queue_t;
+
+static inline void init_prio_waitqueue_entry(prio_wait_queue_t *pwq,
+					     struct task_struct* t,
+					     lt_t priority)
+{
+	init_waitqueue_entry(&pwq->wq, t);
+	pwq->priority    = priority;
+	pwq->tie_breaker = 0;
+}
+
+static inline void init_prio_waitqueue_entry_tie(prio_wait_queue_t *pwq,
+						 struct task_struct* t,
+						 lt_t priority,
+						 unsigned int tie_breaker)
+{
+	init_waitqueue_entry(&pwq->wq, t);
+	pwq->priority    = priority;
+	pwq->tie_breaker = tie_breaker;
+}
+
+unsigned int __add_wait_queue_prio_exclusive(
+	wait_queue_head_t* head,
+	prio_wait_queue_t *new);
+
+static inline unsigned int add_wait_queue_prio_exclusive(
+	wait_queue_head_t* head,
+	prio_wait_queue_t *new)
+{
+	unsigned long flags;
+	unsigned int passed;
+
+	spin_lock_irqsave(&head->lock, flags);
+	passed = __add_wait_queue_prio_exclusive(head, new);
+
+	spin_unlock_irqrestore(&head->lock, flags);
+
+	return passed;
+}
+
+
+#endif
diff --git a/kernel/exit.c b/kernel/exit.c
index 03120229db28..b9d3bc6c21ec 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -56,6 +56,8 @@
 #include <asm/pgtable.h>
 #include <asm/mmu_context.h>
 
+extern void exit_od_table(struct task_struct *t);
+
 static void exit_mm(struct task_struct * tsk);
 
 static void __unhash_process(struct task_struct *p, bool group_dead)
@@ -960,6 +962,8 @@ NORET_TYPE void do_exit(long code)
 	if (unlikely(tsk->audit_context))
 		audit_free(tsk);
 
+	exit_od_table(tsk);
+
 	tsk->exit_code = code;
 	taskstats_exit(tsk, group_dead);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index c445f8cc408d..ab7f29d906c7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -75,6 +75,9 @@
 
 #include <trace/events/sched.h>
 
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+
 /*
  * Protected counters by write_lock_irq(&tasklist_lock)
  */
@@ -183,6 +186,7 @@ void __put_task_struct(struct task_struct *tsk)
 	WARN_ON(atomic_read(&tsk->usage));
 	WARN_ON(tsk == current);
 
+	exit_litmus(tsk);
 	exit_creds(tsk);
 	delayacct_tsk_free(tsk);
 	put_signal_struct(tsk->signal);
@@ -266,6 +270,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 
 	tsk->stack = ti;
 
+	/* Don't let the new task be a real-time task. */
+	litmus_fork(tsk);
+
 	err = prop_local_init_single(&tsk->dirties);
 	if (err)
 		goto out;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 72206cf5c6cf..cb49883b64e5 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -46,6 +46,8 @@
 #include <linux/sched.h>
 #include <linux/timer.h>
 
+#include <litmus/litmus.h>
+
 #include <asm/uaccess.h>
 
 #include <trace/events/timer.h>
@@ -1042,6 +1044,98 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
 }
 EXPORT_SYMBOL_GPL(hrtimer_start);
 
+#ifdef CONFIG_ARCH_HAS_SEND_PULL_TIMERS
+
+/**
+ * hrtimer_start_on_info_init - Initialize hrtimer_start_on_info
+ */
+void hrtimer_start_on_info_init(struct hrtimer_start_on_info *info)
+{
+	memset(info, 0, sizeof(struct hrtimer_start_on_info));
+	atomic_set(&info->state, HRTIMER_START_ON_INACTIVE);
+}
+
+/**
+ *  hrtimer_pull - PULL_TIMERS_VECTOR callback on remote cpu
+ */
+void hrtimer_pull(void)
+{
+	struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases);
+	struct hrtimer_start_on_info *info;
+	struct list_head *pos, *safe, list;
+
+	raw_spin_lock(&base->lock);
+	list_replace_init(&base->to_pull, &list);
+	raw_spin_unlock(&base->lock);
+
+	list_for_each_safe(pos, safe, &list) {
+		info = list_entry(pos, struct hrtimer_start_on_info, list);
+		TRACE("pulled timer 0x%x\n", info->timer);
+		list_del(pos);
+		hrtimer_start(info->timer, info->time, info->mode);
+	}
+}
+
+/**
+ *  hrtimer_start_on - trigger timer arming on remote cpu
+ *  @cpu:	remote cpu
+ *  @info:	save timer information for enqueuing on remote cpu
+ *  @timer:	timer to be pulled
+ *  @time:	expire time
+ *  @mode:	timer mode
+ */
+int hrtimer_start_on(int cpu, struct hrtimer_start_on_info* info,
+		struct hrtimer *timer, ktime_t time,
+		const enum hrtimer_mode mode)
+{
+	unsigned long flags;
+	struct hrtimer_cpu_base* base;
+	int in_use = 0, was_empty;
+
+	/* serialize access to info through the timer base */
+	lock_hrtimer_base(timer, &flags);
+
+	in_use = (atomic_read(&info->state) != HRTIMER_START_ON_INACTIVE);
+	if (!in_use) {
+		INIT_LIST_HEAD(&info->list);
+		info->timer = timer;
+		info->time  = time;
+		info->mode  = mode;
+		/* mark as in use */
+		atomic_set(&info->state, HRTIMER_START_ON_QUEUED);
+	}
+
+	unlock_hrtimer_base(timer, &flags);
+
+	if (!in_use) {
+		/* initiate pull  */
+		preempt_disable();
+		if (cpu == smp_processor_id()) {
+			/* start timer locally; we may get called
+			 * with rq->lock held, do not wake up anything
+			 */
+			TRACE("hrtimer_start_on: starting on local CPU\n");
+			__hrtimer_start_range_ns(info->timer, info->time,
+						 0, info->mode, 0);
+		} else {
+			TRACE("hrtimer_start_on: pulling to remote CPU\n");
+			base = &per_cpu(hrtimer_bases, cpu);
+			raw_spin_lock_irqsave(&base->lock, flags);
+			was_empty = list_empty(&base->to_pull);
+			list_add(&info->list, &base->to_pull);
+			raw_spin_unlock_irqrestore(&base->lock, flags);
+			if (was_empty)
+				/* only send IPI if other no else
+				 * has done so already
+				 */
+				smp_send_pull_timers(cpu);
+		}
+		preempt_enable();
+	}
+	return in_use;
+}
+
+#endif
 
 /**
  * hrtimer_try_to_cancel - try to deactivate a timer
@@ -1634,6 +1728,7 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
 		cpu_base->clock_base[i].cpu_base = cpu_base;
 
 	hrtimer_init_hres(cpu_base);
+	INIT_LIST_HEAD(&cpu_base->to_pull);
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
diff --git a/kernel/printk.c b/kernel/printk.c
index 8fe465ac008a..9dc8ea140426 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -73,6 +73,13 @@ int console_printk[4] = {
 	DEFAULT_CONSOLE_LOGLEVEL,	/* default_console_loglevel */
 };
 
+/*
+ * divert printk() messages when there is a LITMUS^RT debug listener
+ */
+#include <litmus/litmus.h>
+int trace_override = 0;
+int trace_recurse  = 0;
+
 /*
  * Low level drivers may need that to know if they can schedule in
  * their unblank() callback or not. So let's export it.
@@ -735,6 +742,9 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 	/* Emit the output into the temporary buffer */
 	printed_len += vscnprintf(printk_buf + printed_len,
 				  sizeof(printk_buf) - printed_len, fmt, args);
+	/* if LITMUS^RT tracer is active divert printk() msgs */
+	if (trace_override && !trace_recurse)
+		TRACE("%s", printk_buf);
 
 
 	p = printk_buf;
@@ -804,7 +814,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 	 * Try to acquire and then immediately release the
 	 * console semaphore. The release will do all the
 	 * actual magic (print out buffers, wake up klogd,
-	 * etc). 
+	 * etc).
 	 *
 	 * The acquire_console_semaphore_for_printk() function
 	 * will release 'logbuf_lock' regardless of whether it
@@ -1067,7 +1077,7 @@ int printk_needs_cpu(int cpu)
 
 void wake_up_klogd(void)
 {
-	if (waitqueue_active(&log_wait))
+	if (!trace_override && waitqueue_active(&log_wait))
 		__raw_get_cpu_var(printk_pending) = 1;
 }
 
diff --git a/kernel/sched.c b/kernel/sched.c
index dc85ceb90832..1f5327f8c012 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -79,6 +79,11 @@
 #include "sched_cpupri.h"
 #include "workqueue_sched.h"
 
+#include <litmus/sched_trace.h>
+#include <litmus/trace.h>
+
+static void litmus_tick(struct rq*, struct task_struct*);
+
 #define CREATE_TRACE_POINTS
 #include <trace/events/sched.h>
 
@@ -405,6 +410,12 @@ struct rt_rq {
 #endif
 };
 
+/* Litmus related fields in a runqueue */
+struct litmus_rq {
+	unsigned long nr_running;
+	struct task_struct *prev;
+};
+
 #ifdef CONFIG_SMP
 
 /*
@@ -471,6 +482,7 @@ struct rq {
 
 	struct cfs_rq cfs;
 	struct rt_rq rt;
+	struct litmus_rq litmus;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	/* list of leaf cfs_rq on this cpu: */
@@ -566,8 +578,14 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
 	 * A queue event has occurred, and we're going to schedule.  In
 	 * this case, we can save a useless back to back clock update.
 	 */
+	/* LITMUS^RT: turning off the clock update is buggy in Linux 2.6.36;
+	 * the scheduler can "forget" to renable the runqueue clock in some
+	 * cases. LITMUS^RT amplifies the effects of this problem. Hence, we
+	 * turn it off to avoid stalling clocks. */
+	/*
 	if (test_tsk_need_resched(p))
 		rq->skip_clock_update = 1;
+	*/
 }
 
 static inline int cpu_of(struct rq *rq)
@@ -1042,6 +1060,7 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
 	raw_spin_lock(&rq->lock);
 	update_rq_clock(rq);
 	rq->curr->sched_class->task_tick(rq, rq->curr, 1);
+	litmus_tick(rq, rq->curr);
 	raw_spin_unlock(&rq->lock);
 
 	return HRTIMER_NORESTART;
@@ -1840,7 +1859,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 
 static const struct sched_class rt_sched_class;
 
-#define sched_class_highest (&rt_sched_class)
+#define sched_class_highest (&litmus_sched_class)
 #define for_each_class(class) \
    for (class = sched_class_highest; class; class = class->next)
 
@@ -1920,6 +1939,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
 #include "sched_idletask.c"
 #include "sched_fair.c"
 #include "sched_rt.c"
+#include "../litmus/sched_litmus.c"
 #ifdef CONFIG_SCHED_DEBUG
 # include "sched_debug.c"
 #endif
@@ -2352,6 +2372,9 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
 	unsigned long en_flags = ENQUEUE_WAKEUP;
 	struct rq *rq;
 
+	if (is_realtime(p))
+		TRACE_TASK(p, "try_to_wake_up() state:%d\n", p->state);
+
 	this_cpu = get_cpu();
 
 	smp_wmb();
@@ -2366,7 +2389,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
 	orig_cpu = cpu;
 
 #ifdef CONFIG_SMP
-	if (unlikely(task_running(rq, p)))
+	if (unlikely(task_running(rq, p)) || is_realtime(p))
 		goto out_activate;
 
 	/*
@@ -2428,6 +2451,8 @@ out_activate:
 out_running:
 	ttwu_post_activation(p, rq, wake_flags, success);
 out:
+	if (is_realtime(p))
+		TRACE_TASK(p, "try_to_wake_up() done state:%d\n", p->state);
 	task_rq_unlock(rq, &flags);
 	put_cpu();
 
@@ -2532,7 +2557,8 @@ void sched_fork(struct task_struct *p, int clone_flags)
 	 * Revert to default priority/policy on fork if requested.
 	 */
 	if (unlikely(p->sched_reset_on_fork)) {
-		if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {
+		if (p->policy == SCHED_FIFO || p->policy == SCHED_RR ||
+		    p->policy == SCHED_LITMUS) {
 			p->policy = SCHED_NORMAL;
 			p->normal_prio = p->static_prio;
 		}
@@ -2748,6 +2774,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	 */
 	prev_state = prev->state;
 	finish_arch_switch(prev);
+	litmus->finish_switch(prev);
+	prev->rt_param.stack_in_use = NO_CPU;
 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
 	local_irq_disable();
 #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
@@ -2777,6 +2805,15 @@ static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
 {
 	if (prev->sched_class->pre_schedule)
 		prev->sched_class->pre_schedule(rq, prev);
+
+	/* LITMUS^RT not very clean hack: we need to save the prev task
+	 * as our scheduling decision rely on it (as we drop the rq lock
+	 * something in prev can change...); there is no way to escape
+	 * this ack apart from modifying pick_nex_task(rq, _prev_) or
+	 * falling back on the previous solution of decoupling
+	 * scheduling decisions
+	 */
+	rq->litmus.prev = prev;
 }
 
 /* rq->lock is NOT held, but preemption is disabled */
@@ -3578,18 +3615,26 @@ void scheduler_tick(void)
 
 	sched_clock_tick();
 
+	TS_TICK_START(current);
+
 	raw_spin_lock(&rq->lock);
 	update_rq_clock(rq);
 	update_cpu_load_active(rq);
 	curr->sched_class->task_tick(rq, curr, 0);
+
+	/* litmus_tick may force current to resched */
+	litmus_tick(rq, curr);
+
 	raw_spin_unlock(&rq->lock);
 
 	perf_event_task_tick(curr);
 
 #ifdef CONFIG_SMP
 	rq->idle_at_tick = idle_cpu(cpu);
-	trigger_load_balance(rq, cpu);
+	if (!is_realtime(current))
+		trigger_load_balance(rq, cpu);
 #endif
+	TS_TICK_END(current);
 }
 
 notrace unsigned long get_parent_ip(unsigned long addr)
@@ -3716,12 +3761,20 @@ pick_next_task(struct rq *rq)
 	/*
 	 * Optimization: we know that if all tasks are in
 	 * the fair class we can call that function directly:
-	 */
-	if (likely(rq->nr_running == rq->cfs.nr_running)) {
+
+	 * NOT IN LITMUS^RT!
+
+	 * This breaks many assumptions in the plugins.
+	 * Do not uncomment without thinking long and hard
+	 * about how this affects global plugins such as GSN-EDF.
+
+	if (rq->nr_running == rq->cfs.nr_running) {
+		TRACE("taking shortcut in pick_next_task()\n");
 		p = fair_sched_class.pick_next_task(rq);
 		if (likely(p))
 			return p;
 	}
+	*/
 
 	class = sched_class_highest;
 	for ( ; ; ) {
@@ -3748,6 +3801,7 @@ asmlinkage void __sched schedule(void)
 
 need_resched:
 	preempt_disable();
+	sched_state_entered_schedule();
 	cpu = smp_processor_id();
 	rq = cpu_rq(cpu);
 	rcu_note_context_switch(cpu);
@@ -3755,6 +3809,8 @@ need_resched:
 
 	release_kernel_lock(prev);
 need_resched_nonpreemptible:
+	TS_SCHED_START;
+	sched_trace_task_switch_away(prev);
 
 	schedule_debug(prev);
 
@@ -3803,7 +3859,10 @@ need_resched_nonpreemptible:
 		rq->curr = next;
 		++*switch_count;
 
+		TS_SCHED_END(next);
+		TS_CXS_START(next);
 		context_switch(rq, prev, next); /* unlocks the rq */
+		TS_CXS_END(current);
 		/*
 		 * The context switch have flipped the stack from under us
 		 * and restored the local variables which were saved when
@@ -3812,17 +3871,23 @@ need_resched_nonpreemptible:
 		 */
 		cpu = smp_processor_id();
 		rq = cpu_rq(cpu);
-	} else
+	} else {
+		TS_SCHED_END(prev);
 		raw_spin_unlock_irq(&rq->lock);
+	}
+
+	sched_trace_task_switch_to(current);
 
 	post_schedule(rq);
 
-	if (unlikely(reacquire_kernel_lock(prev)))
+	if (sched_state_validate_switch() || unlikely(reacquire_kernel_lock(prev)))
 		goto need_resched_nonpreemptible;
 
 	preempt_enable_no_resched();
 	if (need_resched())
 		goto need_resched;
+
+	srp_ceiling_block();
 }
 EXPORT_SYMBOL(schedule);
 
@@ -4108,6 +4173,17 @@ void complete_all(struct completion *x)
 }
 EXPORT_SYMBOL(complete_all);
 
+void complete_n(struct completion *x, int n)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&x->wait.lock, flags);
+	x->done += n;
+	__wake_up_common(&x->wait, TASK_NORMAL, n, 0, NULL);
+	spin_unlock_irqrestore(&x->wait.lock, flags);
+}
+EXPORT_SYMBOL(complete_n);
+
 static inline long __sched
 do_wait_for_common(struct completion *x, long timeout, int state)
 {
@@ -4550,7 +4626,9 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
 	p->normal_prio = normal_prio(p);
 	/* we are holding p->pi_lock already */
 	p->prio = rt_mutex_getprio(p);
-	if (rt_prio(p->prio))
+	if (p->policy == SCHED_LITMUS)
+		p->sched_class = &litmus_sched_class;
+	else if (rt_prio(p->prio))
 		p->sched_class = &rt_sched_class;
 	else
 		p->sched_class = &fair_sched_class;
@@ -4595,7 +4673,7 @@ recheck:
 
 		if (policy != SCHED_FIFO && policy != SCHED_RR &&
 				policy != SCHED_NORMAL && policy != SCHED_BATCH &&
-				policy != SCHED_IDLE)
+				policy != SCHED_IDLE && policy != SCHED_LITMUS)
 			return -EINVAL;
 	}
 
@@ -4610,6 +4688,8 @@ recheck:
 		return -EINVAL;
 	if (rt_policy(policy) != (param->sched_priority != 0))
 		return -EINVAL;
+	if (policy == SCHED_LITMUS && policy == p->policy)
+		return -EINVAL;
 
 	/*
 	 * Allow unprivileged RT tasks to decrease priority:
@@ -4650,6 +4730,12 @@ recheck:
 			return retval;
 	}
 
+	if (policy == SCHED_LITMUS) {
+		retval = litmus_admit_task(p);
+		if (retval)
+			return retval;
+	}
+
 	/*
 	 * make sure no PI-waiters arrive (or leave) while we are
 	 * changing the priority of the task:
@@ -4692,10 +4778,19 @@ recheck:
 
 	p->sched_reset_on_fork = reset_on_fork;
 
+	if (p->policy == SCHED_LITMUS)
+		litmus_exit_task(p);
+
 	oldprio = p->prio;
 	prev_class = p->sched_class;
 	__setscheduler(rq, p, policy, param->sched_priority);
 
+	if (policy == SCHED_LITMUS) {
+		p->rt_param.stack_in_use = running ? rq->cpu : NO_CPU;
+		p->rt_param.present = running;
+		litmus->task_new(p, on_rq, running);
+	}
+
 	if (running)
 		p->sched_class->set_curr_task(rq);
 	if (on_rq) {
@@ -4755,6 +4850,13 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
 	if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
 		return -EFAULT;
 
+#ifdef CONFIG_LITMUS_LOCKING
+	/* Hack to allow plugin to call into schedule
+	 * prio to a setscheduler() call. */
+	if (is_realtime(current))
+		litmus->pre_setsched(current, policy);
+#endif
+
 	rcu_read_lock();
 	retval = -ESRCH;
 	p = find_process_by_pid(pid);
@@ -4865,10 +4967,11 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
 	rcu_read_lock();
 
 	p = find_process_by_pid(pid);
-	if (!p) {
+	/* Don't set affinity if task not found and for LITMUS tasks */
+	if (!p || is_realtime(p)) {
 		rcu_read_unlock();
 		put_online_cpus();
-		return -ESRCH;
+		return p ? -EPERM : -ESRCH;
 	}
 
 	/* Prevent p going away */
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index db3f674ca49d..e0e8d5ca3c98 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1654,7 +1654,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
 	int scale = cfs_rq->nr_running >= sched_nr_latency;
 
-	if (unlikely(rt_prio(p->prio)))
+	if (unlikely(rt_prio(p->prio)) || p->policy == SCHED_LITMUS)
 		goto preempt;
 
 	if (unlikely(p->sched_class != &fair_sched_class))
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index d10c80ebb67a..e40e7fe43170 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1013,7 +1013,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
  */
 static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
 {
-	if (p->prio < rq->curr->prio) {
+	if (p->prio < rq->curr->prio || p->policy == SCHED_LITMUS) {
 		resched_task(rq->curr);
 		return;
 	}
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 3e216e01bbd1..bb2d8b7850a3 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -767,6 +767,46 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
 	return HRTIMER_RESTART;
 }
 
+/**
+ * tick_set_quanta_type - get the quanta type as a boot option
+ * Default is standard setup with ticks staggered over first
+ * half of tick period.
+ */
+int quanta_type = LINUX_DEFAULT_TICKS;
+static int __init tick_set_quanta_type(char *str)
+{
+	if (strcmp("aligned", str) == 0) {
+		quanta_type = LITMUS_ALIGNED_TICKS;
+		printk(KERN_INFO "LITMUS^RT: setting aligned quanta\n");
+	}
+	else if (strcmp("staggered", str) == 0) {
+		quanta_type = LITMUS_STAGGERED_TICKS;
+		printk(KERN_INFO "LITMUS^RT: setting staggered quanta\n");
+	}
+	return 1;
+}
+__setup("quanta=", tick_set_quanta_type);
+
+u64 cpu_stagger_offset(int cpu)
+{
+	u64 offset = 0;
+	switch (quanta_type) {
+		case LITMUS_ALIGNED_TICKS:
+			offset = 0;
+			break;
+		case LITMUS_STAGGERED_TICKS:
+			offset = ktime_to_ns(tick_period);
+			do_div(offset, num_possible_cpus());
+			offset *= cpu;
+			break;
+		default:
+			offset = ktime_to_ns(tick_period) >> 1;
+			do_div(offset, num_possible_cpus());
+			offset *= cpu;
+	}
+	return offset;
+}
+
 /**
  * tick_setup_sched_timer - setup the tick emulation timer
  */
@@ -774,6 +814,7 @@ void tick_setup_sched_timer(void)
 {
 	struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
 	ktime_t now = ktime_get();
+	u64 offset;
 
 	/*
 	 * Emulate tick processing via per-CPU hrtimers:
@@ -784,6 +825,12 @@ void tick_setup_sched_timer(void)
 	/* Get the next period (per cpu) */
 	hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
 
+	/* Offset must be set correctly to achieve desired quanta type. */
+	offset = cpu_stagger_offset(smp_processor_id());
+
+	/* Add the correct offset to expiration time */
+	hrtimer_add_expires_ns(&ts->sched_timer, offset);
+
 	for (;;) {
 		hrtimer_forward(&ts->sched_timer, now, tick_period);
 		hrtimer_start_expires(&ts->sched_timer,
diff --git a/litmus/Kconfig b/litmus/Kconfig
new file mode 100644
index 000000000000..ad8dc8308cf0
--- /dev/null
+++ b/litmus/Kconfig
@@ -0,0 +1,185 @@
+menu "LITMUS^RT"
+
+menu "Scheduling"
+
+config PLUGIN_CEDF
+        bool "Clustered-EDF"
+	depends on X86 && SYSFS
+        default y
+        help
+          Include the Clustered EDF (C-EDF) plugin in the kernel.
+          This is appropriate for large platforms with shared caches.
+          On smaller platforms (e.g., ARM PB11MPCore), using C-EDF
+          makes little sense since there aren't any shared caches.
+
+config PLUGIN_PFAIR
+	bool "PFAIR"
+	depends on HIGH_RES_TIMERS && !NO_HZ
+	default y
+	help
+	  Include the PFAIR plugin (i.e., the PD^2 scheduler) in the kernel.
+	  The PFAIR plugin requires high resolution timers (for staggered quanta)
+	  and does not support NO_HZ (quanta could be missed when the system is idle).
+
+	  If unsure, say Yes.
+
+config RELEASE_MASTER
+        bool "Release-master Support"
+	depends on ARCH_HAS_SEND_PULL_TIMERS
+	default n
+	help
+           Allow one processor to act as a dedicated interrupt processor
+           that services all timer interrupts, but that does not schedule
+           real-time tasks. See RTSS'09 paper for details
+	   (http://www.cs.unc.edu/~anderson/papers.html).
+           Currently only supported by GSN-EDF.
+
+endmenu
+
+menu "Real-Time Synchronization"
+
+config NP_SECTION
+        bool "Non-preemptive section support"
+	default n
+	help
+	  Allow tasks to become non-preemptable.
+          Note that plugins still need to explicitly support non-preemptivity.
+          Currently, only GSN-EDF and PSN-EDF have such support.
+
+	  This is required to support locking protocols such as the FMLP.
+	  If disabled, all tasks will be considered preemptable at all times.
+
+config LITMUS_LOCKING
+        bool "Support for real-time locking protocols"
+	depends on NP_SECTION
+	default n
+	help
+	  Enable LITMUS^RT's deterministic multiprocessor real-time
+	  locking protocols.
+
+	  Say Yes if you want to include locking protocols such as the FMLP and
+	  Baker's SRP.
+
+endmenu
+
+menu "Tracing"
+
+config FEATHER_TRACE
+	bool "Feather-Trace Infrastructure"
+	default y
+	help
+	  Feather-Trace basic tracing infrastructure. Includes device file
+	  driver and instrumentation point support.
+
+	  There are actually two implementations of Feather-Trace.
+	  1) A slower, but portable, default implementation.
+	  2) Architecture-specific implementations that rewrite kernel .text at runtime.
+
+	  If enabled, Feather-Trace will be based on 2) if available (currently only for x86).
+	  However, if DEBUG_RODATA=y, then Feather-Trace will choose option 1) in any case
+	  to avoid problems with write-protected .text pages.
+
+	  Bottom line: to avoid increased overheads, choose DEBUG_RODATA=n.
+
+	  Note that this option only enables the basic Feather-Trace infrastructure;
+	  you still need to enable SCHED_TASK_TRACE and/or SCHED_OVERHEAD_TRACE to
+	  actually enable any events.
+
+config SCHED_TASK_TRACE
+	bool "Trace real-time tasks"
+	depends on FEATHER_TRACE
+	default y
+	help
+	  Include support for the sched_trace_XXX() tracing functions. This
+          allows the collection of real-time task events such as job
+	  completions, job releases, early completions, etc. This results in  a
+	  small overhead in the scheduling code. Disable if the overhead is not
+	  acceptable (e.g., benchmarking).
+
+	  Say Yes for debugging.
+	  Say No for overhead tracing.
+
+config SCHED_TASK_TRACE_SHIFT
+       int "Buffer size for sched_trace_xxx() events"
+       depends on SCHED_TASK_TRACE
+       range 8 13
+       default 9
+       help
+
+         Select the buffer size of sched_trace_xxx() events as a power of two.
+	 These buffers are statically allocated as per-CPU data. Each event
+	 requires 24 bytes storage plus one additional flag byte. Too large
+	 buffers can cause issues with the per-cpu allocator (and waste
+	 memory). Too small buffers can cause scheduling events to be lost. The
+	 "right" size is workload dependent and depends on the number of tasks,
+	 each task's period, each task's number of suspensions, and how often
+	 the buffer is flushed.
+
+	 Examples: 12 =>   4k events
+		   10 =>   1k events
+		    8 =>  512 events
+
+config SCHED_OVERHEAD_TRACE
+	bool "Record timestamps for overhead measurements"
+	depends on FEATHER_TRACE
+	default n
+	help
+	  Export event stream for overhead tracing.
+	  Say Yes for overhead tracing.
+
+config SCHED_DEBUG_TRACE
+	bool "TRACE() debugging"
+	default y
+	help
+	  Include support for sched_trace_log_messageg(), which is used to
+	  implement TRACE(). If disabled, no TRACE() messages will be included
+	  in the kernel, and no overheads due to debugging statements will be
+	  incurred by the scheduler. Disable if the overhead is not acceptable
+	  (e.g. benchmarking).
+
+	  Say Yes for debugging.
+	  Say No for overhead tracing.
+
+config SCHED_DEBUG_TRACE_SHIFT
+       int "Buffer size for TRACE() buffer"
+       depends on SCHED_DEBUG_TRACE
+       range 14 22
+       default 18
+       help
+
+	Select the amount of memory needed per for the TRACE() buffer, as a
+	power of two. The TRACE() buffer is global and statically allocated. If
+	the buffer is too small, there will be holes in the TRACE() log if the
+	buffer-flushing task is starved.
+
+	The default should be sufficient for most systems. Increase the buffer
+	size if the log contains holes. Reduce the buffer size when running on
+	a memory-constrained system.
+
+	Examples: 14 =>  16KB
+		  18 => 256KB
+		  20 =>   1MB
+
+        This buffer is exported to usespace using a misc device as
+        'litmus/log'. On a system with default udev rules, a corresponding
+        character device node should be created at /dev/litmus/log. The buffer
+        can be flushed using cat, e.g., 'cat /dev/litmus/log > my_log_file.txt'.
+
+config SCHED_DEBUG_TRACE_CALLER
+       bool "Include [function@file:line] tag in TRACE() log"
+       depends on SCHED_DEBUG_TRACE
+       default n
+       help
+         With this option enabled, TRACE() prepends
+
+	      "[<function name>@<filename>:<line number>]"
+
+	 to each message in the debug log. Enable this to aid in figuring out
+         what was called in which order. The downside is that it adds a lot of
+         clutter.
+
+	 If unsure, say No.
+
+endmenu
+
+endmenu
diff --git a/litmus/Makefile b/litmus/Makefile
new file mode 100644
index 000000000000..e86fad8c25ec
--- /dev/null
+++ b/litmus/Makefile
@@ -0,0 +1,30 @@
+#
+# Makefile for LITMUS^RT
+#
+
+obj-y     = sched_plugin.o litmus.o \
+	    preempt.o \
+	    litmus_proc.o \
+	    budget.o \
+	    clustered.o \
+	    jobs.o \
+	    sync.o \
+	    rt_domain.o \
+	    edf_common.o \
+	    fp_common.o \
+	    fdso.o \
+	    locking.o \
+	    srp.o \
+	    bheap.o \
+	    ctrldev.o \
+	    sched_gsn_edf.o \
+	    sched_psn_edf.o \
+	    sched_pfp.o
+
+obj-$(CONFIG_PLUGIN_CEDF) += sched_cedf.o
+obj-$(CONFIG_PLUGIN_PFAIR) += sched_pfair.o
+
+obj-$(CONFIG_FEATHER_TRACE) += ft_event.o ftdev.o
+obj-$(CONFIG_SCHED_TASK_TRACE) += sched_task_trace.o
+obj-$(CONFIG_SCHED_DEBUG_TRACE) += sched_trace.o
+obj-$(CONFIG_SCHED_OVERHEAD_TRACE) += trace.o
diff --git a/litmus/bheap.c b/litmus/bheap.c
new file mode 100644
index 000000000000..528af97f18a6
--- /dev/null
+++ b/litmus/bheap.c
@@ -0,0 +1,314 @@
+#include "linux/kernel.h"
+#include "litmus/bheap.h"
+
+void bheap_init(struct bheap* heap)
+{
+	heap->head = NULL;
+	heap->min  = NULL;
+}
+
+void bheap_node_init(struct bheap_node** _h, void* value)
+{
+	struct bheap_node* h = *_h;
+	h->parent = NULL;
+	h->next   = NULL;
+	h->child  = NULL;
+	h->degree = NOT_IN_HEAP;
+	h->value  = value;
+	h->ref    = _h;
+}
+
+
+/* make child a subtree of root */
+static void __bheap_link(struct bheap_node* root,
+			struct bheap_node* child)
+{
+	child->parent = root;
+	child->next   = root->child;
+	root->child   = child;
+	root->degree++;
+}
+
+/* merge root lists */
+static  struct bheap_node* __bheap_merge(struct bheap_node* a,
+					     struct bheap_node* b)
+{
+	struct bheap_node* head = NULL;
+	struct bheap_node** pos = &head;
+
+	while (a && b) {
+		if (a->degree < b->degree) {
+			*pos = a;
+			a = a->next;
+		} else {
+			*pos = b;
+			b = b->next;
+		}
+		pos = &(*pos)->next;
+	}
+	if (a)
+		*pos = a;
+	else
+		*pos = b;
+	return head;
+}
+
+/* reverse a linked list of nodes. also clears parent pointer */
+static  struct bheap_node* __bheap_reverse(struct bheap_node* h)
+{
+	struct bheap_node* tail = NULL;
+	struct bheap_node* next;
+
+	if (!h)
+		return h;
+
+	h->parent = NULL;
+	while (h->next) {
+		next    = h->next;
+		h->next = tail;
+		tail    = h;
+		h       = next;
+		h->parent = NULL;
+	}
+	h->next = tail;
+	return h;
+}
+
+static  void __bheap_min(bheap_prio_t higher_prio, struct bheap* heap,
+			      struct bheap_node** prev, struct bheap_node** node)
+{
+	struct bheap_node *_prev, *cur;
+	*prev = NULL;
+
+	if (!heap->head) {
+		*node = NULL;
+		return;
+	}
+
+	*node = heap->head;
+	_prev = heap->head;
+	cur   = heap->head->next;
+	while (cur) {
+		if (higher_prio(cur, *node)) {
+			*node = cur;
+			*prev = _prev;
+		}
+		_prev = cur;
+		cur   = cur->next;
+	}
+}
+
+static  void __bheap_union(bheap_prio_t higher_prio, struct bheap* heap,
+				struct bheap_node* h2)
+{
+	struct bheap_node* h1;
+	struct bheap_node *prev, *x, *next;
+	if (!h2)
+		return;
+	h1 = heap->head;
+	if (!h1) {
+		heap->head = h2;
+		return;
+	}
+	h1 = __bheap_merge(h1, h2);
+	prev = NULL;
+	x    = h1;
+	next = x->next;
+	while (next) {
+		if (x->degree != next->degree ||
+		    (next->next && next->next->degree == x->degree)) {
+			/* nothing to do, advance */
+			prev = x;
+			x    = next;
+		} else if (higher_prio(x, next)) {
+			/* x becomes the root of next */
+			x->next = next->next;
+			__bheap_link(x, next);
+		} else {
+			/* next becomes the root of x */
+			if (prev)
+				prev->next = next;
+			else
+				h1 = next;
+			__bheap_link(next, x);
+			x = next;
+		}
+		next = x->next;
+	}
+	heap->head = h1;
+}
+
+static struct bheap_node* __bheap_extract_min(bheap_prio_t higher_prio,
+					    struct bheap* heap)
+{
+	struct bheap_node *prev, *node;
+	__bheap_min(higher_prio, heap, &prev, &node);
+	if (!node)
+		return NULL;
+	if (prev)
+		prev->next = node->next;
+	else
+		heap->head = node->next;
+	__bheap_union(higher_prio, heap, __bheap_reverse(node->child));
+	return node;
+}
+
+/* insert (and reinitialize) a node into the heap */
+void bheap_insert(bheap_prio_t higher_prio, struct bheap* heap,
+		 struct bheap_node* node)
+{
+	struct bheap_node *min;
+	node->child  = NULL;
+	node->parent = NULL;
+	node->next   = NULL;
+	node->degree = 0;
+	if (heap->min && higher_prio(node, heap->min)) {
+		/* swap min cache */
+		min = heap->min;
+		min->child  = NULL;
+		min->parent = NULL;
+		min->next   = NULL;
+		min->degree = 0;
+		__bheap_union(higher_prio, heap, min);
+		heap->min   = node;
+	} else
+		__bheap_union(higher_prio, heap, node);
+}
+
+void bheap_uncache_min(bheap_prio_t higher_prio, struct bheap* heap)
+{
+	struct bheap_node* min;
+	if (heap->min) {
+		min = heap->min;
+		heap->min = NULL;
+		bheap_insert(higher_prio, heap, min);
+	}
+}
+
+/* merge addition into target */
+void bheap_union(bheap_prio_t higher_prio,
+		struct bheap* target, struct bheap* addition)
+{
+	/* first insert any cached minima, if necessary */
+	bheap_uncache_min(higher_prio, target);
+	bheap_uncache_min(higher_prio, addition);
+	__bheap_union(higher_prio, target, addition->head);
+	/* this is a destructive merge */
+	addition->head = NULL;
+}
+
+struct bheap_node* bheap_peek(bheap_prio_t higher_prio,
+			    struct bheap* heap)
+{
+	if (!heap->min)
+		heap->min = __bheap_extract_min(higher_prio, heap);
+	return heap->min;
+}
+
+struct bheap_node* bheap_take(bheap_prio_t higher_prio,
+			    struct bheap* heap)
+{
+	struct bheap_node *node;
+	if (!heap->min)
+		heap->min = __bheap_extract_min(higher_prio, heap);
+	node = heap->min;
+	heap->min = NULL;
+	if (node)
+		node->degree = NOT_IN_HEAP;
+	return node;
+}
+
+int bheap_decrease(bheap_prio_t higher_prio, struct bheap_node* node)
+{
+	struct bheap_node  *parent;
+	struct bheap_node** tmp_ref;
+	void* tmp;
+
+	/* bubble up */
+	parent = node->parent;
+	while (parent && higher_prio(node, parent)) {
+		/* swap parent and node */
+		tmp           = parent->value;
+		parent->value = node->value;
+		node->value   = tmp;
+		/* swap references */
+		*(parent->ref) = node;
+		*(node->ref)   = parent;
+		tmp_ref        = parent->ref;
+		parent->ref    = node->ref;
+		node->ref      = tmp_ref;
+		/* step up */
+		node   = parent;
+		parent = node->parent;
+	}
+
+	return parent != NULL;
+}
+
+void bheap_delete(bheap_prio_t higher_prio, struct bheap* heap,
+		 struct bheap_node* node)
+{
+	struct bheap_node *parent, *prev, *pos;
+	struct bheap_node** tmp_ref;
+	void* tmp;
+
+	if (heap->min != node) {
+		/* bubble up */
+		parent = node->parent;
+		while (parent) {
+			/* swap parent and node */
+			tmp           = parent->value;
+			parent->value = node->value;
+			node->value   = tmp;
+			/* swap references */
+			*(parent->ref) = node;
+			*(node->ref)   = parent;
+			tmp_ref        = parent->ref;
+			parent->ref    = node->ref;
+			node->ref      = tmp_ref;
+			/* step up */
+			node   = parent;
+			parent = node->parent;
+		}
+		/* now delete:
+		 * first find prev */
+		prev = NULL;
+		pos  = heap->head;
+		while (pos != node) {
+			prev = pos;
+			pos  = pos->next;
+		}
+		/* we have prev, now remove node */
+		if (prev)
+			prev->next = node->next;
+		else
+			heap->head = node->next;
+		__bheap_union(higher_prio, heap, __bheap_reverse(node->child));
+	} else
+		heap->min = NULL;
+	node->degree = NOT_IN_HEAP;
+}
+
+/* allocate a heap node for value and insert into the heap */
+int bheap_add(bheap_prio_t higher_prio, struct bheap* heap,
+	     void* value, int gfp_flags)
+{
+	struct bheap_node* hn = bheap_node_alloc(gfp_flags);
+	if (likely(hn)) {
+		bheap_node_init(&hn, value);
+		bheap_insert(higher_prio, heap, hn);
+	}
+	return hn != NULL;
+}
+
+void* bheap_take_del(bheap_prio_t higher_prio,
+		    struct bheap* heap)
+{
+	struct bheap_node* hn = bheap_take(higher_prio, heap);
+	void* ret = NULL;
+	if (hn) {
+		ret = hn->value;
+		bheap_node_free(hn);
+	}
+	return ret;
+}
diff --git a/litmus/budget.c b/litmus/budget.c
new file mode 100644
index 000000000000..310e9a3d4172
--- /dev/null
+++ b/litmus/budget.c
@@ -0,0 +1,111 @@
+#include <linux/sched.h>
+#include <linux/percpu.h>
+#include <linux/hrtimer.h>
+
+#include <litmus/litmus.h>
+#include <litmus/preempt.h>
+
+struct enforcement_timer {
+	/* The enforcement timer is used to accurately police
+	 * slice budgets. */
+	struct hrtimer		timer;
+	int			armed;
+};
+
+DEFINE_PER_CPU(struct enforcement_timer, budget_timer);
+
+static enum hrtimer_restart on_enforcement_timeout(struct hrtimer *timer)
+{
+	struct enforcement_timer* et = container_of(timer,
+						    struct enforcement_timer,
+						    timer);
+	unsigned long flags;
+
+	local_irq_save(flags);
+	TRACE("enforcement timer fired.\n");
+	et->armed = 0;
+	/* activate scheduler */
+	litmus_reschedule_local();
+	local_irq_restore(flags);
+
+	return  HRTIMER_NORESTART;
+}
+
+/* assumes called with IRQs off */
+static void cancel_enforcement_timer(struct enforcement_timer* et)
+{
+	int ret;
+
+	TRACE("cancelling enforcement timer.\n");
+
+	/* Since interrupts are disabled and et->armed is only
+	 * modified locally, we do not need any locks.
+	 */
+
+	if (et->armed) {
+		ret = hrtimer_try_to_cancel(&et->timer);
+		/* Should never be inactive. */
+		BUG_ON(ret == 0);
+		/* Should never be running concurrently. */
+		BUG_ON(ret == -1);
+
+		et->armed = 0;
+	}
+}
+
+/* assumes called with IRQs off */
+static void arm_enforcement_timer(struct enforcement_timer* et,
+				  struct task_struct* t)
+{
+	lt_t when_to_fire;
+	TRACE_TASK(t, "arming enforcement timer.\n");
+
+	/* Calling this when there is no budget left for the task
+	 * makes no sense, unless the task is non-preemptive. */
+	BUG_ON(budget_exhausted(t) && (!is_np(t)));
+
+	/* __hrtimer_start_range_ns() cancels the timer
+	 * anyway, so we don't have to check whether it is still armed */
+
+	if (likely(!is_np(t))) {
+		when_to_fire = litmus_clock() + budget_remaining(t);
+		__hrtimer_start_range_ns(&et->timer,
+					 ns_to_ktime(when_to_fire),
+					 0 /* delta */,
+					 HRTIMER_MODE_ABS_PINNED,
+					 0 /* no wakeup */);
+		et->armed = 1;
+	}
+}
+
+
+/* expects to be called with IRQs off */
+void update_enforcement_timer(struct task_struct* t)
+{
+	struct enforcement_timer* et = &__get_cpu_var(budget_timer);
+
+	if (t && budget_precisely_enforced(t)) {
+		/* Make sure we call into the scheduler when this budget
+		 * expires. */
+		arm_enforcement_timer(et, t);
+	} else if (et->armed) {
+		/* Make sure we don't cause unnecessary interrupts. */
+		cancel_enforcement_timer(et);
+	}
+}
+
+
+static int __init init_budget_enforcement(void)
+{
+	int cpu;
+	struct enforcement_timer* et;
+
+	for (cpu = 0; cpu < NR_CPUS; cpu++)  {
+		et = &per_cpu(budget_timer, cpu);
+		hrtimer_init(&et->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+		et->timer.function = on_enforcement_timeout;
+	}
+	return 0;
+}
+
+module_init(init_budget_enforcement);
diff --git a/litmus/clustered.c b/litmus/clustered.c
new file mode 100644
index 000000000000..6fe1b512f628
--- /dev/null
+++ b/litmus/clustered.c
@@ -0,0 +1,111 @@
+#include <linux/gfp.h>
+#include <linux/cpumask.h>
+#include <linux/list.h>
+
+#include <litmus/clustered.h>
+
+#ifndef CONFIG_X86
+/* fake get_shared_cpu_map() on non-x86 architectures */
+
+int get_shared_cpu_map(cpumask_var_t mask, unsigned int cpu, int index)
+{
+	if (index != 1)
+		return 1;
+	else {
+		/* Fake L1: CPU is all by itself. */
+		cpumask_clear(mask);
+		cpumask_set_cpu(cpu, mask);
+		return 0;
+	}
+}
+
+#endif
+
+int get_cluster_size(enum cache_level level)
+{
+	cpumask_var_t mask;
+	int ok;
+	int num_cpus;
+
+	if (level == GLOBAL_CLUSTER)
+		return num_online_cpus();
+	else {
+		if (!zalloc_cpumask_var(&mask, GFP_ATOMIC))
+			return -ENOMEM;
+		/* assumes CPU 0 is representative of all CPUs */
+		ok = get_shared_cpu_map(mask, 0, level);
+		/* ok == 0 means we got the map; otherwise it's an invalid cache level */
+		if (ok == 0)
+			num_cpus = cpumask_weight(mask);
+		free_cpumask_var(mask);
+
+		if (ok == 0)
+			return num_cpus;
+		else
+			return -EINVAL;
+	}
+}
+
+int assign_cpus_to_clusters(enum cache_level level,
+			    struct scheduling_cluster* clusters[],
+			    unsigned int num_clusters,
+			    struct cluster_cpu* cpus[],
+			    unsigned int num_cpus)
+{
+	cpumask_var_t mask;
+	unsigned int i, free_cluster = 0, low_cpu;
+	int err = 0;
+
+	if (!zalloc_cpumask_var(&mask, GFP_ATOMIC))
+		return -ENOMEM;
+
+	/* clear cluster pointers */
+	for (i = 0; i < num_cpus; i++) {
+		cpus[i]->id      = i;
+		cpus[i]->cluster = NULL;
+	}
+
+	/* initialize clusters */
+	for (i = 0; i < num_clusters; i++) {
+		clusters[i]->id = i;
+		INIT_LIST_HEAD(&clusters[i]->cpus);
+	}
+
+	/* Assign each CPU. Two assumtions are made:
+	 * 1) The index of a cpu in cpus corresponds to its processor id (i.e., the index in a cpu mask).
+	 * 2) All cpus that belong to some cluster are online.
+	 */
+	for_each_online_cpu(i) {
+		/* get lowest-id CPU in cluster */
+		if (level != GLOBAL_CLUSTER) {
+			err = get_shared_cpu_map(mask, cpus[i]->id, level);
+			if (err != 0) {
+				/* ugh... wrong cache level? Either caller screwed up
+				 * or the CPU topology is weird. */
+				printk(KERN_ERR "Could not set up clusters for L%d sharing (max: L%d).\n",
+				       level, err);
+				err = -EINVAL;
+				goto out;
+			}
+			low_cpu = cpumask_first(mask);
+		} else
+			low_cpu = 0;
+		if (low_cpu == i) {
+			/* caller must provide an appropriate number of clusters */
+			BUG_ON(free_cluster >= num_clusters);
+
+			/* create new cluster */
+			cpus[i]->cluster = clusters[free_cluster++];
+		} else {
+			/* low_cpu points to the right cluster
+			 * Assumption: low_cpu is actually online and was processed earlier. */
+			cpus[i]->cluster = cpus[low_cpu]->cluster;
+		}
+		/* enqueue in cpus list */
+		list_add_tail(&cpus[i]->cluster_list, &cpus[i]->cluster->cpus);
+		printk(KERN_INFO "Assigning CPU%u to cluster %u\n.", i, cpus[i]->cluster->id);
+	}
+out:
+	free_cpumask_var(mask);
+	return err;
+}
diff --git a/litmus/ctrldev.c b/litmus/ctrldev.c
new file mode 100644
index 000000000000..6677a67cc945
--- /dev/null
+++ b/litmus/ctrldev.c
@@ -0,0 +1,150 @@
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+
+#include <litmus/litmus.h>
+
+/* only one page for now, but we might want to add a RO version at some point */
+
+#define CTRL_NAME        "litmus/ctrl"
+
+/* allocate t->rt_param.ctrl_page*/
+static int alloc_ctrl_page(struct task_struct *t)
+{
+	int err = 0;
+
+	/* only allocate if the task doesn't have one yet */
+	if (!tsk_rt(t)->ctrl_page) {
+		tsk_rt(t)->ctrl_page = (void*) get_zeroed_page(GFP_KERNEL);
+		if (!tsk_rt(t)->ctrl_page)
+			err = -ENOMEM;
+		/* will get de-allocated in task teardown */
+		TRACE_TASK(t, "%s ctrl_page = %p\n", __FUNCTION__,
+			   tsk_rt(t)->ctrl_page);
+	}
+	return err;
+}
+
+static int map_ctrl_page(struct task_struct *t, struct vm_area_struct* vma)
+{
+	int err;
+	unsigned long pfn;
+
+	struct page* ctrl = virt_to_page(tsk_rt(t)->ctrl_page);
+
+	/* Increase ref count. Is decreased when vma is destroyed. */
+	get_page(ctrl);
+
+	/* compute page frame number */
+	pfn = page_to_pfn(ctrl);
+
+	TRACE_CUR(CTRL_NAME
+		  ": mapping %p (pfn:%lx, %lx) to 0x%lx (prot:%lx)\n",
+		  tsk_rt(t)->ctrl_page, pfn, page_to_pfn(ctrl), vma->vm_start,
+		  vma->vm_page_prot);
+
+	/* Map it into the vma. Make sure to use PAGE_SHARED, otherwise
+	 * userspace actually gets a copy-on-write page. */
+	err = remap_pfn_range(vma, vma->vm_start, pfn, PAGE_SIZE, PAGE_SHARED);
+
+	if (err)
+		TRACE_CUR(CTRL_NAME ": remap_pfn_range() failed (%d)\n", err);
+
+	return err;
+}
+
+static void litmus_ctrl_vm_close(struct vm_area_struct* vma)
+{
+	TRACE_CUR("%s flags=0x%x prot=0x%x\n", __FUNCTION__,
+		  vma->vm_flags, vma->vm_page_prot);
+
+	TRACE_CUR(CTRL_NAME
+		  ": %p:%p vma:%p vma->vm_private_data:%p closed.\n",
+		  (void*) vma->vm_start, (void*) vma->vm_end, vma,
+		  vma->vm_private_data, current->comm,
+		  current->pid);
+}
+
+static int litmus_ctrl_vm_fault(struct vm_area_struct* vma,
+				      struct vm_fault* vmf)
+{
+	/* This function should never be called, since
+	 * all pages should have been mapped by mmap()
+	 * already. */
+	TRACE_CUR("%s flags=0x%x\n", __FUNCTION__, vma->vm_flags);
+
+	/* nope, you only get one page */
+	return VM_FAULT_SIGBUS;
+}
+
+static struct vm_operations_struct litmus_ctrl_vm_ops = {
+	.close = litmus_ctrl_vm_close,
+	.fault = litmus_ctrl_vm_fault,
+};
+
+static int litmus_ctrl_mmap(struct file* filp, struct vm_area_struct* vma)
+{
+	int err = 0;
+
+	/* first make sure mapper knows what he's doing */
+
+	/* you can only get one page */
+	if (vma->vm_end - vma->vm_start != PAGE_SIZE)
+		return -EINVAL;
+
+	/* you can only map the "first" page */
+	if (vma->vm_pgoff != 0)
+		return -EINVAL;
+
+	/* you can't share it with anyone */
+	if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED))
+		return -EINVAL;
+
+	vma->vm_ops = &litmus_ctrl_vm_ops;
+	/* this mapping should not be kept across forks,
+	 * and cannot be expanded */
+	vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND;
+
+	err = alloc_ctrl_page(current);
+	if (!err)
+		err = map_ctrl_page(current, vma);
+
+	TRACE_CUR("%s flags=0x%x prot=0x%lx\n",
+		  __FUNCTION__, vma->vm_flags, vma->vm_page_prot);
+
+	return err;
+}
+
+static struct file_operations litmus_ctrl_fops = {
+	.owner = THIS_MODULE,
+	.mmap  = litmus_ctrl_mmap,
+};
+
+static struct miscdevice litmus_ctrl_dev = {
+	.name  = CTRL_NAME,
+	.minor = MISC_DYNAMIC_MINOR,
+	.fops  = &litmus_ctrl_fops,
+};
+
+static int __init init_litmus_ctrl_dev(void)
+{
+	int err;
+
+	BUILD_BUG_ON(sizeof(struct control_page) > PAGE_SIZE);
+
+	printk("Initializing LITMUS^RT control device.\n");
+	err = misc_register(&litmus_ctrl_dev);
+	if (err)
+		printk("Could not allocate %s device (%d).\n", CTRL_NAME, err);
+	return err;
+}
+
+static void __exit exit_litmus_ctrl_dev(void)
+{
+	misc_deregister(&litmus_ctrl_dev);
+}
+
+module_init(init_litmus_ctrl_dev);
+module_exit(exit_litmus_ctrl_dev);
diff --git a/litmus/edf_common.c b/litmus/edf_common.c
new file mode 100644
index 000000000000..c7d02ec2e15b
--- /dev/null
+++ b/litmus/edf_common.c
@@ -0,0 +1,143 @@
+/*
+ * kernel/edf_common.c
+ *
+ * Common functions for EDF based scheduler.
+ */
+
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/sched_trace.h>
+
+#include <litmus/edf_common.h>
+
+
+#ifdef CONFIG_LITMUS_LOCKING
+int edf_higher_base_prio(struct task_struct* first,
+				struct task_struct* second)
+{
+	struct task_struct *first_task = first;
+	struct task_struct *second_task = second;
+
+	/* check for NULL tasks */
+	if (!first || !second)
+		return first && !second;
+
+	return !is_realtime(second_task)  ||
+		earlier_deadline(first_task, second_task) ||
+		(get_deadline(first_task) == get_deadline(second_task) &&
+		 first_task->pid < second_task->pid);
+}
+
+int edf_pending_order(struct bheap_node* a, struct bheap_node* b)
+{
+	return edf_higher_base_prio(bheap2task(a), bheap2task(b));
+}
+
+#endif
+
+/* edf_higher_prio -  returns true if first has a higher EDF priority
+ *                    than second. Deadline ties are broken by PID.
+ *
+ * both first and second may be NULL
+ */
+int edf_higher_prio(struct task_struct* first,
+		    struct task_struct* second)
+{
+	struct task_struct *first_task = first;
+	struct task_struct *second_task = second;
+
+	/* There is no point in comparing a task to itself. */
+	if (first && first == second) {
+		TRACE_TASK(first,
+			   "WARNING: pointless edf priority comparison.\n");
+		return 0;
+	}
+
+
+	/* check for NULL tasks */
+	if (!first || !second)
+		return first && !second;
+
+#ifdef CONFIG_LITMUS_LOCKING
+
+	/* Check for inherited priorities. Change task
+	 * used for comparison in such a case.
+	 */
+	if (unlikely(first->rt_param.inh_task))
+		first_task = first->rt_param.inh_task;
+	if (unlikely(second->rt_param.inh_task))
+		second_task = second->rt_param.inh_task;
+
+	/* Check for priority boosting. Tie-break by start of boosting.
+	 */
+	if (unlikely(is_priority_boosted(first_task))) {
+		/* first_task is boosted, how about second_task? */
+		if (!is_priority_boosted(second_task) ||
+		    lt_before(get_boost_start(first_task),
+			      get_boost_start(second_task)))
+			return 1;
+		else
+			return 0;
+	} else if (unlikely(is_priority_boosted(second_task)))
+		/* second_task is boosted, first is not*/
+		return 0;
+
+#endif
+
+
+	return !is_realtime(second_task)  ||
+
+		/* is the deadline of the first task earlier?
+		 * Then it has higher priority.
+		 */
+		earlier_deadline(first_task, second_task) ||
+
+		/* Do we have a deadline tie?
+		 * Then break by PID.
+		 */
+		(get_deadline(first_task) == get_deadline(second_task) &&
+	        (first_task->pid < second_task->pid ||
+
+		/* If the PIDs are the same then the task with the inherited
+		 * priority wins.
+		 */
+		(first_task->pid == second_task->pid &&
+		 !second->rt_param.inh_task)));
+}
+
+int edf_ready_order(struct bheap_node* a, struct bheap_node* b)
+{
+	return edf_higher_prio(bheap2task(a), bheap2task(b));
+}
+
+void edf_domain_init(rt_domain_t* rt, check_resched_needed_t resched,
+		      release_jobs_t release)
+{
+	rt_domain_init(rt,  edf_ready_order, resched, release);
+}
+
+/* need_to_preempt - check whether the task t needs to be preempted
+ *                   call only with irqs disabled and with  ready_lock acquired
+ *                   THIS DOES NOT TAKE NON-PREEMPTIVE SECTIONS INTO ACCOUNT!
+ */
+int edf_preemption_needed(rt_domain_t* rt, struct task_struct *t)
+{
+	/* we need the read lock for edf_ready_queue */
+	/* no need to preempt if there is nothing pending */
+	if (!__jobs_pending(rt))
+		return 0;
+	/* we need to reschedule if t doesn't exist */
+	if (!t)
+		return 1;
+
+	/* NOTE: We cannot check for non-preemptibility since we
+	 *       don't know what address space we're currently in.
+	 */
+
+	/* make sure to get non-rt stuff out of the way */
+	return !is_realtime(t) || edf_higher_prio(__next_ready(rt), t);
+}
diff --git a/litmus/fdso.c b/litmus/fdso.c
new file mode 100644
index 000000000000..2c629598e3c9
--- /dev/null
+++ b/litmus/fdso.c
@@ -0,0 +1,297 @@
+/* fdso.c - file descriptor attached shared objects
+ *
+ * (c) 2007 B. Brandenburg, LITMUS^RT project
+ *
+ * Notes:
+ *   - objects descriptor (OD) tables are not cloned during a fork.
+ *   - objects are created on-demand, and freed after the last reference
+ *     is dropped.
+ *   - for now, object types are hard coded.
+ *   - As long as we have live objects, we keep a reference to the inode.
+ */
+
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/mutex.h>
+#include <linux/file.h>
+#include <asm/uaccess.h>
+
+#include <litmus/fdso.h>
+
+extern struct fdso_ops generic_lock_ops;
+
+static const struct fdso_ops* fdso_ops[] = {
+	&generic_lock_ops, /* FMLP_SEM */
+	&generic_lock_ops, /* SRP_SEM */
+	&generic_lock_ops, /* MPCP_SEM */
+	&generic_lock_ops, /* MPCP_VS_SEM */
+	&generic_lock_ops, /* DPCP_SEM */
+	&generic_lock_ops, /* OMLP_SEM */
+};
+
+static int fdso_create(void** obj_ref, obj_type_t type, void* __user config)
+{
+	if (fdso_ops[type]->create)
+		return fdso_ops[type]->create(obj_ref, type, config);
+	else
+		return -EINVAL;
+}
+
+static void fdso_destroy(obj_type_t type, void* obj)
+{
+	fdso_ops[type]->destroy(type, obj);
+}
+
+static int fdso_open(struct od_table_entry* entry, void* __user config)
+{
+	if (fdso_ops[entry->obj->type]->open)
+		return fdso_ops[entry->obj->type]->open(entry, config);
+	else
+		return 0;
+}
+
+static int fdso_close(struct od_table_entry* entry)
+{
+	if (fdso_ops[entry->obj->type]->close)
+		return fdso_ops[entry->obj->type]->close(entry);
+	else
+		return 0;
+}
+
+/* inode must be locked already */
+static int alloc_inode_obj(struct inode_obj_id** obj_ref,
+			   struct inode* inode,
+			   obj_type_t type,
+			   unsigned int id,
+			   void* __user config)
+{
+	struct inode_obj_id* obj;
+	void* raw_obj;
+	int err;
+
+	obj = kmalloc(sizeof(*obj), GFP_KERNEL);
+	if (!obj) {
+		return -ENOMEM;
+	}
+
+	err = fdso_create(&raw_obj, type, config);
+	if (err != 0) {
+		kfree(obj);
+		return err;
+	}
+
+	INIT_LIST_HEAD(&obj->list);
+	atomic_set(&obj->count, 1);
+	obj->type  = type;
+	obj->id    = id;
+	obj->obj   = raw_obj;
+	obj->inode = inode;
+
+	list_add(&obj->list, &inode->i_obj_list);
+	atomic_inc(&inode->i_count);
+
+	printk(KERN_DEBUG "alloc_inode_obj(%p, %d, %d): object created\n", inode, type, id);
+
+	*obj_ref = obj;
+	return 0;
+}
+
+/* inode must be locked already */
+static struct inode_obj_id* get_inode_obj(struct inode* inode,
+					  obj_type_t type,
+					  unsigned int id)
+{
+	struct list_head* pos;
+	struct inode_obj_id* obj = NULL;
+
+	list_for_each(pos, &inode->i_obj_list) {
+		obj = list_entry(pos, struct inode_obj_id, list);
+		if (obj->id == id && obj->type == type) {
+			atomic_inc(&obj->count);
+			return obj;
+		}
+	}
+	printk(KERN_DEBUG "get_inode_obj(%p, %d, %d): couldn't find object\n", inode, type, id);
+	return NULL;
+}
+
+
+static void put_inode_obj(struct inode_obj_id* obj)
+{
+	struct inode* inode;
+	int let_go = 0;
+
+	inode = obj->inode;
+	if (atomic_dec_and_test(&obj->count)) {
+
+		mutex_lock(&inode->i_obj_mutex);
+		/* no new references can be obtained */
+		if (!atomic_read(&obj->count)) {
+			list_del(&obj->list);
+			fdso_destroy(obj->type, obj->obj);
+			kfree(obj);
+			let_go = 1;
+		}
+		mutex_unlock(&inode->i_obj_mutex);
+		if (let_go)
+			iput(inode);
+	}
+}
+
+static struct od_table_entry*  get_od_entry(struct task_struct* t)
+{
+	struct od_table_entry* table;
+	int i;
+
+
+	table = t->od_table;
+	if (!table) {
+		table = kzalloc(sizeof(*table) * MAX_OBJECT_DESCRIPTORS,
+				GFP_KERNEL);
+		t->od_table = table;
+	}
+
+	for (i = 0; table &&  i < MAX_OBJECT_DESCRIPTORS; i++)
+		if (!table[i].used) {
+			table[i].used = 1;
+			return table + i;
+		}
+	return NULL;
+}
+
+static int put_od_entry(struct od_table_entry* od)
+{
+	put_inode_obj(od->obj);
+	od->used = 0;
+	return 0;
+}
+
+void exit_od_table(struct task_struct* t)
+{
+	int i;
+
+	if (t->od_table) {
+		for (i = 0; i < MAX_OBJECT_DESCRIPTORS; i++)
+			if (t->od_table[i].used)
+				put_od_entry(t->od_table + i);
+		kfree(t->od_table);
+		t->od_table = NULL;
+	}
+}
+
+static int do_sys_od_open(struct file* file, obj_type_t type, int id,
+			  void* __user config)
+{
+	int idx = 0, err = 0;
+	struct inode* inode;
+	struct inode_obj_id* obj = NULL;
+	struct od_table_entry* entry;
+
+	inode = file->f_dentry->d_inode;
+
+	entry = get_od_entry(current);
+	if (!entry)
+		return -ENOMEM;
+
+	mutex_lock(&inode->i_obj_mutex);
+	obj = get_inode_obj(inode, type, id);
+	if (!obj)
+		err = alloc_inode_obj(&obj, inode, type, id, config);
+	if (err != 0) {
+		obj = NULL;
+		idx = err;
+		entry->used = 0;
+	} else {
+		entry->obj   = obj;
+		entry->class = fdso_ops[type];
+		idx = entry - current->od_table;
+	}
+
+	mutex_unlock(&inode->i_obj_mutex);
+
+	/* open only if creation succeeded */
+	if (!err)
+		err = fdso_open(entry, config);
+	if (err < 0) {
+		/* The class rejected the open call.
+		 * We need to clean up and tell user space.
+		 */
+		if (obj)
+			put_od_entry(entry);
+		idx = err;
+	}
+
+	return idx;
+}
+
+
+struct od_table_entry* get_entry_for_od(int od)
+{
+	struct task_struct *t = current;
+
+	if (!t->od_table)
+		return NULL;
+	if (od < 0 || od >= MAX_OBJECT_DESCRIPTORS)
+		return NULL;
+	if (!t->od_table[od].used)
+		return NULL;
+	return t->od_table + od;
+}
+
+
+asmlinkage long sys_od_open(int fd, int type, int obj_id, void* __user config)
+{
+	int ret = 0;
+	struct file*  file;
+
+	/*
+	   1) get file from fd, get inode from file
+	   2) lock inode
+	   3) try to lookup object
+	   4) if not present create and enqueue object, inc inode refcnt
+	   5) increment refcnt of object
+	   6) alloc od_table_entry, setup ptrs
+	   7) unlock inode
+	   8) return offset in od_table as OD
+	 */
+
+	if (type < MIN_OBJ_TYPE || type > MAX_OBJ_TYPE) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	file = fget(fd);
+	if (!file) {
+		ret = -EBADF;
+		goto out;
+	}
+
+	ret = do_sys_od_open(file, type, obj_id, config);
+
+	fput(file);
+
+out:
+	return ret;
+}
+
+
+asmlinkage long sys_od_close(int od)
+{
+	int ret = -EINVAL;
+	struct task_struct *t = current;
+
+	if (od < 0 || od >= MAX_OBJECT_DESCRIPTORS)
+		return ret;
+
+	if (!t->od_table || !t->od_table[od].used)
+		return ret;
+
+
+	/* give the class a chance to reject the close
+	 */
+	ret = fdso_close(t->od_table + od);
+	if (ret == 0)
+		ret = put_od_entry(t->od_table + od);
+
+	return ret;
+}
diff --git a/litmus/fp_common.c b/litmus/fp_common.c
new file mode 100644
index 000000000000..31fc2db20adf
--- /dev/null
+++ b/litmus/fp_common.c
@@ -0,0 +1,119 @@
+/*
+ * litmus/fp_common.c
+ *
+ * Common functions for fixed-priority scheduler.
+ */
+
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/sched_trace.h>
+
+#include <litmus/fp_common.h>
+
+/* fp_higher_prio -  returns true if first has a higher static priority
+ *                   than second. Deadline ties are broken by PID.
+ *
+ * both first and second may be NULL
+ */
+int fp_higher_prio(struct task_struct* first,
+		   struct task_struct* second)
+{
+	struct task_struct *first_task = first;
+	struct task_struct *second_task = second;
+
+	/* There is no point in comparing a task to itself. */
+	if (unlikely(first && first == second)) {
+		TRACE_TASK(first,
+			   "WARNING: pointless FP priority comparison.\n");
+		return 0;
+	}
+
+
+	/* check for NULL tasks */
+	if (!first || !second)
+		return first && !second;
+
+#ifdef CONFIG_LITMUS_LOCKING
+
+	/* Check for inherited priorities. Change task
+	 * used for comparison in such a case.
+	 */
+	if (unlikely(first->rt_param.inh_task))
+		first_task = first->rt_param.inh_task;
+	if (unlikely(second->rt_param.inh_task))
+		second_task = second->rt_param.inh_task;
+
+	/* Check for priority boosting. Tie-break by start of boosting.
+	 */
+	if (unlikely(is_priority_boosted(first_task))) {
+		/* first_task is boosted, how about second_task? */
+		if (!is_priority_boosted(second_task) ||
+		    lt_before(get_boost_start(first_task),
+			      get_boost_start(second_task)))
+			return 1;
+		else
+			return 0;
+	} else if (unlikely(is_priority_boosted(second_task)))
+		/* second_task is boosted, first is not*/
+		return 0;
+
+#endif
+
+
+	return !is_realtime(second_task)  ||
+
+		get_priority(first_task) < get_priority(second_task) ||
+
+		/* Break by PID.
+		 */
+		(get_priority(first_task) == get_priority(second_task) &&
+	        (first_task->pid < second_task->pid ||
+
+		/* If the PIDs are the same then the task with the inherited
+		 * priority wins.
+		 */
+		(first_task->pid == second_task->pid &&
+		 !second->rt_param.inh_task)));
+}
+
+int fp_ready_order(struct bheap_node* a, struct bheap_node* b)
+{
+	return fp_higher_prio(bheap2task(a), bheap2task(b));
+}
+
+void fp_domain_init(rt_domain_t* rt, check_resched_needed_t resched,
+		    release_jobs_t release)
+{
+	rt_domain_init(rt,  fp_ready_order, resched, release);
+}
+
+/* need_to_preempt - check whether the task t needs to be preempted
+ */
+int fp_preemption_needed(struct fp_prio_queue *q, struct task_struct *t)
+{
+	struct task_struct *pending;
+
+	pending = fp_prio_peek(q);
+
+	if (!pending)
+		return 0;
+	if (!t)
+		return 1;
+
+	/* make sure to get non-rt stuff out of the way */
+	return !is_realtime(t) || fp_higher_prio(pending, t);
+}
+
+void fp_prio_queue_init(struct fp_prio_queue* q)
+{
+	int i;
+
+	for (i = 0; i < FP_PRIO_BIT_WORDS; i++)
+		q->bitmask[i] = 0;
+	for (i = 0; i < LITMUS_MAX_PRIORITY; i++)
+		bheap_init(&q->queue[i]);
+}
diff --git a/litmus/ft_event.c b/litmus/ft_event.c
new file mode 100644
index 000000000000..399a07becca5
--- /dev/null
+++ b/litmus/ft_event.c
@@ -0,0 +1,43 @@
+#include <linux/types.h>
+
+#include <litmus/feather_trace.h>
+
+#if !defined(CONFIG_ARCH_HAS_FEATHER_TRACE) || defined(CONFIG_DEBUG_RODATA)
+/* provide dummy implementation */
+
+int ft_events[MAX_EVENTS];
+
+int ft_enable_event(unsigned long id)
+{
+	if (id < MAX_EVENTS) {
+		ft_events[id]++;
+		return 1;
+	} else
+		return 0;
+}
+
+int ft_disable_event(unsigned long id)
+{
+	if (id < MAX_EVENTS && ft_events[id]) {
+		ft_events[id]--;
+		return 1;
+	} else
+		return 0;
+}
+
+int ft_disable_all_events(void)
+{
+	int i;
+
+	for (i = 0; i < MAX_EVENTS; i++)
+		ft_events[i] = 0;
+
+	return MAX_EVENTS;
+}
+
+int ft_is_event_enabled(unsigned long id)
+{
+	return 	id < MAX_EVENTS && ft_events[id];
+}
+
+#endif
diff --git a/litmus/ftdev.c b/litmus/ftdev.c
new file mode 100644
index 000000000000..99bc39ffbcef
--- /dev/null
+++ b/litmus/ftdev.c
@@ -0,0 +1,446 @@
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/cdev.h>
+#include <asm/uaccess.h>
+#include <linux/module.h>
+#include <linux/device.h>
+
+#include <litmus/litmus.h>
+#include <litmus/feather_trace.h>
+#include <litmus/ftdev.h>
+
+struct ft_buffer* alloc_ft_buffer(unsigned int count, size_t size)
+{
+	struct ft_buffer* buf;
+	size_t total = (size + 1) * count;
+	char* mem;
+	int order = 0, pages = 1;
+
+	buf = kmalloc(sizeof(*buf), GFP_KERNEL);
+	if (!buf)
+		return NULL;
+
+	total = (total / PAGE_SIZE) + (total % PAGE_SIZE != 0);
+	while (pages < total) {
+		order++;
+		pages *= 2;
+	}
+
+	mem = (char*) __get_free_pages(GFP_KERNEL, order);
+	if (!mem) {
+		kfree(buf);
+		return NULL;
+	}
+
+	if (!init_ft_buffer(buf, count, size,
+			    mem + (count * size),  /* markers at the end */
+			    mem)) {                /* buffer objects     */
+		free_pages((unsigned long) mem, order);
+		kfree(buf);
+		return NULL;
+	}
+	return buf;
+}
+
+void free_ft_buffer(struct ft_buffer* buf)
+{
+	int order = 0, pages = 1;
+	size_t total;
+
+	if (buf) {
+		total = (buf->slot_size + 1) * buf->slot_count;
+		total = (total / PAGE_SIZE) + (total % PAGE_SIZE != 0);
+		while (pages < total) {
+			order++;
+			pages *= 2;
+		}
+		free_pages((unsigned long) buf->buffer_mem, order);
+		kfree(buf);
+	}
+}
+
+struct ftdev_event {
+	int id;
+	struct ftdev_event* next;
+};
+
+static int activate(struct ftdev_event** chain, int id)
+{
+	struct ftdev_event* ev = kmalloc(sizeof(*ev), GFP_KERNEL);
+	if (ev) {
+		printk(KERN_INFO
+		       "Enabling feather-trace event %d.\n", (int) id);
+		ft_enable_event(id);
+		ev->id = id;
+		ev->next = *chain;
+		*chain    = ev;
+	}
+	return ev ? 0 : -ENOMEM;
+}
+
+static void deactivate(struct ftdev_event** chain, int id)
+{
+	struct ftdev_event **cur = chain;
+	struct ftdev_event *nxt;
+	while (*cur) {
+		if ((*cur)->id == id) {
+			nxt   = (*cur)->next;
+			kfree(*cur);
+			*cur  = nxt;
+			printk(KERN_INFO
+			       "Disabling feather-trace event %d.\n", (int) id);
+			ft_disable_event(id);
+			break;
+		}
+		cur = &(*cur)->next;
+	}
+}
+
+static int ftdev_open(struct inode *in, struct file *filp)
+{
+	struct ftdev* ftdev;
+	struct ftdev_minor* ftdm;
+	unsigned int buf_idx = iminor(in);
+	int err = 0;
+
+	ftdev = container_of(in->i_cdev, struct ftdev, cdev);
+
+	if (buf_idx >= ftdev->minor_cnt) {
+		err = -ENODEV;
+		goto out;
+	}
+	if (ftdev->can_open && (err = ftdev->can_open(ftdev, buf_idx)))
+		goto out;
+
+	ftdm = ftdev->minor + buf_idx;
+	ftdm->ftdev = ftdev;
+	filp->private_data = ftdm;
+
+	if (mutex_lock_interruptible(&ftdm->lock)) {
+		err = -ERESTARTSYS;
+		goto out;
+	}
+
+	if (!ftdm->readers && ftdev->alloc)
+		err = ftdev->alloc(ftdev, buf_idx);
+	if (0 == err)
+		ftdm->readers++;
+
+	mutex_unlock(&ftdm->lock);
+out:
+	return err;
+}
+
+static int ftdev_release(struct inode *in, struct file *filp)
+{
+	struct ftdev* ftdev;
+	struct ftdev_minor* ftdm;
+	unsigned int buf_idx = iminor(in);
+	int err = 0;
+
+	ftdev = container_of(in->i_cdev, struct ftdev, cdev);
+
+	if (buf_idx >= ftdev->minor_cnt) {
+		err = -ENODEV;
+		goto out;
+	}
+	ftdm = ftdev->minor + buf_idx;
+
+	if (mutex_lock_interruptible(&ftdm->lock)) {
+		err = -ERESTARTSYS;
+		goto out;
+	}
+
+	if (ftdm->readers == 1) {
+		while (ftdm->events)
+			deactivate(&ftdm->events, ftdm->events->id);
+
+		/* wait for any pending events to complete */
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		schedule_timeout(HZ);
+
+		printk(KERN_ALERT "Failed trace writes: %u\n",
+		       ftdm->buf->failed_writes);
+
+		if (ftdev->free)
+			ftdev->free(ftdev, buf_idx);
+	}
+
+	ftdm->readers--;
+	mutex_unlock(&ftdm->lock);
+out:
+	return err;
+}
+
+/* based on ft_buffer_read
+ * @returns < 0 : page fault
+ *          = 0 : no data available
+ *          = 1 : one slot copied
+ */
+static int ft_buffer_copy_to_user(struct ft_buffer* buf, char __user *dest)
+{
+	unsigned int idx;
+	int err = 0;
+	if (buf->free_count != buf->slot_count) {
+		/* data available */
+		idx = buf->read_idx % buf->slot_count;
+		if (buf->slots[idx] == SLOT_READY) {
+			err = copy_to_user(dest, ((char*) buf->buffer_mem) +
+					   idx * buf->slot_size,
+					   buf->slot_size);
+			if (err == 0) {
+				/* copy ok */
+				buf->slots[idx] = SLOT_FREE;
+				buf->read_idx++;
+				fetch_and_inc(&buf->free_count);
+				err = 1;
+			}
+		}
+	}
+	return err;
+}
+
+static ssize_t ftdev_read(struct file *filp,
+			  char __user *to, size_t len, loff_t *f_pos)
+{
+	/* 	we ignore f_pos, this is strictly sequential */
+
+	ssize_t err = 0;
+	size_t chunk;
+	int copied;
+	struct ftdev_minor* ftdm = filp->private_data;
+
+	if (mutex_lock_interruptible(&ftdm->lock)) {
+		err = -ERESTARTSYS;
+		goto out;
+	}
+
+
+	chunk = ftdm->buf->slot_size;
+	while (len >= chunk) {
+		copied = ft_buffer_copy_to_user(ftdm->buf, to);
+		if (copied == 1) {
+			len    -= chunk;
+			to     += chunk;
+			err    += chunk;
+	        } else if (err == 0 && copied == 0 && ftdm->events) {
+			/* Only wait if there are any events enabled and only
+			 * if we haven't copied some data yet. We cannot wait
+			 * here with copied data because that data would get
+			 * lost if the task is interrupted (e.g., killed).
+			 */
+			mutex_unlock(&ftdm->lock);
+			set_current_state(TASK_INTERRUPTIBLE);
+
+			schedule_timeout(50);
+
+			if (signal_pending(current)) {
+				if (err == 0)
+					/* nothing read yet, signal problem */
+					err = -ERESTARTSYS;
+				goto out;
+			}
+			if (mutex_lock_interruptible(&ftdm->lock)) {
+				err = -ERESTARTSYS;
+				goto out;
+			}
+		} else if (copied < 0) {
+			/* page fault */
+			err = copied;
+			break;
+		} else
+			/* nothing left to get, return to user space */
+			break;
+	}
+	mutex_unlock(&ftdm->lock);
+out:
+	return err;
+}
+
+static long ftdev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	long err = -ENOIOCTLCMD;
+	struct ftdev_minor* ftdm = filp->private_data;
+
+	if (mutex_lock_interruptible(&ftdm->lock)) {
+		err = -ERESTARTSYS;
+		goto out;
+	}
+
+	/* FIXME: check id against list of acceptable events */
+
+	switch (cmd) {
+	case  FTDEV_ENABLE_CMD:
+		if (activate(&ftdm->events, arg))
+			err = -ENOMEM;
+		else
+			err = 0;
+		break;
+
+	case FTDEV_DISABLE_CMD:
+		deactivate(&ftdm->events, arg);
+		err = 0;
+		break;
+
+	default:
+		printk(KERN_DEBUG "ftdev: strange ioctl (%u, %lu)\n", cmd, arg);
+	};
+
+	mutex_unlock(&ftdm->lock);
+out:
+	return err;
+}
+
+static ssize_t ftdev_write(struct file *filp, const char __user *from,
+			   size_t len, loff_t *f_pos)
+{
+	struct ftdev_minor* ftdm = filp->private_data;
+	ssize_t err = -EINVAL;
+	struct ftdev* ftdev = ftdm->ftdev;
+
+	/* dispatch write to buffer-specific code, if available */
+	if (ftdev->write)
+		err = ftdev->write(ftdm->buf, len, from);
+
+	return err;
+}
+
+struct file_operations ftdev_fops = {
+	.owner   = THIS_MODULE,
+	.open    = ftdev_open,
+	.release = ftdev_release,
+	.write   = ftdev_write,
+	.read    = ftdev_read,
+	.unlocked_ioctl = ftdev_ioctl,
+};
+
+int ftdev_init(	struct ftdev* ftdev, struct module* owner,
+		const int minor_cnt, const char* name)
+{
+	int i, err;
+
+	BUG_ON(minor_cnt < 1);
+
+	cdev_init(&ftdev->cdev, &ftdev_fops);
+	ftdev->name = name;
+	ftdev->minor_cnt = minor_cnt;
+	ftdev->cdev.owner = owner;
+	ftdev->cdev.ops = &ftdev_fops;
+	ftdev->alloc    = NULL;
+	ftdev->free     = NULL;
+	ftdev->can_open = NULL;
+	ftdev->write	= NULL;
+
+	ftdev->minor = kcalloc(ftdev->minor_cnt, sizeof(*ftdev->minor),
+			GFP_KERNEL);
+	if (!ftdev->minor) {
+		printk(KERN_WARNING "ftdev(%s): Could not allocate memory\n",
+			ftdev->name);
+		err = -ENOMEM;
+		goto err_out;
+	}
+
+	for (i = 0; i < ftdev->minor_cnt; i++) {
+		mutex_init(&ftdev->minor[i].lock);
+		ftdev->minor[i].readers = 0;
+		ftdev->minor[i].buf     = NULL;
+		ftdev->minor[i].events  = NULL;
+	}
+
+	ftdev->class = class_create(owner, ftdev->name);
+	if (IS_ERR(ftdev->class)) {
+		err = PTR_ERR(ftdev->class);
+		printk(KERN_WARNING "ftdev(%s): "
+			"Could not create device class.\n", ftdev->name);
+		goto err_dealloc;
+	}
+
+	return 0;
+
+err_dealloc:
+	kfree(ftdev->minor);
+err_out:
+	return err;
+}
+
+/*
+ * Destroy minor devices up to, but not including, up_to.
+ */
+static void ftdev_device_destroy(struct ftdev* ftdev, unsigned int up_to)
+{
+	dev_t minor_cntr;
+
+	if (up_to < 1)
+		up_to = (ftdev->minor_cnt < 1) ? 0 : ftdev->minor_cnt;
+
+	for (minor_cntr = 0; minor_cntr < up_to; ++minor_cntr)
+		device_destroy(ftdev->class, MKDEV(ftdev->major, minor_cntr));
+}
+
+void ftdev_exit(struct ftdev* ftdev)
+{
+	printk("ftdev(%s): Exiting\n", ftdev->name);
+	ftdev_device_destroy(ftdev, -1);
+	cdev_del(&ftdev->cdev);
+	unregister_chrdev_region(MKDEV(ftdev->major, 0), ftdev->minor_cnt);
+	class_destroy(ftdev->class);
+	kfree(ftdev->minor);
+}
+
+int register_ftdev(struct ftdev* ftdev)
+{
+	struct device **device;
+	dev_t trace_dev_tmp, minor_cntr;
+	int err;
+
+	err = alloc_chrdev_region(&trace_dev_tmp, 0, ftdev->minor_cnt,
+			ftdev->name);
+	if (err) {
+		printk(KERN_WARNING "ftdev(%s): "
+		       "Could not allocate char. device region (%d minors)\n",
+		       ftdev->name, ftdev->minor_cnt);
+		goto err_out;
+	}
+
+	ftdev->major = MAJOR(trace_dev_tmp);
+
+	err = cdev_add(&ftdev->cdev, trace_dev_tmp, ftdev->minor_cnt);
+	if (err) {
+		printk(KERN_WARNING "ftdev(%s): "
+		       "Could not add cdev for major %u with %u minor(s).\n",
+		       ftdev->name, ftdev->major, ftdev->minor_cnt);
+		goto err_unregister;
+	}
+
+	/* create the minor device(s) */
+	for (minor_cntr = 0; minor_cntr < ftdev->minor_cnt; ++minor_cntr)
+	{
+		trace_dev_tmp = MKDEV(ftdev->major, minor_cntr);
+		device = &ftdev->minor[minor_cntr].device;
+
+		*device = device_create(ftdev->class, NULL, trace_dev_tmp, NULL,
+				"litmus/%s%d", ftdev->name, minor_cntr);
+		if (IS_ERR(*device)) {
+			err = PTR_ERR(*device);
+			printk(KERN_WARNING "ftdev(%s): "
+				"Could not create device major/minor number "
+				"%u/%u\n", ftdev->name, ftdev->major,
+				minor_cntr);
+			printk(KERN_WARNING "ftdev(%s): "
+				"will attempt deletion of allocated devices.\n",
+				ftdev->name);
+			goto err_minors;
+		}
+	}
+
+	return 0;
+
+err_minors:
+	ftdev_device_destroy(ftdev, minor_cntr);
+	cdev_del(&ftdev->cdev);
+err_unregister:
+	unregister_chrdev_region(MKDEV(ftdev->major, 0), ftdev->minor_cnt);
+err_out:
+	return err;
+}
diff --git a/litmus/jobs.c b/litmus/jobs.c
new file mode 100644
index 000000000000..36e314625d86
--- /dev/null
+++ b/litmus/jobs.c
@@ -0,0 +1,43 @@
+/* litmus/jobs.c - common job control code
+ */
+
+#include <linux/sched.h>
+
+#include <litmus/litmus.h>
+#include <litmus/jobs.h>
+
+void prepare_for_next_period(struct task_struct *t)
+{
+	BUG_ON(!t);
+	/* prepare next release */
+	t->rt_param.job_params.release   = t->rt_param.job_params.deadline;
+	t->rt_param.job_params.deadline += get_rt_period(t);
+	t->rt_param.job_params.exec_time = 0;
+	/* update job sequence number */
+	t->rt_param.job_params.job_no++;
+
+	/* don't confuse Linux */
+	t->rt.time_slice = 1;
+}
+
+void release_at(struct task_struct *t, lt_t start)
+{
+	t->rt_param.job_params.deadline = start;
+	prepare_for_next_period(t);
+	set_rt_flags(t, RT_F_RUNNING);
+}
+
+
+/*
+ *	Deactivate current task until the beginning of the next period.
+ */
+long complete_job(void)
+{
+	/* Mark that we do not excute anymore */
+	set_rt_flags(current, RT_F_SLEEP);
+	/* call schedule, this will return when a new job arrives
+	 * it also takes care of preparing for the next release
+	 */
+	schedule();
+	return 0;
+}
diff --git a/litmus/litmus.c b/litmus/litmus.c
new file mode 100644
index 000000000000..b22f84a02010
--- /dev/null
+++ b/litmus/litmus.c
@@ -0,0 +1,555 @@
+/*
+ * litmus.c -- Implementation of the LITMUS syscalls,
+ *             the LITMUS intialization code,
+ *             and the procfs interface..
+ */
+#include <asm/uaccess.h>
+#include <linux/uaccess.h>
+#include <linux/sysrq.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#include <litmus/litmus.h>
+#include <litmus/bheap.h>
+#include <litmus/trace.h>
+#include <litmus/rt_domain.h>
+#include <litmus/litmus_proc.h>
+#include <litmus/sched_trace.h>
+
+/* Number of RT tasks that exist in the system */
+atomic_t rt_task_count 		= ATOMIC_INIT(0);
+static DEFINE_RAW_SPINLOCK(task_transition_lock);
+/* synchronize plugin switching */
+atomic_t cannot_use_plugin	= ATOMIC_INIT(0);
+
+/* Give log messages sequential IDs. */
+atomic_t __log_seq_no = ATOMIC_INIT(0);
+
+#ifdef CONFIG_RELEASE_MASTER
+/* current master CPU for handling timer IRQs */
+atomic_t release_master_cpu = ATOMIC_INIT(NO_CPU);
+#endif
+
+static struct kmem_cache * bheap_node_cache;
+extern struct kmem_cache * release_heap_cache;
+
+struct bheap_node* bheap_node_alloc(int gfp_flags)
+{
+	return kmem_cache_alloc(bheap_node_cache, gfp_flags);
+}
+
+void bheap_node_free(struct bheap_node* hn)
+{
+	kmem_cache_free(bheap_node_cache, hn);
+}
+
+struct release_heap* release_heap_alloc(int gfp_flags);
+void release_heap_free(struct release_heap* rh);
+
+/*
+ * sys_set_task_rt_param
+ * @pid: Pid of the task which scheduling parameters must be changed
+ * @param: New real-time extension parameters such as the execution cost and
+ *         period
+ * Syscall for manipulating with task rt extension params
+ * Returns EFAULT  if param is NULL.
+ *         ESRCH   if pid is not corrsponding
+ *	           to a valid task.
+ *	   EINVAL  if either period or execution cost is <=0
+ *	   EPERM   if pid is a real-time task
+ *	   0       if success
+ *
+ * Only non-real-time tasks may be configured with this system call
+ * to avoid races with the scheduler. In practice, this means that a
+ * task's parameters must be set _before_ calling sys_prepare_rt_task()
+ *
+ * find_task_by_vpid() assumes that we are in the same namespace of the
+ * target.
+ */
+asmlinkage long sys_set_rt_task_param(pid_t pid, struct rt_task __user * param)
+{
+	struct rt_task tp;
+	struct task_struct *target;
+	int retval = -EINVAL;
+
+	printk("Setting up rt task parameters for process %d.\n", pid);
+
+	if (pid < 0 || param == 0) {
+		goto out;
+	}
+	if (copy_from_user(&tp, param, sizeof(tp))) {
+		retval = -EFAULT;
+		goto out;
+	}
+
+	/* Task search and manipulation must be protected */
+	read_lock_irq(&tasklist_lock);
+	if (!(target = find_task_by_vpid(pid))) {
+		retval = -ESRCH;
+		goto out_unlock;
+	}
+
+	if (is_realtime(target)) {
+		/* The task is already a real-time task.
+		 * We cannot not allow parameter changes at this point.
+		 */
+		retval = -EBUSY;
+		goto out_unlock;
+	}
+
+	if (tp.exec_cost <= 0)
+		goto out_unlock;
+	if (tp.period <= 0)
+		goto out_unlock;
+	if (!cpu_online(tp.cpu))
+		goto out_unlock;
+	if (tp.period < tp.exec_cost)
+	{
+		printk(KERN_INFO "litmus: real-time task %d rejected "
+		       "because wcet > period\n", pid);
+		goto out_unlock;
+	}
+	if (tp.budget_policy != NO_ENFORCEMENT &&
+	    tp.budget_policy != QUANTUM_ENFORCEMENT &&
+	    tp.budget_policy != PRECISE_ENFORCEMENT)
+	{
+		printk(KERN_INFO "litmus: real-time task %d rejected "
+		       "because unsupported budget enforcement policy "
+		       "specified (%d)\n",
+		       pid, tp.budget_policy);
+		goto out_unlock;
+	}
+
+	if (tp.priority >= LITMUS_MAX_PRIORITY) {
+		printk(KERN_INFO "litmus: invalid priority (%u); "
+		       "task %s/%d rejected\n",
+		       tp.priority, target->comm, target->pid);
+		goto out_unlock;
+	}
+
+	target->rt_param.task_params = tp;
+
+	retval = 0;
+      out_unlock:
+	read_unlock_irq(&tasklist_lock);
+      out:
+	return retval;
+}
+
+/*
+ * Getter of task's RT params
+ *   returns EINVAL if param or pid is NULL
+ *   returns ESRCH  if pid does not correspond to a valid task
+ *   returns EFAULT if copying of parameters has failed.
+ *
+ *   find_task_by_vpid() assumes that we are in the same namespace of the
+ *   target.
+ */
+asmlinkage long sys_get_rt_task_param(pid_t pid, struct rt_task __user * param)
+{
+	int retval = -EINVAL;
+	struct task_struct *source;
+	struct rt_task lp;
+	if (param == 0 || pid < 0)
+		goto out;
+	read_lock(&tasklist_lock);
+	if (!(source = find_task_by_vpid(pid))) {
+		retval = -ESRCH;
+		goto out_unlock;
+	}
+	lp = source->rt_param.task_params;
+	read_unlock(&tasklist_lock);
+	/* Do copying outside the lock */
+	retval =
+	    copy_to_user(param, &lp, sizeof(lp)) ? -EFAULT : 0;
+	return retval;
+      out_unlock:
+	read_unlock(&tasklist_lock);
+      out:
+	return retval;
+
+}
+
+/*
+ *	This is the crucial function for periodic task implementation,
+ *	It checks if a task is periodic, checks if such kind of sleep
+ *	is permitted and calls plugin-specific sleep, which puts the
+ *	task into a wait array.
+ *	returns 0 on successful wakeup
+ *	returns EPERM if current conditions do not permit such sleep
+ *	returns EINVAL if current task is not able to go to sleep
+ */
+asmlinkage long sys_complete_job(void)
+{
+	int retval = -EPERM;
+	if (!is_realtime(current)) {
+		retval = -EINVAL;
+		goto out;
+	}
+	/* Task with negative or zero period cannot sleep */
+	if (get_rt_period(current) <= 0) {
+		retval = -EINVAL;
+		goto out;
+	}
+	/* The plugin has to put the task into an
+	 * appropriate queue and call schedule
+	 */
+	retval = litmus->complete_job();
+      out:
+	return retval;
+}
+
+/*	This is an "improved" version of sys_complete_job that
+ *      addresses the problem of unintentionally missing a job after
+ *      an overrun.
+ *
+ *	returns 0 on successful wakeup
+ *	returns EPERM if current conditions do not permit such sleep
+ *	returns EINVAL if current task is not able to go to sleep
+ */
+asmlinkage long sys_wait_for_job_release(unsigned int job)
+{
+	int retval = -EPERM;
+	if (!is_realtime(current)) {
+		retval = -EINVAL;
+		goto out;
+	}
+
+	/* Task with negative or zero period cannot sleep */
+	if (get_rt_period(current) <= 0) {
+		retval = -EINVAL;
+		goto out;
+	}
+
+	retval = 0;
+
+	/* first wait until we have "reached" the desired job
+	 *
+	 * This implementation has at least two problems:
+	 *
+	 * 1) It doesn't gracefully handle the wrap around of
+	 *    job_no. Since LITMUS is a prototype, this is not much
+	 *    of a problem right now.
+	 *
+	 * 2) It is theoretically racy if a job release occurs
+	 *    between checking job_no and calling sleep_next_period().
+	 *    A proper solution would requiring adding another callback
+	 *    in the plugin structure and testing the condition with
+	 *    interrupts disabled.
+	 *
+	 * FIXME: At least problem 2 should be taken care of eventually.
+	 */
+	while (!retval && job > current->rt_param.job_params.job_no)
+		/* If the last job overran then job <= job_no and we
+		 * don't send the task to sleep.
+		 */
+		retval = litmus->complete_job();
+      out:
+	return retval;
+}
+
+/*	This is a helper syscall to query the current job sequence number.
+ *
+ *	returns 0 on successful query
+ *	returns EPERM if task is not a real-time task.
+ *      returns EFAULT if &job is not a valid pointer.
+ */
+asmlinkage long sys_query_job_no(unsigned int __user *job)
+{
+	int retval = -EPERM;
+	if (is_realtime(current))
+		retval = put_user(current->rt_param.job_params.job_no, job);
+
+	return retval;
+}
+
+/* sys_null_call() is only used for determining raw system call
+ * overheads (kernel entry, kernel exit). It has no useful side effects.
+ * If ts is non-NULL, then the current Feather-Trace time is recorded.
+ */
+asmlinkage long sys_null_call(cycles_t __user *ts)
+{
+	long ret = 0;
+	cycles_t now;
+
+	if (ts) {
+		now = get_cycles();
+		ret = put_user(now, ts);
+	}
+
+	return ret;
+}
+
+/* p is a real-time task. Re-init its state as a best-effort task. */
+static void reinit_litmus_state(struct task_struct* p, int restore)
+{
+	struct rt_task  user_config = {};
+	void*  ctrl_page     = NULL;
+
+	if (restore) {
+		/* Safe user-space provided configuration data.
+		 * and allocated page. */
+		user_config = p->rt_param.task_params;
+		ctrl_page   = p->rt_param.ctrl_page;
+	}
+
+	/* We probably should not be inheriting any task's priority
+	 * at this point in time.
+	 */
+	WARN_ON(p->rt_param.inh_task);
+
+	/* Cleanup everything else. */
+	memset(&p->rt_param, 0, sizeof(p->rt_param));
+
+	/* Restore preserved fields. */
+	if (restore) {
+		p->rt_param.task_params = user_config;
+		p->rt_param.ctrl_page   = ctrl_page;
+	}
+}
+
+long litmus_admit_task(struct task_struct* tsk)
+{
+	long retval = 0;
+	unsigned long flags;
+
+	BUG_ON(is_realtime(tsk));
+
+	if (get_rt_period(tsk) == 0 ||
+	    get_exec_cost(tsk) > get_rt_period(tsk)) {
+		TRACE_TASK(tsk, "litmus admit: invalid task parameters "
+			   "(%lu, %lu)\n",
+		           get_exec_cost(tsk), get_rt_period(tsk));
+		retval = -EINVAL;
+		goto out;
+	}
+
+	if (!cpu_online(get_partition(tsk))) {
+		TRACE_TASK(tsk, "litmus admit: cpu %d is not online\n",
+			   get_partition(tsk));
+		retval = -EINVAL;
+		goto out;
+	}
+
+	INIT_LIST_HEAD(&tsk_rt(tsk)->list);
+
+	/* avoid scheduler plugin changing underneath us */
+	raw_spin_lock_irqsave(&task_transition_lock, flags);
+
+	/* allocate heap node for this task */
+	tsk_rt(tsk)->heap_node = bheap_node_alloc(GFP_ATOMIC);
+	tsk_rt(tsk)->rel_heap = release_heap_alloc(GFP_ATOMIC);
+
+	if (!tsk_rt(tsk)->heap_node || !tsk_rt(tsk)->rel_heap) {
+		printk(KERN_WARNING "litmus: no more heap node memory!?\n");
+
+		bheap_node_free(tsk_rt(tsk)->heap_node);
+		release_heap_free(tsk_rt(tsk)->rel_heap);
+
+		retval = -ENOMEM;
+		goto out_unlock;
+	} else {
+		bheap_node_init(&tsk_rt(tsk)->heap_node, tsk);
+	}
+
+	retval = litmus->admit_task(tsk);
+
+	if (!retval) {
+		sched_trace_task_name(tsk);
+		sched_trace_task_param(tsk);
+		atomic_inc(&rt_task_count);
+	}
+
+out_unlock:
+	raw_spin_unlock_irqrestore(&task_transition_lock, flags);
+out:
+	return retval;
+}
+
+void litmus_exit_task(struct task_struct* tsk)
+{
+	if (is_realtime(tsk)) {
+		sched_trace_task_completion(tsk, 1);
+
+		litmus->task_exit(tsk);
+
+		BUG_ON(bheap_node_in_heap(tsk_rt(tsk)->heap_node));
+	        bheap_node_free(tsk_rt(tsk)->heap_node);
+		release_heap_free(tsk_rt(tsk)->rel_heap);
+
+		atomic_dec(&rt_task_count);
+		reinit_litmus_state(tsk, 1);
+	}
+}
+
+/* IPI callback to synchronize plugin switching */
+static void synch_on_plugin_switch(void* info)
+{
+	atomic_inc(&cannot_use_plugin);
+	while (atomic_read(&cannot_use_plugin) > 0)
+		cpu_relax();
+}
+
+/* Switching a plugin in use is tricky.
+ * We must watch out that no real-time tasks exists
+ * (and that none is created in parallel) and that the plugin is not
+ * currently in use on any processor (in theory).
+ */
+int switch_sched_plugin(struct sched_plugin* plugin)
+{
+	unsigned long flags;
+	int ret = 0;
+
+	BUG_ON(!plugin);
+
+	/* forbid other cpus to use the plugin */
+	atomic_set(&cannot_use_plugin, 1);
+	/* send IPI to force other CPUs to synch with us */
+	smp_call_function(synch_on_plugin_switch, NULL, 0);
+
+	/* wait until all other CPUs have started synch */
+	while (atomic_read(&cannot_use_plugin) < num_online_cpus())
+		cpu_relax();
+
+	/* stop task transitions */
+	raw_spin_lock_irqsave(&task_transition_lock, flags);
+
+	/* don't switch if there are active real-time tasks */
+	if (atomic_read(&rt_task_count) == 0) {
+		ret = litmus->deactivate_plugin();
+		if (0 != ret)
+			goto out;
+		ret = plugin->activate_plugin();
+		if (0 != ret) {
+			printk(KERN_INFO "Can't activate %s (%d).\n",
+			       plugin->plugin_name, ret);
+			plugin = &linux_sched_plugin;
+		}
+		printk(KERN_INFO "Switching to LITMUS^RT plugin %s.\n", plugin->plugin_name);
+		litmus = plugin;
+	} else
+		ret = -EBUSY;
+out:
+	raw_spin_unlock_irqrestore(&task_transition_lock, flags);
+	atomic_set(&cannot_use_plugin, 0);
+	return ret;
+}
+
+/* Called upon fork.
+ * p is the newly forked task.
+ */
+void litmus_fork(struct task_struct* p)
+{
+	if (is_realtime(p)) {
+		/* clean out any litmus related state, don't preserve anything */
+		reinit_litmus_state(p, 0);
+		/* Don't let the child be a real-time task.  */
+		p->sched_reset_on_fork = 1;
+	} else
+		/* non-rt tasks might have ctrl_page set */
+		tsk_rt(p)->ctrl_page = NULL;
+
+	/* od tables are never inherited across a fork */
+	p->od_table = NULL;
+}
+
+/* Called upon execve().
+ * current is doing the exec.
+ * Don't let address space specific stuff leak.
+ */
+void litmus_exec(void)
+{
+	struct task_struct* p = current;
+
+	if (is_realtime(p)) {
+		WARN_ON(p->rt_param.inh_task);
+		if (tsk_rt(p)->ctrl_page) {
+			free_page((unsigned long) tsk_rt(p)->ctrl_page);
+			tsk_rt(p)->ctrl_page = NULL;
+		}
+	}
+}
+
+void exit_litmus(struct task_struct *dead_tsk)
+{
+	/* We also allow non-RT tasks to
+	 * allocate control pages to allow
+	 * measurements with non-RT tasks.
+	 * So check if we need to free the page
+	 * in any case.
+	 */
+	if (tsk_rt(dead_tsk)->ctrl_page) {
+		TRACE_TASK(dead_tsk,
+			   "freeing ctrl_page %p\n",
+			   tsk_rt(dead_tsk)->ctrl_page);
+		free_page((unsigned long) tsk_rt(dead_tsk)->ctrl_page);
+	}
+
+	/* main cleanup only for RT tasks */
+	if (is_realtime(dead_tsk))
+		litmus_exit_task(dead_tsk);
+}
+
+
+#ifdef CONFIG_MAGIC_SYSRQ
+int sys_kill(int pid, int sig);
+
+static void sysrq_handle_kill_rt_tasks(int key)
+{
+	struct task_struct *t;
+	read_lock(&tasklist_lock);
+	for_each_process(t) {
+		if (is_realtime(t)) {
+			sys_kill(t->pid, SIGKILL);
+		}
+	}
+	read_unlock(&tasklist_lock);
+}
+
+static struct sysrq_key_op sysrq_kill_rt_tasks_op = {
+	.handler	= sysrq_handle_kill_rt_tasks,
+	.help_msg	= "quit-rt-tasks(X)",
+	.action_msg	= "sent SIGKILL to all LITMUS^RT real-time tasks",
+};
+#endif
+
+extern struct sched_plugin linux_sched_plugin;
+
+static int __init _init_litmus(void)
+{
+	/*      Common initializers,
+	 *      mode change lock is used to enforce single mode change
+	 *      operation.
+	 */
+	printk("Starting LITMUS^RT kernel\n");
+
+	BUILD_BUG_ON(sizeof(union np_flag) != sizeof(uint32_t));
+
+	register_sched_plugin(&linux_sched_plugin);
+
+	bheap_node_cache    = KMEM_CACHE(bheap_node, SLAB_PANIC);
+	release_heap_cache = KMEM_CACHE(release_heap, SLAB_PANIC);
+
+#ifdef CONFIG_MAGIC_SYSRQ
+	/* offer some debugging help */
+	if (!register_sysrq_key('x', &sysrq_kill_rt_tasks_op))
+		printk("Registered kill rt tasks magic sysrq.\n");
+	else
+		printk("Could not register kill rt tasks magic sysrq.\n");
+#endif
+
+	init_litmus_proc();
+
+	return 0;
+}
+
+static void _exit_litmus(void)
+{
+	exit_litmus_proc();
+	kmem_cache_destroy(bheap_node_cache);
+	kmem_cache_destroy(release_heap_cache);
+}
+
+module_init(_init_litmus);
+module_exit(_exit_litmus);
diff --git a/litmus/litmus_proc.c b/litmus/litmus_proc.c
new file mode 100644
index 000000000000..4bf725a36c9c
--- /dev/null
+++ b/litmus/litmus_proc.c
@@ -0,0 +1,347 @@
+/*
+ * litmus_proc.c -- Implementation of the /proc/litmus directory tree.
+ */
+
+#include <linux/sched.h>
+#include <linux/uaccess.h>
+
+#include <litmus/litmus.h>
+#include <litmus/litmus_proc.h>
+
+#include <litmus/clustered.h>
+
+/* in litmus/litmus.c */
+extern atomic_t rt_task_count;
+
+static struct proc_dir_entry *litmus_dir = NULL,
+	*curr_file = NULL,
+	*stat_file = NULL,
+	*plugs_dir = NULL,
+#ifdef CONFIG_RELEASE_MASTER
+	*release_master_file = NULL,
+#endif
+	*plugs_file = NULL;
+
+/* in litmus/sync.c */
+int count_tasks_waiting_for_release(void);
+
+static int proc_read_stats(char *page, char **start,
+			   off_t off, int count,
+			   int *eof, void *data)
+{
+	int len;
+
+	len = snprintf(page, PAGE_SIZE,
+		       "real-time tasks   = %d\n"
+		       "ready for release = %d\n",
+		       atomic_read(&rt_task_count),
+		       count_tasks_waiting_for_release());
+	return len;
+}
+
+static int proc_read_plugins(char *page, char **start,
+			   off_t off, int count,
+			   int *eof, void *data)
+{
+	int len;
+
+	len = print_sched_plugins(page, PAGE_SIZE);
+	return len;
+}
+
+static int proc_read_curr(char *page, char **start,
+			  off_t off, int count,
+			  int *eof, void *data)
+{
+	int len;
+
+	len = snprintf(page, PAGE_SIZE, "%s\n", litmus->plugin_name);
+	return len;
+}
+
+/* in litmus/litmus.c */
+int switch_sched_plugin(struct sched_plugin*);
+
+static int proc_write_curr(struct file *file,
+			   const char *buffer,
+			   unsigned long count,
+			   void *data)
+{
+	int len, ret;
+	char name[65];
+	struct sched_plugin* found;
+
+	len = copy_and_chomp(name, sizeof(name), buffer, count);
+	if (len < 0)
+		return len;
+
+	found = find_sched_plugin(name);
+
+	if (found) {
+		ret = switch_sched_plugin(found);
+		if (ret != 0)
+			printk(KERN_INFO "Could not switch plugin: %d\n", ret);
+	} else
+		printk(KERN_INFO "Plugin '%s' is unknown.\n", name);
+
+	return len;
+}
+
+#ifdef CONFIG_RELEASE_MASTER
+static int proc_read_release_master(char *page, char **start,
+				    off_t off, int count,
+				    int *eof, void *data)
+{
+	int len, master;
+	master = atomic_read(&release_master_cpu);
+	if (master == NO_CPU)
+		len = snprintf(page, PAGE_SIZE, "NO_CPU\n");
+	else
+		len = snprintf(page, PAGE_SIZE, "%d\n", master);
+	return len;
+}
+
+static int proc_write_release_master(struct file *file,
+				     const char *buffer,
+				     unsigned long count,
+				     void *data)
+{
+	int cpu, err, len, online = 0;
+	char msg[64];
+
+	len = copy_and_chomp(msg, sizeof(msg), buffer, count);
+
+	if (len < 0)
+		return len;
+
+	if (strcmp(msg, "NO_CPU") == 0)
+		atomic_set(&release_master_cpu, NO_CPU);
+	else {
+		err = sscanf(msg, "%d", &cpu);
+		if (err == 1 && cpu >= 0 && (online = cpu_online(cpu))) {
+			atomic_set(&release_master_cpu, cpu);
+		} else {
+			TRACE("invalid release master: '%s' "
+			      "(err:%d cpu:%d online:%d)\n",
+			      msg, err, cpu, online);
+			len = -EINVAL;
+		}
+	}
+	return len;
+}
+#endif
+
+int __init init_litmus_proc(void)
+{
+	litmus_dir = proc_mkdir("litmus", NULL);
+	if (!litmus_dir) {
+		printk(KERN_ERR "Could not allocate LITMUS^RT procfs entry.\n");
+		return -ENOMEM;
+	}
+
+	curr_file = create_proc_entry("active_plugin",
+				      0644, litmus_dir);
+	if (!curr_file) {
+		printk(KERN_ERR "Could not allocate active_plugin "
+		       "procfs entry.\n");
+		return -ENOMEM;
+	}
+	curr_file->read_proc  = proc_read_curr;
+	curr_file->write_proc = proc_write_curr;
+
+#ifdef CONFIG_RELEASE_MASTER
+	release_master_file = create_proc_entry("release_master",
+						0644, litmus_dir);
+	if (!release_master_file) {
+		printk(KERN_ERR "Could not allocate release_master "
+		       "procfs entry.\n");
+		return -ENOMEM;
+	}
+	release_master_file->read_proc = proc_read_release_master;
+	release_master_file->write_proc  = proc_write_release_master;
+#endif
+
+	stat_file = create_proc_read_entry("stats", 0444, litmus_dir,
+					   proc_read_stats, NULL);
+
+	plugs_dir = proc_mkdir("plugins", litmus_dir);
+	if (!plugs_dir){
+		printk(KERN_ERR "Could not allocate plugins directory "
+				"procfs entry.\n");
+		return -ENOMEM;
+	}
+
+	plugs_file = create_proc_read_entry("loaded", 0444, plugs_dir,
+					   proc_read_plugins, NULL);
+
+	return 0;
+}
+
+void exit_litmus_proc(void)
+{
+	if (plugs_file)
+		remove_proc_entry("loaded", plugs_dir);
+	if (plugs_dir)
+		remove_proc_entry("plugins", litmus_dir);
+	if (stat_file)
+		remove_proc_entry("stats", litmus_dir);
+	if (curr_file)
+		remove_proc_entry("active_plugin", litmus_dir);
+#ifdef CONFIG_RELEASE_MASTER
+	if (release_master_file)
+		remove_proc_entry("release_master", litmus_dir);
+#endif
+	if (litmus_dir)
+		remove_proc_entry("litmus", NULL);
+}
+
+long make_plugin_proc_dir(struct sched_plugin* plugin,
+		struct proc_dir_entry** pde_in)
+{
+	struct proc_dir_entry *pde_new = NULL;
+	long rv;
+
+	if (!plugin || !plugin->plugin_name){
+		printk(KERN_ERR "Invalid plugin struct passed to %s.\n",
+				__func__);
+		rv = -EINVAL;
+		goto out_no_pde;
+	}
+
+	if (!plugs_dir){
+		printk(KERN_ERR "Could not make plugin sub-directory, because "
+				"/proc/litmus/plugins does not exist.\n");
+		rv = -ENOENT;
+		goto out_no_pde;
+	}
+
+	pde_new = proc_mkdir(plugin->plugin_name, plugs_dir);
+	if (!pde_new){
+		printk(KERN_ERR "Could not make plugin sub-directory: "
+				"out of memory?.\n");
+		rv = -ENOMEM;
+		goto out_no_pde;
+	}
+
+	rv = 0;
+	*pde_in = pde_new;
+	goto out_ok;
+
+out_no_pde:
+	*pde_in = NULL;
+out_ok:
+	return rv;
+}
+
+void remove_plugin_proc_dir(struct sched_plugin* plugin)
+{
+	if (!plugin || !plugin->plugin_name){
+		printk(KERN_ERR "Invalid plugin struct passed to %s.\n",
+				__func__);
+		return;
+	}
+	remove_proc_entry(plugin->plugin_name, plugs_dir);
+}
+
+
+
+/* misc. I/O helper functions */
+
+int copy_and_chomp(char *kbuf, unsigned long ksize,
+		   __user const char* ubuf, unsigned long ulength)
+{
+	/* caller must provide buffer space */
+	BUG_ON(!ksize);
+
+	ksize--; /* leave space for null byte */
+
+	if (ksize > ulength)
+		ksize = ulength;
+
+	if(copy_from_user(kbuf, ubuf, ksize))
+		return -EFAULT;
+
+	kbuf[ksize] = '\0';
+
+	/* chomp kbuf */
+	if (ksize > 0 && kbuf[ksize - 1] == '\n')
+		kbuf[ksize - 1] = '\0';
+
+	return ksize;
+}
+
+/* helper functions for clustered plugins */
+static const char* cache_level_names[] = {
+	"ALL",
+	"L1",
+	"L2",
+	"L3",
+};
+
+int parse_cache_level(const char *cache_name, enum cache_level *level)
+{
+	int err = -EINVAL;
+	int i;
+	/* do a quick and dirty comparison to find the cluster size */
+	for (i = GLOBAL_CLUSTER; i <= L3_CLUSTER; i++)
+		if (!strcmp(cache_name, cache_level_names[i])) {
+			*level = (enum cache_level) i;
+			err = 0;
+			break;
+		}
+	return err;
+}
+
+const char* cache_level_name(enum cache_level level)
+{
+	int idx = level;
+
+	if (idx >= GLOBAL_CLUSTER && idx <= L3_CLUSTER)
+		return cache_level_names[idx];
+	else
+		return "INVALID";
+}
+
+
+/* proc file interface to configure the cluster size */
+static int proc_read_cluster_size(char *page, char **start,
+				  off_t off, int count,
+				  int *eof, void *data)
+{
+	return snprintf(page, PAGE_SIZE, "%s\n",
+			cache_level_name(*((enum cache_level*) data)));;
+}
+
+static int proc_write_cluster_size(struct file *file,
+				   const char *buffer,
+				   unsigned long count,
+				   void *data)
+{
+	int len;
+	char cache_name[8];
+
+	len = copy_and_chomp(cache_name, sizeof(cache_name), buffer, count);
+
+	if (len > 0 && parse_cache_level(cache_name, (enum cache_level*) data))
+		printk(KERN_INFO "Cluster '%s' is unknown.\n", cache_name);
+
+	return len;
+}
+
+struct proc_dir_entry* create_cluster_file(struct proc_dir_entry* parent,
+					   enum cache_level* level)
+{
+	struct proc_dir_entry* cluster_file;
+
+	cluster_file = create_proc_entry("cluster", 0644, parent);
+	if (!cluster_file) {
+		printk(KERN_ERR "Could not allocate %s/cluster "
+		       "procfs entry.\n", parent->name);
+	} else {
+		cluster_file->read_proc = proc_read_cluster_size;
+		cluster_file->write_proc = proc_write_cluster_size;
+		cluster_file->data = level;
+	}
+	return cluster_file;
+}
+
diff --git a/litmus/locking.c b/litmus/locking.c
new file mode 100644
index 000000000000..84a1d8309699
--- /dev/null
+++ b/litmus/locking.c
@@ -0,0 +1,186 @@
+#include <linux/sched.h>
+#include <litmus/litmus.h>
+#include <litmus/fdso.h>
+
+#ifdef CONFIG_LITMUS_LOCKING
+
+#include <litmus/sched_plugin.h>
+#include <litmus/trace.h>
+#include <litmus/wait.h>
+
+static int create_generic_lock(void** obj_ref, obj_type_t type, void* __user arg);
+static int open_generic_lock(struct od_table_entry* entry, void* __user arg);
+static int close_generic_lock(struct od_table_entry* entry);
+static void destroy_generic_lock(obj_type_t type, void* sem);
+
+struct fdso_ops generic_lock_ops = {
+	.create  = create_generic_lock,
+	.open    = open_generic_lock,
+	.close   = close_generic_lock,
+	.destroy = destroy_generic_lock
+};
+
+static inline bool is_lock(struct od_table_entry* entry)
+{
+	return entry->class == &generic_lock_ops;
+}
+
+static inline struct litmus_lock* get_lock(struct od_table_entry* entry)
+{
+	BUG_ON(!is_lock(entry));
+	return (struct litmus_lock*) entry->obj->obj;
+}
+
+static  int create_generic_lock(void** obj_ref, obj_type_t type, void* __user arg)
+{
+	struct litmus_lock* lock;
+	int err;
+
+	err = litmus->allocate_lock(&lock, type, arg);
+	if (err == 0)
+		*obj_ref = lock;
+	return err;
+}
+
+static int open_generic_lock(struct od_table_entry* entry, void* __user arg)
+{
+	struct litmus_lock* lock = get_lock(entry);
+	if (lock->ops->open)
+		return lock->ops->open(lock, arg);
+	else
+		return 0; /* default: any task can open it */
+}
+
+static int close_generic_lock(struct od_table_entry* entry)
+{
+	struct litmus_lock* lock = get_lock(entry);
+	if (lock->ops->close)
+		return lock->ops->close(lock);
+	else
+		return 0; /* default: closing succeeds */
+}
+
+static void destroy_generic_lock(obj_type_t type, void* obj)
+{
+	struct litmus_lock* lock = (struct litmus_lock*) obj;
+	lock->ops->deallocate(lock);
+}
+
+asmlinkage long sys_litmus_lock(int lock_od)
+{
+	long err = -EINVAL;
+	struct od_table_entry* entry;
+	struct litmus_lock* l;
+
+	TS_SYSCALL_IN_START;
+
+	TS_SYSCALL_IN_END;
+
+	TS_LOCK_START;
+
+	entry = get_entry_for_od(lock_od);
+	if (entry && is_lock(entry)) {
+		l = get_lock(entry);
+		TRACE_CUR("attempts to lock 0x%p\n", l);
+		err = l->ops->lock(l);
+	}
+
+	/* Note: task my have been suspended or preempted in between!  Take
+	 * this into account when computing overheads. */
+	TS_LOCK_END;
+
+	TS_SYSCALL_OUT_START;
+
+	return err;
+}
+
+asmlinkage long sys_litmus_unlock(int lock_od)
+{
+	long err = -EINVAL;
+	struct od_table_entry* entry;
+	struct litmus_lock* l;
+
+	TS_SYSCALL_IN_START;
+
+	TS_SYSCALL_IN_END;
+
+	TS_UNLOCK_START;
+
+	entry = get_entry_for_od(lock_od);
+	if (entry && is_lock(entry)) {
+		l = get_lock(entry);
+		TRACE_CUR("attempts to unlock 0x%p\n", l);
+		err = l->ops->unlock(l);
+	}
+
+	/* Note: task my have been preempted in between!  Take this into
+	 * account when computing overheads. */
+	TS_UNLOCK_END;
+
+	TS_SYSCALL_OUT_START;
+
+	return err;
+}
+
+struct task_struct* __waitqueue_remove_first(wait_queue_head_t *wq)
+{
+	wait_queue_t* q;
+	struct task_struct* t = NULL;
+
+	if (waitqueue_active(wq)) {
+		q = list_entry(wq->task_list.next,
+			       wait_queue_t, task_list);
+		t = (struct task_struct*) q->private;
+		__remove_wait_queue(wq, q);
+	}
+	return(t);
+}
+
+unsigned int __add_wait_queue_prio_exclusive(
+	wait_queue_head_t* head,
+	prio_wait_queue_t *new)
+{
+	struct list_head *pos;
+	unsigned int passed = 0;
+
+	new->wq.flags |= WQ_FLAG_EXCLUSIVE;
+
+	/* find a spot where the new entry is less than the next */
+	list_for_each(pos, &head->task_list) {
+		prio_wait_queue_t* queued = list_entry(pos, prio_wait_queue_t,
+						       wq.task_list);
+
+		if (unlikely(lt_before(new->priority, queued->priority) ||
+			     (new->priority == queued->priority &&
+			      new->tie_breaker < queued->tie_breaker))) {
+			/* pos is not less than new, thus insert here */
+			__list_add(&new->wq.task_list, pos->prev, pos);
+			goto out;
+		}
+		passed++;
+	}
+
+	/* if we get to this point either the list is empty or every entry
+	 * queued element is less than new.
+	 * Let's add new to the end. */
+	list_add_tail(&new->wq.task_list, &head->task_list);
+out:
+	return passed;
+}
+
+
+#else
+
+struct fdso_ops generic_lock_ops = {};
+
+asmlinkage long sys_litmus_lock(int sem_od)
+{
+	return -ENOSYS;
+}
+
+asmlinkage long sys_litmus_unlock(int sem_od)
+{
+	return -ENOSYS;
+}
+
+#endif
diff --git a/litmus/preempt.c b/litmus/preempt.c
new file mode 100644
index 000000000000..90e09d091e30
--- /dev/null
+++ b/litmus/preempt.c
@@ -0,0 +1,131 @@
+#include <linux/sched.h>
+
+#include <litmus/litmus.h>
+#include <litmus/preempt.h>
+
+/* The rescheduling state of each processor.
+ */
+DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, resched_state);
+
+void sched_state_will_schedule(struct task_struct* tsk)
+{
+	/* Litmus hack: we only care about processor-local invocations of
+	 * set_tsk_need_resched(). We can't reliably set the flag remotely
+	 * since it might race with other updates to the scheduling state.  We
+	 * can't rely on the runqueue lock protecting updates to the sched
+	 * state since processors do not acquire the runqueue locks for all
+	 * updates to the sched state (to avoid acquiring two runqueue locks at
+	 * the same time). Further, if tsk is residing on a remote processor,
+	 * then that processor doesn't actually know yet that it is going to
+	 * reschedule; it still must receive an IPI (unless a local invocation
+	 * races).
+	 */
+	if (likely(task_cpu(tsk) == smp_processor_id())) {
+		VERIFY_SCHED_STATE(TASK_SCHEDULED | SHOULD_SCHEDULE | TASK_PICKED | WILL_SCHEDULE);
+		if (is_in_sched_state(TASK_PICKED | PICKED_WRONG_TASK))
+			set_sched_state(PICKED_WRONG_TASK);
+		else
+			set_sched_state(WILL_SCHEDULE);
+	} else
+		/* Litmus tasks should never be subject to a remote
+		 * set_tsk_need_resched(). */
+		BUG_ON(is_realtime(tsk));
+//	TRACE_TASK(tsk, "set_tsk_need_resched() ret:%p\n",
+//		   __builtin_return_address(0));
+}
+
+/* Called by the IPI handler after another CPU called smp_send_resched(). */
+void sched_state_ipi(void)
+{
+	/* If the IPI was slow, we might be in any state right now. The IPI is
+	 * only meaningful if we are in SHOULD_SCHEDULE. */
+	if (is_in_sched_state(SHOULD_SCHEDULE)) {
+		/* Cause scheduler to be invoked.
+		 * This will cause a transition to WILL_SCHEDULE. */
+		set_tsk_need_resched(current);
+		TRACE_STATE("IPI -> set_tsk_need_resched(%s/%d)\n",
+			    current->comm, current->pid);
+	} else {
+		/* ignore */
+		TRACE_STATE("ignoring IPI in state %x (%s)\n",
+			    get_sched_state(),
+			    sched_state_name(get_sched_state()));
+	}
+}
+
+/* Called by plugins to cause a CPU to reschedule. IMPORTANT: the caller must
+ * hold the lock that is used to serialize scheduling decisions. */
+void litmus_reschedule(int cpu)
+{
+	int picked_transition_ok = 0;
+	int scheduled_transition_ok = 0;
+
+	/* The (remote) CPU could be in any state. */
+
+	/* The critical states are TASK_PICKED and TASK_SCHEDULED, as the CPU
+	 * is not aware of the need to reschedule at this point. */
+
+	/* is a context switch in progress? */
+	if (cpu_is_in_sched_state(cpu, TASK_PICKED))
+		picked_transition_ok = sched_state_transition_on(
+			cpu, TASK_PICKED, PICKED_WRONG_TASK);
+
+	if (!picked_transition_ok &&
+	    cpu_is_in_sched_state(cpu, TASK_SCHEDULED)) {
+		/* We either raced with the end of the context switch, or the
+		 * CPU was in TASK_SCHEDULED anyway. */
+		scheduled_transition_ok = sched_state_transition_on(
+			cpu, TASK_SCHEDULED, SHOULD_SCHEDULE);
+	}
+
+	/* If the CPU was in state TASK_SCHEDULED, then we need to cause the
+	 * scheduler to be invoked. */
+	if (scheduled_transition_ok) {
+		if (smp_processor_id() == cpu)
+			set_tsk_need_resched(current);
+		else
+			smp_send_reschedule(cpu);
+	}
+
+	TRACE_STATE("%s picked-ok:%d sched-ok:%d\n",
+		    __FUNCTION__,
+		    picked_transition_ok,
+		    scheduled_transition_ok);
+}
+
+void litmus_reschedule_local(void)
+{
+	if (is_in_sched_state(TASK_PICKED))
+		set_sched_state(PICKED_WRONG_TASK);
+	else if (is_in_sched_state(TASK_SCHEDULED | SHOULD_SCHEDULE)) {
+		set_sched_state(WILL_SCHEDULE);
+		set_tsk_need_resched(current);
+	}
+}
+
+#ifdef CONFIG_DEBUG_KERNEL
+
+void sched_state_plugin_check(void)
+{
+	if (!is_in_sched_state(TASK_PICKED | PICKED_WRONG_TASK)) {
+		TRACE("!!!! plugin did not call sched_state_task_picked()!"
+		      "Calling sched_state_task_picked() is mandatory---fix this.\n");
+		set_sched_state(TASK_PICKED);
+	}
+}
+
+#define NAME_CHECK(x) case x:  return #x
+const char* sched_state_name(int s)
+{
+	switch (s) {
+		NAME_CHECK(TASK_SCHEDULED);
+		NAME_CHECK(SHOULD_SCHEDULE);
+		NAME_CHECK(WILL_SCHEDULE);
+		NAME_CHECK(TASK_PICKED);
+		NAME_CHECK(PICKED_WRONG_TASK);
+	default:
+		return "UNKNOWN";
+	};
+}
+
+#endif
diff --git a/litmus/rt_domain.c b/litmus/rt_domain.c
new file mode 100644
index 000000000000..d405854cd39c
--- /dev/null
+++ b/litmus/rt_domain.c
@@ -0,0 +1,357 @@
+/*
+ * litmus/rt_domain.c
+ *
+ * LITMUS real-time infrastructure. This file contains the
+ * functions that manipulate RT domains. RT domains are an abstraction
+ * of a ready queue and a release queue.
+ */
+
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/sched_trace.h>
+
+#include <litmus/rt_domain.h>
+
+#include <litmus/trace.h>
+
+#include <litmus/bheap.h>
+
+/* Uncomment when debugging timer races... */
+#if 0
+#define VTRACE_TASK TRACE_TASK
+#define VTRACE TRACE
+#else
+#define VTRACE_TASK(t, fmt, args...) /* shut up */
+#define VTRACE(fmt, args...) /* be quiet already */
+#endif
+
+static int dummy_resched(rt_domain_t *rt)
+{
+	return 0;
+}
+
+static int dummy_order(struct bheap_node* a, struct bheap_node* b)
+{
+	return 0;
+}
+
+/* default implementation: use default lock */
+static void default_release_jobs(rt_domain_t* rt, struct bheap* tasks)
+{
+	merge_ready(rt, tasks);
+}
+
+static unsigned int time2slot(lt_t time)
+{
+	return (unsigned int) time2quanta(time, FLOOR) % RELEASE_QUEUE_SLOTS;
+}
+
+static enum hrtimer_restart on_release_timer(struct hrtimer *timer)
+{
+	unsigned long flags;
+	struct release_heap* rh;
+	rh = container_of(timer, struct release_heap, timer);
+
+	TS_RELEASE_LATENCY(rh->release_time);
+
+	VTRACE("on_release_timer(0x%p) starts.\n", timer);
+
+	TS_RELEASE_START;
+
+
+	raw_spin_lock_irqsave(&rh->dom->release_lock, flags);
+	VTRACE("CB has the release_lock 0x%p\n", &rh->dom->release_lock);
+	/* remove from release queue */
+	list_del(&rh->list);
+	raw_spin_unlock_irqrestore(&rh->dom->release_lock, flags);
+	VTRACE("CB returned release_lock 0x%p\n", &rh->dom->release_lock);
+
+	/* call release callback */
+	rh->dom->release_jobs(rh->dom, &rh->heap);
+	/* WARNING: rh can be referenced from other CPUs from now on. */
+
+	TS_RELEASE_END;
+
+	VTRACE("on_release_timer(0x%p) ends.\n", timer);
+
+	return  HRTIMER_NORESTART;
+}
+
+/* allocated in litmus.c */
+struct kmem_cache * release_heap_cache;
+
+struct release_heap* release_heap_alloc(int gfp_flags)
+{
+	struct release_heap* rh;
+	rh= kmem_cache_alloc(release_heap_cache, gfp_flags);
+	if (rh) {
+		/* initialize timer */
+		hrtimer_init(&rh->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+		rh->timer.function = on_release_timer;
+	}
+	return rh;
+}
+
+void release_heap_free(struct release_heap* rh)
+{
+	/* make sure timer is no longer in use */
+	hrtimer_cancel(&rh->timer);
+	kmem_cache_free(release_heap_cache, rh);
+}
+
+/* Caller must hold release lock.
+ * Will return heap for given time. If no such heap exists prior to
+ * the invocation it will be created.
+ */
+static struct release_heap* get_release_heap(rt_domain_t *rt,
+					     struct task_struct* t,
+					     int use_task_heap)
+{
+	struct list_head* pos;
+	struct release_heap* heap = NULL;
+	struct release_heap* rh;
+	lt_t release_time = get_release(t);
+	unsigned int slot = time2slot(release_time);
+
+	/* initialize pos for the case that the list is empty */
+	pos = rt->release_queue.slot[slot].next;
+	list_for_each(pos, &rt->release_queue.slot[slot]) {
+		rh = list_entry(pos, struct release_heap, list);
+		if (release_time == rh->release_time) {
+			/* perfect match -- this happens on hyperperiod
+			 * boundaries
+			 */
+			heap = rh;
+			break;
+		} else if (lt_before(release_time, rh->release_time)) {
+			/* we need to insert a new node since rh is
+			 * already in the future
+			 */
+			break;
+		}
+	}
+	if (!heap && use_task_heap) {
+		/* use pre-allocated release heap */
+		rh = tsk_rt(t)->rel_heap;
+
+		rh->dom = rt;
+		rh->release_time = release_time;
+
+		/* add to release queue */
+		list_add(&rh->list, pos->prev);
+		heap = rh;
+	}
+	return heap;
+}
+
+static void reinit_release_heap(struct task_struct* t)
+{
+	struct release_heap* rh;
+
+	/* use pre-allocated release heap */
+	rh = tsk_rt(t)->rel_heap;
+
+	/* Make sure it is safe to use.  The timer callback could still
+	 * be executing on another CPU; hrtimer_cancel() will wait
+	 * until the timer callback has completed.  However, under no
+	 * circumstances should the timer be active (= yet to be
+	 * triggered).
+	 *
+	 * WARNING: If the CPU still holds the release_lock at this point,
+	 *          deadlock may occur!
+	 */
+	BUG_ON(hrtimer_cancel(&rh->timer));
+
+	/* initialize */
+	bheap_init(&rh->heap);
+#ifdef CONFIG_RELEASE_MASTER
+	atomic_set(&rh->info.state, HRTIMER_START_ON_INACTIVE);
+#endif
+}
+/* arm_release_timer() - start local release timer or trigger
+ *     remote timer (pull timer)
+ *
+ * Called by add_release() with:
+ * - tobe_lock taken
+ * - IRQ disabled
+ */
+#ifdef CONFIG_RELEASE_MASTER
+#define arm_release_timer(t) arm_release_timer_on((t), NO_CPU)
+static void arm_release_timer_on(rt_domain_t *_rt , int target_cpu)
+#else
+static void arm_release_timer(rt_domain_t *_rt)
+#endif
+{
+	rt_domain_t *rt = _rt;
+	struct list_head list;
+	struct list_head *pos, *safe;
+	struct task_struct* t;
+	struct release_heap* rh;
+
+	VTRACE("arm_release_timer() at %llu\n", litmus_clock());
+	list_replace_init(&rt->tobe_released, &list);
+
+	list_for_each_safe(pos, safe, &list) {
+		/* pick task of work list */
+		t = list_entry(pos, struct task_struct, rt_param.list);
+		sched_trace_task_release(t);
+		list_del(pos);
+
+		/* put into release heap while holding release_lock */
+		raw_spin_lock(&rt->release_lock);
+		VTRACE_TASK(t, "I have the release_lock 0x%p\n", &rt->release_lock);
+
+		rh = get_release_heap(rt, t, 0);
+		if (!rh) {
+			/* need to use our own, but drop lock first */
+			raw_spin_unlock(&rt->release_lock);
+			VTRACE_TASK(t, "Dropped release_lock 0x%p\n",
+				    &rt->release_lock);
+
+			reinit_release_heap(t);
+			VTRACE_TASK(t, "release_heap ready\n");
+
+			raw_spin_lock(&rt->release_lock);
+			VTRACE_TASK(t, "Re-acquired release_lock 0x%p\n",
+				    &rt->release_lock);
+
+			rh = get_release_heap(rt, t, 1);
+		}
+		bheap_insert(rt->order, &rh->heap, tsk_rt(t)->heap_node);
+		VTRACE_TASK(t, "arm_release_timer(): added to release heap\n");
+
+		raw_spin_unlock(&rt->release_lock);
+		VTRACE_TASK(t, "Returned the release_lock 0x%p\n", &rt->release_lock);
+
+		/* To avoid arming the timer multiple times, we only let the
+		 * owner do the arming (which is the "first" task to reference
+		 * this release_heap anyway).
+		 */
+		if (rh == tsk_rt(t)->rel_heap) {
+			VTRACE_TASK(t, "arming timer 0x%p\n", &rh->timer);
+			/* we cannot arm the timer using hrtimer_start()
+			 * as it may deadlock on rq->lock
+			 *
+			 * PINNED mode is ok on both local and remote CPU
+			 */
+#ifdef CONFIG_RELEASE_MASTER
+			if (rt->release_master == NO_CPU &&
+			    target_cpu == NO_CPU)
+#endif
+				__hrtimer_start_range_ns(&rh->timer,
+						ns_to_ktime(rh->release_time),
+						0, HRTIMER_MODE_ABS_PINNED, 0);
+#ifdef CONFIG_RELEASE_MASTER
+			else
+				hrtimer_start_on(
+					/* target_cpu overrides release master */
+					(target_cpu != NO_CPU ?
+					 target_cpu : rt->release_master),
+					&rh->info, &rh->timer,
+					ns_to_ktime(rh->release_time),
+					HRTIMER_MODE_ABS_PINNED);
+#endif
+		} else
+			VTRACE_TASK(t, "0x%p is not my timer\n", &rh->timer);
+	}
+}
+
+void rt_domain_init(rt_domain_t *rt,
+		    bheap_prio_t order,
+		    check_resched_needed_t check,
+		    release_jobs_t release
+		   )
+{
+	int i;
+
+	BUG_ON(!rt);
+	if (!check)
+		check = dummy_resched;
+	if (!release)
+		release = default_release_jobs;
+	if (!order)
+		order = dummy_order;
+
+#ifdef CONFIG_RELEASE_MASTER
+	rt->release_master = NO_CPU;
+#endif
+
+	bheap_init(&rt->ready_queue);
+	INIT_LIST_HEAD(&rt->tobe_released);
+	for (i = 0; i < RELEASE_QUEUE_SLOTS; i++)
+		INIT_LIST_HEAD(&rt->release_queue.slot[i]);
+
+	raw_spin_lock_init(&rt->ready_lock);
+	raw_spin_lock_init(&rt->release_lock);
+	raw_spin_lock_init(&rt->tobe_lock);
+
+	rt->check_resched 	= check;
+	rt->release_jobs	= release;
+	rt->order		= order;
+}
+
+/* add_ready - add a real-time task to the rt ready queue. It must be runnable.
+ * @new:       the newly released task
+ */
+void __add_ready(rt_domain_t* rt, struct task_struct *new)
+{
+	TRACE("rt: adding %s/%d (%llu, %llu) rel=%llu to ready queue at %llu\n",
+	      new->comm, new->pid, get_exec_cost(new), get_rt_period(new),
+	      get_release(new), litmus_clock());
+
+	BUG_ON(bheap_node_in_heap(tsk_rt(new)->heap_node));
+
+	bheap_insert(rt->order, &rt->ready_queue, tsk_rt(new)->heap_node);
+	rt->check_resched(rt);
+}
+
+/* merge_ready - Add a sorted set of tasks to the rt ready queue. They must be runnable.
+ * @tasks      - the newly released tasks
+ */
+void __merge_ready(rt_domain_t* rt, struct bheap* tasks)
+{
+	bheap_union(rt->order, &rt->ready_queue, tasks);
+	rt->check_resched(rt);
+}
+
+
+#ifdef CONFIG_RELEASE_MASTER
+void __add_release_on(rt_domain_t* rt, struct task_struct *task,
+		      int target_cpu)
+{
+	TRACE_TASK(task, "add_release_on(), rel=%llu, target=%d\n",
+		   get_release(task), target_cpu);
+	list_add(&tsk_rt(task)->list, &rt->tobe_released);
+	task->rt_param.domain = rt;
+
+	/* start release timer */
+	TS_SCHED2_START(task);
+
+	arm_release_timer_on(rt, target_cpu);
+
+	TS_SCHED2_END(task);
+}
+#endif
+
+/* add_release - add a real-time task to the rt release queue.
+ * @task:        the sleeping task
+ */
+void __add_release(rt_domain_t* rt, struct task_struct *task)
+{
+	TRACE_TASK(task, "add_release(), rel=%llu\n", get_release(task));
+	list_add(&tsk_rt(task)->list, &rt->tobe_released);
+	task->rt_param.domain = rt;
+
+	/* start release timer */
+	TS_SCHED2_START(task);
+
+	arm_release_timer(rt);
+
+	TS_SCHED2_END(task);
+}
+
diff --git a/litmus/sched_cedf.c b/litmus/sched_cedf.c
new file mode 100644
index 000000000000..4f5bb26b339b
--- /dev/null
+++ b/litmus/sched_cedf.c
@@ -0,0 +1,1526 @@
+/*
+ * litmus/sched_cedf.c
+ *
+ * Implementation of the C-EDF scheduling algorithm.
+ *
+ * This implementation is based on G-EDF:
+ * - CPUs are clustered around L2 or L3 caches.
+ * - Clusters topology is automatically detected (this is arch dependent
+ *   and is working only on x86 at the moment --- and only with modern
+ *   cpus that exports cpuid4 information)
+ * - The plugins _does not_ attempt to put tasks in the right cluster i.e.
+ *   the programmer needs to be aware of the topology to place tasks
+ *   in the desired cluster
+ * - default clustering is around L2 cache (cache index = 2)
+ *   supported clusters are: L1 (private cache: pedf), L2, L3, ALL (all
+ *   online_cpus are placed in a single cluster).
+ *
+ *   For details on functions, take a look at sched_gsn_edf.c
+ *
+ * Currently, we do not support changes in the number of online cpus.
+ * If the num_online_cpus() dynamically changes, the plugin is broken.
+ *
+ * This version uses the simple approach and serializes all scheduling
+ * decisions by the use of a queue lock. This is probably not the
+ * best way to do it, but it should suffice for now.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+#include <linux/module.h>
+
+#include <litmus/litmus.h>
+#include <litmus/wait.h>
+#include <litmus/jobs.h>
+#include <litmus/preempt.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/edf_common.h>
+#include <litmus/sched_trace.h>
+#include <litmus/trace.h>
+
+#include <litmus/clustered.h>
+
+#include <litmus/bheap.h>
+
+/* to configure the cluster size */
+#include <litmus/litmus_proc.h>
+#include <linux/uaccess.h>
+
+/* Reference configuration variable. Determines which cache level is used to
+ * group CPUs into clusters.  GLOBAL_CLUSTER, which is the default, means that
+ * all CPUs form a single cluster (just like GSN-EDF).
+ */
+static enum cache_level cluster_config = GLOBAL_CLUSTER;
+
+struct clusterdomain;
+
+/* cpu_entry_t - maintain the linked and scheduled state
+ *
+ * A cpu also contains a pointer to the cedf_domain_t cluster
+ * that owns it (struct clusterdomain*)
+ */
+typedef struct  {
+	int 			cpu;
+	struct clusterdomain*	cluster;	/* owning cluster */
+	struct task_struct*	linked;		/* only RT tasks */
+	struct task_struct*	scheduled;	/* only RT tasks */
+	atomic_t		will_schedule;	/* prevent unneeded IPIs */
+	struct bheap_node*	hn;
+#ifdef CONFIG_LITMUS_LOCKING
+	struct bheap_node*	pending_hn;
+	struct task_struct*	pending;
+#endif
+} cpu_entry_t;
+
+/* one cpu_entry_t per CPU */
+DEFINE_PER_CPU(cpu_entry_t, cedf_cpu_entries);
+
+
+static struct bheap_node cpu_nodes[NR_CPUS];
+#ifdef CONFIG_LITMUS_LOCKING
+static struct bheap_node pending_nodes[NR_CPUS];
+#endif
+
+/*
+ * In C-EDF there is a cedf domain _per_ cluster
+ * The number of clusters is dynamically determined accordingly to the
+ * total cpu number and the cluster size
+ */
+typedef struct clusterdomain {
+	/* rt_domain for this cluster */
+	rt_domain_t	domain;
+	/* map of this cluster cpus */
+	cpumask_var_t	cpu_map;
+	unsigned int	num_cpus;
+	/* the cpus queue themselves according to priority in here */
+	struct bheap    cpu_heap;
+#ifdef CONFIG_LITMUS_LOCKING
+	struct bheap	pending_jobs;
+	struct bheap	pending_cpus;
+#endif
+	/* lock for this cluster */
+#define cluster_lock domain.ready_lock
+} cedf_domain_t;
+
+/* a cedf_domain per cluster; allocation is done at init/activation time */
+cedf_domain_t *cedf;
+
+#define remote_cpu(cpu)		(&per_cpu(cedf_cpu_entries, cpu))
+#define remote_cluster(cpu)	((cedf_domain_t *) per_cpu(cedf_cpu_entries, cpu).cluster)
+#define task_cpu_cluster(task)	remote_cluster(get_partition(task))
+
+/* Uncomment WANT_ALL_SCHED_EVENTS if you want to see all scheduling
+ * decisions in the TRACE() log; uncomment VERBOSE_INIT for verbose
+ * information during the initialization of the plugin (e.g., topology)
+#define WANT_ALL_SCHED_EVENTS
+ */
+#define VERBOSE_INIT
+
+static int cpu_lower_prio(struct bheap_node *_a, struct bheap_node *_b)
+{
+	cpu_entry_t *a, *b;
+	a = _a->value;
+	b = _b->value;
+	/* Note that a and b are inverted: we want the lowest-priority CPU at
+	 * the top of the heap.
+	 */
+	return edf_higher_prio(b->linked, a->linked);
+}
+
+/* update_cpu_position - Move the cpu entry to the correct place to maintain
+ *                       order in the cpu queue. Caller must hold cedf lock.
+ */
+static void update_cpu_position(cpu_entry_t *entry)
+{
+	cedf_domain_t *cluster = entry->cluster;
+
+	if (likely(bheap_node_in_heap(entry->hn)))
+		bheap_delete(cpu_lower_prio,
+				&cluster->cpu_heap,
+				entry->hn);
+
+	bheap_insert(cpu_lower_prio, &cluster->cpu_heap, entry->hn);
+}
+
+/* caller must hold cedf lock */
+static cpu_entry_t* lowest_prio_cpu(cedf_domain_t *cluster)
+{
+	struct bheap_node* hn;
+	hn = bheap_peek(cpu_lower_prio, &cluster->cpu_heap);
+	return hn->value;
+}
+
+
+/* link_task_to_cpu - Update the link of a CPU.
+ *                    Handles the case where the to-be-linked task is already
+ *                    scheduled on a different CPU.
+ */
+static noinline void link_task_to_cpu(struct task_struct* linked,
+				      cpu_entry_t *entry)
+{
+	cpu_entry_t *sched;
+	struct task_struct* tmp;
+	int on_cpu;
+
+	BUG_ON(linked && !is_realtime(linked));
+
+	/* Currently linked task is set to be unlinked. */
+	if (entry->linked) {
+		entry->linked->rt_param.linked_on = NO_CPU;
+	}
+
+	/* Link new task to CPU. */
+	if (linked) {
+		/* handle task is already scheduled somewhere! */
+		on_cpu = linked->rt_param.scheduled_on;
+		if (on_cpu != NO_CPU) {
+			sched = &per_cpu(cedf_cpu_entries, on_cpu);
+			/* this should only happen if not linked already */
+			BUG_ON(sched->linked == linked);
+
+			/* If we are already scheduled on the CPU to which we
+			 * wanted to link, we don't need to do the swap --
+			 * we just link ourselves to the CPU and depend on
+			 * the caller to get things right.
+			 */
+			if (entry != sched) {
+				TRACE_TASK(linked,
+					   "already scheduled on %d, updating link.\n",
+					   sched->cpu);
+				tmp = sched->linked;
+				linked->rt_param.linked_on = sched->cpu;
+				sched->linked = linked;
+				update_cpu_position(sched);
+				linked = tmp;
+			}
+		}
+		if (linked) /* might be NULL due to swap */
+			linked->rt_param.linked_on = entry->cpu;
+	}
+	entry->linked = linked;
+#ifdef WANT_ALL_SCHED_EVENTS
+	if (linked)
+		TRACE_TASK(linked, "linked to %d.\n", entry->cpu);
+	else
+		TRACE("NULL linked to %d.\n", entry->cpu);
+#endif
+	update_cpu_position(entry);
+}
+
+/* unlink - Make sure a task is not linked any longer to an entry
+ *          where it was linked before. Must hold cedf_lock.
+ */
+static noinline void unlink(struct task_struct* t)
+{
+    	cpu_entry_t *entry;
+
+	if (t->rt_param.linked_on != NO_CPU) {
+		/* unlink */
+		entry = &per_cpu(cedf_cpu_entries, t->rt_param.linked_on);
+		t->rt_param.linked_on = NO_CPU;
+		link_task_to_cpu(NULL, entry);
+	} else if (is_queued(t)) {
+		/* This is an interesting situation: t is scheduled,
+		 * but was just recently unlinked.  It cannot be
+		 * linked anywhere else (because then it would have
+		 * been relinked to this CPU), thus it must be in some
+		 * queue. We must remove it from the list in this
+		 * case.
+		 *
+		 * in C-EDF case is should be somewhere in the queue for
+		 * its domain, therefore and we can get the domain using
+		 * task_cpu_cluster
+		 */
+		remove(&(task_cpu_cluster(t))->domain, t);
+	}
+}
+
+
+/* preempt - force a CPU to reschedule
+ */
+static void preempt(cpu_entry_t *entry)
+{
+	preempt_if_preemptable(entry->scheduled, entry->cpu);
+}
+
+#ifdef CONFIG_LITMUS_LOCKING
+static int update_pending_job(cedf_domain_t* cluster, struct task_struct* t);
+static void priodon_become_eligible(void);
+static void priodon_complete_request(void);
+
+static inline int in_pending_heap(struct task_struct* t)
+{
+	return bheap_node_in_heap(tsk_rt(t)->pending_node);
+}
+
+/* has this task already been processed for pending */
+static inline int is_pending(struct task_struct* t)
+{
+	return tsk_rt(t)->pending_on != NO_CPU ||
+		in_pending_heap(t);
+}
+
+#endif
+
+/* requeue - Put an unlinked task into gsn-edf domain.
+ *           Caller must hold cedf_lock.
+ */
+static noinline void requeue(struct task_struct* task)
+{
+	cedf_domain_t *cluster = task_cpu_cluster(task);
+	BUG_ON(!task);
+	/* sanity check before insertion */
+	BUG_ON(is_queued(task));
+
+	if (is_released(task, litmus_clock())) {
+#ifdef CONFIG_LITMUS_LOCKING
+		if (!is_pending(task))
+			update_pending_job(cluster, task);
+#endif
+		__add_ready(&cluster->domain, task);
+	} else {
+		/* it has got to wait */
+		add_release(&cluster->domain, task);
+	}
+}
+
+/* check for any necessary preemptions */
+static void check_for_preemptions(cedf_domain_t *cluster)
+{
+	struct task_struct *task;
+	cpu_entry_t* last;
+
+	for(last = lowest_prio_cpu(cluster);
+	    edf_preemption_needed(&cluster->domain, last->linked);
+	    last = lowest_prio_cpu(cluster)) {
+		/* preemption necessary */
+
+#ifdef CONFIG_LITMUS_LOCKING
+		task = __peek_ready(&cluster->domain);
+		if (update_pending_job(cluster, task)) {
+			/* Something changed, re-evaluate priorites to
+			 * see if we still need to preempt.
+			 * */
+			TRACE_TASK(task, "hitting continue\n");
+			continue;
+		}
+#endif
+		task = __take_ready(&cluster->domain);
+		TRACE_TASK(task, "attempting to link task to P%d\n",
+			   last->cpu);
+		if (last->linked)
+			requeue(last->linked);
+		link_task_to_cpu(task, last);
+		preempt(last);
+	}
+}
+
+#ifdef CONFIG_LITMUS_LOCKING
+
+static int pending_lower_prio(struct bheap_node *_a, struct bheap_node *_b)
+{
+	cpu_entry_t *a, *b;
+	a = _a->value;
+	b = _b->value;
+	/* Note that a and b are inverted: we want the lowest-priority CPU at
+	 * the top of the heap.
+	 */
+	return edf_higher_base_prio(b->pending, a->pending);
+}
+
+/* update_cpu_position - Move the cpu entry to the correct place to maintain
+ *                       order in the cpu queue. Caller must hold cedf lock.
+ */
+static void update_pending_position(cpu_entry_t *entry)
+{
+	cedf_domain_t *cluster = entry->cluster;
+
+	if (likely(bheap_node_in_heap(entry->pending_hn)))
+		bheap_delete(pending_lower_prio,
+			     &cluster->pending_cpus,
+			     entry->pending_hn);
+
+	bheap_insert(pending_lower_prio, &cluster->pending_cpus, entry->pending_hn);
+}
+
+/* caller must hold cedf lock */
+static cpu_entry_t* lowest_pending_cpu(cedf_domain_t *cluster)
+{
+	struct bheap_node* hn;
+	hn = bheap_peek(pending_lower_prio, &cluster->pending_cpus);
+	return hn->value;
+}
+
+static void priority_raised(struct task_struct* t)
+{
+	cedf_domain_t *cluster = task_cpu_cluster(t);
+	int linked_on;
+
+	linked_on  = tsk_rt(t)->linked_on;
+
+	/* If it is scheduled, then we need to reorder the CPU heap. */
+	if (linked_on != NO_CPU) {
+		TRACE_TASK(t, "%s: linked  on %d\n",
+			   __FUNCTION__, linked_on);
+		/* Holder is scheduled; need to re-order CPUs.
+		 * We can't use heap_decrease() here since
+		 * the cpu_heap is ordered in reverse direction, so
+		 * it is actually an increase. */
+		bheap_delete(cpu_lower_prio, &cluster->cpu_heap,
+			     remote_cpu(linked_on)->hn);
+		bheap_insert(cpu_lower_prio, &cluster->cpu_heap,
+			     remote_cpu(linked_on)->hn);
+	} else {
+		/* holder may be queued: first stop queue changes */
+		raw_spin_lock(&cluster->domain.release_lock);
+		if (is_queued(t)) {
+			TRACE_TASK(t, "%s: is queued\n",
+				   __FUNCTION__);
+			bheap_decrease(edf_ready_order,
+				       tsk_rt(t)->heap_node);
+		} else {
+			/* Nothing to do: if it is not queued and not linked
+			 * then it is either sleeping or currently being moved
+			 * by other code (e.g., a timer interrupt handler) that
+			 * will use the correct priority when enqueuing the
+			 * task. */
+			TRACE_TASK(t, "%s: is NOT queued => Done.\n",
+				   __FUNCTION__);
+		}
+		raw_spin_unlock(&cluster->domain.release_lock);
+	}
+}
+
+static void priority_lowered(struct task_struct* t)
+{
+	/* assumption: t is not in a release heap */
+	if (is_queued(t) || tsk_rt(t)->linked_on != NO_CPU) {
+		unlink(t);
+		requeue(t);
+	}
+}
+
+static void donate_priority(struct task_struct* recipient, struct task_struct* donor)
+{
+	cedf_domain_t *cluster = task_cpu_cluster(donor);
+
+	BUG_ON(task_cpu_cluster(recipient) != task_cpu_cluster(donor));
+	BUG_ON(tsk_rt(donor)->is_donor);
+	BUG_ON(tsk_rt(recipient)->is_donor);
+	BUG_ON(tsk_rt(donor)->inh_task);
+	BUG_ON(tsk_rt(recipient)->inh_task);
+
+	TRACE_TASK(donor, "priodon: becomes priority donor for %s/%d\n",
+		   recipient->comm, recipient->pid);
+
+	/* swap priorities */
+	tsk_rt(recipient)->inh_task = donor;
+	tsk_rt(donor)->inh_task = recipient;
+	tsk_rt(donor)->is_donor = 1;
+
+	priority_lowered(donor);
+	priority_raised(recipient);
+
+	bheap_uncache_min(edf_ready_order,
+			  &cluster->domain.ready_queue);
+}
+
+/* assumption: new_donor has a higher priority than old_donor */
+static void switch_donor(struct task_struct* recipient,
+			 struct task_struct* old_donor,
+			 struct task_struct* new_donor)
+{
+	TRACE_TASK(new_donor, "becomes donor for %s/%d instead of %s/%d\n",
+		   recipient->comm, recipient->pid, old_donor->comm, old_donor->pid);
+
+	BUG_ON(tsk_rt(recipient)->inh_task != old_donor);
+	BUG_ON(tsk_rt(old_donor)->inh_task != recipient);
+	BUG_ON(tsk_rt(new_donor)->inh_task != NULL);
+	BUG_ON(tsk_rt(new_donor)->is_donor);
+
+	tsk_rt(old_donor)->inh_task = NULL;
+	tsk_rt(old_donor)->is_donor = 0;
+
+	tsk_rt(recipient)->inh_task = new_donor;
+	tsk_rt(new_donor)->inh_task = recipient;
+	tsk_rt(new_donor)->is_donor = 1;
+
+	priority_raised(recipient);
+	priority_raised(old_donor);
+	priority_lowered(new_donor);
+}
+
+static void undonate_priority(struct task_struct* recipient, struct task_struct* donor)
+{
+	cedf_domain_t *cluster = task_cpu_cluster(donor);
+
+	BUG_ON(tsk_rt(recipient)->inh_task != donor);
+	BUG_ON(tsk_rt(donor)->inh_task != recipient);
+
+	TRACE_TASK(donor, "priodon: is no longer priority donor of  %s/%d\n",
+		   recipient->comm, recipient->pid);
+
+	tsk_rt(recipient)->inh_task = NULL;
+	tsk_rt(donor)->inh_task = NULL;
+	tsk_rt(donor)->is_donor = 0;
+
+	priority_lowered(recipient);
+	priority_raised(donor);
+
+	bheap_uncache_min(edf_ready_order,
+			  &cluster->domain.ready_queue);
+}
+
+static inline void add_to_pending(cedf_domain_t* cluster, struct task_struct* t)
+{
+	TRACE_TASK(t, "priodon: adding to pending heap wait:%u donor:%u req:%u pend:%d\n",
+		   tsk_rt(t)->waiting_eligible,
+		   tsk_rt(t)->is_donor, tsk_rt(t)->request_incomplete,
+		   tsk_rt(t)->pending_on);
+	bheap_insert(edf_pending_order,
+		     &cluster->pending_jobs,
+		     tsk_rt(t)->pending_node);
+}
+
+static inline struct task_struct* take_pending(cedf_domain_t* cluster)
+{
+	struct bheap_node* node;
+	node = bheap_take(edf_pending_order, &cluster->pending_jobs);
+	return node ? (struct task_struct*) node->value : NULL;
+}
+
+static inline struct task_struct* peek_pending(cedf_domain_t* cluster)
+{
+	struct bheap_node* node;
+	node = bheap_peek(edf_pending_order, &cluster->pending_jobs);
+	return node ? (struct task_struct*) node->value : NULL;
+}
+
+static inline int fake_resume(struct task_struct* t)
+{
+	TRACE_TASK(t, "priodon: fake resume wait:%u donor:%u\n",
+		   tsk_rt(t)->waiting_eligible, tsk_rt(t)->is_donor);
+	/* Fake suspended. Let's resume it. */
+	if (tsk_rt(t)->waiting_eligible) {
+		tsk_rt(t)->waiting_eligible = 0;
+		if (tsk_rt(t)->scheduled_on == NO_CPU) {
+			/* it was removed from the queue */
+			requeue(t);
+			return 1;
+		}
+	}
+	return 0;
+}
+
+
+/* Lazily update set of highest-priority pending jobs.
+ * Returns 1 if priority recheck is required.
+ */
+static int update_pending_job(cedf_domain_t* cluster,
+			      struct task_struct* to_be_linked)
+{
+	cpu_entry_t* entry;
+	struct task_struct* lowest_hp; /* lowest-priority high-priority task */
+	struct task_struct* highest_lp; /* highest-priority low-priority task */
+	int reeval = 0;
+
+	entry = lowest_pending_cpu(cluster);
+	lowest_hp = entry->pending;
+
+	if (to_be_linked && !is_pending(to_be_linked))
+		/* not yet accounted for, stick in heap */
+		add_to_pending(cluster, to_be_linked);
+
+	highest_lp = peek_pending(cluster);
+	if (edf_higher_base_prio(highest_lp, lowest_hp)) {
+		/* yep, should be become of the c highest-prior pending jobs */
+
+		TRACE_TASK(highest_lp,
+			   "priodon: became one of the %u highest-prio tasks (P%d, req:%u) X\n",
+			   cluster->num_cpus,
+			   entry->cpu,
+			   tsk_rt(highest_lp)->request_incomplete);
+
+		/* get it out of the heap */
+		highest_lp = take_pending(cluster);
+
+		BUG_ON(highest_lp == lowest_hp);
+
+		/* it should never be a priority donor at this point */
+		BUG_ON(tsk_rt(highest_lp)->is_donor);
+
+		entry->pending = highest_lp;
+		update_pending_position(entry);
+		tsk_rt(highest_lp)->pending_on = entry->cpu;
+
+		/* things that could happen:
+		 *
+		 * 1) lowest_hp has no donor, but is in a request => highest_lp becomes donor
+		 * 2) lowest_hp is donor => highest_lp becomes new donor, old donor is resumed if suspended
+		 * 3) lowest_hp is not in a request, and highest_lp is waiting => highest_lp is resumed
+		 * 4) lowest_hp is not in a request, and highest_lp is not waiting => nothing to do
+		 * 5) highest_lp has a priority donor => resume its donor
+		 */
+
+		/* do we need to put it back? */
+		if (lowest_hp) {
+			TRACE_TASK(lowest_hp,
+				   "priodon: no longer among %u highest-prio tasks req:%u\n",
+				   cluster->num_cpus,
+				   tsk_rt(lowest_hp)->request_incomplete);
+			tsk_rt(lowest_hp)->pending_on = NO_CPU;
+			add_to_pending(cluster, lowest_hp);
+
+
+			if (tsk_rt(lowest_hp)->request_incomplete) {
+				/* case 1) */
+				donate_priority(lowest_hp, highest_lp);
+				reeval = 1;
+			} else if (tsk_rt(lowest_hp)->inh_task) {
+				/* case 2) */
+				switch_donor(tsk_rt(lowest_hp)->inh_task,
+					     lowest_hp, highest_lp);
+				fake_resume(lowest_hp);
+				reeval = 1;
+			}
+		}
+
+
+		if (!tsk_rt(highest_lp)->is_donor) {
+			if (tsk_rt(highest_lp)->waiting_eligible) {
+				/* case 3) */
+				reeval = fake_resume(highest_lp);
+				BUG_ON(tsk_rt(highest_lp)->inh_task);
+			} else if (tsk_rt(highest_lp)->inh_task) {
+				/* case 5 */
+				struct task_struct* donor = tsk_rt(highest_lp)->inh_task;
+				undonate_priority(highest_lp, donor);
+				reeval = fake_resume(donor);
+			}
+		}
+	}
+
+	return reeval;
+}
+
+/* job has exited => no longer pending */
+
+static void job_pending_exit(struct task_struct* t)
+{
+	cedf_domain_t *cluster;
+	cpu_entry_t* entry;
+
+	TRACE_TASK(t, "priodon: is no longer pending (pending_on:%d, queued:%d)\n",
+		   tsk_rt(t)->pending_on, in_pending_heap(t));
+
+	cluster = task_cpu_cluster(t);
+
+	if (tsk_rt(t)->pending_on != NO_CPU) {
+		entry = &per_cpu(cedf_cpu_entries, tsk_rt(t)->pending_on);
+		tsk_rt(t)->pending_on = NO_CPU;
+		entry->pending = NULL;
+		update_pending_position(entry);
+
+		/* let's see if anything changed */
+		update_pending_job(cluster, NULL);
+	} else if (in_pending_heap(t)) {
+		bheap_delete(edf_pending_order, &cluster->pending_jobs,
+			     tsk_rt(t)->pending_node);
+	}
+}
+
+#endif
+
+
+/* cedf_job_arrival: task is either resumed or released */
+static noinline void cedf_job_arrival(struct task_struct* task)
+{
+	cedf_domain_t *cluster = task_cpu_cluster(task);
+	BUG_ON(!task);
+
+	requeue(task);
+	check_for_preemptions(cluster);
+}
+
+
+static void cedf_release_jobs(rt_domain_t* rt, struct bheap* tasks)
+{
+	cedf_domain_t* cluster = container_of(rt, cedf_domain_t, domain);
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&cluster->cluster_lock, flags);
+
+	__merge_ready(&cluster->domain, tasks);
+	check_for_preemptions(cluster);
+
+	raw_spin_unlock_irqrestore(&cluster->cluster_lock, flags);
+}
+
+/* caller holds cedf_lock */
+static noinline void job_completion(struct task_struct *t, int forced)
+{
+	BUG_ON(!t);
+
+	sched_trace_task_completion(t, forced);
+
+	TRACE_TASK(t, "job_completion().\n");
+
+#ifdef CONFIG_LITMUS_LOCKING
+	job_pending_exit(t);
+#endif
+
+	/* prepare for next period */
+	prepare_for_next_period(t);
+	if (is_released(t, litmus_clock()))
+		sched_trace_task_release(t);
+	/* unlink */
+	unlink(t);
+	/* requeue
+	 * But don't requeue a blocking task. */
+	set_rt_flags(t, RT_F_RUNNING);
+	if (is_running(t))
+		cedf_job_arrival(t);
+}
+
+/* cedf_tick - this function is called for every local timer
+ *                         interrupt.
+ *
+ *                   checks whether the current task has expired and checks
+ *                   whether we need to preempt it if it has not expired
+ */
+static void cedf_tick(struct task_struct* t)
+{
+	if (is_realtime(t) && budget_enforced(t) && budget_exhausted(t)) {
+		if (!is_np(t)) {
+			/* np tasks will be preempted when they become
+			 * preemptable again
+			 */
+			litmus_reschedule_local();
+			TRACE("cedf_scheduler_tick: "
+			      "%d is preemptable "
+			      " => FORCE_RESCHED\n", t->pid);
+		} else if (is_user_np(t)) {
+			TRACE("cedf_scheduler_tick: "
+			      "%d is non-preemptable, "
+			      "preemption delayed.\n", t->pid);
+			request_exit_np(t);
+		}
+	}
+}
+
+/* Getting schedule() right is a bit tricky. schedule() may not make any
+ * assumptions on the state of the current task since it may be called for a
+ * number of reasons. The reasons include a scheduler_tick() determined that it
+ * was necessary, because sys_exit_np() was called, because some Linux
+ * subsystem determined so, or even (in the worst case) because there is a bug
+ * hidden somewhere. Thus, we must take extreme care to determine what the
+ * current state is.
+ *
+ * The CPU could currently be scheduling a task (or not), be linked (or not).
+ *
+ * The following assertions for the scheduled task could hold:
+ *
+ *      - !is_running(scheduled)        // the job blocks
+ *	- scheduled->timeslice == 0	// the job completed (forcefully)
+ *	- get_rt_flag() == RT_F_SLEEP	// the job completed (by syscall)
+ * 	- linked != scheduled		// we need to reschedule (for any reason)
+ * 	- is_np(scheduled)		// rescheduling must be delayed,
+ *					   sys_exit_np must be requested
+ *
+ * Any of these can occur together.
+ */
+static struct task_struct* cedf_schedule(struct task_struct * prev)
+{
+	cpu_entry_t* entry = &__get_cpu_var(cedf_cpu_entries);
+	cedf_domain_t *cluster = entry->cluster;
+	int out_of_time, sleep, preempt, np, exists, blocks;
+	struct task_struct* next = NULL;
+
+#ifdef CONFIG_LITMUS_LOCKING
+	int priodon;
+#else
+#define priodon 0
+#endif
+
+#ifdef CONFIG_RELEASE_MASTER
+	/* Bail out early if we are the release master.
+	 * The release master never schedules any real-time tasks.
+	 */
+	if (cluster->domain.release_master == entry->cpu) {
+		sched_state_task_picked();
+		return NULL;
+	}
+#endif
+
+	raw_spin_lock(&cluster->cluster_lock);
+
+	/* sanity checking */
+	BUG_ON(entry->scheduled && entry->scheduled != prev);
+	BUG_ON(entry->scheduled && !is_realtime(prev));
+	BUG_ON(is_realtime(prev) && !entry->scheduled);
+
+	/* (0) Determine state */
+	exists      = entry->scheduled != NULL;
+	blocks      = exists && !is_running(entry->scheduled);
+	out_of_time = exists &&
+				  budget_enforced(entry->scheduled) &&
+				  budget_exhausted(entry->scheduled);
+	np 	    = exists && is_np(entry->scheduled);
+	sleep	    = exists && get_rt_flags(entry->scheduled) == RT_F_SLEEP;
+	preempt     = entry->scheduled != entry->linked;
+
+#ifdef CONFIG_LITMUS_LOCKING
+	priodon = exists && (tsk_rt(entry->scheduled)->waiting_eligible ||
+			     /* can't allow job to exit until request is over */
+			     (tsk_rt(entry->scheduled)->is_donor && sleep));
+
+	/* this should never happend together (at least we don't handle it atm) */
+	BUG_ON(priodon && blocks);
+#endif
+
+#ifdef WANT_ALL_SCHED_EVENTS
+	TRACE_TASK(prev, "invoked cedf_schedule.\n");
+#endif
+
+	if (exists)
+		TRACE_TASK(prev,
+			   "blocks:%d out_of_time:%d np:%d sleep:%d preempt:%d "
+			   "state:%d sig:%d priodon:%d\n",
+			   blocks, out_of_time, np, sleep, preempt,
+			   prev->state, signal_pending(prev), priodon);
+	if (entry->linked && preempt)
+		TRACE_TASK(prev, "will be preempted by %s/%d\n",
+			   entry->linked->comm, entry->linked->pid);
+
+
+	/* If a task blocks we have no choice but to reschedule.
+	 */
+	if (blocks || priodon)
+		unlink(entry->scheduled);
+
+	/* Request a sys_exit_np() call if we would like to preempt but cannot.
+	 * Do not unlink since entry->scheduled is currently in the ready queue.
+	 * We don't process out_of_time and sleep until the job is preemptive again.
+	 */
+	if (np && (out_of_time || preempt || sleep)) {
+		request_exit_np(entry->scheduled);
+	}
+
+	/* Any task that is preemptable and either exhausts its execution
+	 * budget or wants to sleep completes. We may have to reschedule after
+	 * this. Don't do a job completion if we block (can't have timers running
+	 * for blocked jobs). Preemption go first for the same reason.
+	 */
+	if (!np && (out_of_time || sleep) && !blocks && !preempt
+	    && !priodon)
+		/* note: priority donation prevents job completion */
+		job_completion(entry->scheduled, !sleep);
+
+	/* Link pending task if we became unlinked.
+	 */
+
+	if (!entry->linked) {
+#ifdef CONFIG_LITMUS_LOCKING
+		struct task_struct *pulled;
+		int reeval;
+		do {
+			pulled =  __take_ready(&cluster->domain);
+			reeval = 0;
+			if (pulled && !is_pending(pulled)) {
+				/* Pulled an un-processed task from the ready queue. */
+				TRACE_TASK(pulled, "pulled unprocessed\n");
+				reeval = update_pending_job(cluster, pulled);
+				if (reeval)
+					/* priority may have changed --- try again */
+					requeue(pulled);
+			}
+		} while (reeval);
+		link_task_to_cpu(pulled, entry);
+#else
+		link_task_to_cpu(__take_ready(&cluster->domain), entry);
+#endif
+	}
+
+	/* The final scheduling decision. Do we need to switch for some reason?
+	 * If linked is different from scheduled, then select linked as next.
+	 */
+	if ((!np || blocks || priodon) &&
+	    entry->linked != entry->scheduled) {
+		/* Schedule a linked job? */
+		if (entry->linked) {
+			entry->linked->rt_param.scheduled_on = entry->cpu;
+			next = entry->linked;
+		}
+		if (entry->scheduled) {
+			/* not gonna be scheduled soon */
+			entry->scheduled->rt_param.scheduled_on = NO_CPU;
+			TRACE_TASK(entry->scheduled, "scheduled_on = NO_CPU\n");
+		}
+	} else
+		/* Only override Linux scheduler if we have a real-time task
+		 * scheduled that needs to continue.
+		 */
+		if (exists)
+			next = prev;
+
+	sched_state_task_picked();
+	raw_spin_unlock(&cluster->cluster_lock);
+
+#ifdef WANT_ALL_SCHED_EVENTS
+	TRACE("cedf_lock released, next=0x%p\n", next);
+
+	if (next)
+		TRACE_TASK(next, "scheduled at %llu\n", litmus_clock());
+	else if (exists && !next)
+		TRACE("becomes idle at %llu.\n", litmus_clock());
+#endif
+
+
+	return next;
+}
+
+
+/* _finish_switch - we just finished the switch away from prev
+ */
+static void cedf_finish_switch(struct task_struct *prev)
+{
+	cpu_entry_t* 	entry = &__get_cpu_var(cedf_cpu_entries);
+
+	entry->scheduled = is_realtime(current) ? current : NULL;
+#ifdef WANT_ALL_SCHED_EVENTS
+	TRACE_TASK(prev, "switched away from\n");
+#endif
+}
+
+
+/*	Prepare a task for running in RT mode
+ */
+static void cedf_task_new(struct task_struct * t, int on_rq, int running)
+{
+	unsigned long 		flags;
+	cpu_entry_t* 		entry;
+	cedf_domain_t*		cluster;
+
+	TRACE("gsn edf: task new %d\n", t->pid);
+
+	/* the cluster doesn't change even if t is running */
+	cluster = task_cpu_cluster(t);
+
+	raw_spin_lock_irqsave(&cluster->cluster_lock, flags);
+
+	/* setup job params */
+	release_at(t, litmus_clock());
+
+#ifdef CONFIG_LITMUS_LOCKING
+	tsk_rt(t)->pending_node = bheap_node_alloc(GFP_ATOMIC | __GFP_NOFAIL);
+	bheap_node_init(&tsk_rt(t)->pending_node, t);
+	tsk_rt(t)->pending_on = NO_CPU;
+	add_to_pending(cluster, t);
+#endif
+
+	if (running) {
+		entry = &per_cpu(cedf_cpu_entries, task_cpu(t));
+		BUG_ON(entry->scheduled);
+
+#ifdef CONFIG_RELEASE_MASTER
+		if (entry->cpu != cluster->domain.release_master) {
+#endif
+			entry->scheduled = t;
+			tsk_rt(t)->scheduled_on = task_cpu(t);
+#ifdef CONFIG_RELEASE_MASTER
+		} else {
+			/* do not schedule on release master */
+			preempt(entry); /* force resched */
+			tsk_rt(t)->scheduled_on = NO_CPU;
+		}
+#endif
+	} else {
+		t->rt_param.scheduled_on = NO_CPU;
+	}
+	t->rt_param.linked_on          = NO_CPU;
+
+	cedf_job_arrival(t);
+	raw_spin_unlock_irqrestore(&(cluster->cluster_lock), flags);
+}
+
+static void cedf_task_wake_up(struct task_struct *task)
+{
+	unsigned long flags;
+	lt_t now;
+	cedf_domain_t *cluster;
+
+	TRACE_TASK(task, "wake_up at %llu\n", litmus_clock());
+
+	cluster = task_cpu_cluster(task);
+
+	raw_spin_lock_irqsave(&cluster->cluster_lock, flags);
+	/* We need to take suspensions because of semaphores into
+	 * account! If a job resumes after being suspended due to acquiring
+	 * a semaphore, it should never be treated as a new job release.
+	 */
+	if (get_rt_flags(task) == RT_F_EXIT_SEM) {
+		set_rt_flags(task, RT_F_RUNNING);
+	} else {
+		now = litmus_clock();
+		if (is_tardy(task, now)) {
+			/* new sporadic release */
+			release_at(task, now);
+			sched_trace_task_release(task);
+		}
+		else {
+			if (task->rt.time_slice) {
+				/* came back in time before deadline
+				*/
+				set_rt_flags(task, RT_F_RUNNING);
+			}
+		}
+	}
+	cedf_job_arrival(task);
+	raw_spin_unlock_irqrestore(&cluster->cluster_lock, flags);
+}
+
+static void cedf_task_block(struct task_struct *t)
+{
+	unsigned long flags;
+	cedf_domain_t *cluster;
+
+	TRACE_TASK(t, "block at %llu\n", litmus_clock());
+
+	cluster = task_cpu_cluster(t);
+
+	/* unlink if necessary */
+	raw_spin_lock_irqsave(&cluster->cluster_lock, flags);
+	unlink(t);
+	raw_spin_unlock_irqrestore(&cluster->cluster_lock, flags);
+
+	BUG_ON(!is_realtime(t));
+}
+
+#ifdef CONFIG_LITMUS_LOCKING
+static void cedf_pre_setsched(struct task_struct *t, int policy)
+{
+
+	unsigned long flags;
+	cedf_domain_t *cluster = task_cpu_cluster(t);
+
+	int delay_donor_exit = 0;
+
+	while (1) {
+		raw_spin_lock_irqsave(&cluster->cluster_lock, flags);
+
+		TRACE_CUR("cedf_pre_setsched wait:%u pend:%d donor:%u req:%u\n",
+			  tsk_rt(t)->waiting_eligible,
+			  tsk_rt(t)->pending_on, tsk_rt(t)->is_donor,
+			  tsk_rt(t)->request_incomplete);
+
+		delay_donor_exit = tsk_rt(current)->is_donor;
+
+		raw_spin_unlock_irqrestore(&cluster->cluster_lock, flags);
+
+		if (!delay_donor_exit)
+			break;
+
+		TRACE_CUR("donor exit delay\n");
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(HZ);
+	}
+}
+#endif
+
+static void cedf_task_exit(struct task_struct * t)
+{
+	unsigned long flags;
+	cedf_domain_t *cluster = task_cpu_cluster(t);
+
+	/* unlink if necessary */
+	raw_spin_lock_irqsave(&cluster->cluster_lock, flags);
+
+	unlink(t);
+
+#ifdef CONFIG_LITMUS_LOCKING
+	/* make sure it's not pending anymore */
+	job_pending_exit(t);
+	bheap_node_free(tsk_rt(t)->pending_node);
+#endif
+
+	if (tsk_rt(t)->scheduled_on != NO_CPU) {
+		cpu_entry_t *cpu;
+		cpu = &per_cpu(cedf_cpu_entries, tsk_rt(t)->scheduled_on);
+		cpu->scheduled = NULL;
+		tsk_rt(t)->scheduled_on = NO_CPU;
+	}
+	raw_spin_unlock_irqrestore(&cluster->cluster_lock, flags);
+
+
+	BUG_ON(!is_realtime(t));
+        TRACE_TASK(t, "RIP\n");
+}
+
+#ifdef CONFIG_LITMUS_LOCKING
+
+#include <litmus/fdso.h>
+#include <litmus/locking.h>
+
+/* NOTE: we use fake suspensions because we must wake the task from within the
+ * scheduler */
+
+/* suspend until the current task becomes eligible to issue a lock request */
+static void priodon_become_eligible(void)
+{
+	struct task_struct* t = current;
+	unsigned long flags;
+	cedf_domain_t *cluster;
+
+	cluster = task_cpu_cluster(t);
+
+	do {
+		TRACE_CUR("priodon: checking whether request may be issued\n");
+		raw_spin_lock_irqsave(&cluster->cluster_lock, flags);
+
+		if (tsk_rt(t)->pending_on == NO_CPU ||
+		    tsk_rt(t)->is_donor) {
+			/* nope, gotta wait */
+			tsk_rt(t)->waiting_eligible = 1;
+			TRACE_CUR("priodon: not eligible pend:%u donor:%u\n",
+				  tsk_rt(t)->pending_on, tsk_rt(t)->is_donor);
+		} else {
+			/* alright! we are good to go! */
+			tsk_rt(t)->request_incomplete = 1;
+			TRACE_CUR("priodon: request issued\n");
+		}
+
+		raw_spin_unlock_irqrestore(&cluster->cluster_lock, flags);
+
+		if (tsk_rt(t)->waiting_eligible) {
+			TRACE_CUR("priodon: fake suspending\n");
+			TS_LOCK_SUSPEND;
+			schedule();
+			TS_LOCK_RESUME;
+		}
+
+	} while (!tsk_rt(t)->request_incomplete);
+}
+
+/* current task has completed its request */
+static void priodon_complete_request(void)
+{
+	struct task_struct* t = current;
+	struct task_struct* donor;
+	unsigned long flags;
+	cedf_domain_t *cluster;
+
+	cluster = task_cpu_cluster(t);
+
+	preempt_disable();
+
+	raw_spin_lock_irqsave(&cluster->cluster_lock, flags);
+
+	TRACE_CUR("priodon: completing request\n");
+
+	if (tsk_rt(t)->inh_task) {
+		/* we have a donor job --- see if we need to wake it */
+		donor = tsk_rt(t)->inh_task;
+		undonate_priority(t, donor);
+
+		if (fake_resume(donor))
+			check_for_preemptions(cluster);
+	}
+
+	tsk_rt(t)->request_incomplete = 0;
+
+	raw_spin_unlock_irqrestore(&cluster->cluster_lock, flags);
+
+	preempt_enable();
+}
+
+/* struct for semaphore with priority inheritance */
+struct omlp_semaphore {
+	struct litmus_lock litmus_lock;
+
+	/* current resource holder */
+	struct task_struct *owner;
+
+	/* FIFO queue of waiting tasks */
+	wait_queue_head_t fifo_wait;
+};
+
+static inline struct omlp_semaphore* omlp_from_lock(struct litmus_lock* lock)
+{
+	return container_of(lock, struct omlp_semaphore, litmus_lock);
+}
+
+static int cedf_omlp_lock(struct litmus_lock* l)
+{
+	struct task_struct* t = current;
+	struct omlp_semaphore *sem = omlp_from_lock(l);
+	wait_queue_t wait;
+	unsigned long flags;
+
+	if (!is_realtime(t))
+		return -EPERM;
+
+	priodon_become_eligible();
+
+	spin_lock_irqsave(&sem->fifo_wait.lock, flags);
+
+	if (sem->owner) {
+		/* resource is not free => must suspend and wait */
+
+		init_waitqueue_entry(&wait, t);
+
+		set_task_state(t, TASK_UNINTERRUPTIBLE);
+
+		__add_wait_queue_tail_exclusive(&sem->fifo_wait, &wait);
+
+		TS_LOCK_SUSPEND;
+
+		spin_unlock_irqrestore(&sem->fifo_wait.lock, flags);
+
+		schedule();
+
+		TS_LOCK_RESUME;
+
+		BUG_ON(sem->owner != t);
+	} else {
+		/* it's ours now */
+		sem->owner = t;
+
+		spin_unlock_irqrestore(&sem->fifo_wait.lock, flags);
+	}
+
+	return 0;
+}
+
+static int cedf_omlp_unlock(struct litmus_lock* l)
+{
+	struct task_struct *t = current, *next;
+	struct omlp_semaphore *sem = omlp_from_lock(l);
+	unsigned long flags;
+	int err = 0;
+
+	spin_lock_irqsave(&sem->fifo_wait.lock, flags);
+
+	if (sem->owner != t) {
+		err = -EINVAL;
+		spin_unlock_irqrestore(&sem->fifo_wait.lock, flags);
+		goto out;
+	}
+
+	/* check if there are jobs waiting for this resource */
+	next = __waitqueue_remove_first(&sem->fifo_wait);
+	if (next) {
+		/* next becomes the resouce holder */
+		sem->owner = next;
+		TRACE_CUR("lock ownership passed to %s/%d\n", next->comm, next->pid);
+
+		/* wake up next */
+		wake_up_process(next);
+	} else
+		/* becomes available */
+		sem->owner = NULL;
+
+	spin_unlock_irqrestore(&sem->fifo_wait.lock, flags);
+
+	priodon_complete_request();
+
+out:
+	return err;
+}
+
+static int cedf_omlp_close(struct litmus_lock* l)
+{
+	struct task_struct *t = current;
+	struct omlp_semaphore *sem = omlp_from_lock(l);
+	unsigned long flags;
+
+	int owner;
+
+	spin_lock_irqsave(&sem->fifo_wait.lock, flags);
+
+	owner = sem->owner == t;
+
+	spin_unlock_irqrestore(&sem->fifo_wait.lock, flags);
+
+	if (owner)
+		cedf_omlp_unlock(l);
+
+	return 0;
+}
+
+static void cedf_omlp_free(struct litmus_lock* lock)
+{
+	kfree(omlp_from_lock(lock));
+}
+
+static struct litmus_lock_ops cedf_omlp_lock_ops = {
+	.close  = cedf_omlp_close,
+	.lock   = cedf_omlp_lock,
+	.unlock = cedf_omlp_unlock,
+	.deallocate = cedf_omlp_free,
+};
+
+static struct litmus_lock* cedf_new_omlp(void)
+{
+	struct omlp_semaphore* sem;
+
+	sem = kmalloc(sizeof(*sem), GFP_KERNEL);
+	if (!sem)
+		return NULL;
+
+	sem->owner   = NULL;
+	init_waitqueue_head(&sem->fifo_wait);
+	sem->litmus_lock.ops = &cedf_omlp_lock_ops;
+
+	return &sem->litmus_lock;
+}
+
+static long cedf_allocate_lock(struct litmus_lock **lock, int type,
+			       void* __user unused)
+{
+	int err = -ENXIO;
+
+	switch (type) {
+
+	case OMLP_SEM:
+		/* O(m) Multiprocessor Locking Protocol */
+		*lock  = cedf_new_omlp();
+		if (*lock)
+			err = 0;
+		else
+			err = -ENOMEM;
+		break;
+
+	};
+
+	return err;
+}
+
+
+#endif
+
+static long cedf_admit_task(struct task_struct* tsk)
+{
+	if (task_cpu(tsk) == tsk->rt_param.task_params.cpu) {
+#ifdef CONFIG_LITMUS_LOCKING
+		
+#endif
+		return 0;
+	}
+	else
+		return -EINVAL;
+}
+
+/* total number of cluster */
+static int num_clusters;
+/* we do not support cluster of different sizes */
+static unsigned int cluster_size;
+
+#ifdef VERBOSE_INIT
+static void print_cluster_topology(cpumask_var_t mask, int cpu)
+{
+	int chk;
+	char buf[255];
+
+	chk = cpulist_scnprintf(buf, 254, mask);
+	buf[chk] = '\0';
+	printk(KERN_INFO "CPU = %d, shared cpu(s) = %s\n", cpu, buf);
+
+}
+#endif
+
+static int clusters_allocated = 0;
+
+static void cleanup_cedf(void)
+{
+	int i;
+
+	if (clusters_allocated) {
+		for (i = 0; i < num_clusters; i++) {
+			free_cpumask_var(cedf[i].cpu_map);
+		}
+
+		kfree(cedf);
+	}
+}
+
+static long cedf_activate_plugin(void)
+{
+	int i, j, cpu, ccpu, cpu_count;
+	cpu_entry_t *entry;
+
+	cpumask_var_t mask;
+	int chk = 0;
+
+	/* de-allocate old clusters, if any */
+	cleanup_cedf();
+
+	printk(KERN_INFO "C-EDF: Activate Plugin, cluster configuration = %d\n",
+			cluster_config);
+
+	/* need to get cluster_size first */
+	if(!zalloc_cpumask_var(&mask, GFP_ATOMIC))
+		return -ENOMEM;
+
+	if (unlikely(cluster_config == GLOBAL_CLUSTER)) {
+		cluster_size = num_online_cpus();
+	} else {
+		chk = get_shared_cpu_map(mask, 0, cluster_config);
+		if (chk) {
+			/* if chk != 0 then it is the max allowed index */
+			printk(KERN_INFO "C-EDF: Cluster configuration = %d "
+			       "is not supported on this hardware.\n",
+			       cluster_config);
+			/* User should notice that the configuration failed, so
+			 * let's bail out. */
+			return -EINVAL;
+		}
+
+		cluster_size = cpumask_weight(mask);
+	}
+
+	if ((num_online_cpus() % cluster_size) != 0) {
+		/* this can't be right, some cpus are left out */
+		printk(KERN_ERR "C-EDF: Trying to group %d cpus in %d!\n",
+				num_online_cpus(), cluster_size);
+		return -1;
+	}
+
+	num_clusters = num_online_cpus() / cluster_size;
+	printk(KERN_INFO "C-EDF: %d cluster(s) of size = %d\n",
+			num_clusters, cluster_size);
+
+	/* initialize clusters */
+	cedf = kmalloc(num_clusters * sizeof(cedf_domain_t), GFP_ATOMIC);
+	for (i = 0; i < num_clusters; i++) {
+		bheap_init(&(cedf[i].cpu_heap));
+#ifdef CONFIG_LITMUS_LOCKING
+		bheap_init(&(cedf[i].pending_jobs));
+		bheap_init(&(cedf[i].pending_cpus));
+#endif
+		edf_domain_init(&(cedf[i].domain), NULL, cedf_release_jobs);
+
+		if(!zalloc_cpumask_var(&cedf[i].cpu_map, GFP_ATOMIC))
+			return -ENOMEM;
+#ifdef CONFIG_RELEASE_MASTER
+		cedf[i].domain.release_master = atomic_read(&release_master_cpu);
+#endif
+	}
+
+	/* cycle through cluster and add cpus to them */
+	for (i = 0; i < num_clusters; i++) {
+
+		for_each_online_cpu(cpu) {
+			/* check if the cpu is already in a cluster */
+			for (j = 0; j < num_clusters; j++)
+				if (cpumask_test_cpu(cpu, cedf[j].cpu_map))
+					break;
+			/* if it is in a cluster go to next cpu */
+			if (j < num_clusters &&
+					cpumask_test_cpu(cpu, cedf[j].cpu_map))
+				continue;
+
+			/* this cpu isn't in any cluster */
+			/* get the shared cpus */
+			if (unlikely(cluster_config == GLOBAL_CLUSTER))
+				cpumask_copy(mask, cpu_online_mask);
+			else
+				get_shared_cpu_map(mask, cpu, cluster_config);
+
+			cpumask_copy(cedf[i].cpu_map, mask);
+#ifdef VERBOSE_INIT
+			print_cluster_topology(mask, cpu);
+#endif
+			/* add cpus to current cluster and init cpu_entry_t */
+			cpu_count = 0;
+			cedf[i].num_cpus = 0;
+			for_each_cpu(ccpu, cedf[i].cpu_map) {
+
+				entry = &per_cpu(cedf_cpu_entries, ccpu);
+				atomic_set(&entry->will_schedule, 0);
+				entry->cpu = ccpu;
+				entry->cluster = &cedf[i];
+				entry->hn = cpu_nodes + ccpu;
+				bheap_node_init(&entry->hn, entry);
+
+#ifdef CONFIG_LITMUS_LOCKING
+				entry->pending_hn = pending_nodes + ccpu;
+				bheap_node_init(&entry->pending_hn, entry);
+				entry->pending = NULL;
+#endif
+
+				cpu_count++;
+
+				entry->linked = NULL;
+				entry->scheduled = NULL;
+#ifdef CONFIG_RELEASE_MASTER
+				/* only add CPUs that should schedule jobs */
+				if (entry->cpu != entry->cluster->domain.release_master)
+#endif
+				{
+					cedf[i].num_cpus++;
+					update_cpu_position(entry);
+#ifdef CONFIG_LITMUS_LOCKING
+					update_pending_position(entry);
+#endif
+				}
+			}
+			/* done with this cluster */
+			break;
+		}
+	}
+
+	free_cpumask_var(mask);
+	clusters_allocated = 1;
+	return 0;
+}
+
+/*	Plugin object	*/
+static struct sched_plugin cedf_plugin __cacheline_aligned_in_smp = {
+	.plugin_name		= "C-EDF",
+	.finish_switch		= cedf_finish_switch,
+	.tick			= cedf_tick,
+	.task_new		= cedf_task_new,
+	.complete_job		= complete_job,
+	.task_exit		= cedf_task_exit,
+	.schedule		= cedf_schedule,
+	.task_wake_up		= cedf_task_wake_up,
+	.task_block		= cedf_task_block,
+	.admit_task		= cedf_admit_task,
+	.activate_plugin	= cedf_activate_plugin,
+#ifdef CONFIG_LITMUS_LOCKING
+	.allocate_lock		= cedf_allocate_lock,
+	.pre_setsched		= cedf_pre_setsched,
+#endif
+};
+
+static struct proc_dir_entry *cluster_file = NULL, *cedf_dir = NULL;
+
+static int __init init_cedf(void)
+{
+	int err, fs;
+
+	err = register_sched_plugin(&cedf_plugin);
+	if (!err) {
+		fs = make_plugin_proc_dir(&cedf_plugin, &cedf_dir);
+		if (!fs)
+			cluster_file = create_cluster_file(cedf_dir, &cluster_config);
+		else
+			printk(KERN_ERR "Could not allocate C-EDF procfs dir.\n");
+	}
+	return err;
+}
+
+static void clean_cedf(void)
+{
+	cleanup_cedf();
+	if (cluster_file)
+		remove_proc_entry("cluster", cedf_dir);
+	if (cedf_dir)
+		remove_plugin_proc_dir(&cedf_plugin);
+}
+
+module_init(init_cedf);
+module_exit(clean_cedf);
diff --git a/litmus/sched_cedf.c.rej b/litmus/sched_cedf.c.rej
new file mode 100644
index 000000000000..ec74da6c4a64
--- /dev/null
+++ b/litmus/sched_cedf.c.rej
@@ -0,0 +1,53 @@
+--- litmus/sched_cedf.c
++++ litmus/sched_cedf.c
+@@ -739,6 +1100,12 @@
+ 	int out_of_time, sleep, preempt, np, exists, blocks;
+ 	struct task_struct* next = NULL;
+ 
++#ifdef CONFIG_LITMUS_LOCKING
++	int priodon;
++#else
++#define priodon 0
++#endif
++
+ #ifdef CONFIG_RELEASE_MASTER
+ 	/* Bail out early if we are the release master.
+ 	 * The release master never schedules any real-time tasks.
+@@ -750,7 +1117,6 @@
+ #endif
+ 
+ 	raw_spin_lock(&cluster->cluster_lock);
+-	clear_will_schedule();
+ 
+ 	/* sanity checking */
+ 	BUG_ON(entry->scheduled && entry->scheduled != prev);
+@@ -1032,7 +1466,15 @@
+ 
+ 	/* unlink if necessary */
+ 	raw_spin_lock_irqsave(&cluster->cluster_lock, flags);
++
+ 	unlink(t);
++
++#ifdef CONFIG_LITMUS_LOCKING
++	/* make sure it's not pending anymore */
++	job_pending_exit(t);
++	bheap_node_free(tsk_rt(t)->pending_node);
++#endif
++
+ 	if (tsk_rt(t)->scheduled_on != NO_CPU) {
+ 		cpu_entry_t *cpu;
+ 		cpu = &per_cpu(cedf_cpu_entries, tsk_rt(t)->scheduled_on);
+@@ -1446,7 +2140,13 @@
+ 				/* only add CPUs that should schedule jobs */
+ 				if (entry->cpu != entry->cluster->domain.release_master)
+ #endif
++				{
++					cedf[i].num_cpus++;
+ 					update_cpu_position(entry);
++#ifdef CONFIG_LITMUS_LOCKING
++					update_pending_position(entry);
++#endif
++				}
+ 			}
+ 			/* done with this cluster */
+ 			break;
diff --git a/litmus/sched_gfl_split_namechange.c b/litmus/sched_gfl_split_namechange.c
new file mode 100644
index 000000000000..c154b115a00e
--- /dev/null
+++ b/litmus/sched_gfl_split_namechange.c
@@ -0,0 +1,1149 @@
+/*
+ * litmus/sched_gfl_split.c
+ *
+ * Implementation of the G-FL with job splitting.  See the Erickson/Anderson
+ * paper at ECRTS 2012 for a description of G-FL.
+ *
+ * This plugin is a modified version of the prior GSN-EDF-split plugin in
+ * litmus/sched_gsn_edf_split.c.  Job splitting works the same way as in that
+ * plugin.  The subjob "deadlines" (really priorities) are computed according
+ * to G-FL with respect to the post-split (smaller) jobs.
+ * 
+ */
+
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+#include <litmus/litmus.h>
+#include <litmus/jobs.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/edf_split_common.h>
+#include <litmus/sched_trace.h>
+#include <litmus/trace.h>
+
+#include <litmus/preempt.h>
+
+#include <litmus/bheap.h>
+
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+#include <litmus/affinity.h>
+#endif
+
+#include <linux/module.h>
+
+/* cpu_entry_t - maintain the linked and scheduled state
+ */
+typedef struct {
+	int 			cpu;
+	struct task_struct*	linked;		/* only RT tasks */
+	struct task_struct*	scheduled;	/* only RT tasks */
+	struct bheap_node*	hn;
+	struct hrtimer		split_timer;
+	int			timer_armed;
+} cpu_entry_t;
+DEFINE_PER_CPU(cpu_entry_t, gsnedf_cpu_entries);
+
+cpu_entry_t* gsnedf_cpus[NR_CPUS];
+
+/* the cpus queue themselves according to priority in here */
+static struct bheap_node gsnedf_heap_node[NR_CPUS];
+static struct bheap	  gsnedf_cpu_heap;
+
+static rt_domain_t gsnedf;
+#define gsnedf_lock (gsnedf.ready_lock)
+
+inline static int get_slice_num(struct task_struct* t)
+{
+	int basic = ((t->rt_param.job_params.exec_time *
+		      t->rt_param.task_params.split) /
+		      t->rt_param.task_params.exec_cost) + 1;
+	if (basic <= t->rt_param.task_params.split){
+		return basic;
+	}
+	else{
+		/*Since we don't police budget, just leave where it's at.*/
+		return t->rt_param.task_params.split;
+	}
+}
+
+/* Returns the appropriate subjob deadline.*/
+inline static lt_t get_proper_deadline(struct task_struct* t)
+{
+	unsigned int num_cpus = num_online_cpus();
+	return t->rt_param.job_params.release +
+	       ((t->rt_param.task_params.period * get_slice_num(t))
+	       / t->rt_param.task_params.split)
+	       /* G-FL correction */
+	       - (((num_cpus - 1) * t->rt_param.task_params.exec_cost)
+	       / (num_cpus * t->rt_param.task_params.split));
+}
+
+/* Tells us if the current deadline is too small.*/
+inline static int needs_deadline_move(struct task_struct* t)
+{
+	BUG_ON(get_proper_deadline(t) < t->rt_param.job_params.subjob_deadline);
+#ifdef CONFIG_LITMUS_LOCKING
+	return !is_in_crit_section(t) &&
+		(get_proper_deadline(t) !=
+		 tsk_rt(t)->job_params.subjob_deadline);
+#else
+	return get_proper_deadline(t) != tsk_rt(t)->job_params.subjob_deadline;
+#endif
+}
+
+/*Returns execution time until the next deadline move.
+ * 0 means the task has no more deadline moves
+ */
+inline static lt_t time_to_next_move(struct task_struct* t)
+{
+	if (get_slice_num(t) == t->rt_param.task_params.split){
+		return 0;
+	}
+	/* +1 upper bounds ceiling, since integer division is floor*/
+	return ((get_slice_num(t) * t->rt_param.task_params.exec_cost)
+		 / t->rt_param.task_params.split) + 1
+		 - t->rt_param.job_params.exec_time;
+}
+
+/* Timer stuff - similar to budget.c. */
+static enum hrtimer_restart on_split_timeout(struct hrtimer *timer)
+{
+	cpu_entry_t* st = container_of(timer,
+				       cpu_entry_t,
+				       split_timer);
+	
+	unsigned long flags;
+
+	local_irq_save(flags);
+	TRACE("split timer fired.\n");
+	st->timer_armed = 0;
+	/* Activate scheduler */
+	litmus_reschedule_local();
+	local_irq_restore(flags);
+
+	return HRTIMER_NORESTART;
+}
+
+static void cancel_split_timer(cpu_entry_t* ce)
+{
+	int ret;
+
+	TRACE("cancelling split time.\n");
+
+	/* Since interrupts are disabled and et->timer_armed is only
+	 * modified locally, we do not need any locks.
+	 */
+	
+	if (ce->timer_armed) {
+		ret = hrtimer_try_to_cancel(&ce->split_timer);
+		/* Should never be inactive. */
+		BUG_ON(ret == 0);
+		/* Should never be running concurrently.*/
+		BUG_ON(ret == -1);
+
+		ce->timer_armed = 0;
+	}
+}
+
+/* assumes called with IRQs off */
+static void arm_split_timer(cpu_entry_t *ce,
+				struct task_struct* t)
+{
+	lt_t when_to_fire;
+	lt_t time_to_move;
+	TRACE_TASK(t, "arming split timer.\n");
+
+	/* __hrtimer_start_range_ns() cancels the timer
+	 * anyway, so we don't have to check whether it is still armed */
+	
+	/*We won't do any new deadline moves if the budget has been exhausted*/
+	if (likely(!is_np(t) && (time_to_move = time_to_next_move(t)))) {
+		when_to_fire = litmus_clock() + time_to_move;
+		TRACE_TASK(t, "actually arming for %llu into the future\n",
+			   time_to_move);
+		__hrtimer_start_range_ns(&ce->split_timer,
+					 ns_to_ktime(when_to_fire),
+					 0 /* delta */,
+					 HRTIMER_MODE_ABS_PINNED,
+					 0 /* no wakeup */);
+		ce->timer_armed = 1;
+	}
+}
+
+/* Uncomment this if you want to see all scheduling decisions in the
+ * TRACE() log.
+#define WANT_ALL_SCHED_EVENTS
+ */
+
+static int cpu_lower_prio(struct bheap_node *_a, struct bheap_node *_b)
+{
+	cpu_entry_t *a, *b;
+	a = _a->value;
+	b = _b->value;
+	/* Note that a and b are inverted: we want the lowest-priority CPU at
+	 * the top of the heap.
+	 */
+	return edf_split_higher_prio(b->linked, a->linked);
+}
+
+/* update_cpu_position - Move the cpu entry to the correct place to maintain
+ *					   order in the cpu queue. Caller must hold gsnedf lock.
+ */
+static void update_cpu_position(cpu_entry_t *entry)
+{
+	if (likely(bheap_node_in_heap(entry->hn)))
+		bheap_delete(cpu_lower_prio, &gsnedf_cpu_heap, entry->hn);
+	bheap_insert(cpu_lower_prio, &gsnedf_cpu_heap, entry->hn);
+}
+
+/* caller must hold gsnedf lock */
+static cpu_entry_t* lowest_prio_cpu(void)
+{
+	struct bheap_node* hn;
+	hn = bheap_peek(cpu_lower_prio, &gsnedf_cpu_heap);
+	return hn->value;
+}
+
+
+/* link_task_to_cpu - Update the link of a CPU.
+ *					Handles the case where the to-be-linked task is already
+ *					scheduled on a different CPU.
+ */
+static noinline void link_task_to_cpu(struct task_struct* linked,
+					  cpu_entry_t *entry)
+{
+	cpu_entry_t *sched;
+	struct task_struct* tmp;
+	int on_cpu;
+
+	BUG_ON(linked && !is_realtime(linked));
+
+	/* Currently linked task is set to be unlinked. */
+	if (entry->linked) {
+		entry->linked->rt_param.linked_on = NO_CPU;
+	}
+
+	/* Link new task to CPU. */
+	if (linked) {
+		set_rt_flags(linked, RT_F_RUNNING);
+		/* handle task is already scheduled somewhere! */
+		on_cpu = linked->rt_param.scheduled_on;
+		if (on_cpu != NO_CPU) {
+			sched = &per_cpu(gsnedf_cpu_entries, on_cpu);
+			/* this should only happen if not linked already */
+			BUG_ON(sched->linked == linked);
+
+			/* If we are already scheduled on the CPU to which we
+			 * wanted to link, we don't need to do the swap --
+			 * we just link ourselves to the CPU and depend on
+			 * the caller to get things right.
+			 */
+			if (entry != sched) {
+				TRACE_TASK(linked,
+					   "already scheduled on %d, updating link.\n",
+					   sched->cpu);
+				tmp = sched->linked;
+				linked->rt_param.linked_on = sched->cpu;
+				sched->linked = linked;
+				update_cpu_position(sched);
+				linked = tmp;
+			}
+		}
+		if (linked) /* might be NULL due to swap */
+			linked->rt_param.linked_on = entry->cpu;
+	}
+	entry->linked = linked;
+#ifdef WANT_ALL_SCHED_EVENTS
+	if (linked)
+		TRACE_TASK(linked, "linked to %d.\n", entry->cpu);
+	else
+		TRACE("NULL linked to %d.\n", entry->cpu);
+#endif
+	update_cpu_position(entry);
+}
+
+/* unlink - Make sure a task is not linked any longer to an entry
+ *		  where it was linked before. Must hold gsnedf_lock.
+ */
+static noinline void unlink(struct task_struct* t)
+{
+	cpu_entry_t *entry;
+
+	if (t->rt_param.linked_on != NO_CPU) {
+		/* unlink */
+		entry = &per_cpu(gsnedf_cpu_entries, t->rt_param.linked_on);
+		t->rt_param.linked_on = NO_CPU;
+		link_task_to_cpu(NULL, entry);
+	} else if (is_queued(t)) {
+		/* This is an interesting situation: t is scheduled,
+		 * but was just recently unlinked.  It cannot be
+		 * linked anywhere else (because then it would have
+		 * been relinked to this CPU), thus it must be in some
+		 * queue. We must remove it from the list in this
+		 * case.
+		 */
+		remove(&gsnedf, t);
+	}
+}
+
+
+/* preempt - force a CPU to reschedule
+ */
+static void preempt(cpu_entry_t *entry)
+{
+	preempt_if_preemptable(entry->scheduled, entry->cpu);
+}
+
+/* requeue - Put an unlinked task into gsn-edf domain.
+ *		   Caller must hold gsnedf_lock.
+ */
+static noinline void requeue(struct task_struct* task)
+{
+	BUG_ON(!task);
+	/* sanity check before insertion */
+	BUG_ON(is_queued(task));
+
+	if (is_released(task, litmus_clock()))
+		__add_ready(&gsnedf, task);
+	else {
+		/* it has got to wait */
+		add_release(&gsnedf, task);
+	}
+}
+
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+static cpu_entry_t* gsnedf_get_nearest_available_cpu(cpu_entry_t *start)
+{
+	cpu_entry_t *affinity;
+
+	get_nearest_available_cpu(affinity, start, gsnedf_cpu_entries,
+#ifdef CONFIG_RELEASE_MASTER
+				  gsnedf.release_master
+#else
+				  NO_CPU
+#endif
+				 );
+
+	return(affinity);
+}
+#endif
+
+/* check for any necessary preemptions */
+static void check_for_preemptions(void)
+{
+	struct task_struct *task;
+	cpu_entry_t *last;
+
+	for (last = lowest_prio_cpu();
+		edf_split_preemption_needed(&gsnedf, last->linked);
+		last = lowest_prio_cpu()) {
+		/* preemption necessary */
+		task = __take_ready(&gsnedf);
+		TRACE("check_for_preemptions: attempting to link task %d to %d\n",
+			  task->pid, last->cpu);
+
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+		{
+			cpu_entry_t *affinity =
+					gsnedf_get_nearest_available_cpu(
+						&per_cpu(gsnedf_cpu_entries,
+						task_cpu(task)));
+			if (affinity)
+				last = affinity;
+			else if (last->linked)
+				requeue(last->linked);
+		}
+#else
+		if (last->linked)
+			requeue(last->linked);
+#endif
+
+		link_task_to_cpu(task, last);
+		preempt(last);
+	}
+}
+
+/* gsnedf_job_arrival: task is either resumed or released */
+static noinline void gsnedf_job_arrival(struct task_struct* task)
+{
+	BUG_ON(!task);
+
+	requeue(task);
+	check_for_preemptions();
+}
+
+static void gsnedf_release_jobs(rt_domain_t* rt, struct bheap* tasks)
+{
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&gsnedf_lock, flags);
+
+	__merge_ready(rt, tasks);
+	check_for_preemptions();
+
+	raw_spin_unlock_irqrestore(&gsnedf_lock, flags);
+}
+
+/* caller holds gsnedf_lock */
+static noinline void job_completion(struct task_struct *t, int forced)
+{
+	BUG_ON(!t);
+
+	sched_trace_task_completion(t, forced);
+
+	TRACE_TASK(t, "job_completion().\n");
+
+	/* set flags */
+	set_rt_flags(t, RT_F_SLEEP);
+	/* prepare for next period */
+	/* prepare_for_next_period assumes implicit deadlines and no splitting,
+	 * so we call it with the job deadline it expects.
+	 */
+	t->rt_param.job_params.deadline = t->rt_param.job_params.release +
+		t->rt_param.task_params.period;
+	prepare_for_next_period(t);
+	/* We now set the subjob deadline to what it should be for scheduling
+	 * priority.
+	 */
+	t->rt_param.job_params.subjob_deadline = get_proper_deadline(t);
+	if (is_released(t, litmus_clock()))
+		sched_trace_task_release(t);
+	/* unlink */
+	unlink(t);
+	/* requeue
+	 * But don't requeue a blocking task. */
+	if (is_running(t))
+		gsnedf_job_arrival(t);
+}
+
+static void move_deadline(struct task_struct *t)
+{
+	tsk_rt(t)->job_params.subjob_deadline = get_proper_deadline(t);
+	/* Check if rescheduling needed with lower priority. */
+	unlink(t);
+	gsnedf_job_arrival(t);
+}
+
+/* gsnedf_tick - this function is called for every local timer
+ *						 interrupt.
+ *
+ *				   checks whether the current task has expired and checks
+ *				   whether we need to preempt it if it has not expired
+ */
+static void gsnedf_tick(struct task_struct* t)
+{
+	if (is_realtime(t) && budget_enforced(t) && budget_exhausted(t)) {
+		if (!is_np(t)) {
+			/* np tasks will be preempted when they become
+			 * preemptable again
+			 */
+			litmus_reschedule_local();
+			TRACE("gsnedf_scheduler_tick: "
+				  "%d is preemptable "
+				  " => FORCE_RESCHED\n", t->pid);
+		} else if (is_user_np(t)) {
+			TRACE("gsnedf_scheduler_tick: "
+			      "%d is non-preemptable, "
+			      "preemption delayed.\n", t->pid);
+			request_exit_np(t);
+		}
+	}
+}
+
+/* Getting schedule() right is a bit tricky. schedule() may not make any
+ * assumptions on the state of the current task since it may be called for a
+ * number of reasons. The reasons include a scheduler_tick() determined that it
+ * was necessary, because sys_exit_np() was called, because some Linux
+ * subsystem determined so, or even (in the worst case) because there is a bug
+ * hidden somewhere. Thus, we must take extreme care to determine what the
+ * current state is.
+ *
+ * The CPU could currently be scheduling a task (or not), be linked (or not).
+ *
+ * The following assertions for the scheduled task could hold:
+ *
+ *	  - !is_running(scheduled)		// the job blocks
+ *	- scheduled->timeslice == 0	// the job completed (forcefully)
+ *	- get_rt_flag() == RT_F_SLEEP	// the job completed (by syscall)
+ * 	- linked != scheduled		// we need to reschedule (for any reason)
+ * 	- is_np(scheduled)		// rescheduling must be delayed,
+ *					   sys_exit_np must be requested
+ *
+ * Any of these can occur together.
+ */
+static struct task_struct* gsnedf_schedule(struct task_struct * prev)
+{
+	cpu_entry_t* entry = &__get_cpu_var(gsnedf_cpu_entries);
+	int out_of_time, sleep, preempt, np, exists, blocks, needs_move;
+	struct task_struct* next = NULL;
+
+#ifdef CONFIG_RELEASE_MASTER
+	/* Bail out early if we are the release master.
+	 * The release master never schedules any real-time tasks.
+	 */
+	if (unlikely(gsnedf.release_master == entry->cpu)) {
+		sched_state_task_picked();
+		return NULL;
+	}
+#endif
+
+	raw_spin_lock(&gsnedf_lock);
+
+	/* sanity checking */
+	BUG_ON(entry->scheduled && entry->scheduled != prev);
+	BUG_ON(entry->scheduled && !is_realtime(prev));
+	BUG_ON(is_realtime(prev) && !entry->scheduled);
+
+	/* (0) Determine state */
+	exists	  = entry->scheduled != NULL;
+	blocks	  = exists && !is_running(entry->scheduled);
+	out_of_time = exists &&
+		      budget_enforced(entry->scheduled) &&
+		      budget_exhausted(entry->scheduled);
+	needs_move  = exists && needs_deadline_move(entry->scheduled);
+	np 		= exists && is_np(entry->scheduled);
+	sleep		= exists && get_rt_flags(entry->scheduled) == RT_F_SLEEP;
+	preempt	 = entry->scheduled != entry->linked;
+
+#ifdef WANT_ALL_SCHED_EVENTS
+	TRACE_TASK(prev, "invoked gsnedf_schedule.\n");
+#endif
+
+	if (exists)
+		TRACE_TASK(prev,
+			   "blocks:%d out_of_time:%d needs_move:%d np:%d"
+			   " sleep:%d preempt:%d state:%d sig:%d\n",
+			   blocks, out_of_time, needs_move, np, sleep, preempt,
+			   prev->state, signal_pending(prev));
+	if (entry->linked && preempt)
+		TRACE_TASK(prev, "will be preempted by %s/%d\n",
+			   entry->linked->comm, entry->linked->pid);
+
+
+	/* If a task blocks we have no choice but to reschedule.
+	 */
+	if (blocks)
+		unlink(entry->scheduled);
+
+	/* Request a sys_exit_np() call if we would like to preempt but cannot.
+	 * We need to make sure to update the link structure anyway in case
+	 * that we are still linked. Multiple calls to request_exit_np() don't
+	 * hurt.
+	 *
+	 * Job deadline moves handled similarly
+	 */
+	if (np && (out_of_time || preempt || sleep)) {
+		unlink(entry->scheduled);
+		request_exit_np(entry->scheduled);
+	}
+	else if (np && needs_move) {
+		move_deadline(entry->scheduled);
+	}
+
+	/* Any task that is preemptable and either exhausts its execution
+	 * budget or wants to sleep completes. We may have to reschedule after
+	 * this. Don't do a job completion if we block (can't have timers running
+	 * for blocked jobs). Preemption go first for the same reason.
+	 */
+	if (!np && (out_of_time || sleep) && !blocks && !preempt)
+		job_completion(entry->scheduled, !sleep);
+	else if (!np && needs_move && !blocks && !preempt) {
+		move_deadline(entry->scheduled);
+	}
+
+	/* Link pending task if we became unlinked.
+	 */
+	if (!entry->linked)
+		link_task_to_cpu(__take_ready(&gsnedf), entry);
+
+	/* The final scheduling decision. Do we need to switch for some reason?
+	 * If linked is different from scheduled, then select linked as next.
+	 */
+	if ((!np || blocks) &&
+		entry->linked != entry->scheduled) {
+		/* Schedule a linked job? */
+		if (entry->linked) {
+			entry->linked->rt_param.scheduled_on = entry->cpu;
+			next = entry->linked;
+			TRACE_TASK(next, "scheduled_on = P%d\n", smp_processor_id());
+		}
+		if (entry->scheduled) {
+			/* not gonna be scheduled soon */
+			entry->scheduled->rt_param.scheduled_on = NO_CPU;
+			TRACE_TASK(entry->scheduled, "scheduled_on = NO_CPU\n");
+		}
+	} else
+		/* Only override Linux scheduler if we have a real-time task
+		 * scheduled that needs to continue.
+		 */
+		if (exists)
+			next = prev;
+
+	sched_state_task_picked();
+
+	raw_spin_unlock(&gsnedf_lock);
+
+	if (next) {
+		arm_split_timer(entry, next);
+	}
+	else if (entry->timer_armed) {
+		cancel_split_timer(entry);
+	}
+
+#ifdef WANT_ALL_SCHED_EVENTS
+	TRACE("gsnedf_lock released, next=0x%p\n", next);
+
+	if (next)
+		TRACE_TASK(next, "scheduled at %llu\n", litmus_clock());
+	else if (exists && !next)
+		TRACE("becomes idle at %llu.\n", litmus_clock());
+#endif
+
+
+	return next;
+}
+
+
+/* _finish_switch - we just finished the switch away from prev
+ */
+static void gsnedf_finish_switch(struct task_struct *prev)
+{
+	cpu_entry_t* 	entry = &__get_cpu_var(gsnedf_cpu_entries);
+
+	entry->scheduled = is_realtime(current) ? current : NULL;
+#ifdef WANT_ALL_SCHED_EVENTS
+	TRACE_TASK(prev, "switched away from\n");
+#endif
+}
+
+static void gsnedf_release_at(struct task_struct *t, lt_t start)
+{
+	t->rt_param.job_params.deadline = start;
+	prepare_for_next_period(t);
+	t->rt_param.job_params.subjob_deadline = get_proper_deadline(t);
+	set_rt_flags(t, RT_F_RUNNING);
+}
+
+/*	Prepare a task for running in RT mode
+ */
+static void gsnedf_task_new(struct task_struct * t, int on_rq, int running)
+{
+	unsigned long 		flags;
+	cpu_entry_t* 		entry;
+
+	TRACE("gsn edf: task new %d\n", t->pid);
+
+	raw_spin_lock_irqsave(&gsnedf_lock, flags);
+
+	/* setup job params */
+	gsnedf_release_at(t, litmus_clock());
+
+	if (running) {
+		entry = &per_cpu(gsnedf_cpu_entries, task_cpu(t));
+		BUG_ON(entry->scheduled);
+
+#ifdef CONFIG_RELEASE_MASTER
+		if (entry->cpu != gsnedf.release_master) {
+#endif
+			entry->scheduled = t;
+			tsk_rt(t)->scheduled_on = task_cpu(t);
+#ifdef CONFIG_RELEASE_MASTER
+		} else {
+			/* do not schedule on release master */
+			preempt(entry); /* force resched */
+			tsk_rt(t)->scheduled_on = NO_CPU;
+		}
+#endif
+	} else {
+		t->rt_param.scheduled_on = NO_CPU;
+	}
+	t->rt_param.linked_on		  = NO_CPU;
+
+	gsnedf_job_arrival(t);
+	raw_spin_unlock_irqrestore(&gsnedf_lock, flags);
+}
+
+static void gsnedf_task_wake_up(struct task_struct *task)
+{
+	unsigned long flags;
+	lt_t now;
+
+	TRACE_TASK(task, "wake_up at %llu\n", litmus_clock());
+
+	raw_spin_lock_irqsave(&gsnedf_lock, flags);
+	/* We need to take suspensions because of semaphores into
+	 * account! If a job resumes after being suspended due to acquiring
+	 * a semaphore, it should never be treated as a new job release.
+	 */
+	if (get_rt_flags(task) == RT_F_EXIT_SEM) {
+		set_rt_flags(task, RT_F_RUNNING);
+	} else {
+		now = litmus_clock();
+		if (is_tardy(task, now)) {
+			/* new sporadic release */
+			gsnedf_release_at(task, now);
+			sched_trace_task_release(task);
+		}
+		else {
+			if (task->rt.time_slice) {
+				/* came back in time before deadline
+				*/
+				set_rt_flags(task, RT_F_RUNNING);
+			}
+		}
+	}
+	gsnedf_job_arrival(task);
+	raw_spin_unlock_irqrestore(&gsnedf_lock, flags);
+}
+
+static void gsnedf_task_block(struct task_struct *t)
+{
+	unsigned long flags;
+
+	TRACE_TASK(t, "block at %llu\n", litmus_clock());
+
+	/* unlink if necessary */
+	raw_spin_lock_irqsave(&gsnedf_lock, flags);
+	unlink(t);
+	raw_spin_unlock_irqrestore(&gsnedf_lock, flags);
+
+	BUG_ON(!is_realtime(t));
+}
+
+
+static void gsnedf_task_exit(struct task_struct * t)
+{
+	unsigned long flags;
+
+	/* unlink if necessary */
+	raw_spin_lock_irqsave(&gsnedf_lock, flags);
+	unlink(t);
+	if (tsk_rt(t)->scheduled_on != NO_CPU) {
+		gsnedf_cpus[tsk_rt(t)->scheduled_on]->scheduled = NULL;
+		tsk_rt(t)->scheduled_on = NO_CPU;
+	}
+	raw_spin_unlock_irqrestore(&gsnedf_lock, flags);
+
+	BUG_ON(!is_realtime(t));
+	TRACE_TASK(t, "RIP\n");
+}
+
+
+static long gsnedf_admit_task(struct task_struct* tsk)
+{
+	return 0;
+}
+
+#ifdef CONFIG_LITMUS_LOCKING
+
+#include <litmus/fdso.h>
+
+/* called with IRQs off */
+static void set_priority_inheritance(struct task_struct* t, struct task_struct* prio_inh)
+{
+	int linked_on;
+	int check_preempt = 0;
+
+	raw_spin_lock(&gsnedf_lock);
+
+	TRACE_TASK(t, "inherits priority from %s/%d\n", prio_inh->comm, prio_inh->pid);
+	tsk_rt(t)->inh_task = prio_inh;
+
+	linked_on  = tsk_rt(t)->linked_on;
+
+	/* If it is scheduled, then we need to reorder the CPU heap. */
+	if (linked_on != NO_CPU) {
+		TRACE_TASK(t, "%s: linked  on %d\n",
+			   __FUNCTION__, linked_on);
+		/* Holder is scheduled; need to re-order CPUs.
+		 * We can't use heap_decrease() here since
+		 * the cpu_heap is ordered in reverse direction, so
+		 * it is actually an increase. */
+		bheap_delete(cpu_lower_prio, &gsnedf_cpu_heap,
+			     gsnedf_cpus[linked_on]->hn);
+		bheap_insert(cpu_lower_prio, &gsnedf_cpu_heap,
+			     gsnedf_cpus[linked_on]->hn);
+	} else {
+		/* holder may be queued: first stop queue changes */
+		raw_spin_lock(&gsnedf.release_lock);
+		if (is_queued(t)) {
+			TRACE_TASK(t, "%s: is queued\n",
+				   __FUNCTION__);
+			/* We need to update the position of holder in some
+			 * heap. Note that this could be a release heap if we
+			 * budget enforcement is used and this job overran. */
+			check_preempt =
+				!bheap_decrease(edf_split_ready_order,
+						tsk_rt(t)->heap_node);
+		} else {
+			/* Nothing to do: if it is not queued and not linked
+			 * then it is either sleeping or currently being moved
+			 * by other code (e.g., a timer interrupt handler) that
+			 * will use the correct priority when enqueuing the
+			 * task. */
+			TRACE_TASK(t, "%s: is NOT queued => Done.\n",
+				   __FUNCTION__);
+		}
+		raw_spin_unlock(&gsnedf.release_lock);
+
+		/* If holder was enqueued in a release heap, then the following
+		 * preemption check is pointless, but we can't easily detect
+		 * that case. If you want to fix this, then consider that
+		 * simply adding a state flag requires O(n) time to update when
+		 * releasing n tasks, which conflicts with the goal to have
+		 * O(log n) merges. */
+		if (check_preempt) {
+			/* heap_decrease() hit the top level of the heap: make
+			 * sure preemption checks get the right task, not the
+			 * potentially stale cache. */
+			bheap_uncache_min(edf_split_ready_order,
+					  &gsnedf.ready_queue);
+			check_for_preemptions();
+		}
+	}
+
+	raw_spin_unlock(&gsnedf_lock);
+}
+
+/* called with IRQs off */
+static void update_unlocked_priority(struct task_struct* t)
+{
+	raw_spin_lock(&gsnedf_lock);
+
+	/* A job only stops inheriting a priority when it releases a
+	 * resource. Thus we can make the following assumption.*/
+	BUG_ON(tsk_rt(t)->scheduled_on == NO_CPU);
+
+	/* Clear priority inheritance */
+	TRACE_TASK(t, "priority restored\n");
+	tsk_rt(t)->inh_task = NULL;
+
+	/* Update splitting deadline */
+	tsk_rt(t)->job_params.subjob_deadline = get_proper_deadline(t);
+
+	/* Check if rescheduling is necessary. We can't use heap_decrease()
+	 * since the priority was effectively lowered. */
+	unlink(t);
+	gsnedf_job_arrival(t);
+
+	raw_spin_unlock(&gsnedf_lock);
+}
+
+
+/* ******************** FMLP support ********************** */
+
+/* struct for semaphore with priority inheritance */
+struct fmlp_semaphore {
+	struct litmus_lock litmus_lock;
+
+	/* current resource holder */
+	struct task_struct *owner;
+
+	/* highest-priority waiter */
+	struct task_struct *hp_waiter;
+
+	/* FIFO queue of waiting tasks */
+	wait_queue_head_t wait;
+};
+
+static inline struct fmlp_semaphore* fmlp_from_lock(struct litmus_lock* lock)
+{
+	return container_of(lock, struct fmlp_semaphore, litmus_lock);
+}
+
+/* caller is responsible for locking */
+static struct task_struct* find_hp_waiter(struct fmlp_semaphore *sem,
+				   struct task_struct* skip)
+{
+	struct list_head	*pos;
+	struct task_struct 	*queued, *found = NULL;
+
+	list_for_each(pos, &sem->wait.task_list) {
+		queued  = (struct task_struct*) list_entry(pos, wait_queue_t,
+							   task_list)->private;
+
+		/* Compare task prios, find high prio task. */
+		if (queued != skip && edf_split_higher_prio(queued, found))
+			found = queued;
+	}
+	return found;
+}
+
+int gsnedf_fmlp_lock(struct litmus_lock* l)
+{
+	struct task_struct* t = current;
+	struct fmlp_semaphore *sem = fmlp_from_lock(l);
+	cpu_entry_t* entry;
+	wait_queue_t wait;
+	unsigned long flags;
+
+	if (!is_realtime(t))
+		return -EPERM;
+
+	spin_lock_irqsave(&sem->wait.lock, flags);
+	entry = &__get_cpu_var(gsnedf_cpu_entries);
+
+	tsk_rt(t)->in_crit_section = 1;
+	if (entry->timer_armed) {
+		cancel_split_timer(entry);
+	}
+
+	if (sem->owner) {
+		/* resource is not free => must suspend and wait */
+
+		init_waitqueue_entry(&wait, t);
+
+		/* FIXME: interruptible would be nice some day */
+		set_task_state(t, TASK_UNINTERRUPTIBLE);
+
+		__add_wait_queue_tail_exclusive(&sem->wait, &wait);
+
+		/* check if we need to activate priority inheritance */
+		if (edf_split_higher_prio(t, sem->hp_waiter)) {
+			sem->hp_waiter = t;
+			if (edf_split_higher_prio(t, sem->owner))
+				set_priority_inheritance(sem->owner, sem->hp_waiter);
+		}
+
+		TS_LOCK_SUSPEND;
+
+		/* release lock before sleeping */
+		spin_unlock_irqrestore(&sem->wait.lock, flags);
+
+		/* We depend on the FIFO order.  Thus, we don't need to recheck
+		 * when we wake up; we are guaranteed to have the lock since
+		 * there is only one wake up per release.
+		 */
+
+		schedule();
+
+		TS_LOCK_RESUME;
+
+		/* Since we hold the lock, no other task will change
+		 * ->owner. We can thus check it without acquiring the spin
+		 * lock. */
+		BUG_ON(sem->owner != t);
+	} else {
+		/* it's ours now */
+		sem->owner = t;
+
+		spin_unlock_irqrestore(&sem->wait.lock, flags);
+	}
+
+	return 0;
+}
+
+int gsnedf_fmlp_unlock(struct litmus_lock* l)
+{
+	struct task_struct *t = current, *next;
+	struct fmlp_semaphore *sem = fmlp_from_lock(l);
+	unsigned long flags;
+	int err = 0;
+
+	spin_lock_irqsave(&sem->wait.lock, flags);
+
+	if (sem->owner != t) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	/* check if there are jobs waiting for this resource */
+	next = __waitqueue_remove_first(&sem->wait);
+	if (next) {
+		/* next becomes the resouce holder */
+		sem->owner = next;
+		TRACE_CUR("lock ownership passed to %s/%d\n", next->comm, next->pid);
+
+		/* determine new hp_waiter if necessary */
+		if (next == sem->hp_waiter) {
+			TRACE_TASK(next, "was highest-prio waiter\n");
+			/* next has the highest priority --- it doesn't need to
+			 * inherit.  However, we need to make sure that the
+			 * next-highest priority in the queue is reflected in
+			 * hp_waiter. */
+			sem->hp_waiter = find_hp_waiter(sem, next);
+			if (sem->hp_waiter)
+				TRACE_TASK(sem->hp_waiter, "is new highest-prio waiter\n");
+			else
+				TRACE("no further waiters\n");
+		} else {
+			/* Well, if next is not the highest-priority waiter,
+			 * then it ought to inherit the highest-priority
+			 * waiter's priority. */
+			set_priority_inheritance(next, sem->hp_waiter);
+		}
+
+		/* wake up next */
+		wake_up_process(next);
+	} else
+		/* becomes available */
+		sem->owner = NULL;
+
+	/* We are no longer in the critical section */
+	tsk_rt(t)->in_crit_section = 0;
+
+	/* we lose the benefit of priority inheritance (if any) and may need
+	 * to move the deadline.  In either case, may need to reschedule
+	 * due to reduced priority. */
+	if (tsk_rt(t)->inh_task || needs_deadline_move(t))
+		update_unlocked_priority(t);
+	/* TODO: Check that schedule() gets called - it needs to arm the
+	 * enforcement timer.  Otherwise we should do it here or in
+	 * update_unlocked_priority. */
+
+out:
+	spin_unlock_irqrestore(&sem->wait.lock, flags);
+
+	return err;
+}
+
+int gsnedf_fmlp_close(struct litmus_lock* l)
+{
+	struct task_struct *t = current;
+	struct fmlp_semaphore *sem = fmlp_from_lock(l);
+	unsigned long flags;
+
+	int owner;
+
+	spin_lock_irqsave(&sem->wait.lock, flags);
+
+	owner = sem->owner == t;
+
+	spin_unlock_irqrestore(&sem->wait.lock, flags);
+
+	if (owner)
+		gsnedf_fmlp_unlock(l);
+
+	return 0;
+}
+
+void gsnedf_fmlp_free(struct litmus_lock* lock)
+{
+	kfree(fmlp_from_lock(lock));
+}
+
+static struct litmus_lock_ops gsnedf_fmlp_lock_ops = {
+	.close  = gsnedf_fmlp_close,
+	.lock   = gsnedf_fmlp_lock,
+	.unlock = gsnedf_fmlp_unlock,
+	.deallocate = gsnedf_fmlp_free,
+};
+
+static struct litmus_lock* gsnedf_new_fmlp(void)
+{
+	struct fmlp_semaphore* sem;
+
+	sem = kmalloc(sizeof(*sem), GFP_KERNEL);
+	if (!sem)
+		return NULL;
+
+	sem->owner   = NULL;
+	sem->hp_waiter = NULL;
+	init_waitqueue_head(&sem->wait);
+	sem->litmus_lock.ops = &gsnedf_fmlp_lock_ops;
+
+	return &sem->litmus_lock;
+}
+
+/* **** lock constructor **** */
+
+
+static long gsnedf_allocate_lock(struct litmus_lock **lock, int type,
+				 void* __user unused)
+{
+	int err = -ENXIO;
+
+	/* GSN-EDF currently only supports the FMLP for global resources. */
+	switch (type) {
+
+	case FMLP_SEM:
+		/* Flexible Multiprocessor Locking Protocol */
+		*lock = gsnedf_new_fmlp();
+		if (*lock)
+			err = 0;
+		else
+			err = -ENOMEM;
+		break;
+
+	};
+
+	return err;
+}
+
+#endif
+
+
+static long gsnedf_activate_plugin(void)
+{
+	int cpu;
+	cpu_entry_t *entry;
+
+	bheap_init(&gsnedf_cpu_heap);
+#ifdef CONFIG_RELEASE_MASTER
+	gsnedf.release_master = atomic_read(&release_master_cpu);
+#endif
+
+	for_each_online_cpu(cpu) {
+		entry = &per_cpu(gsnedf_cpu_entries, cpu);
+		bheap_node_init(&entry->hn, entry);
+		entry->linked	= NULL;
+		entry->scheduled = NULL;
+#ifdef CONFIG_RELEASE_MASTER
+		if (cpu != gsnedf.release_master) {
+#endif
+			TRACE("GSN-EDF: Initializing CPU #%d.\n", cpu);
+			update_cpu_position(entry);
+#ifdef CONFIG_RELEASE_MASTER
+		} else {
+			TRACE("GSN-EDF: CPU %d is release master.\n", cpu);
+		}
+#endif
+	}
+	return 0;
+}
+
+/*	Plugin object	*/
+static struct sched_plugin gfl_plugin __cacheline_aligned_in_smp = {
+	.plugin_name		= "GSN-EDF",
+	.finish_switch		= gsnedf_finish_switch,
+	.tick			= gsnedf_tick,
+	.task_new		= gsnedf_task_new,
+	.complete_job		= complete_job,
+	.task_exit		= gsnedf_task_exit,
+	.schedule		= gsnedf_schedule,
+	.release_at		= gsnedf_release_at,
+	.task_wake_up		= gsnedf_task_wake_up,
+	.task_block		= gsnedf_task_block,
+	.admit_task		= gsnedf_admit_task,
+	.activate_plugin	= gsnedf_activate_plugin,
+#ifdef CONFIG_LITMUS_LOCKING
+	.allocate_lock		= gsnedf_allocate_lock,
+#endif
+};
+
+
+static int __init init_gfl(void)
+{
+	int cpu;
+	cpu_entry_t *entry;
+
+	bheap_init(&gsnedf_cpu_heap);
+	/* initialize CPU state */
+	for (cpu = 0; cpu < NR_CPUS; cpu++)  {
+		entry = &per_cpu(gsnedf_cpu_entries, cpu);
+		gsnedf_cpus[cpu] = entry;
+		entry->cpu		= cpu;
+		entry->hn		= &gsnedf_heap_node[cpu];
+		hrtimer_init(&entry->split_timer,
+			     CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+		entry->split_timer.function = on_split_timeout;
+		bheap_node_init(&entry->hn, entry);
+	}
+	edf_split_domain_init(&gsnedf, NULL, gsnedf_release_jobs);
+	return register_sched_plugin(&gfl_plugin);
+}
+
+
+module_init(init_gfl);
diff --git a/litmus/sched_gsn_edf.c b/litmus/sched_gsn_edf.c
new file mode 100644
index 000000000000..9debea981419
--- /dev/null
+++ b/litmus/sched_gsn_edf.c
@@ -0,0 +1,1286 @@
+/*
+ * litmus/sched_gsn_edf.c
+ *
+ * Implementation of the GSN-EDF scheduling algorithm.
+ *
+ * This version uses the simple approach and serializes all scheduling
+ * decisions by the use of a queue lock. This is probably not the
+ * best way to do it, but it should suffice for now.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+#include <litmus/litmus.h>
+#include <litmus/wait.h>
+#include <litmus/jobs.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/edf_common.h>
+#include <litmus/sched_trace.h>
+#include <litmus/trace.h>
+
+#include <litmus/preempt.h>
+
+#include <litmus/bheap.h>
+
+#include <linux/module.h>
+
+/* Overview of GSN-EDF operations.
+ *
+ * For a detailed explanation of GSN-EDF have a look at the FMLP paper. This
+ * description only covers how the individual operations are implemented in
+ * LITMUS.
+ *
+ * link_task_to_cpu(T, cpu) 	- Low-level operation to update the linkage
+ *                                structure (NOT the actually scheduled
+ *                                task). If there is another linked task To
+ *                                already it will set To->linked_on = NO_CPU
+ *                                (thereby removing its association with this
+ *                                CPU). However, it will not requeue the
+ *                                previously linked task (if any). It will set
+ *                                T's state to RT_F_RUNNING and check whether
+ *                                it is already running somewhere else. If T
+ *                                is scheduled somewhere else it will link
+ *                                it to that CPU instead (and pull the linked
+ *                                task to cpu). T may be NULL.
+ *
+ * unlink(T)			- Unlink removes T from all scheduler data
+ *                                structures. If it is linked to some CPU it
+ *                                will link NULL to that CPU. If it is
+ *                                currently queued in the gsnedf queue it will
+ *                                be removed from the rt_domain. It is safe to
+ *                                call unlink(T) if T is not linked. T may not
+ *                                be NULL.
+ *
+ * requeue(T)			- Requeue will insert T into the appropriate
+ *                                queue. If the system is in real-time mode and
+ *                                the T is released already, it will go into the
+ *                                ready queue. If the system is not in
+ *                                real-time mode is T, then T will go into the
+ *                                release queue. If T's release time is in the
+ *                                future, it will go into the release
+ *                                queue. That means that T's release time/job
+ *                                no/etc. has to be updated before requeu(T) is
+ *                                called. It is not safe to call requeue(T)
+ *                                when T is already queued. T may not be NULL.
+ *
+ * gsnedf_job_arrival(T)	- This is the catch all function when T enters
+ *                                the system after either a suspension or at a
+ *                                job release. It will queue T (which means it
+ *                                is not safe to call gsnedf_job_arrival(T) if
+ *                                T is already queued) and then check whether a
+ *                                preemption is necessary. If a preemption is
+ *                                necessary it will update the linkage
+ *                                accordingly and cause scheduled to be called
+ *                                (either with an IPI or need_resched). It is
+ *                                safe to call gsnedf_job_arrival(T) if T's
+ *                                next job has not been actually released yet
+ *                                (releast time in the future). T will be put
+ *                                on the release queue in that case.
+ *
+ * job_completion(T)		- Take care of everything that needs to be done
+ *                                to prepare T for its next release and place
+ *                                it in the right queue with
+ *                                gsnedf_job_arrival().
+ *
+ *
+ * When we now that T is linked to CPU then link_task_to_cpu(NULL, CPU) is
+ * equivalent to unlink(T). Note that if you unlink a task from a CPU none of
+ * the functions will automatically propagate pending task from the ready queue
+ * to a linked task. This is the job of the calling function ( by means of
+ * __take_ready).
+ */
+
+
+/* cpu_entry_t - maintain the linked and scheduled state
+ */
+typedef struct  {
+	int 			cpu;
+	struct task_struct*	linked;		/* only RT tasks */
+	struct task_struct*	scheduled;	/* only RT tasks */
+	struct bheap_node*	hn;
+} cpu_entry_t;
+DEFINE_PER_CPU(cpu_entry_t, gsnedf_cpu_entries);
+
+cpu_entry_t* gsnedf_cpus[NR_CPUS];
+
+/* the cpus queue themselves according to priority in here */
+static struct bheap_node gsnedf_heap_node[NR_CPUS];
+static struct bheap      gsnedf_cpu_heap;
+
+static rt_domain_t gsnedf;
+#define gsnedf_lock (gsnedf.ready_lock)
+
+
+/* Uncomment this if you want to see all scheduling decisions in the
+ * TRACE() log.
+#define WANT_ALL_SCHED_EVENTS
+ */
+
+static int cpu_lower_prio(struct bheap_node *_a, struct bheap_node *_b)
+{
+	cpu_entry_t *a, *b;
+	a = _a->value;
+	b = _b->value;
+	/* Note that a and b are inverted: we want the lowest-priority CPU at
+	 * the top of the heap.
+	 */
+	return edf_higher_prio(b->linked, a->linked);
+}
+
+/* update_cpu_position - Move the cpu entry to the correct place to maintain
+ *                       order in the cpu queue. Caller must hold gsnedf lock.
+ */
+static void update_cpu_position(cpu_entry_t *entry)
+{
+	if (likely(bheap_node_in_heap(entry->hn)))
+		bheap_delete(cpu_lower_prio, &gsnedf_cpu_heap, entry->hn);
+	bheap_insert(cpu_lower_prio, &gsnedf_cpu_heap, entry->hn);
+}
+
+/* caller must hold gsnedf lock */
+static cpu_entry_t* lowest_prio_cpu(void)
+{
+	struct bheap_node* hn;
+	hn = bheap_peek(cpu_lower_prio, &gsnedf_cpu_heap);
+	return hn->value;
+}
+
+
+/* link_task_to_cpu - Update the link of a CPU.
+ *                    Handles the case where the to-be-linked task is already
+ *                    scheduled on a different CPU.
+ */
+static noinline void link_task_to_cpu(struct task_struct* linked,
+				      cpu_entry_t *entry)
+{
+	cpu_entry_t *sched;
+	struct task_struct* tmp;
+	int on_cpu;
+
+	BUG_ON(linked && !is_realtime(linked));
+
+	/* Currently linked task is set to be unlinked. */
+	if (entry->linked) {
+		entry->linked->rt_param.linked_on = NO_CPU;
+	}
+
+	/* Link new task to CPU. */
+	if (linked) {
+		set_rt_flags(linked, RT_F_RUNNING);
+		/* handle task is already scheduled somewhere! */
+		on_cpu = linked->rt_param.scheduled_on;
+		if (on_cpu != NO_CPU) {
+			sched = &per_cpu(gsnedf_cpu_entries, on_cpu);
+			/* this should only happen if not linked already */
+			BUG_ON(sched->linked == linked);
+
+			/* If we are already scheduled on the CPU to which we
+			 * wanted to link, we don't need to do the swap --
+			 * we just link ourselves to the CPU and depend on
+			 * the caller to get things right.
+			 */
+			if (entry != sched) {
+				TRACE_TASK(linked,
+					   "already scheduled on %d, updating link.\n",
+					   sched->cpu);
+				tmp = sched->linked;
+				linked->rt_param.linked_on = sched->cpu;
+				sched->linked = linked;
+				update_cpu_position(sched);
+				linked = tmp;
+			}
+		}
+		if (linked) /* might be NULL due to swap */
+			linked->rt_param.linked_on = entry->cpu;
+	}
+	entry->linked = linked;
+#ifdef WANT_ALL_SCHED_EVENTS
+	if (linked)
+		TRACE_TASK(linked, "linked to %d.\n", entry->cpu);
+	else
+		TRACE("NULL linked to %d.\n", entry->cpu);
+#endif
+	update_cpu_position(entry);
+}
+
+/* unlink - Make sure a task is not linked any longer to an entry
+ *          where it was linked before. Must hold gsnedf_lock.
+ */
+static noinline void unlink(struct task_struct* t)
+{
+    	cpu_entry_t *entry;
+
+	if (t->rt_param.linked_on != NO_CPU) {
+		/* unlink */
+		entry = &per_cpu(gsnedf_cpu_entries, t->rt_param.linked_on);
+		t->rt_param.linked_on = NO_CPU;
+		link_task_to_cpu(NULL, entry);
+	} else if (is_queued(t)) {
+		/* This is an interesting situation: t is scheduled,
+		 * but was just recently unlinked.  It cannot be
+		 * linked anywhere else (because then it would have
+		 * been relinked to this CPU), thus it must be in some
+		 * queue. We must remove it from the list in this
+		 * case.
+		 */
+		remove(&gsnedf, t);
+	}
+}
+
+
+/* preempt - force a CPU to reschedule
+ */
+static void preempt(cpu_entry_t *entry)
+{
+	preempt_if_preemptable(entry->scheduled, entry->cpu);
+}
+
+/* requeue - Put an unlinked task into gsn-edf domain.
+ *           Caller must hold gsnedf_lock.
+ */
+static noinline void requeue(struct task_struct* task)
+{
+	BUG_ON(!task);
+	/* sanity check before insertion */
+	BUG_ON(is_queued(task));
+
+	if (is_released(task, litmus_clock()))
+		__add_ready(&gsnedf, task);
+	else {
+		/* it has got to wait */
+		add_release(&gsnedf, task);
+	}
+}
+
+/* check for any necessary preemptions */
+static void check_for_preemptions(void)
+{
+	struct task_struct *task;
+	cpu_entry_t* last;
+
+	for(last = lowest_prio_cpu();
+	    edf_preemption_needed(&gsnedf, last->linked);
+	    last = lowest_prio_cpu()) {
+		/* preemption necessary */
+		task = __take_ready(&gsnedf);
+		TRACE_TASK(task, "attempting to link to P%d\n",
+		      last->cpu);
+		if (last->linked)
+			requeue(last->linked);
+		link_task_to_cpu(task, last);
+		preempt(last);
+	}
+}
+
+/* gsnedf_job_arrival: task is either resumed or released */
+static noinline void gsnedf_job_arrival(struct task_struct* task)
+{
+	BUG_ON(!task);
+
+	requeue(task);
+	check_for_preemptions();
+}
+
+static void gsnedf_release_jobs(rt_domain_t* rt, struct bheap* tasks)
+{
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&gsnedf_lock, flags);
+
+	__merge_ready(rt, tasks);
+	check_for_preemptions();
+
+	raw_spin_unlock_irqrestore(&gsnedf_lock, flags);
+}
+
+/* caller holds gsnedf_lock */
+static noinline void job_completion(struct task_struct *t, int forced)
+{
+	BUG_ON(!t);
+
+	sched_trace_task_completion(t, forced);
+
+	TRACE_TASK(t, "job_completion().\n");
+
+	/* set flags */
+	set_rt_flags(t, RT_F_SLEEP);
+	/* prepare for next period */
+	prepare_for_next_period(t);
+	if (is_released(t, litmus_clock()))
+		sched_trace_task_release(t);
+	/* unlink */
+	unlink(t);
+	/* requeue
+	 * But don't requeue a blocking task. */
+	if (is_running(t))
+		gsnedf_job_arrival(t);
+}
+
+/* gsnedf_tick - this function is called for every local timer
+ *                         interrupt.
+ *
+ *                   checks whether the current task has expired and checks
+ *                   whether we need to preempt it if it has not expired
+ */
+static void gsnedf_tick(struct task_struct* t)
+{
+	if (is_realtime(t) && budget_enforced(t) && budget_exhausted(t)) {
+		if (!is_np(t)) {
+			/* np tasks will be preempted when they become
+			 * preemptable again
+			 */
+			litmus_reschedule_local();
+			TRACE("gsnedf_scheduler_tick: "
+			      "%d is preemptable "
+			      " => FORCE_RESCHED\n", t->pid);
+		} else if (is_user_np(t)) {
+			TRACE("gsnedf_scheduler_tick: "
+			      "%d is non-preemptable, "
+			      "preemption delayed.\n", t->pid);
+			request_exit_np(t);
+		}
+	}
+}
+
+/* Getting schedule() right is a bit tricky. schedule() may not make any
+ * assumptions on the state of the current task since it may be called for a
+ * number of reasons. The reasons include a scheduler_tick() determined that it
+ * was necessary, because sys_exit_np() was called, because some Linux
+ * subsystem determined so, or even (in the worst case) because there is a bug
+ * hidden somewhere. Thus, we must take extreme care to determine what the
+ * current state is.
+ *
+ * The CPU could currently be scheduling a task (or not), be linked (or not).
+ *
+ * The following assertions for the scheduled task could hold:
+ *
+ *      - !is_running(scheduled)        // the job blocks
+ *	- scheduled->timeslice == 0	// the job completed (forcefully)
+ *	- get_rt_flag() == RT_F_SLEEP	// the job completed (by syscall)
+ * 	- linked != scheduled		// we need to reschedule (for any reason)
+ * 	- is_np(scheduled)		// rescheduling must be delayed,
+ *					   sys_exit_np must be requested
+ *
+ * Any of these can occur together.
+ */
+static struct task_struct* gsnedf_schedule(struct task_struct * prev)
+{
+	cpu_entry_t* entry = &__get_cpu_var(gsnedf_cpu_entries);
+	int out_of_time, sleep, preempt, np, exists, blocks;
+	struct task_struct* next = NULL;
+
+#ifdef CONFIG_RELEASE_MASTER
+	/* Bail out early if we are the release master.
+	 * The release master never schedules any real-time tasks.
+	 */
+	if (gsnedf.release_master == entry->cpu) {
+		sched_state_task_picked();
+		return NULL;
+	}
+#endif
+
+	raw_spin_lock(&gsnedf_lock);
+
+	/* sanity checking */
+	BUG_ON(entry->scheduled && entry->scheduled != prev);
+	BUG_ON(entry->scheduled && !is_realtime(prev));
+	BUG_ON(is_realtime(prev) && !entry->scheduled);
+
+	/* (0) Determine state */
+	exists      = entry->scheduled != NULL;
+	blocks      = exists && !is_running(entry->scheduled);
+	out_of_time = exists &&
+				  budget_enforced(entry->scheduled) &&
+				  budget_exhausted(entry->scheduled);
+	np 	    = exists && is_np(entry->scheduled);
+	sleep	    = exists && get_rt_flags(entry->scheduled) == RT_F_SLEEP;
+	preempt     = entry->scheduled != entry->linked;
+
+#ifdef WANT_ALL_SCHED_EVENTS
+	TRACE_TASK(prev, "invoked gsnedf_schedule.\n");
+#endif
+
+	if (exists)
+		TRACE_TASK(prev,
+			   "blocks:%d out_of_time:%d np:%d sleep:%d preempt:%d "
+			   "state:%d sig:%d\n",
+			   blocks, out_of_time, np, sleep, preempt,
+			   prev->state, signal_pending(prev));
+	if (entry->linked && preempt && !np)
+		TRACE_TASK(prev, "will be preempted by %s/%d\n",
+			   entry->linked->comm, entry->linked->pid);
+
+
+	/* If a task blocks we have no choice but to reschedule.
+	 */
+	if (blocks)
+		unlink(entry->scheduled);
+
+	/* Request a sys_exit_np() call if we would like to preempt but cannot.
+	 * Do not unlink since entry->scheduled is currently in the ready queue.
+	 * We don't process out_of_time and sleep until the job is preemptive again.
+	 */
+	if (np && (out_of_time || preempt || sleep)) {
+		request_exit_np(entry->scheduled);
+	}
+
+	/* Any task that is preemptable and either exhausts its execution
+	 * budget or wants to sleep completes. We may have to reschedule after
+	 * this. Don't do a job completion if we block (can't have timers running
+	 * for blocked jobs). Preemption go first for the same reason.
+	 */
+	if (!np && (out_of_time || sleep) && !blocks && !preempt)
+		job_completion(entry->scheduled, !sleep);
+
+	/* Link pending task if we became unlinked.
+	 */
+	if (!entry->linked)
+		link_task_to_cpu(__take_ready(&gsnedf), entry);
+
+	/* The final scheduling decision. Do we need to switch for some reason?
+	 * If linked is different from scheduled, then select linked as next.
+	 */
+	if ((!np || blocks) &&
+	    entry->linked != entry->scheduled) {
+		/* Schedule a linked job? */
+		if (entry->linked) {
+			entry->linked->rt_param.scheduled_on = entry->cpu;
+			next = entry->linked;
+			TRACE_TASK(next, "scheduled_on = P%d\n", smp_processor_id());
+		}
+		if (entry->scheduled) {
+			/* not gonna be scheduled soon */
+			entry->scheduled->rt_param.scheduled_on = NO_CPU;
+			TRACE_TASK(entry->scheduled, "scheduled_on = NO_CPU\n");
+		}
+	} else
+		/* Only override Linux scheduler if we have a real-time task
+		 * scheduled that needs to continue.
+		 */
+		if (exists)
+			next = prev;
+
+	sched_state_task_picked();
+
+	raw_spin_unlock(&gsnedf_lock);
+
+#ifdef WANT_ALL_SCHED_EVENTS
+	TRACE("gsnedf_lock released, next=0x%p\n", next);
+
+	if (next)
+		TRACE_TASK(next, "scheduled at %llu\n", litmus_clock());
+	else if (exists && !next)
+		TRACE("becomes idle at %llu.\n", litmus_clock());
+#endif
+
+
+	return next;
+}
+
+
+/* _finish_switch - we just finished the switch away from prev
+ */
+static void gsnedf_finish_switch(struct task_struct *prev)
+{
+	cpu_entry_t* 	entry = &__get_cpu_var(gsnedf_cpu_entries);
+
+	entry->scheduled = is_realtime(current) ? current : NULL;
+#ifdef WANT_ALL_SCHED_EVENTS
+	TRACE_TASK(prev, "switched away from\n");
+#endif
+}
+
+
+/*	Prepare a task for running in RT mode
+ */
+static void gsnedf_task_new(struct task_struct * t, int on_rq, int running)
+{
+	unsigned long 		flags;
+	cpu_entry_t* 		entry;
+
+	TRACE("gsn edf: task new %d\n", t->pid);
+
+	raw_spin_lock_irqsave(&gsnedf_lock, flags);
+
+	/* setup job params */
+	release_at(t, litmus_clock());
+
+	if (running) {
+		entry = &per_cpu(gsnedf_cpu_entries, task_cpu(t));
+		BUG_ON(entry->scheduled);
+
+#ifdef CONFIG_RELEASE_MASTER
+		if (entry->cpu != gsnedf.release_master) {
+#endif
+			entry->scheduled = t;
+			tsk_rt(t)->scheduled_on = task_cpu(t);
+#ifdef CONFIG_RELEASE_MASTER
+		} else {
+			/* do not schedule on release master */
+			preempt(entry); /* force resched */
+			tsk_rt(t)->scheduled_on = NO_CPU;
+		}
+#endif
+	} else {
+		t->rt_param.scheduled_on = NO_CPU;
+	}
+	t->rt_param.linked_on          = NO_CPU;
+
+	gsnedf_job_arrival(t);
+	raw_spin_unlock_irqrestore(&gsnedf_lock, flags);
+}
+
+static void gsnedf_task_wake_up(struct task_struct *task)
+{
+	unsigned long flags;
+	lt_t now;
+
+	TRACE_TASK(task, "wake_up at %llu\n", litmus_clock());
+
+	raw_spin_lock_irqsave(&gsnedf_lock, flags);
+	/* We need to take suspensions because of semaphores into
+	 * account! If a job resumes after being suspended due to acquiring
+	 * a semaphore, it should never be treated as a new job release.
+	 */
+	if (get_rt_flags(task) == RT_F_EXIT_SEM) {
+		set_rt_flags(task, RT_F_RUNNING);
+	} else {
+		now = litmus_clock();
+		if (is_tardy(task, now)) {
+			/* new sporadic release */
+			release_at(task, now);
+			sched_trace_task_release(task);
+		}
+		else {
+			if (task->rt.time_slice) {
+				/* came back in time before deadline
+				*/
+				set_rt_flags(task, RT_F_RUNNING);
+			}
+		}
+	}
+	gsnedf_job_arrival(task);
+	raw_spin_unlock_irqrestore(&gsnedf_lock, flags);
+}
+
+static void gsnedf_task_block(struct task_struct *t)
+{
+	unsigned long flags;
+
+	TRACE_TASK(t, "block at %llu\n", litmus_clock());
+
+	/* unlink if necessary */
+	raw_spin_lock_irqsave(&gsnedf_lock, flags);
+	unlink(t);
+	raw_spin_unlock_irqrestore(&gsnedf_lock, flags);
+
+	BUG_ON(!is_realtime(t));
+}
+
+
+static void gsnedf_task_exit(struct task_struct * t)
+{
+	unsigned long flags;
+
+	/* unlink if necessary */
+	raw_spin_lock_irqsave(&gsnedf_lock, flags);
+	unlink(t);
+	if (tsk_rt(t)->scheduled_on != NO_CPU) {
+		gsnedf_cpus[tsk_rt(t)->scheduled_on]->scheduled = NULL;
+		tsk_rt(t)->scheduled_on = NO_CPU;
+	}
+	raw_spin_unlock_irqrestore(&gsnedf_lock, flags);
+
+	BUG_ON(!is_realtime(t));
+        TRACE_TASK(t, "RIP\n");
+}
+
+
+static long gsnedf_admit_task(struct task_struct* tsk)
+{
+	return 0;
+}
+
+#ifdef CONFIG_LITMUS_LOCKING
+
+#include <litmus/fdso.h>
+
+
+
+/* called with IRQs off */
+static void __set_priority_inheritance(struct task_struct* t, struct task_struct* prio_inh)
+{
+	int linked_on;
+	int check_preempt = 0;
+
+	TRACE_TASK(t, "inherits priority from %s/%d\n", prio_inh->comm, prio_inh->pid);
+	tsk_rt(t)->inh_task = prio_inh;
+
+	linked_on  = tsk_rt(t)->linked_on;
+
+	/* If it is scheduled, then we need to reorder the CPU heap. */
+	if (linked_on != NO_CPU) {
+		TRACE_TASK(t, "%s: linked  on %d\n",
+			   __FUNCTION__, linked_on);
+		/* Holder is scheduled; need to re-order CPUs.
+		 * We can't use heap_decrease() here since
+		 * the cpu_heap is ordered in reverse direction, so
+		 * it is actually an increase. */
+		bheap_delete(cpu_lower_prio, &gsnedf_cpu_heap,
+			    gsnedf_cpus[linked_on]->hn);
+		bheap_insert(cpu_lower_prio, &gsnedf_cpu_heap,
+			    gsnedf_cpus[linked_on]->hn);
+	} else {
+		/* holder may be queued: first stop queue changes */
+		raw_spin_lock(&gsnedf.release_lock);
+		if (is_queued(t)) {
+			TRACE_TASK(t, "%s: is queued\n",
+				   __FUNCTION__);
+			/* We need to update the position of holder in some
+			 * heap. Note that this could be a release heap if
+			 * budget enforcement is used and this job overran. */
+			check_preempt =
+				!bheap_decrease(edf_ready_order,
+					       tsk_rt(t)->heap_node);
+		} else {
+			/* Nothing to do: if it is not queued and not linked
+			 * then it is either sleeping or currently being moved
+			 * by other code (e.g., a timer interrupt handler) that
+			 * will use the correct priority when enqueuing the
+			 * task. */
+			TRACE_TASK(t, "%s: is NOT queued => Done.\n",
+				   __FUNCTION__);
+		}
+		raw_spin_unlock(&gsnedf.release_lock);
+
+		/* If holder was enqueued in a release heap, then the following
+		 * preemption check is pointless, but we can't easily detect
+		 * that case. If you want to fix this, then consider that
+		 * simply adding a state flag requires O(n) time to update when
+		 * releasing n tasks, which conflicts with the goal to have
+		 * O(log n) merges. */
+		if (check_preempt) {
+			/* heap_decrease() hit the top level of the heap: make
+			 * sure preemption checks get the right task, not the
+			 * potentially stale cache. */
+			bheap_uncache_min(edf_ready_order,
+					 &gsnedf.ready_queue);
+			check_for_preemptions();
+		}
+	}
+}
+
+static void set_priority_inheritance(struct task_struct* t, struct task_struct* prio_inh)
+{
+	raw_spin_lock(&gsnedf_lock);
+	__set_priority_inheritance(t, prio_inh);
+	raw_spin_unlock(&gsnedf_lock);
+}
+
+static void __clear_priority_inheritance(struct task_struct* t)
+{
+	/* A job only stops inheriting a priority when it releases a
+	 * resource. Thus we can make the following assumption.*/
+	BUG_ON(tsk_rt(t)->scheduled_on == NO_CPU);
+
+	TRACE_TASK(t, "priority restored\n");
+	tsk_rt(t)->inh_task = NULL;
+
+	/* Check if rescheduling is necessary. We can't use heap_decrease()
+	 * since the priority was effectively lowered. */
+	unlink(t);
+	gsnedf_job_arrival(t);
+}
+
+/* set and clear at the same time to avoid having to
+ * acquire the runqueue lock twice */
+static void update_priority_inheritance(
+	struct task_struct* deprived,
+	struct task_struct* blocker,
+	struct task_struct* blocked)
+{
+	/* things to do:
+	 * 1) deprived no longer inherits anything.
+	 * 2) blocker gets blocked's priority.
+	 */
+
+	raw_spin_lock(&gsnedf_lock);
+
+	if (tsk_rt(deprived)->inh_task)
+		__clear_priority_inheritance(deprived);
+
+	if (blocked)
+		__set_priority_inheritance(blocker, blocked);
+
+	raw_spin_unlock(&gsnedf_lock);
+}
+
+
+/* ******************** FMLP support ********************** */
+
+/* struct for semaphore with priority inheritance */
+struct fmlp_semaphore {
+	struct litmus_lock litmus_lock;
+
+	/* current resource holder */
+	struct task_struct *owner;
+
+	/* highest-priority waiter */
+	struct task_struct *hp_waiter;
+
+	/* FIFO queue of waiting tasks */
+	wait_queue_head_t wait;
+};
+
+static inline struct fmlp_semaphore* fmlp_from_lock(struct litmus_lock* lock)
+{
+	return container_of(lock, struct fmlp_semaphore, litmus_lock);
+}
+
+/* caller is responsible for locking */
+struct task_struct* find_hp_waiter(struct fmlp_semaphore *sem,
+				   struct task_struct* skip)
+{
+	struct list_head	*pos;
+	struct task_struct 	*queued, *found = NULL;
+
+	list_for_each(pos, &sem->wait.task_list) {
+		queued  = (struct task_struct*) list_entry(pos, wait_queue_t,
+							   task_list)->private;
+
+		/* Compare task prios, find high prio task. */
+		if (queued != skip && edf_higher_prio(queued, found))
+			found = queued;
+	}
+	return found;
+}
+
+int gsnedf_fmlp_lock(struct litmus_lock* l)
+{
+	struct task_struct* t = current;
+	struct fmlp_semaphore *sem = fmlp_from_lock(l);
+	wait_queue_t wait;
+	unsigned long flags;
+
+	if (!is_realtime(t))
+		return -EPERM;
+
+	spin_lock_irqsave(&sem->wait.lock, flags);
+
+	if (sem->owner) {
+		/* resource is not free => must suspend and wait */
+
+		init_waitqueue_entry(&wait, t);
+
+		/* FIXME: interruptible would be nice some day */
+		set_task_state(t, TASK_UNINTERRUPTIBLE);
+
+		__add_wait_queue_tail_exclusive(&sem->wait, &wait);
+
+		/* check if we need to activate priority inheritance */
+		if (edf_higher_prio(t, sem->hp_waiter)) {
+			sem->hp_waiter = t;
+			if (edf_higher_prio(t, sem->owner))
+				set_priority_inheritance(sem->owner, sem->hp_waiter);
+		}
+
+		TS_LOCK_SUSPEND;
+
+		/* release lock before sleeping */
+		spin_unlock_irqrestore(&sem->wait.lock, flags);
+
+		/* We depend on the FIFO order.  Thus, we don't need to recheck
+		 * when we wake up; we are guaranteed to have the lock since
+		 * there is only one wake up per release.
+		 */
+
+		schedule();
+
+		TS_LOCK_RESUME;
+
+		/* Since we hold the lock, no other task will change
+		 * ->owner. We can thus check it without acquiring the spin
+		 * lock. */
+		BUG_ON(sem->owner != t);
+	} else {
+		/* it's ours now */
+		sem->owner = t;
+
+		spin_unlock_irqrestore(&sem->wait.lock, flags);
+	}
+
+	return 0;
+}
+
+int gsnedf_fmlp_unlock(struct litmus_lock* l)
+{
+	struct task_struct *t = current, *next, *blocked = NULL;
+	struct fmlp_semaphore *sem = fmlp_from_lock(l);
+	unsigned long flags;
+	int err = 0;
+
+	spin_lock_irqsave(&sem->wait.lock, flags);
+
+	if (sem->owner != t) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	/* check if there are jobs waiting for this resource */
+	next = __waitqueue_remove_first(&sem->wait);
+	if (next) {
+		/* next becomes the resouce holder */
+		sem->owner = next;
+		TRACE_CUR("lock ownership passed to %s/%d\n", next->comm, next->pid);
+
+		/* determine new hp_waiter if necessary */
+		if (next == sem->hp_waiter) {
+			TRACE_TASK(next, "was highest-prio waiter\n");
+			/* next has the highest priority --- it doesn't need to
+			 * inherit.  However, we need to make sure that the
+			 * next-highest priority in the queue is reflected in
+			 * hp_waiter. */
+			sem->hp_waiter = find_hp_waiter(sem, next);
+			if (sem->hp_waiter)
+				TRACE_TASK(sem->hp_waiter, "is new highest-prio waiter\n");
+			else
+				TRACE("no further waiters\n");
+		} else {
+			/* Well, if next is not the highest-priority waiter,
+			 * then it ought to inherit the highest-priority
+			 * waiter's priority. */
+			blocked = sem->hp_waiter;
+		}
+
+		/* wake up next */
+		wake_up_process(next);
+	} else
+		/* becomes available */
+		sem->owner = NULL;
+
+	/* we lose the benefit of priority inheritance (if any) */
+	if (tsk_rt(t)->inh_task || blocked)
+		update_priority_inheritance(t, next, blocked);
+
+out:
+	spin_unlock_irqrestore(&sem->wait.lock, flags);
+
+	return err;
+}
+
+int gsnedf_fmlp_close(struct litmus_lock* l)
+{
+	struct task_struct *t = current;
+	struct fmlp_semaphore *sem = fmlp_from_lock(l);
+	unsigned long flags;
+
+	int owner;
+
+	spin_lock_irqsave(&sem->wait.lock, flags);
+
+	owner = sem->owner == t;
+
+	spin_unlock_irqrestore(&sem->wait.lock, flags);
+
+	if (owner)
+		gsnedf_fmlp_unlock(l);
+
+	return 0;
+}
+
+void gsnedf_fmlp_free(struct litmus_lock* lock)
+{
+	kfree(fmlp_from_lock(lock));
+}
+
+static struct litmus_lock_ops gsnedf_fmlp_lock_ops = {
+	.close  = gsnedf_fmlp_close,
+	.lock   = gsnedf_fmlp_lock,
+	.unlock = gsnedf_fmlp_unlock,
+	.deallocate = gsnedf_fmlp_free,
+};
+
+static struct litmus_lock* gsnedf_new_fmlp(void)
+{
+	struct fmlp_semaphore* sem;
+
+	sem = kmalloc(sizeof(*sem), GFP_KERNEL);
+	if (!sem)
+		return NULL;
+
+	sem->owner   = NULL;
+	sem->hp_waiter = NULL;
+	init_waitqueue_head(&sem->wait);
+	sem->litmus_lock.ops = &gsnedf_fmlp_lock_ops;
+
+	return &sem->litmus_lock;
+}
+
+
+/* ******************** OMLP support ********************** */
+
+/* struct for semaphore with priority inheritance */
+struct omlp_semaphore {
+	struct litmus_lock litmus_lock;
+
+	/* current resource holder */
+	struct task_struct *owner;
+
+	/* highest-priority waiter */
+	struct task_struct *hp_waiter;
+
+	/* FIFO queue of waiting tasks */
+	wait_queue_head_t fifo_wait;
+	/* Priority queue of waiting tasks */
+	wait_queue_head_t prio_wait;
+
+	/* How many slots remaining in FIFO queue? */
+	unsigned int num_free;
+};
+
+static inline struct omlp_semaphore* omlp_from_lock(struct litmus_lock* lock)
+{
+	return container_of(lock, struct omlp_semaphore, litmus_lock);
+}
+
+/* already locked */
+static void omlp_enqueue(struct omlp_semaphore *sem, prio_wait_queue_t* wait)
+{
+	if (sem->num_free) {
+		/* there is space in the FIFO queue */
+		sem->num_free--;
+		__add_wait_queue_tail_exclusive(&sem->fifo_wait, &wait->wq);
+	} else {
+		/* nope, gotta go to the priority queue */
+		__add_wait_queue_prio_exclusive(&sem->prio_wait, wait);
+	}
+}
+
+/* already locked */
+static int omlp_move(struct omlp_semaphore *sem)
+{
+	struct list_head* first;
+
+	if (waitqueue_active(&sem->prio_wait)) {
+		first = sem->prio_wait.task_list.next;
+		list_move_tail(first, &sem->fifo_wait.task_list);
+		return 1;
+	}
+	else
+		return 0;
+}
+
+static struct task_struct* omlp_dequeue(struct omlp_semaphore *sem)
+{
+	struct task_struct* first = __waitqueue_remove_first(&sem->fifo_wait);
+
+	if (first && !omlp_move(sem))
+		sem->num_free++;
+
+	return first;
+}
+
+/* caller is responsible for locking */
+static struct task_struct* omlp_find_hp_waiter(struct omlp_semaphore *sem,
+					struct task_struct* skip)
+{
+	struct list_head	*pos;
+	struct task_struct 	*queued, *found = NULL;
+
+	/* check FIFO queue first */
+	list_for_each(pos, &sem->fifo_wait.task_list) {
+		queued  = (struct task_struct*) list_entry(pos, wait_queue_t,
+							   task_list)->private;
+
+		/* Compare task prios, find high prio task. */
+		if (queued != skip && edf_higher_prio(queued, found))
+			found = queued;
+	}
+
+	/* check priority queue next */
+	if (waitqueue_active(&sem->prio_wait)) {
+		/* first has highest priority */
+		pos = sem->prio_wait.task_list.next;
+		queued  = (struct task_struct*) list_entry(pos, wait_queue_t,
+							   task_list)->private;
+		if (edf_higher_prio(queued, found))
+			found = queued;
+	}
+
+	return found;
+}
+
+int gsnedf_omlp_lock(struct litmus_lock* l)
+{
+	struct task_struct* t = current;
+	struct omlp_semaphore *sem = omlp_from_lock(l);
+	prio_wait_queue_t wait;
+	unsigned long flags;
+
+	if (!is_realtime(t))
+		return -EPERM;
+
+	spin_lock_irqsave(&sem->fifo_wait.lock, flags);
+
+	if (sem->owner) {
+		/* resource is not free => must suspend and wait */
+
+		init_prio_waitqueue_entry(&wait, t, get_deadline(t));
+
+		set_task_state(t, TASK_UNINTERRUPTIBLE);
+
+		omlp_enqueue(sem, &wait);
+
+		/* check if we need to activate priority inheritance */
+		if (edf_higher_prio(t, sem->hp_waiter)) {
+			sem->hp_waiter = t;
+			if (edf_higher_prio(t, sem->owner))
+				set_priority_inheritance(sem->owner, sem->hp_waiter);
+		}
+
+		TS_LOCK_SUSPEND;
+
+		/* release lock before sleeping */
+		spin_unlock_irqrestore(&sem->fifo_wait.lock, flags);
+
+		schedule();
+
+		TS_LOCK_RESUME;
+
+		/* Since we hold the lock, no other task will change
+		 * ->owner. We can thus check it without acquiring the spin
+		 * lock. */
+		BUG_ON(sem->owner != t);
+	} else {
+		/* it's ours now */
+		sem->owner = t;
+
+		spin_unlock_irqrestore(&sem->fifo_wait.lock, flags);
+	}
+
+	return 0;
+}
+
+static int gsnedf_omlp_unlock(struct litmus_lock* l)
+{
+	struct task_struct *t = current, *next, *blocked = NULL;
+	struct omlp_semaphore *sem = omlp_from_lock(l);
+	unsigned long flags;
+	int err = 0;
+
+	spin_lock_irqsave(&sem->fifo_wait.lock, flags);
+
+	if (sem->owner != t) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	/* check if there are jobs waiting for this resource */
+	next = omlp_dequeue(sem);
+	if (next) {
+		/* next becomes the resouce holder */
+		sem->owner = next;
+		TRACE_CUR("lock ownership passed to %s/%d\n", next->comm, next->pid);
+
+		/* determine new hp_waiter if necessary */
+		if (next == sem->hp_waiter) {
+			TRACE_TASK(next, "was highest-prio waiter\n");
+			/* next has the highest priority --- it doesn't need to
+			 * inherit.  However, we need to make sure that the
+			 * next-highest priority in the queue is reflected in
+			 * hp_waiter. */
+			sem->hp_waiter = omlp_find_hp_waiter(sem, next);
+			if (sem->hp_waiter)
+				TRACE_TASK(sem->hp_waiter, "is new highest-prio waiter\n");
+			else
+				TRACE("no further waiters\n");
+		} else {
+			/* Well, if next is not the highest-priority waiter,
+			 * then it ought to inherit the highest-priority
+			 * waiter's priority. */
+			blocked = sem->hp_waiter;
+		}
+
+		/* wake up next */
+		wake_up_process(next);
+	} else
+		/* becomes available */
+		sem->owner = NULL;
+
+	/* we lose the benefit of priority inheritance (if any) */
+	if (tsk_rt(t)->inh_task || blocked)
+		update_priority_inheritance(t, next, blocked);
+
+out:
+	spin_unlock_irqrestore(&sem->fifo_wait.lock, flags);
+
+	return err;
+}
+
+static int gsnedf_omlp_close(struct litmus_lock* l)
+{
+	struct task_struct *t = current;
+	struct omlp_semaphore *sem = omlp_from_lock(l);
+	unsigned long flags;
+
+	int owner;
+
+	spin_lock_irqsave(&sem->fifo_wait.lock, flags);
+
+	owner = sem->owner == t;
+
+	spin_unlock_irqrestore(&sem->fifo_wait.lock, flags);
+
+	if (owner)
+		gsnedf_omlp_unlock(l);
+
+	return 0;
+}
+
+static void gsnedf_omlp_free(struct litmus_lock* lock)
+{
+	kfree(omlp_from_lock(lock));
+}
+
+static struct litmus_lock_ops gsnedf_omlp_lock_ops = {
+	.close  = gsnedf_omlp_close,
+	.lock   = gsnedf_omlp_lock,
+	.unlock = gsnedf_omlp_unlock,
+	.deallocate = gsnedf_omlp_free,
+};
+
+static struct litmus_lock* gsnedf_new_omlp(void)
+{
+	struct omlp_semaphore* sem;
+
+	sem = kmalloc(sizeof(*sem), GFP_KERNEL);
+	if (!sem)
+		return NULL;
+
+	sem->owner   = NULL;
+	sem->hp_waiter = NULL;
+	init_waitqueue_head(&sem->fifo_wait);
+	init_waitqueue_head(&sem->prio_wait);
+	sem->litmus_lock.ops = &gsnedf_omlp_lock_ops;
+	/* free = cpus -1 since ->owner is the head and also counted */
+	sem->num_free = num_online_cpus() - 1;
+
+#ifdef CONFIG_RELEASE_MASTER
+	/* If we use dedicated interrupt handling, then there are actually
+	 * only m - 1 CPUs around. */
+	if (gsnedf.release_master != NO_CPU)
+		sem->num_free -= 1;
+#endif
+
+	return &sem->litmus_lock;
+}
+
+
+/* **** lock constructor **** */
+
+
+static long gsnedf_allocate_lock(struct litmus_lock **lock, int type,
+				 void* __user unused)
+{
+	int err = -ENXIO;
+
+	/* GSN-EDF currently only supports the FMLP for global resources. */
+	switch (type) {
+
+	case FMLP_SEM:
+		/* Flexible Multiprocessor Locking Protocol */
+		*lock = gsnedf_new_fmlp();
+		if (*lock)
+			err = 0;
+		else
+			err = -ENOMEM;
+		break;
+
+	case OMLP_SEM:
+		/* O(m) Multiprocessor Locking Protocol */
+		*lock  = gsnedf_new_omlp();
+		if (*lock)
+			err = 0;
+		else
+			err = -ENOMEM;
+		break;
+
+	};
+
+	return err;
+}
+
+#endif
+
+
+static long gsnedf_activate_plugin(void)
+{
+	int cpu;
+	cpu_entry_t *entry;
+
+	bheap_init(&gsnedf_cpu_heap);
+#ifdef CONFIG_RELEASE_MASTER
+	gsnedf.release_master = atomic_read(&release_master_cpu);
+#endif
+
+	for_each_online_cpu(cpu) {
+		entry = &per_cpu(gsnedf_cpu_entries, cpu);
+		bheap_node_init(&entry->hn, entry);
+		entry->linked    = NULL;
+		entry->scheduled = NULL;
+#ifdef CONFIG_RELEASE_MASTER
+		if (cpu != gsnedf.release_master) {
+#endif
+			TRACE("GSN-EDF: Initializing CPU #%d.\n", cpu);
+			update_cpu_position(entry);
+#ifdef CONFIG_RELEASE_MASTER
+		} else {
+			TRACE("GSN-EDF: CPU %d is release master.\n", cpu);
+		}
+#endif
+	}
+	return 0;
+}
+
+/*	Plugin object	*/
+static struct sched_plugin gsn_edf_plugin __cacheline_aligned_in_smp = {
+	.plugin_name		= "GSN-EDF",
+	.finish_switch		= gsnedf_finish_switch,
+	.tick			= gsnedf_tick,
+	.task_new		= gsnedf_task_new,
+	.complete_job		= complete_job,
+	.task_exit		= gsnedf_task_exit,
+	.schedule		= gsnedf_schedule,
+	.task_wake_up		= gsnedf_task_wake_up,
+	.task_block		= gsnedf_task_block,
+	.admit_task		= gsnedf_admit_task,
+	.activate_plugin	= gsnedf_activate_plugin,
+#ifdef CONFIG_LITMUS_LOCKING
+	.allocate_lock		= gsnedf_allocate_lock,
+#endif
+};
+
+
+static int __init init_gsn_edf(void)
+{
+	int cpu;
+	cpu_entry_t *entry;
+
+	bheap_init(&gsnedf_cpu_heap);
+	/* initialize CPU state */
+	for (cpu = 0; cpu < NR_CPUS; cpu++)  {
+		entry = &per_cpu(gsnedf_cpu_entries, cpu);
+		gsnedf_cpus[cpu] = entry;
+		entry->cpu 	 = cpu;
+		entry->hn        = &gsnedf_heap_node[cpu];
+		bheap_node_init(&entry->hn, entry);
+	}
+	edf_domain_init(&gsnedf, NULL, gsnedf_release_jobs);
+	return register_sched_plugin(&gsn_edf_plugin);
+}
+
+
+module_init(init_gsn_edf);
diff --git a/litmus/sched_gsn_edf_split_namechange.c b/litmus/sched_gsn_edf_split_namechange.c
new file mode 100644
index 000000000000..6839ae642b3a
--- /dev/null
+++ b/litmus/sched_gsn_edf_split_namechange.c
@@ -0,0 +1,1165 @@
+/*
+ * litmus/sched_gsn_edf.c
+ *
+ * Implementation of the GSN-EDF scheduling algorithm with job splitting, i.e.
+ * GSN-EDF.
+ *
+ * This plugin is a modified version of the prior GSN-EDF plugin in
+ * litmus/sched_gsn_edf.c
+ * 
+ * Splitting an implicit-deadline job simply means splitting each job into an
+ * integral number of subjobs.  For example, a task with a period of 10 ms and
+ * a runtime of 4 ms could be re-organized as a task with a period of 5 ms and
+ * a runtime of 2 ms, with analytical benefit for bounded tardiness (ignoring
+ * overheads and assuming no critical sections).  This would have a "splitting
+ * factor" of 2.
+ *
+ * Because our analysis works with early releasing, we actually only release
+ * each job once, but move the subjob deadline back when the appropriate amount
+ * of execution has been completed.  (In the example above, a job released at
+ * time 0 would intially have a subjob deadline at time 5, but this deadline
+ * would be moved to time 10 as soon as 2 ms of execution had completed.)
+ */
+
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+#include <litmus/litmus.h>
+#include <litmus/jobs.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/edf_common.h>
+#include <litmus/sched_trace.h>
+#include <litmus/trace.h>
+
+#include <litmus/preempt.h>
+
+#include <litmus/bheap.h>
+
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+#include <litmus/affinity.h>
+#endif
+
+#include <linux/module.h>
+
+/* cpu_entry_t - maintain the linked and scheduled state
+ */
+typedef struct {
+	int 			cpu;
+	struct task_struct*	linked;		/* only RT tasks */
+	struct task_struct*	scheduled;	/* only RT tasks */
+	struct bheap_node*	hn;
+	struct hrtimer		split_timer;
+	int			timer_armed;
+} cpu_entry_t;
+DEFINE_PER_CPU(cpu_entry_t, gsnedf_cpu_entries);
+
+cpu_entry_t* gsnedf_cpus[NR_CPUS];
+
+/* the cpus queue themselves according to priority in here */
+static struct bheap_node gsnedf_heap_node[NR_CPUS];
+static struct bheap	  gsnedf_cpu_heap;
+
+static rt_domain_t gsnedf;
+#define gsnedf_lock (gsnedf.ready_lock)
+
+inline static int get_slice_num(struct task_struct* t)
+{
+	int basic = ((t->rt_param.job_params.exec_time *
+		      t->rt_param.task_params.split) /
+		      t->rt_param.task_params.exec_cost) + 1;
+	if (basic <= t->rt_param.task_params.split){
+		return basic;
+	}
+	else{
+		/*Since we don't police budget, just leave where it's at.*/
+		return t->rt_param.task_params.split;
+	}
+}
+
+/* Returns the appropriate subjob deadline.*/
+inline static lt_t get_proper_deadline(struct task_struct* t)
+{
+	return t->rt_param.job_params.release +
+	       ((t->rt_param.task_params.period * get_slice_num(t))
+	       / t->rt_param.task_params.split);
+}
+
+/* Tells us if the current deadline is too small.*/
+inline static int needs_deadline_move(struct task_struct* t)
+{
+	BUG_ON(get_proper_deadline(t) < t->rt_param.job_params.subjob_deadline);
+#ifdef CONFIG_LITMUS_LOCKING
+	return !is_in_crit_section(t) &&
+		(get_proper_deadline(t) !=
+		 tsk_rt(t)->job_params.subjob_deadline);
+#else
+	return get_proper_deadline(t) != tsk_rt(t)->job_params.subjob_deadline;
+#endif
+}
+
+/*Returns execution time until the next deadline move.
+ * 0 means the task has no more deadline moves
+ */
+inline static lt_t time_to_next_move(struct task_struct* t)
+{
+	if (get_slice_num(t) == t->rt_param.task_params.split){
+		return 0;
+	}
+	/* +1 upper bounds ceiling, since integer division is floor*/
+	return ((get_slice_num(t) * t->rt_param.task_params.exec_cost)
+		 / t->rt_param.task_params.split) + 1
+		 - t->rt_param.job_params.exec_time;
+}
+
+/* Timer stuff - similar to budget.c. */
+static enum hrtimer_restart on_split_timeout(struct hrtimer *timer)
+{
+	cpu_entry_t* st = container_of(timer,
+				       cpu_entry_t,
+				       split_timer);
+	
+	unsigned long flags;
+
+	local_irq_save(flags);
+	TRACE("split timer fired.\n");
+	st->timer_armed = 0;
+	/* Activate scheduler */
+	litmus_reschedule_local();
+	local_irq_restore(flags);
+
+	return HRTIMER_NORESTART;
+}
+
+static void cancel_split_timer(cpu_entry_t* ce)
+{
+	int ret;
+
+	TRACE("cancelling split time.\n");
+
+	/* Since interrupts are disabled and et->timer_armed is only
+	 * modified locally, we do not need any locks.
+	 */
+	
+	if (ce->timer_armed) {
+		ret = hrtimer_try_to_cancel(&ce->split_timer);
+		/* Should never be inactive. */
+		BUG_ON(ret == 0);
+		/* Should never be running concurrently.*/
+		BUG_ON(ret == -1);
+
+		ce->timer_armed = 0;
+	}
+}
+
+/* assumes called with IRQs off */
+static void arm_split_timer(cpu_entry_t *ce,
+				struct task_struct* t)
+{
+	lt_t when_to_fire;
+	lt_t time_to_move;
+	TRACE_TASK(t, "arming split timer.\n");
+
+	/* __hrtimer_start_range_ns() cancels the timer
+	 * anyway, so we don't have to check whether it is still armed */
+	
+	/*We won't do any new deadline moves if the budget has been exhausted*/
+	if (likely(!is_np(t) && (time_to_move = time_to_next_move(t)))) {
+		when_to_fire = litmus_clock() + time_to_move;
+		TRACE_TASK(t, "actually arming for %llu into the future\n",
+			   time_to_move);
+		__hrtimer_start_range_ns(&ce->split_timer,
+					 ns_to_ktime(when_to_fire),
+					 0 /* delta */,
+					 HRTIMER_MODE_ABS_PINNED,
+					 0 /* no wakeup */);
+		ce->timer_armed = 1;
+	}
+}
+
+/* Uncomment this if you want to see all scheduling decisions in the
+ * TRACE() log.
+#define WANT_ALL_SCHED_EVENTS
+ */
+
+static int cpu_lower_prio(struct bheap_node *_a, struct bheap_node *_b)
+{
+	cpu_entry_t *a, *b;
+	a = _a->value;
+	b = _b->value;
+	/* Note that a and b are inverted: we want the lowest-priority CPU at
+	 * the top of the heap.
+	 */
+	return edf_higher_prio(b->linked, a->linked);
+}
+
+/* update_cpu_position - Move the cpu entry to the correct place to maintain
+ *					   order in the cpu queue. Caller must hold gsnedf lock.
+ */
+static void update_cpu_position(cpu_entry_t *entry)
+{
+	if (likely(bheap_node_in_heap(entry->hn)))
+		bheap_delete(cpu_lower_prio, &gsnedf_cpu_heap, entry->hn);
+	bheap_insert(cpu_lower_prio, &gsnedf_cpu_heap, entry->hn);
+}
+
+/* caller must hold gsnedf lock */
+static cpu_entry_t* lowest_prio_cpu(void)
+{
+	struct bheap_node* hn;
+	hn = bheap_peek(cpu_lower_prio, &gsnedf_cpu_heap);
+	return hn->value;
+}
+
+
+/* link_task_to_cpu - Update the link of a CPU.
+ *					Handles the case where the to-be-linked task is already
+ *					scheduled on a different CPU.
+ */
+static noinline void link_task_to_cpu(struct task_struct* linked,
+					  cpu_entry_t *entry)
+{
+	cpu_entry_t *sched;
+	struct task_struct* tmp;
+	int on_cpu;
+
+	BUG_ON(linked && !is_realtime(linked));
+
+	/* Currently linked task is set to be unlinked. */
+	if (entry->linked) {
+		entry->linked->rt_param.linked_on = NO_CPU;
+	}
+
+	/* Link new task to CPU. */
+	if (linked) {
+		set_rt_flags(linked, RT_F_RUNNING);
+		/* handle task is already scheduled somewhere! */
+		on_cpu = linked->rt_param.scheduled_on;
+		if (on_cpu != NO_CPU) {
+			sched = &per_cpu(gsnedf_cpu_entries, on_cpu);
+			/* this should only happen if not linked already */
+			BUG_ON(sched->linked == linked);
+
+			/* If we are already scheduled on the CPU to which we
+			 * wanted to link, we don't need to do the swap --
+			 * we just link ourselves to the CPU and depend on
+			 * the caller to get things right.
+			 */
+			if (entry != sched) {
+				TRACE_TASK(linked,
+					   "already scheduled on %d, updating link.\n",
+					   sched->cpu);
+				tmp = sched->linked;
+				linked->rt_param.linked_on = sched->cpu;
+				sched->linked = linked;
+				update_cpu_position(sched);
+				linked = tmp;
+			}
+		}
+		if (linked) /* might be NULL due to swap */
+			linked->rt_param.linked_on = entry->cpu;
+	}
+	entry->linked = linked;
+#ifdef WANT_ALL_SCHED_EVENTS
+	if (linked)
+		TRACE_TASK(linked, "linked to %d.\n", entry->cpu);
+	else
+		TRACE("NULL linked to %d.\n", entry->cpu);
+#endif
+	update_cpu_position(entry);
+}
+
+/* unlink - Make sure a task is not linked any longer to an entry
+ *		  where it was linked before. Must hold gsnedf_lock.
+ */
+static noinline void unlink(struct task_struct* t)
+{
+	cpu_entry_t *entry;
+
+	if (t->rt_param.linked_on != NO_CPU) {
+		/* unlink */
+		entry = &per_cpu(gsnedf_cpu_entries, t->rt_param.linked_on);
+		t->rt_param.linked_on = NO_CPU;
+		link_task_to_cpu(NULL, entry);
+	} else if (is_queued(t)) {
+		/* This is an interesting situation: t is scheduled,
+		 * but was just recently unlinked.  It cannot be
+		 * linked anywhere else (because then it would have
+		 * been relinked to this CPU), thus it must be in some
+		 * queue. We must remove it from the list in this
+		 * case.
+		 */
+		remove(&gsnedf, t);
+	}
+}
+
+
+/* preempt - force a CPU to reschedule
+ */
+static void preempt(cpu_entry_t *entry)
+{
+	preempt_if_preemptable(entry->scheduled, entry->cpu);
+}
+
+/* requeue - Put an unlinked task into gsn-edf domain.
+ *		   Caller must hold gsnedf_lock.
+ */
+static noinline void requeue(struct task_struct* task)
+{
+	BUG_ON(!task);
+	/* sanity check before insertion */
+	BUG_ON(is_queued(task));
+
+	if (is_released(task, litmus_clock()))
+		__add_ready(&gsnedf, task);
+	else {
+		/* it has got to wait */
+		add_release(&gsnedf, task);
+	}
+}
+
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+static cpu_entry_t* gsnedf_get_nearest_available_cpu(cpu_entry_t *start)
+{
+	cpu_entry_t *affinity;
+
+	get_nearest_available_cpu(affinity, start, gsnedf_cpu_entries,
+#ifdef CONFIG_RELEASE_MASTER
+				  gsnedf.release_master
+#else
+				  NO_CPU
+#endif
+				 );
+
+	return(affinity);
+}
+#endif
+
+/* check for any necessary preemptions */
+static void check_for_preemptions(void)
+{
+	struct task_struct *task;
+	cpu_entry_t *last;
+
+	for (last = lowest_prio_cpu();
+		edf_preemption_needed(&gsnedf, last->linked);
+		last = lowest_prio_cpu()) {
+		/* preemption necessary */
+		task = __take_ready(&gsnedf);
+		TRACE("check_for_preemptions: attempting to link task %d to %d\n",
+			  task->pid, last->cpu);
+
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+		{
+			cpu_entry_t *affinity =
+					gsnedf_get_nearest_available_cpu(
+						&per_cpu(gsnedf_cpu_entries,
+						task_cpu(task)));
+			if (affinity)
+				last = affinity;
+			else if (last->linked)
+				requeue(last->linked);
+		}
+#else
+		if (last->linked)
+			requeue(last->linked);
+#endif
+
+		link_task_to_cpu(task, last);
+		preempt(last);
+	}
+}
+
+/* gsnedf_job_arrival: task is either resumed or released */
+static noinline void gsnedf_job_arrival(struct task_struct* task)
+{
+	BUG_ON(!task);
+
+	requeue(task);
+	check_for_preemptions();
+}
+
+static void gsnedf_release_jobs(rt_domain_t* rt, struct bheap* tasks)
+{
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&gsnedf_lock, flags);
+
+	__merge_ready(rt, tasks);
+	check_for_preemptions();
+
+	raw_spin_unlock_irqrestore(&gsnedf_lock, flags);
+}
+
+/* caller holds gsnedf_lock */
+static noinline void job_completion(struct task_struct *t, int forced)
+{
+	BUG_ON(!t);
+
+	sched_trace_task_completion(t, forced);
+
+	TRACE_TASK(t, "job_completion().\n");
+
+	/* set flags */
+	set_rt_flags(t, RT_F_SLEEP);
+	/* prepare for next period */
+	/* prepare_for_next_period assumes implicit deadlines and no splitting,
+	 * so we call it with the job deadline it expects.
+	 */
+	t->rt_param.job_params.deadline = t->rt_param.job_params.release +
+		t->rt_param.task_params.period;
+	prepare_for_next_period(t);
+	/* We now set the subjob deadline to what it should be for scheduling
+	 * priority.
+	 */
+	t->rt_param.job_params.subjob_deadline = get_proper_deadline(t);
+	if (is_released(t, litmus_clock()))
+		sched_trace_task_release(t);
+	/* unlink */
+	unlink(t);
+	/* requeue
+	 * But don't requeue a blocking task. */
+	if (is_running(t))
+		gsnedf_job_arrival(t);
+}
+
+static void move_deadline(struct task_struct *t)
+{
+	tsk_rt(t)->job_params.subjob_deadline = get_proper_deadline(t);
+	TRACE_TASK(t, "move_deadline called\nRelease: %llu\nPeriod: %llu"
+		      "\nRelease + Period: %llu\nDeadline: %llu"
+		      "\nDeadline - Release: %llu\n",
+		      t->rt_param.job_params.release,
+		      t->rt_param.task_params.period,
+		      t->rt_param.job_params.release
+		      + t->rt_param.task_params.period,
+		      t->rt_param.job_params.subjob_deadline,
+		      t->rt_param.job_params.subjob_deadline
+		      - t->rt_param.job_params.release);
+	/* Check if rescheduling needed with lower priority. */
+	unlink(t);
+	gsnedf_job_arrival(t);
+}
+
+/* gsnedf_tick - this function is called for every local timer
+ *						 interrupt.
+ *
+ *				   checks whether the current task has expired and checks
+ *				   whether we need to preempt it if it has not expired
+ */
+static void gsnedf_tick(struct task_struct* t)
+{
+	if (is_realtime(t) && budget_enforced(t) && budget_exhausted(t)) {
+		if (!is_np(t)) {
+			/* np tasks will be preempted when they become
+			 * preemptable again
+			 */
+			litmus_reschedule_local();
+			TRACE("gsnedf_scheduler_tick: "
+				  "%d is preemptable "
+				  " => FORCE_RESCHED\n", t->pid);
+		} else if (is_user_np(t)) {
+			TRACE("gsnedf_scheduler_tick: "
+			      "%d is non-preemptable, "
+			      "preemption delayed.\n", t->pid);
+			request_exit_np(t);
+		}
+	}
+}
+
+/* Getting schedule() right is a bit tricky. schedule() may not make any
+ * assumptions on the state of the current task since it may be called for a
+ * number of reasons. The reasons include a scheduler_tick() determined that it
+ * was necessary, because sys_exit_np() was called, because some Linux
+ * subsystem determined so, or even (in the worst case) because there is a bug
+ * hidden somewhere. Thus, we must take extreme care to determine what the
+ * current state is.
+ *
+ * The CPU could currently be scheduling a task (or not), be linked (or not).
+ *
+ * The following assertions for the scheduled task could hold:
+ *
+ *	  - !is_running(scheduled)		// the job blocks
+ *	- scheduled->timeslice == 0	// the job completed (forcefully)
+ *	- get_rt_flag() == RT_F_SLEEP	// the job completed (by syscall)
+ * 	- linked != scheduled		// we need to reschedule (for any reason)
+ * 	- is_np(scheduled)		// rescheduling must be delayed,
+ *					   sys_exit_np must be requested
+ *
+ * Any of these can occur together.
+ */
+static struct task_struct* gsnedf_schedule(struct task_struct * prev)
+{
+	cpu_entry_t* entry = &__get_cpu_var(gsnedf_cpu_entries);
+	int out_of_time, sleep, preempt, np, exists, blocks, needs_move;
+	struct task_struct* next = NULL;
+
+#ifdef CONFIG_RELEASE_MASTER
+	/* Bail out early if we are the release master.
+	 * The release master never schedules any real-time tasks.
+	 */
+	if (unlikely(gsnedf.release_master == entry->cpu)) {
+		sched_state_task_picked();
+		return NULL;
+	}
+#endif
+
+	raw_spin_lock(&gsnedf_lock);
+
+	/* sanity checking */
+	BUG_ON(entry->scheduled && entry->scheduled != prev);
+	BUG_ON(entry->scheduled && !is_realtime(prev));
+	BUG_ON(is_realtime(prev) && !entry->scheduled);
+
+	/* (0) Determine state */
+	exists	  = entry->scheduled != NULL;
+	blocks	  = exists && !is_running(entry->scheduled);
+	out_of_time = exists &&
+		      budget_enforced(entry->scheduled) &&
+		      budget_exhausted(entry->scheduled);
+	needs_move  = exists && needs_deadline_move(entry->scheduled);
+	np 		= exists && is_np(entry->scheduled);
+	sleep		= exists && get_rt_flags(entry->scheduled) == RT_F_SLEEP;
+	preempt	 = entry->scheduled != entry->linked;
+
+#ifdef WANT_ALL_SCHED_EVENTS
+	TRACE_TASK(prev, "invoked gsnedf_schedule.\n");
+#endif
+
+	if (exists)
+		TRACE_TASK(prev,
+			   "blocks:%d out_of_time:%d needs_move:%d np:%d"
+			   " sleep:%d preempt:%d state:%d sig:%d\n",
+			   blocks, out_of_time, needs_move, np, sleep, preempt,
+			   prev->state, signal_pending(prev));
+	if (entry->linked && preempt)
+		TRACE_TASK(prev, "will be preempted by %s/%d\n",
+			   entry->linked->comm, entry->linked->pid);
+
+
+	/* If a task blocks we have no choice but to reschedule.
+	 */
+	if (blocks)
+		unlink(entry->scheduled);
+
+	/* Request a sys_exit_np() call if we would like to preempt but cannot.
+	 * We need to make sure to update the link structure anyway in case
+	 * that we are still linked. Multiple calls to request_exit_np() don't
+	 * hurt.
+	 *
+	 * Job deadline moves handled similarly
+	 */
+	if (np && (out_of_time || preempt || sleep)) {
+		unlink(entry->scheduled);
+		request_exit_np(entry->scheduled);
+	}
+	else if (np && needs_move) {
+		move_deadline(entry->scheduled);
+	}
+
+	/* Any task that is preemptable and either exhausts its execution
+	 * budget or wants to sleep completes. We may have to reschedule after
+	 * this. Don't do a job completion if we block (can't have timers running
+	 * for blocked jobs). Preemption go first for the same reason.
+	 */
+	if (!np && (out_of_time || sleep) && !blocks && !preempt)
+		job_completion(entry->scheduled, !sleep);
+	else if (!np && needs_move && !blocks && !preempt) {
+		move_deadline(entry->scheduled);
+	}
+
+	/* Link pending task if we became unlinked.
+	 */
+	if (!entry->linked)
+		link_task_to_cpu(__take_ready(&gsnedf), entry);
+
+	/* The final scheduling decision. Do we need to switch for some reason?
+	 * If linked is different from scheduled, then select linked as next.
+	 */
+	if ((!np || blocks) &&
+		entry->linked != entry->scheduled) {
+		/* Schedule a linked job? */
+		if (entry->linked) {
+			entry->linked->rt_param.scheduled_on = entry->cpu;
+			next = entry->linked;
+			TRACE_TASK(next, "scheduled_on = P%d\n", smp_processor_id());
+		}
+		if (entry->scheduled) {
+			/* not gonna be scheduled soon */
+			entry->scheduled->rt_param.scheduled_on = NO_CPU;
+			TRACE_TASK(entry->scheduled, "scheduled_on = NO_CPU\n");
+		}
+	} else
+		/* Only override Linux scheduler if we have a real-time task
+		 * scheduled that needs to continue.
+		 */
+		if (exists)
+			next = prev;
+
+	sched_state_task_picked();
+
+	raw_spin_unlock(&gsnedf_lock);
+
+	if (next) {
+		arm_split_timer(entry, next);
+	}
+	else if (entry->timer_armed) {
+		cancel_split_timer(entry);
+	}
+
+#ifdef WANT_ALL_SCHED_EVENTS
+	TRACE("gsnedf_lock released, next=0x%p\n", next);
+
+	if (next)
+		TRACE_TASK(next, "scheduled at %llu\n", litmus_clock());
+	else if (exists && !next)
+		TRACE("becomes idle at %llu.\n", litmus_clock());
+#endif
+
+
+	return next;
+}
+
+
+/* _finish_switch - we just finished the switch away from prev
+ */
+static void gsnedf_finish_switch(struct task_struct *prev)
+{
+	cpu_entry_t* 	entry = &__get_cpu_var(gsnedf_cpu_entries);
+
+	entry->scheduled = is_realtime(current) ? current : NULL;
+#ifdef WANT_ALL_SCHED_EVENTS
+	TRACE_TASK(prev, "switched away from\n");
+#endif
+}
+
+static void gsnedf_release_at(struct task_struct *t, lt_t start)
+{
+	t->rt_param.job_params.deadline = start;
+	prepare_for_next_period(t);
+	t->rt_param.job_params.subjob_deadline = get_proper_deadline(t);
+	set_rt_flags(t, RT_F_RUNNING);
+}
+
+/*	Prepare a task for running in RT mode
+ */
+static void gsnedf_task_new(struct task_struct * t, int on_rq, int running)
+{
+	unsigned long 		flags;
+	cpu_entry_t* 		entry;
+
+	TRACE("gsn edf: task new %d\n", t->pid);
+
+	raw_spin_lock_irqsave(&gsnedf_lock, flags);
+
+	/* setup job params */
+	gsnedf_release_at(t, litmus_clock());
+
+	if (running) {
+		entry = &per_cpu(gsnedf_cpu_entries, task_cpu(t));
+		BUG_ON(entry->scheduled);
+
+#ifdef CONFIG_RELEASE_MASTER
+		if (entry->cpu != gsnedf.release_master) {
+#endif
+			entry->scheduled = t;
+			tsk_rt(t)->scheduled_on = task_cpu(t);
+#ifdef CONFIG_RELEASE_MASTER
+		} else {
+			/* do not schedule on release master */
+			preempt(entry); /* force resched */
+			tsk_rt(t)->scheduled_on = NO_CPU;
+		}
+#endif
+	} else {
+		t->rt_param.scheduled_on = NO_CPU;
+	}
+	t->rt_param.linked_on		  = NO_CPU;
+
+	gsnedf_job_arrival(t);
+	raw_spin_unlock_irqrestore(&gsnedf_lock, flags);
+}
+
+static void gsnedf_task_wake_up(struct task_struct *task)
+{
+	unsigned long flags;
+	lt_t now;
+
+	TRACE_TASK(task, "wake_up at %llu\n", litmus_clock());
+
+	raw_spin_lock_irqsave(&gsnedf_lock, flags);
+	/* We need to take suspensions because of semaphores into
+	 * account! If a job resumes after being suspended due to acquiring
+	 * a semaphore, it should never be treated as a new job release.
+	 */
+	if (get_rt_flags(task) == RT_F_EXIT_SEM) {
+		set_rt_flags(task, RT_F_RUNNING);
+	} else {
+		now = litmus_clock();
+		if (is_tardy(task, now)) {
+			/* new sporadic release */
+			gsnedf_release_at(task, now);
+			sched_trace_task_release(task);
+		}
+		else {
+			if (task->rt.time_slice) {
+				/* came back in time before deadline
+				*/
+				set_rt_flags(task, RT_F_RUNNING);
+			}
+		}
+	}
+	gsnedf_job_arrival(task);
+	raw_spin_unlock_irqrestore(&gsnedf_lock, flags);
+}
+
+static void gsnedf_task_block(struct task_struct *t)
+{
+	unsigned long flags;
+
+	TRACE_TASK(t, "block at %llu\n", litmus_clock());
+
+	/* unlink if necessary */
+	raw_spin_lock_irqsave(&gsnedf_lock, flags);
+	unlink(t);
+	raw_spin_unlock_irqrestore(&gsnedf_lock, flags);
+
+	BUG_ON(!is_realtime(t));
+}
+
+
+static void gsnedf_task_exit(struct task_struct * t)
+{
+	unsigned long flags;
+
+	/* unlink if necessary */
+	raw_spin_lock_irqsave(&gsnedf_lock, flags);
+	unlink(t);
+	if (tsk_rt(t)->scheduled_on != NO_CPU) {
+		gsnedf_cpus[tsk_rt(t)->scheduled_on]->scheduled = NULL;
+		tsk_rt(t)->scheduled_on = NO_CPU;
+	}
+	raw_spin_unlock_irqrestore(&gsnedf_lock, flags);
+
+	BUG_ON(!is_realtime(t));
+	TRACE_TASK(t, "RIP\n");
+}
+
+
+static long gsnedf_admit_task(struct task_struct* tsk)
+{
+	return 0;
+}
+
+#ifdef CONFIG_LITMUS_LOCKING
+
+#include <litmus/fdso.h>
+
+/* called with IRQs off */
+static void set_priority_inheritance(struct task_struct* t, struct task_struct* prio_inh)
+{
+	int linked_on;
+	int check_preempt = 0;
+
+	raw_spin_lock(&gsnedf_lock);
+
+	TRACE_TASK(t, "inherits priority from %s/%d\n", prio_inh->comm, prio_inh->pid);
+	tsk_rt(t)->inh_task = prio_inh;
+
+	linked_on  = tsk_rt(t)->linked_on;
+
+	/* If it is scheduled, then we need to reorder the CPU heap. */
+	if (linked_on != NO_CPU) {
+		TRACE_TASK(t, "%s: linked  on %d\n",
+			   __FUNCTION__, linked_on);
+		/* Holder is scheduled; need to re-order CPUs.
+		 * We can't use heap_decrease() here since
+		 * the cpu_heap is ordered in reverse direction, so
+		 * it is actually an increase. */
+		bheap_delete(cpu_lower_prio, &gsnedf_cpu_heap,
+			     gsnedf_cpus[linked_on]->hn);
+		bheap_insert(cpu_lower_prio, &gsnedf_cpu_heap,
+			     gsnedf_cpus[linked_on]->hn);
+	} else {
+		/* holder may be queued: first stop queue changes */
+		raw_spin_lock(&gsnedf.release_lock);
+		if (is_queued(t)) {
+			TRACE_TASK(t, "%s: is queued\n",
+				   __FUNCTION__);
+			/* We need to update the position of holder in some
+			 * heap. Note that this could be a release heap if we
+			 * budget enforcement is used and this job overran. */
+			check_preempt =
+				!bheap_decrease(edf_ready_order,
+						tsk_rt(t)->heap_node);
+		} else {
+			/* Nothing to do: if it is not queued and not linked
+			 * then it is either sleeping or currently being moved
+			 * by other code (e.g., a timer interrupt handler) that
+			 * will use the correct priority when enqueuing the
+			 * task. */
+			TRACE_TASK(t, "%s: is NOT queued => Done.\n",
+				   __FUNCTION__);
+		}
+		raw_spin_unlock(&gsnedf.release_lock);
+
+		/* If holder was enqueued in a release heap, then the following
+		 * preemption check is pointless, but we can't easily detect
+		 * that case. If you want to fix this, then consider that
+		 * simply adding a state flag requires O(n) time to update when
+		 * releasing n tasks, which conflicts with the goal to have
+		 * O(log n) merges. */
+		if (check_preempt) {
+			/* heap_decrease() hit the top level of the heap: make
+			 * sure preemption checks get the right task, not the
+			 * potentially stale cache. */
+			bheap_uncache_min(edf_ready_order,
+					  &gsnedf.ready_queue);
+			check_for_preemptions();
+		}
+	}
+
+	raw_spin_unlock(&gsnedf_lock);
+}
+
+/* called with IRQs off */
+static void update_unlocked_priority(struct task_struct* t)
+{
+	raw_spin_lock(&gsnedf_lock);
+
+	/* A job only stops inheriting a priority when it releases a
+	 * resource. Thus we can make the following assumption.*/
+	BUG_ON(tsk_rt(t)->scheduled_on == NO_CPU);
+
+	/* Clear priority inheritance */
+	TRACE_TASK(t, "priority restored\n");
+	tsk_rt(t)->inh_task = NULL;
+
+	/* Update splitting deadline */
+	tsk_rt(t)->job_params.subjob_deadline = get_proper_deadline(t);
+
+	/* Check if rescheduling is necessary. We can't use heap_decrease()
+	 * since the priority was effectively lowered. */
+	unlink(t);
+	gsnedf_job_arrival(t);
+
+	raw_spin_unlock(&gsnedf_lock);
+}
+
+
+/* ******************** FMLP support ********************** */
+
+/* struct for semaphore with priority inheritance */
+struct fmlp_semaphore {
+	struct litmus_lock litmus_lock;
+
+	/* current resource holder */
+	struct task_struct *owner;
+
+	/* highest-priority waiter */
+	struct task_struct *hp_waiter;
+
+	/* FIFO queue of waiting tasks */
+	wait_queue_head_t wait;
+};
+
+static inline struct fmlp_semaphore* fmlp_from_lock(struct litmus_lock* lock)
+{
+	return container_of(lock, struct fmlp_semaphore, litmus_lock);
+}
+
+/* caller is responsible for locking */
+static struct task_struct* find_hp_waiter(struct fmlp_semaphore *sem,
+				   struct task_struct* skip)
+{
+	struct list_head	*pos;
+	struct task_struct 	*queued, *found = NULL;
+
+	list_for_each(pos, &sem->wait.task_list) {
+		queued  = (struct task_struct*) list_entry(pos, wait_queue_t,
+							   task_list)->private;
+
+		/* Compare task prios, find high prio task. */
+		if (queued != skip && edf_higher_prio(queued, found))
+			found = queued;
+	}
+	return found;
+}
+
+int gsnedf_fmlp_lock(struct litmus_lock* l)
+{
+	struct task_struct* t = current;
+	struct fmlp_semaphore *sem = fmlp_from_lock(l);
+	cpu_entry_t* entry;
+	wait_queue_t wait;
+	unsigned long flags;
+
+	if (!is_realtime(t))
+		return -EPERM;
+
+	spin_lock_irqsave(&sem->wait.lock, flags);
+	entry = &__get_cpu_var(gsnedf_cpu_entries);
+
+	tsk_rt(t)->in_crit_section = 1;
+	if (entry->timer_armed) {
+		cancel_split_timer(entry);
+	}
+
+	if (sem->owner) {
+		/* resource is not free => must suspend and wait */
+
+		init_waitqueue_entry(&wait, t);
+
+		/* FIXME: interruptible would be nice some day */
+		set_task_state(t, TASK_UNINTERRUPTIBLE);
+
+		__add_wait_queue_tail_exclusive(&sem->wait, &wait);
+
+		/* check if we need to activate priority inheritance */
+		if (edf_higher_prio(t, sem->hp_waiter)) {
+			sem->hp_waiter = t;
+			if (edf_higher_prio(t, sem->owner))
+				set_priority_inheritance(sem->owner, sem->hp_waiter);
+		}
+
+		TS_LOCK_SUSPEND;
+
+		/* release lock before sleeping */
+		spin_unlock_irqrestore(&sem->wait.lock, flags);
+
+		/* We depend on the FIFO order.  Thus, we don't need to recheck
+		 * when we wake up; we are guaranteed to have the lock since
+		 * there is only one wake up per release.
+		 */
+
+		schedule();
+
+		TS_LOCK_RESUME;
+
+		/* Since we hold the lock, no other task will change
+		 * ->owner. We can thus check it without acquiring the spin
+		 * lock. */
+		BUG_ON(sem->owner != t);
+	} else {
+		/* it's ours now */
+		sem->owner = t;
+
+		spin_unlock_irqrestore(&sem->wait.lock, flags);
+	}
+
+	return 0;
+}
+
+int gsnedf_fmlp_unlock(struct litmus_lock* l)
+{
+	struct task_struct *t = current, *next;
+	struct fmlp_semaphore *sem = fmlp_from_lock(l);
+	unsigned long flags;
+	int err = 0;
+
+	spin_lock_irqsave(&sem->wait.lock, flags);
+
+	if (sem->owner != t) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	/* check if there are jobs waiting for this resource */
+	next = __waitqueue_remove_first(&sem->wait);
+	if (next) {
+		/* next becomes the resouce holder */
+		sem->owner = next;
+		TRACE_CUR("lock ownership passed to %s/%d\n", next->comm, next->pid);
+
+		/* determine new hp_waiter if necessary */
+		if (next == sem->hp_waiter) {
+			TRACE_TASK(next, "was highest-prio waiter\n");
+			/* next has the highest priority --- it doesn't need to
+			 * inherit.  However, we need to make sure that the
+			 * next-highest priority in the queue is reflected in
+			 * hp_waiter. */
+			sem->hp_waiter = find_hp_waiter(sem, next);
+			if (sem->hp_waiter)
+				TRACE_TASK(sem->hp_waiter, "is new highest-prio waiter\n");
+			else
+				TRACE("no further waiters\n");
+		} else {
+			/* Well, if next is not the highest-priority waiter,
+			 * then it ought to inherit the highest-priority
+			 * waiter's priority. */
+			set_priority_inheritance(next, sem->hp_waiter);
+		}
+
+		/* wake up next */
+		wake_up_process(next);
+	} else
+		/* becomes available */
+		sem->owner = NULL;
+
+	/* We are no longer in a critical section */
+	tsk_rt(t)->in_crit_section = 0;
+
+	/* we lose the benefit of priority inheritance (if any) and may need
+	 * to move the deadline.  In either case, may need to reschedule
+	 * due to reduced priority. */
+	if (tsk_rt(t)->inh_task || needs_deadline_move(t))
+		update_unlocked_priority(t);
+	/* TODO: Check that schedule() gets called - it needs to arm the
+	 * enforcement timer.  Otherwise we should do it here or in
+	 * update_unlocked_priority. */
+
+out:
+	spin_unlock_irqrestore(&sem->wait.lock, flags);
+
+	return err;
+}
+
+int gsnedf_fmlp_close(struct litmus_lock* l)
+{
+	struct task_struct *t = current;
+	struct fmlp_semaphore *sem = fmlp_from_lock(l);
+	unsigned long flags;
+
+	int owner;
+
+	spin_lock_irqsave(&sem->wait.lock, flags);
+
+	owner = sem->owner == t;
+
+	spin_unlock_irqrestore(&sem->wait.lock, flags);
+
+	if (owner)
+		gsnedf_fmlp_unlock(l);
+
+	return 0;
+}
+
+void gsnedf_fmlp_free(struct litmus_lock* lock)
+{
+	kfree(fmlp_from_lock(lock));
+}
+
+static struct litmus_lock_ops gsnedf_fmlp_lock_ops = {
+	.close  = gsnedf_fmlp_close,
+	.lock   = gsnedf_fmlp_lock,
+	.unlock = gsnedf_fmlp_unlock,
+	.deallocate = gsnedf_fmlp_free,
+};
+
+static struct litmus_lock* gsnedf_new_fmlp(void)
+{
+	struct fmlp_semaphore* sem;
+
+	sem = kmalloc(sizeof(*sem), GFP_KERNEL);
+	if (!sem)
+		return NULL;
+
+	sem->owner   = NULL;
+	sem->hp_waiter = NULL;
+	init_waitqueue_head(&sem->wait);
+	sem->litmus_lock.ops = &gsnedf_fmlp_lock_ops;
+
+	return &sem->litmus_lock;
+}
+
+/* **** lock constructor **** */
+
+
+static long gsnedf_allocate_lock(struct litmus_lock **lock, int type,
+				 void* __user unused)
+{
+	int err = -ENXIO;
+
+	/* GSN-EDF currently only supports the FMLP for global resources. */
+	switch (type) {
+
+	case FMLP_SEM:
+		/* Flexible Multiprocessor Locking Protocol */
+		*lock = gsnedf_new_fmlp();
+		if (*lock)
+			err = 0;
+		else
+			err = -ENOMEM;
+		break;
+
+	};
+
+	return err;
+}
+
+#endif
+
+
+static long gsnedf_activate_plugin(void)
+{
+	int cpu;
+	cpu_entry_t *entry;
+
+	bheap_init(&gsnedf_cpu_heap);
+#ifdef CONFIG_RELEASE_MASTER
+	gsnedf.release_master = atomic_read(&release_master_cpu);
+#endif
+
+	for_each_online_cpu(cpu) {
+		entry = &per_cpu(gsnedf_cpu_entries, cpu);
+		bheap_node_init(&entry->hn, entry);
+		entry->linked	= NULL;
+		entry->scheduled = NULL;
+#ifdef CONFIG_RELEASE_MASTER
+		if (cpu != gsnedf.release_master) {
+#endif
+			TRACE("GSN-EDF: Initializing CPU #%d.\n", cpu);
+			update_cpu_position(entry);
+#ifdef CONFIG_RELEASE_MASTER
+		} else {
+			TRACE("GSN-EDF: CPU %d is release master.\n", cpu);
+		}
+#endif
+	}
+	return 0;
+}
+
+/*	Plugin object	*/
+static struct sched_plugin gsn_edf_plugin __cacheline_aligned_in_smp = {
+	.plugin_name		= "GSN-EDF",
+	.finish_switch		= gsnedf_finish_switch,
+	.tick			= gsnedf_tick,
+	.task_new		= gsnedf_task_new,
+	.complete_job		= complete_job,
+	.task_exit		= gsnedf_task_exit,
+	.schedule		= gsnedf_schedule,
+	.release_at		= gsnedf_release_at,
+	.task_wake_up		= gsnedf_task_wake_up,
+	.task_block		= gsnedf_task_block,
+	.admit_task		= gsnedf_admit_task,
+	.activate_plugin	= gsnedf_activate_plugin,
+#ifdef CONFIG_LITMUS_LOCKING
+	.allocate_lock		= gsnedf_allocate_lock,
+#endif
+};
+
+
+static int __init init_gsn_edf(void)
+{
+	int cpu;
+	cpu_entry_t *entry;
+
+	bheap_init(&gsnedf_cpu_heap);
+	/* initialize CPU state */
+	for (cpu = 0; cpu < NR_CPUS; cpu++)  {
+		entry = &per_cpu(gsnedf_cpu_entries, cpu);
+		gsnedf_cpus[cpu] = entry;
+		entry->cpu		= cpu;
+		entry->hn		= &gsnedf_heap_node[cpu];
+		hrtimer_init(&entry->split_timer,
+			     CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+		entry->split_timer.function = on_split_timeout;
+		bheap_node_init(&entry->hn, entry);
+	}
+	edf_domain_init(&gsnedf, NULL, gsnedf_release_jobs);
+	return register_sched_plugin(&gsn_edf_plugin);
+}
+
+
+module_init(init_gsn_edf);
diff --git a/litmus/sched_litmus.c b/litmus/sched_litmus.c
new file mode 100644
index 000000000000..5877307a996b
--- /dev/null
+++ b/litmus/sched_litmus.c
@@ -0,0 +1,328 @@
+/* This file is included from kernel/sched.c */
+
+#include <litmus/litmus.h>
+#include <litmus/budget.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/preempt.h>
+
+static void update_time_litmus(struct rq *rq, struct task_struct *p)
+{
+	u64 delta = rq->clock - p->se.exec_start;
+	if (unlikely((s64)delta < 0))
+		delta = 0;
+	/* per job counter */
+	p->rt_param.job_params.exec_time += delta;
+	/* task counter */
+	p->se.sum_exec_runtime += delta;
+	/* sched_clock() */
+	p->se.exec_start = rq->clock;
+	cpuacct_charge(p, delta);
+}
+
+static void double_rq_lock(struct rq *rq1, struct rq *rq2);
+static void double_rq_unlock(struct rq *rq1, struct rq *rq2);
+
+/*
+ * litmus_tick gets called by scheduler_tick() with HZ freq
+ * Interrupts are disabled
+ */
+static void litmus_tick(struct rq *rq, struct task_struct *p)
+{
+	TS_PLUGIN_TICK_START;
+
+	if (is_realtime(p))
+		update_time_litmus(rq, p);
+
+	/* plugin tick */
+	litmus->tick(p);
+
+	TS_PLUGIN_TICK_END;
+
+	return;
+}
+
+static struct task_struct *
+litmus_schedule(struct rq *rq, struct task_struct *prev)
+{
+	struct rq* other_rq;
+	struct task_struct *next;
+
+	long was_running;
+	lt_t _maybe_deadlock = 0;
+
+	/* let the plugin schedule */
+	next = litmus->schedule(prev);
+
+	sched_state_plugin_check();
+
+	/* check if a global plugin pulled a task from a different RQ */
+	if (next && task_rq(next) != rq) {
+		/* we need to migrate the task */
+		other_rq = task_rq(next);
+		TRACE_TASK(next, "migrate from %d\n", other_rq->cpu);
+
+		/* while we drop the lock, the prev task could change its
+		 * state
+		 */
+		was_running = is_running(prev);
+		mb();
+		raw_spin_unlock(&rq->lock);
+
+		/* Don't race with a concurrent switch.  This could deadlock in
+		 * the case of cross or circular migrations.  It's the job of
+		 * the plugin to make sure that doesn't happen.
+		 */
+		TRACE_TASK(next, "stack_in_use=%d\n",
+			   next->rt_param.stack_in_use);
+		if (next->rt_param.stack_in_use != NO_CPU) {
+			TRACE_TASK(next, "waiting to deschedule\n");
+			_maybe_deadlock = litmus_clock();
+		}
+		while (next->rt_param.stack_in_use != NO_CPU) {
+			cpu_relax();
+			mb();
+			if (next->rt_param.stack_in_use == NO_CPU)
+				TRACE_TASK(next,"descheduled. Proceeding.\n");
+
+			if (lt_before(_maybe_deadlock + 10000000,
+				      litmus_clock())) {
+				/* We've been spinning for 10ms.
+				 * Something can't be right!
+				 * Let's abandon the task and bail out; at least
+				 * we will have debug info instead of a hard
+				 * deadlock.
+				 */
+				TRACE_TASK(next,"stack too long in use. "
+					   "Deadlock?\n");
+				next = NULL;
+
+				/* bail out */
+				raw_spin_lock(&rq->lock);
+				return next;
+			}
+		}
+#ifdef  __ARCH_WANT_UNLOCKED_CTXSW
+		if (next->oncpu)
+			TRACE_TASK(next, "waiting for !oncpu");
+		while (next->oncpu) {
+			cpu_relax();
+			mb();
+		}
+#endif
+		double_rq_lock(rq, other_rq);
+		mb();
+		if (is_realtime(prev) && is_running(prev) != was_running) {
+			TRACE_TASK(prev,
+				   "state changed while we dropped"
+				   " the lock: is_running=%d, was_running=%d\n",
+				   is_running(prev), was_running);
+			if (is_running(prev) && !was_running) {
+				/* prev task became unblocked
+				 * we need to simulate normal sequence of events
+				 * to scheduler plugins.
+				 */
+				litmus->task_block(prev);
+				litmus->task_wake_up(prev);
+			}
+		}
+
+		set_task_cpu(next, smp_processor_id());
+
+		/* DEBUG: now that we have the lock we need to make sure a
+		 *  couple of things still hold:
+		 *  - it is still a real-time task
+		 *  - it is still runnable (could have been stopped)
+		 * If either is violated, then the active plugin is
+		 * doing something wrong.
+		 */
+		if (!is_realtime(next) || !is_running(next)) {
+			/* BAD BAD BAD */
+			TRACE_TASK(next,"BAD: migration invariant FAILED: "
+				   "rt=%d running=%d\n",
+				   is_realtime(next),
+				   is_running(next));
+			/* drop the task */
+			next = NULL;
+		}
+		/* release the other CPU's runqueue, but keep ours */
+		raw_spin_unlock(&other_rq->lock);
+	}
+	if (next) {
+		next->rt_param.stack_in_use = rq->cpu;
+		next->se.exec_start = rq->clock;
+	}
+
+	update_enforcement_timer(next);
+	return next;
+}
+
+static void enqueue_task_litmus(struct rq *rq, struct task_struct *p,
+				int flags)
+{
+	if (flags & ENQUEUE_WAKEUP) {
+		sched_trace_task_resume(p);
+		tsk_rt(p)->present = 1;
+		/* LITMUS^RT plugins need to update the state
+		 * _before_ making it available in global structures.
+		 * Linux gets away with being lazy about the task state
+		 * update. We can't do that, hence we update the task
+		 * state already here.
+		 *
+		 * WARNING: this needs to be re-evaluated when porting
+		 *          to newer kernel versions.
+		 */
+		p->state = TASK_RUNNING;
+		litmus->task_wake_up(p);
+
+		rq->litmus.nr_running++;
+	} else
+		TRACE_TASK(p, "ignoring an enqueue, not a wake up.\n");
+}
+
+static void dequeue_task_litmus(struct rq *rq, struct task_struct *p,
+				int flags)
+{
+	if (flags & DEQUEUE_SLEEP) {
+		litmus->task_block(p);
+		tsk_rt(p)->present = 0;
+		sched_trace_task_block(p);
+
+		rq->litmus.nr_running--;
+	} else
+		TRACE_TASK(p, "ignoring a dequeue, not going to sleep.\n");
+}
+
+static void yield_task_litmus(struct rq *rq)
+{
+	TS_SYSCALL_IN_START;
+
+	TS_SYSCALL_IN_END;
+
+	TRACE_CUR("yields\n");
+
+	BUG_ON(rq->curr != current);
+	/* sched_yield() is called to trigger delayed preemptions.
+	 * Thus, mark the current task as needing to be rescheduled.
+	 * This will cause the scheduler plugin to be invoked, which can
+	 * then determine if a preemption is still required.
+	 */
+	clear_exit_np(current);
+	litmus_reschedule_local();
+
+	TS_SYSCALL_OUT_START;
+}
+
+/* Plugins are responsible for this.
+ */
+static void check_preempt_curr_litmus(struct rq *rq, struct task_struct *p, int flags)
+{
+}
+
+static void put_prev_task_litmus(struct rq *rq, struct task_struct *p)
+{
+}
+
+static void pre_schedule_litmus(struct rq *rq, struct task_struct *prev)
+{
+	update_time_litmus(rq, prev);
+	if (!is_running(prev))
+		tsk_rt(prev)->present = 0;
+}
+
+/* pick_next_task_litmus() - litmus_schedule() function
+ *
+ * return the next task to be scheduled
+ */
+static struct task_struct *pick_next_task_litmus(struct rq *rq)
+{
+	/* get the to-be-switched-out task (prev) */
+	struct task_struct *prev = rq->litmus.prev;
+	struct task_struct *next;
+
+	/* if not called from schedule() but from somewhere
+	 * else (e.g., migration), return now!
+	 */
+	if(!rq->litmus.prev)
+		return NULL;
+
+	rq->litmus.prev = NULL;
+
+	TS_PLUGIN_SCHED_START;
+	next = litmus_schedule(rq, prev);
+	TS_PLUGIN_SCHED_END;
+
+	return next;
+}
+
+static void task_tick_litmus(struct rq *rq, struct task_struct *p, int queued)
+{
+	/* nothing to do; tick related tasks are done by litmus_tick() */
+	return;
+}
+
+static void switched_to_litmus(struct rq *rq, struct task_struct *p, int running)
+{
+}
+
+static void prio_changed_litmus(struct rq *rq, struct task_struct *p,
+				int oldprio, int running)
+{
+}
+
+unsigned int get_rr_interval_litmus(struct rq *rq, struct task_struct *p)
+{
+	/* return infinity */
+	return 0;
+}
+
+/* This is called when a task became a real-time task, either due to a SCHED_*
+ * class transition or due to PI mutex inheritance. We don't handle Linux PI
+ * mutex inheritance yet (and probably never will). Use LITMUS provided
+ * synchronization primitives instead.
+ */
+static void set_curr_task_litmus(struct rq *rq)
+{
+	rq->curr->se.exec_start = rq->clock;
+}
+
+
+#ifdef CONFIG_SMP
+/* execve tries to rebalance task in this scheduling domain.
+ * We don't care about the scheduling domain; can gets called from
+ * exec, fork, wakeup.
+ */
+static int select_task_rq_litmus(struct rq *rq, struct task_struct *p,
+		int sd_flag, int flags)
+{
+	/* preemption is already disabled.
+	 * We don't want to change cpu here
+	 */
+	return task_cpu(p);
+}
+#endif
+
+static const struct sched_class litmus_sched_class = {
+	.next			= &rt_sched_class,
+	.enqueue_task		= enqueue_task_litmus,
+	.dequeue_task		= dequeue_task_litmus,
+	.yield_task		= yield_task_litmus,
+
+	.check_preempt_curr	= check_preempt_curr_litmus,
+
+	.pick_next_task		= pick_next_task_litmus,
+	.put_prev_task		= put_prev_task_litmus,
+
+#ifdef CONFIG_SMP
+	.select_task_rq		= select_task_rq_litmus,
+
+	.pre_schedule		= pre_schedule_litmus,
+#endif
+
+	.set_curr_task          = set_curr_task_litmus,
+	.task_tick		= task_tick_litmus,
+
+	.get_rr_interval	= get_rr_interval_litmus,
+
+	.prio_changed		= prio_changed_litmus,
+	.switched_to		= switched_to_litmus,
+};
diff --git a/litmus/sched_litmus.c.rej b/litmus/sched_litmus.c.rej
new file mode 100644
index 000000000000..e0750ecbe7a2
--- /dev/null
+++ b/litmus/sched_litmus.c.rej
@@ -0,0 +1,11 @@
+--- litmus/sched_litmus.c
++++ litmus/sched_litmus.c
+@@ -196,7 +196,7 @@
+ {
+ 	TS_SYSCALL_IN_START;
+ 
+-	TS_SYSCALL_OUT_END;
++	TS_SYSCALL_IN_END;
+ 
+ 	TRACE_CUR("yields\n");
+ 
diff --git a/litmus/sched_pfair.c b/litmus/sched_pfair.c
new file mode 100644
index 000000000000..c95bde87b5d7
--- /dev/null
+++ b/litmus/sched_pfair.c
@@ -0,0 +1,1056 @@
+/*
+ * kernel/sched_pfair.c
+ *
+ * Implementation of the PD^2 pfair scheduling algorithm. This
+ * implementation realizes "early releasing," i.e., it is work-conserving.
+ *
+ */
+
+#include <asm/div64.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+
+#include <litmus/litmus.h>
+#include <litmus/jobs.h>
+#include <litmus/preempt.h>
+#include <litmus/rt_domain.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/sched_trace.h>
+
+#include <litmus/bheap.h>
+
+/* to configure the cluster size */
+#include <litmus/litmus_proc.h>
+
+#include <litmus/clustered.h>
+
+static enum cache_level pfair_cluster_level = GLOBAL_CLUSTER;
+
+struct subtask {
+	/* measured in quanta relative to job release */
+	quanta_t release;
+        quanta_t deadline;
+	quanta_t overlap; /* called "b bit" by PD^2 */
+	quanta_t group_deadline;
+};
+
+struct pfair_param   {
+	quanta_t	quanta;       /* number of subtasks */
+	quanta_t	cur;          /* index of current subtask */
+
+	quanta_t	release;      /* in quanta */
+	quanta_t	period;       /* in quanta */
+
+	quanta_t	last_quantum; /* when scheduled last */
+	int		last_cpu;     /* where scheduled last */
+
+	struct pfair_cluster* cluster; /* where this task is scheduled */
+
+	struct subtask subtasks[0];   /* allocate together with pfair_param */
+};
+
+#define tsk_pfair(tsk) ((tsk)->rt_param.pfair)
+
+struct pfair_state {
+	struct cluster_cpu topology;
+
+	volatile quanta_t cur_tick;    /* updated by the CPU that is advancing
+				        * the time */
+	volatile quanta_t local_tick;  /* What tick is the local CPU currently
+				        * executing? Updated only by the local
+				        * CPU. In QEMU, this may lag behind the
+				        * current tick. In a real system, with
+				        * proper timers and aligned quanta,
+				        * that should only be the case for a
+				        * very short time after the time
+				        * advanced. With staggered quanta, it
+				        * will lag for the duration of the
+				        * offset.
+					*/
+
+	struct task_struct* linked;    /* the task that should be executing */
+	struct task_struct* local;     /* the local copy of linked          */
+	struct task_struct* scheduled; /* what is actually scheduled        */
+
+	lt_t offset;			/* stagger offset */
+	unsigned int missed_updates;
+	unsigned int missed_quanta;
+};
+
+struct pfair_cluster {
+	struct scheduling_cluster topology;
+
+	/* The "global" time in this cluster. */
+	quanta_t pfair_time; /* the "official" PFAIR clock */
+
+	/* The ready queue for this cluster. */
+	rt_domain_t pfair;
+
+	/* The set of jobs that should have their release enacted at the next
+	 * quantum boundary.
+	 */
+	struct bheap release_queue;
+	raw_spinlock_t release_lock;
+};
+
+static inline struct pfair_cluster* cpu_cluster(struct pfair_state* state)
+{
+	return container_of(state->topology.cluster, struct pfair_cluster, topology);
+}
+
+static inline int cpu_id(struct pfair_state* state)
+{
+	return state->topology.id;
+}
+
+static inline struct pfair_state* from_cluster_list(struct list_head* pos)
+{
+	return list_entry(pos, struct pfair_state, topology.cluster_list);
+}
+
+static inline struct pfair_cluster* from_domain(rt_domain_t* rt)
+{
+	return container_of(rt, struct pfair_cluster, pfair);
+}
+
+static inline raw_spinlock_t* cluster_lock(struct pfair_cluster* cluster)
+{
+	/* The ready_lock is used to serialize all scheduling events. */
+	return &cluster->pfair.ready_lock;
+}
+
+static inline raw_spinlock_t* cpu_lock(struct pfair_state* state)
+{
+	return cluster_lock(cpu_cluster(state));
+}
+
+DEFINE_PER_CPU(struct pfair_state, pfair_state);
+struct pfair_state* *pstate; /* short cut */
+
+static struct pfair_cluster* pfair_clusters;
+static int num_pfair_clusters;
+
+/* Enable for lots of trace info.
+ * #define PFAIR_DEBUG
+ */
+
+#ifdef PFAIR_DEBUG
+#define PTRACE_TASK(t, f, args...)  TRACE_TASK(t, f, ## args)
+#define PTRACE(f, args...) TRACE(f, ## args)
+#else
+#define PTRACE_TASK(t, f, args...)
+#define PTRACE(f, args...)
+#endif
+
+/* gcc will inline all of these accessor functions... */
+static struct subtask* cur_subtask(struct task_struct* t)
+{
+	return tsk_pfair(t)->subtasks + tsk_pfair(t)->cur;
+}
+
+static quanta_t cur_deadline(struct task_struct* t)
+{
+	return cur_subtask(t)->deadline +  tsk_pfair(t)->release;
+}
+
+static quanta_t cur_release(struct task_struct* t)
+{
+	/* This is early releasing: only the release of the first subtask
+	 * counts. */
+	return tsk_pfair(t)->release;
+}
+
+static quanta_t cur_overlap(struct task_struct* t)
+{
+	return cur_subtask(t)->overlap;
+}
+
+static quanta_t cur_group_deadline(struct task_struct* t)
+{
+	quanta_t gdl = cur_subtask(t)->group_deadline;
+	if (gdl)
+		return gdl + tsk_pfair(t)->release;
+	else
+		return gdl;
+}
+
+
+static int pfair_higher_prio(struct task_struct* first,
+			     struct task_struct* second)
+{
+	return  /* first task must exist */
+		first && (
+		/* Does the second task exist and is it a real-time task?  If
+		 * not, the first task (which is a RT task) has higher
+		 * priority.
+		 */
+		!second || !is_realtime(second)  ||
+
+		/* Is the (subtask) deadline of the first task earlier?
+		 * Then it has higher priority.
+		 */
+		time_before(cur_deadline(first), cur_deadline(second)) ||
+
+		/* Do we have a deadline tie?
+		 * Then break by B-bit.
+		 */
+		(cur_deadline(first) == cur_deadline(second) &&
+		 (cur_overlap(first) > cur_overlap(second) ||
+
+		/* Do we have a B-bit tie?
+		 * Then break by group deadline.
+		 */
+		(cur_overlap(first) == cur_overlap(second) &&
+		 (time_after(cur_group_deadline(first),
+			     cur_group_deadline(second)) ||
+
+		/* Do we have a group deadline tie?
+		 * Then break by PID, which are unique.
+		 */
+		(cur_group_deadline(first) ==
+		 cur_group_deadline(second) &&
+		 first->pid < second->pid))))));
+}
+
+int pfair_ready_order(struct bheap_node* a, struct bheap_node* b)
+{
+	return pfair_higher_prio(bheap2task(a), bheap2task(b));
+}
+
+static void pfair_release_jobs(rt_domain_t* rt, struct bheap* tasks)
+{
+	struct pfair_cluster* cluster = from_domain(rt);
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&cluster->release_lock, flags);
+
+	bheap_union(pfair_ready_order, &cluster->release_queue, tasks);
+
+	raw_spin_unlock_irqrestore(&cluster->release_lock, flags);
+}
+
+static void prepare_release(struct task_struct* t, quanta_t at)
+{
+	tsk_pfair(t)->release    = at;
+	tsk_pfair(t)->cur        = 0;
+}
+
+/* pull released tasks from the release queue */
+static void poll_releases(struct pfair_cluster* cluster)
+{
+	raw_spin_lock(&cluster->release_lock);
+	__merge_ready(&cluster->pfair, &cluster->release_queue);
+	raw_spin_unlock(&cluster->release_lock);
+}
+
+static void check_preempt(struct task_struct* t)
+{
+	int cpu = NO_CPU;
+	if (tsk_rt(t)->linked_on != tsk_rt(t)->scheduled_on &&
+	    tsk_rt(t)->present) {
+		/* the task can be scheduled and
+		 * is not scheduled where it ought to be scheduled
+		 */
+		cpu = tsk_rt(t)->linked_on != NO_CPU ?
+			tsk_rt(t)->linked_on         :
+			tsk_rt(t)->scheduled_on;
+		PTRACE_TASK(t, "linked_on:%d, scheduled_on:%d\n",
+			   tsk_rt(t)->linked_on, tsk_rt(t)->scheduled_on);
+		/* preempt */
+		litmus_reschedule(cpu);
+	}
+}
+
+/* caller must hold pfair.ready_lock */
+static void drop_all_references(struct task_struct *t)
+{
+        int cpu;
+        struct pfair_state* s;
+	struct pfair_cluster* cluster;
+        if (bheap_node_in_heap(tsk_rt(t)->heap_node)) {
+                /* It must be in the ready queue; drop references isn't called
+		 * when the job is in a release queue. */
+		cluster = tsk_pfair(t)->cluster;
+                bheap_delete(pfair_ready_order, &cluster->pfair.ready_queue,
+                            tsk_rt(t)->heap_node);
+        }
+        for (cpu = 0; cpu < num_online_cpus(); cpu++) {
+                s = &per_cpu(pfair_state, cpu);
+                if (s->linked == t)
+                        s->linked = NULL;
+                if (s->local  == t)
+                        s->local  = NULL;
+                if (s->scheduled  == t)
+                        s->scheduled = NULL;
+        }
+}
+
+static void pfair_prepare_next_period(struct task_struct* t)
+{
+	struct pfair_param* p = tsk_pfair(t);
+
+	prepare_for_next_period(t);
+	get_rt_flags(t) = RT_F_RUNNING;
+	p->release += p->period;
+}
+
+/* returns 1 if the task needs to go the release queue */
+static int advance_subtask(quanta_t time, struct task_struct* t, int cpu)
+{
+	struct pfair_param* p = tsk_pfair(t);
+	int to_relq;
+	p->cur = (p->cur + 1) % p->quanta;
+	if (!p->cur) {
+		if (tsk_rt(t)->present) {
+			/* The job overran; we start a new budget allocation. */
+			pfair_prepare_next_period(t);
+		} else {
+			/* remove task from system until it wakes */
+			drop_all_references(t);
+			TRACE_TASK(t, "on %d advanced to subtask %lu (not present)\n",
+				   cpu, p->cur);
+			return 0;
+		}
+	}
+	to_relq = time_after(cur_release(t), time);
+	TRACE_TASK(t, "on %d advanced to subtask %lu -> to_relq=%d (cur_release:%lu time:%lu)\n",
+		   cpu, p->cur, to_relq, cur_release(t), time);
+	return to_relq;
+}
+
+static void advance_subtasks(struct pfair_cluster *cluster, quanta_t time)
+{
+	struct task_struct* l;
+	struct pfair_param* p;
+	struct list_head* pos;
+	struct pfair_state* cpu;
+
+	list_for_each(pos, &cluster->topology.cpus) {
+		cpu = from_cluster_list(pos);
+		l = cpu->linked;
+		cpu->missed_updates += cpu->linked != cpu->local;
+		if (l) {
+			p = tsk_pfair(l);
+			p->last_quantum = time;
+			p->last_cpu     =  cpu_id(cpu);
+			if (advance_subtask(time, l, cpu_id(cpu))) {
+				//cpu->linked = NULL;
+				PTRACE_TASK(l, "should go to release queue. "
+					    "scheduled_on=%d present=%d\n",
+					    tsk_rt(l)->scheduled_on,
+					    tsk_rt(l)->present);
+			}
+		}
+	}
+}
+
+static int target_cpu(quanta_t time, struct task_struct* t, int default_cpu)
+{
+	int cpu;
+	if (tsk_rt(t)->scheduled_on != NO_CPU) {
+		/* always observe scheduled_on linkage */
+		default_cpu = tsk_rt(t)->scheduled_on;
+	} else if (tsk_pfair(t)->last_quantum == time - 1) {
+		/* back2back quanta */
+		/* Only observe last_quantum if no scheduled_on is in the way.
+		 * This should only kick in if a CPU missed quanta, and that
+		 * *should* only happen in QEMU.
+		 */
+		cpu = tsk_pfair(t)->last_cpu;
+		if (!pstate[cpu]->linked ||
+		    tsk_rt(pstate[cpu]->linked)->scheduled_on != cpu) {
+			default_cpu = cpu;
+		}
+	}
+	return default_cpu;
+}
+
+/* returns one if linking was redirected */
+static int pfair_link(quanta_t time, int cpu,
+		      struct task_struct* t)
+{
+	int target = target_cpu(time, t, cpu);
+	struct task_struct* prev  = pstate[cpu]->linked;
+	struct task_struct* other;
+	struct pfair_cluster* cluster = cpu_cluster(pstate[cpu]);
+
+	if (target != cpu) {
+		BUG_ON(pstate[target]->topology.cluster != pstate[cpu]->topology.cluster);
+		other = pstate[target]->linked;
+		pstate[target]->linked = t;
+		tsk_rt(t)->linked_on   = target;
+		if (!other)
+			/* linked ok, but reschedule this CPU */
+			return 1;
+		if (target < cpu) {
+			/* link other to cpu instead */
+			tsk_rt(other)->linked_on = cpu;
+			pstate[cpu]->linked      = other;
+			if (prev) {
+				/* prev got pushed back into the ready queue */
+				tsk_rt(prev)->linked_on = NO_CPU;
+				__add_ready(&cluster->pfair, prev);
+			}
+			/* we are done with this cpu */
+			return 0;
+		} else {
+			/* re-add other, it's original CPU was not considered yet */
+			tsk_rt(other)->linked_on = NO_CPU;
+			__add_ready(&cluster->pfair, other);
+			/* reschedule this CPU */
+			return 1;
+		}
+	} else {
+		pstate[cpu]->linked  = t;
+		tsk_rt(t)->linked_on = cpu;
+		if (prev) {
+			/* prev got pushed back into the ready queue */
+			tsk_rt(prev)->linked_on = NO_CPU;
+			__add_ready(&cluster->pfair, prev);
+		}
+		/* we are done with this CPU */
+		return 0;
+	}
+}
+
+static void schedule_subtasks(struct pfair_cluster *cluster, quanta_t time)
+{
+	int retry;
+	struct list_head *pos;
+	struct pfair_state *cpu_state;
+
+	list_for_each(pos, &cluster->topology.cpus) {
+		cpu_state = from_cluster_list(pos);
+		retry = 1;
+#ifdef CONFIG_RELEASE_MASTER
+		/* skip release master */
+		if (cluster->pfair.release_master == cpu_id(cpu_state))
+			continue;
+#endif
+		while (retry) {
+			if (pfair_higher_prio(__peek_ready(&cluster->pfair),
+					      cpu_state->linked))
+				retry = pfair_link(time, cpu_id(cpu_state),
+						   __take_ready(&cluster->pfair));
+			else
+				retry = 0;
+		}
+	}
+}
+
+static void schedule_next_quantum(struct pfair_cluster *cluster, quanta_t time)
+{
+	struct pfair_state *cpu;
+	struct list_head* pos;
+
+	/* called with interrupts disabled */
+	PTRACE("--- Q %lu at %llu PRE-SPIN\n",
+	       time, litmus_clock());
+	raw_spin_lock(cluster_lock(cluster));
+	PTRACE("<<< Q %lu at %llu\n",
+	       time, litmus_clock());
+
+	sched_trace_quantum_boundary();
+
+	advance_subtasks(cluster, time);
+	poll_releases(cluster);
+	schedule_subtasks(cluster, time);
+
+	list_for_each(pos, &cluster->topology.cpus) {
+		cpu = from_cluster_list(pos);
+		if (cpu->linked)
+			PTRACE_TASK(cpu->linked,
+				    " linked on %d.\n", cpu_id(cpu));
+		else
+			PTRACE("(null) linked on %d.\n", cpu_id(cpu));
+	}
+	/* We are done. Advance time. */
+	mb();
+	list_for_each(pos, &cluster->topology.cpus) {
+		cpu = from_cluster_list(pos);
+		if (cpu->local_tick != cpu->cur_tick) {
+			TRACE("BAD Quantum not acked on %d "
+			      "(l:%lu c:%lu p:%lu)\n",
+			      cpu_id(cpu),
+			      cpu->local_tick,
+			      cpu->cur_tick,
+			      cluster->pfair_time);
+			cpu->missed_quanta++;
+		}
+		cpu->cur_tick = time;
+	}
+	PTRACE(">>> Q %lu at %llu\n",
+	       time, litmus_clock());
+	raw_spin_unlock(cluster_lock(cluster));
+}
+
+static noinline void wait_for_quantum(quanta_t q, struct pfair_state* state)
+{
+	quanta_t loc;
+
+	goto first; /* skip mb() on first iteration */
+	do {
+		cpu_relax();
+		mb();
+	first:	loc = state->cur_tick;
+		/* FIXME: what if loc > cur? */
+	} while (time_before(loc, q));
+	PTRACE("observed cur_tick:%lu >= q:%lu\n",
+	       loc, q);
+}
+
+static quanta_t current_quantum(struct pfair_state* state)
+{
+	lt_t t = litmus_clock() - state->offset;
+	return time2quanta(t, FLOOR);
+}
+
+static void catchup_quanta(quanta_t from, quanta_t target,
+			   struct pfair_state* state)
+{
+	quanta_t cur = from, time;
+	TRACE("+++< BAD catching up quanta from %lu to %lu\n",
+	      from, target);
+	while (time_before(cur, target)) {
+		wait_for_quantum(cur, state);
+		cur++;
+		time = cmpxchg(&cpu_cluster(state)->pfair_time,
+			       cur - 1,   /* expected */
+			       cur        /* next     */
+			);
+		if (time == cur - 1)
+			schedule_next_quantum(cpu_cluster(state), cur);
+	}
+	TRACE("+++> catching up done\n");
+}
+
+/* pfair_tick - this function is called for every local timer
+ *                         interrupt.
+ */
+static void pfair_tick(struct task_struct* t)
+{
+	struct pfair_state* state = &__get_cpu_var(pfair_state);
+	quanta_t time, cur;
+	int retry = 10;
+
+	do {
+		cur  = current_quantum(state);
+		PTRACE("q %lu at %llu\n", cur, litmus_clock());
+
+		/* Attempt to advance time. First CPU to get here
+		 * will prepare the next quantum.
+		 */
+		time = cmpxchg(&cpu_cluster(state)->pfair_time,
+			       cur - 1,   /* expected */
+			       cur        /* next     */
+			);
+		if (time == cur - 1) {
+			/* exchange succeeded */
+			wait_for_quantum(cur - 1, state);
+			schedule_next_quantum(cpu_cluster(state), cur);
+			retry = 0;
+		} else if (time_before(time, cur - 1)) {
+			/* the whole system missed a tick !? */
+			catchup_quanta(time, cur, state);
+			retry--;
+		} else if (time_after(time, cur)) {
+			/* our timer lagging behind!? */
+			TRACE("BAD pfair_time:%lu > cur:%lu\n", time, cur);
+			retry--;
+		} else {
+			/* Some other CPU already started scheduling
+			 * this quantum. Let it do its job and then update.
+			 */
+			retry = 0;
+		}
+	} while (retry);
+
+	/* Spin locally until time advances. */
+	wait_for_quantum(cur, state);
+
+	/* copy assignment */
+	/* FIXME: what if we race with a future update? Corrupted state? */
+	state->local      = state->linked;
+	/* signal that we are done */
+	mb();
+	state->local_tick = state->cur_tick;
+
+	if (state->local != current
+	    && (is_realtime(current) || is_present(state->local)))
+		litmus_reschedule_local();
+}
+
+static int safe_to_schedule(struct task_struct* t, int cpu)
+{
+	int where = tsk_rt(t)->scheduled_on;
+	if (where != NO_CPU && where != cpu) {
+		TRACE_TASK(t, "BAD: can't be scheduled on %d, "
+			   "scheduled already on %d.\n", cpu, where);
+		return 0;
+	} else
+		return tsk_rt(t)->present && get_rt_flags(t) == RT_F_RUNNING;
+}
+
+static struct task_struct* pfair_schedule(struct task_struct * prev)
+{
+	struct pfair_state* state = &__get_cpu_var(pfair_state);
+	struct pfair_cluster* cluster = cpu_cluster(state);
+	int blocks, completion, out_of_time;
+	struct task_struct* next = NULL;
+
+#ifdef CONFIG_RELEASE_MASTER
+	/* Bail out early if we are the release master.
+	 * The release master never schedules any real-time tasks.
+	 */
+	if (unlikely(cluster->pfair.release_master == cpu_id(state))) {
+		sched_state_task_picked();
+		return NULL;
+	}
+#endif
+
+	raw_spin_lock(cpu_lock(state));
+
+	blocks      = is_realtime(prev) && !is_running(prev);
+	completion  = is_realtime(prev) && get_rt_flags(prev) == RT_F_SLEEP;
+	out_of_time = is_realtime(prev) && time_after(cur_release(prev),
+						      state->local_tick);
+
+	if (is_realtime(prev))
+	    PTRACE_TASK(prev, "blocks:%d completion:%d out_of_time:%d\n",
+			blocks, completion, out_of_time);
+
+	if (completion) {
+		sched_trace_task_completion(prev, 0);
+		pfair_prepare_next_period(prev);
+		prepare_release(prev, cur_release(prev));
+	}
+
+	if (!blocks && (completion || out_of_time)) {
+		drop_all_references(prev);
+		sched_trace_task_release(prev);
+		add_release(&cluster->pfair, prev);
+	}
+
+	if (state->local && safe_to_schedule(state->local, cpu_id(state)))
+		next = state->local;
+
+	if (prev != next) {
+		tsk_rt(prev)->scheduled_on = NO_CPU;
+		if (next)
+			tsk_rt(next)->scheduled_on = cpu_id(state);
+	}
+	sched_state_task_picked();
+	raw_spin_unlock(cpu_lock(state));
+
+	if (next)
+		TRACE_TASK(next, "scheduled rel=%lu at %lu (%llu)\n",
+			   tsk_pfair(next)->release, cpu_cluster(state)->pfair_time, litmus_clock());
+	else if (is_realtime(prev))
+		TRACE("Becomes idle at %lu (%llu)\n", cpu_cluster(state)->pfair_time, litmus_clock());
+
+	return next;
+}
+
+static void pfair_task_new(struct task_struct * t, int on_rq, int running)
+{
+	unsigned long flags;
+	struct pfair_cluster* cluster;
+
+	TRACE("pfair: task new %d state:%d\n", t->pid, t->state);
+
+	cluster = tsk_pfair(t)->cluster;
+
+	raw_spin_lock_irqsave(cluster_lock(cluster), flags);
+
+	prepare_release(t, cluster->pfair_time + 1);
+
+	t->rt_param.scheduled_on = NO_CPU;
+
+	if (running) {
+#ifdef CONFIG_RELEASE_MASTER
+		if (task_cpu(t) != cluster->pfair.release_master)
+#endif
+			t->rt_param.scheduled_on = task_cpu(t);
+		__add_ready(&cluster->pfair, t);
+	}
+
+	check_preempt(t);
+
+	raw_spin_unlock_irqrestore(cluster_lock(cluster), flags);
+}
+
+static void pfair_task_wake_up(struct task_struct *t)
+{
+	unsigned long flags;
+	lt_t now;
+	struct pfair_cluster* cluster;
+
+	cluster = tsk_pfair(t)->cluster;
+
+	TRACE_TASK(t, "wakes at %llu, release=%lu, pfair_time:%lu\n",
+		   litmus_clock(), cur_release(t), cluster->pfair_time);
+
+	raw_spin_lock_irqsave(cluster_lock(cluster), flags);
+
+	/* If a task blocks and wakes before its next job release,
+	 * then it may resume if it is currently linked somewhere
+	 * (as if it never blocked at all). Otherwise, we have a
+	 * new sporadic job release.
+	 */
+	now = litmus_clock();
+	if (lt_before(get_deadline(t), now)) {
+		release_at(t, now);
+		prepare_release(t, time2quanta(now, CEIL));
+		sched_trace_task_release(t);
+	}
+
+	/* only add to ready queue if the task isn't still linked somewhere */
+	if (tsk_rt(t)->linked_on == NO_CPU)
+		__add_ready(&cluster->pfair, t);
+
+	check_preempt(t);
+
+	raw_spin_unlock_irqrestore(cluster_lock(cluster), flags);
+	TRACE_TASK(t, "wake up done at %llu\n", litmus_clock());
+}
+
+static void pfair_task_block(struct task_struct *t)
+{
+	BUG_ON(!is_realtime(t));
+	TRACE_TASK(t, "blocks at %llu, state:%d\n",
+		   litmus_clock(), t->state);
+}
+
+static void pfair_task_exit(struct task_struct * t)
+{
+	unsigned long flags;
+	struct pfair_cluster *cluster;
+
+	BUG_ON(!is_realtime(t));
+
+	cluster = tsk_pfair(t)->cluster;
+
+	/* Remote task from release or ready queue, and ensure
+	 * that it is not the scheduled task for ANY CPU. We
+	 * do this blanket check because occassionally when
+	 * tasks exit while blocked, the task_cpu of the task
+	 * might not be the same as the CPU that the PFAIR scheduler
+	 * has chosen for it.
+	 */
+	raw_spin_lock_irqsave(cluster_lock(cluster), flags);
+
+	TRACE_TASK(t, "RIP, state:%d\n", t->state);
+	drop_all_references(t);
+
+	raw_spin_unlock_irqrestore(cluster_lock(cluster), flags);
+
+	kfree(t->rt_param.pfair);
+	t->rt_param.pfair = NULL;
+}
+
+
+static void pfair_release_at(struct task_struct* task, lt_t start)
+{
+	unsigned long flags;
+	quanta_t release;
+
+	struct pfair_cluster *cluster;
+
+	cluster = tsk_pfair(task)->cluster;
+
+	BUG_ON(!is_realtime(task));
+
+	raw_spin_lock_irqsave(cluster_lock(cluster), flags);
+	release_at(task, start);
+	release = time2quanta(start, CEIL);
+
+	TRACE_TASK(task, "sys release at %lu\n", release);
+
+	drop_all_references(task);
+	prepare_release(task, release);
+	add_release(&cluster->pfair, task);
+
+	raw_spin_unlock_irqrestore(cluster_lock(cluster), flags);
+}
+
+static void init_subtask(struct subtask* sub, unsigned long i,
+			 lt_t quanta, lt_t period)
+{
+	/* since i is zero-based, the formulas are shifted by one */
+	lt_t tmp;
+
+	/* release */
+	tmp = period * i;
+	do_div(tmp, quanta); /* floor */
+	sub->release = (quanta_t) tmp;
+
+	/* deadline */
+	tmp = period * (i + 1);
+	if (do_div(tmp, quanta)) /* ceil */
+		tmp++;
+	sub->deadline = (quanta_t) tmp;
+
+	/* next release */
+	tmp = period * (i + 1);
+	do_div(tmp, quanta); /* floor */
+	sub->overlap =  sub->deadline - (quanta_t) tmp;
+
+	/* Group deadline.
+	 * Based on the formula given in Uma's thesis.
+	 */
+	if (2 * quanta >= period) {
+		/* heavy */
+		tmp = (sub->deadline - (i + 1)) * period;
+		if (period > quanta &&
+		    do_div(tmp, (period - quanta))) /* ceil */
+			tmp++;
+		sub->group_deadline = (quanta_t) tmp;
+	} else
+		sub->group_deadline = 0;
+}
+
+static void dump_subtasks(struct task_struct* t)
+{
+	unsigned long i;
+	for (i = 0; i < t->rt_param.pfair->quanta; i++)
+		TRACE_TASK(t, "SUBTASK %lu: rel=%lu dl=%lu bbit:%lu gdl:%lu\n",
+			   i + 1,
+			   t->rt_param.pfair->subtasks[i].release,
+			   t->rt_param.pfair->subtasks[i].deadline,
+			   t->rt_param.pfair->subtasks[i].overlap,
+			   t->rt_param.pfair->subtasks[i].group_deadline);
+}
+
+static long pfair_admit_task(struct task_struct* t)
+{
+	lt_t quanta;
+	lt_t period;
+	s64  quantum_length = ktime_to_ns(tick_period);
+	struct pfair_param* param;
+	unsigned long i;
+
+	/* first check that the task is in the right cluster */
+	if (cpu_cluster(pstate[tsk_rt(t)->task_params.cpu]) !=
+	    cpu_cluster(pstate[task_cpu(t)]))
+		return -EINVAL;
+
+	/* Pfair is a tick-based method, so the time
+	 * of interest is jiffies. Calculate tick-based
+	 * times for everything.
+	 * (Ceiling of exec cost, floor of period.)
+	 */
+
+	quanta = get_exec_cost(t);
+	period = get_rt_period(t);
+
+	quanta = time2quanta(get_exec_cost(t), CEIL);
+
+	if (do_div(period, quantum_length))
+		printk(KERN_WARNING
+		       "The period of %s/%d is not a multiple of %llu.\n",
+		       t->comm, t->pid, (unsigned long long) quantum_length);
+
+	if (quanta == period) {
+		/* special case: task has weight 1.0 */
+		printk(KERN_INFO
+		       "Admitting weight 1.0 task. (%s/%d, %llu, %llu).\n",
+		       t->comm, t->pid, quanta, period);
+		quanta = 1;
+		period = 1;
+	}
+
+	param = kmalloc(sizeof(*param) +
+			quanta * sizeof(struct subtask), GFP_ATOMIC);
+
+	if (!param)
+		return -ENOMEM;
+
+	param->quanta  = quanta;
+	param->cur     = 0;
+	param->release = 0;
+	param->period  = period;
+
+	param->cluster = cpu_cluster(pstate[tsk_rt(t)->task_params.cpu]);
+
+	for (i = 0; i < quanta; i++)
+		init_subtask(param->subtasks + i, i, quanta, period);
+
+	if (t->rt_param.pfair)
+		/* get rid of stale allocation */
+		kfree(t->rt_param.pfair);
+
+	t->rt_param.pfair = param;
+
+	/* spew out some debug info */
+	dump_subtasks(t);
+
+	return 0;
+}
+
+static void pfair_init_cluster(struct pfair_cluster* cluster)
+{
+	rt_domain_init(&cluster->pfair, pfair_ready_order, NULL, pfair_release_jobs);
+	bheap_init(&cluster->release_queue);
+	raw_spin_lock_init(&cluster->release_lock);
+	INIT_LIST_HEAD(&cluster->topology.cpus);
+}
+
+static void cleanup_clusters(void)
+{
+	int i;
+
+	if (num_pfair_clusters)
+		kfree(pfair_clusters);
+	pfair_clusters = NULL;
+	num_pfair_clusters = 0;
+
+	/* avoid stale pointers */
+	for (i = 0; i < num_online_cpus(); i++) {
+		pstate[i]->topology.cluster = NULL;
+		printk("P%d missed %u updates and %u quanta.\n", cpu_id(pstate[i]),
+		       pstate[i]->missed_updates, pstate[i]->missed_quanta);
+	}
+}
+
+static long pfair_activate_plugin(void)
+{
+	int err, i;
+	struct pfair_state* state;
+	struct pfair_cluster* cluster ;
+	quanta_t now;
+	int cluster_size;
+	struct cluster_cpu* cpus[NR_CPUS];
+	struct scheduling_cluster* clust[NR_CPUS];
+
+	cluster_size = get_cluster_size(pfair_cluster_level);
+
+	if (cluster_size <= 0 || num_online_cpus() % cluster_size != 0)
+		return -EINVAL;
+
+	num_pfair_clusters = num_online_cpus() / cluster_size;
+
+	pfair_clusters = kzalloc(num_pfair_clusters * sizeof(struct pfair_cluster), GFP_ATOMIC);
+	if (!pfair_clusters) {
+		num_pfair_clusters = 0;
+		printk(KERN_ERR "Could not allocate Pfair clusters!\n");
+		return -ENOMEM;
+	}
+
+	state = &__get_cpu_var(pfair_state);
+	now = current_quantum(state);
+	TRACE("Activating PFAIR at q=%lu\n", now);
+
+	for (i = 0; i < num_pfair_clusters; i++) {
+		cluster = &pfair_clusters[i];
+		pfair_init_cluster(cluster);
+		cluster->pfair_time = now;
+		clust[i] = &cluster->topology;
+#ifdef CONFIG_RELEASE_MASTER
+		cluster->pfair.release_master = atomic_read(&release_master_cpu);
+#endif
+	}
+
+	for (i = 0; i < num_online_cpus(); i++)  {
+		state = &per_cpu(pfair_state, i);
+		state->cur_tick   = now;
+		state->local_tick = now;
+		state->missed_quanta = 0;
+		state->missed_updates = 0;
+		state->offset     = cpu_stagger_offset(i);
+		printk(KERN_ERR "cpus[%d] set; %d\n", i, num_online_cpus());
+		cpus[i] = &state->topology;
+	}
+
+	err = assign_cpus_to_clusters(pfair_cluster_level, clust, num_pfair_clusters,
+				      cpus, num_online_cpus());
+
+	if (err < 0)
+		cleanup_clusters();
+
+	return err;
+}
+
+static long pfair_deactivate_plugin(void)
+{
+	cleanup_clusters();
+	return 0;
+}
+
+/*	Plugin object	*/
+static struct sched_plugin pfair_plugin __cacheline_aligned_in_smp = {
+	.plugin_name		= "PFAIR",
+	.tick			= pfair_tick,
+	.task_new		= pfair_task_new,
+	.task_exit		= pfair_task_exit,
+	.schedule		= pfair_schedule,
+	.task_wake_up		= pfair_task_wake_up,
+	.task_block		= pfair_task_block,
+	.admit_task		= pfair_admit_task,
+	.release_at		= pfair_release_at,
+	.complete_job		= complete_job,
+	.activate_plugin	= pfair_activate_plugin,
+	.deactivate_plugin	= pfair_deactivate_plugin,
+};
+
+
+static struct proc_dir_entry *cluster_file = NULL, *pfair_dir = NULL;
+
+static int __init init_pfair(void)
+{
+	int cpu, err, fs;
+	struct pfair_state *state;
+
+	/*
+	 * initialize short_cut for per-cpu pfair state;
+	 * there may be a problem here if someone removes a cpu
+	 * while we are doing this initialization... and if cpus
+	 * are added / removed later... but we don't support CPU hotplug atm anyway.
+	 */
+	pstate = kmalloc(sizeof(struct pfair_state*) * num_online_cpus(), GFP_KERNEL);
+
+	/* initialize CPU state */
+	for (cpu = 0; cpu < num_online_cpus(); cpu++)  {
+		state = &per_cpu(pfair_state, cpu);
+		state->topology.id = cpu;
+		state->cur_tick   = 0;
+		state->local_tick = 0;
+		state->linked     = NULL;
+		state->local      = NULL;
+		state->scheduled  = NULL;
+		state->missed_quanta = 0;
+		state->offset     = cpu_stagger_offset(cpu);
+		pstate[cpu] = state;
+	}
+
+	pfair_clusters = NULL;
+	num_pfair_clusters = 0;
+
+	err = register_sched_plugin(&pfair_plugin);
+	if (!err) {
+		fs = make_plugin_proc_dir(&pfair_plugin, &pfair_dir);
+		if (!fs)
+			cluster_file = create_cluster_file(pfair_dir, &pfair_cluster_level);
+		else
+			printk(KERN_ERR "Could not allocate PFAIR procfs dir.\n");
+	}
+
+	return err;
+}
+
+static void __exit clean_pfair(void)
+{
+	kfree(pstate);
+
+	if (cluster_file)
+		remove_proc_entry("cluster", pfair_dir);
+	if (pfair_dir)
+		remove_plugin_proc_dir(&pfair_plugin);
+}
+
+module_init(init_pfair);
+module_exit(clean_pfair);
diff --git a/litmus/sched_pfp.c b/litmus/sched_pfp.c
new file mode 100644
index 000000000000..74a77e7a4959
--- /dev/null
+++ b/litmus/sched_pfp.c
@@ -0,0 +1,1542 @@
+/*
+ * litmus/sched_pfp.c
+ *
+ * Implementation of partitioned fixed-priority scheduling.
+ * Based on PSN-EDF.
+ */
+
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+
+#include <litmus/litmus.h>
+#include <litmus/wait.h>
+#include <litmus/jobs.h>
+#include <litmus/preempt.h>
+#include <litmus/fp_common.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/sched_trace.h>
+#include <litmus/trace.h>
+
+#include <linux/uaccess.h>
+
+
+typedef struct {
+	rt_domain_t 		domain;
+	struct fp_prio_queue	ready_queue;
+	int          		cpu;
+	struct task_struct* 	scheduled; /* only RT tasks */
+/*
+ * scheduling lock slock
+ * protects the domain and serializes scheduling decisions
+ */
+#define slock domain.ready_lock
+
+} pfp_domain_t;
+
+DEFINE_PER_CPU(pfp_domain_t, pfp_domains);
+
+pfp_domain_t* pfp_doms[NR_CPUS];
+
+#define local_pfp		(&__get_cpu_var(pfp_domains))
+#define remote_dom(cpu)		(&per_cpu(pfp_domains, cpu).domain)
+#define remote_pfp(cpu)	(&per_cpu(pfp_domains, cpu))
+#define task_dom(task)		remote_dom(get_partition(task))
+#define task_pfp(task)		remote_pfp(get_partition(task))
+
+/* we assume the lock is being held */
+static void preempt(pfp_domain_t *pfp)
+{
+	preempt_if_preemptable(pfp->scheduled, pfp->cpu);
+}
+
+static unsigned int priority_index(struct task_struct* t)
+{
+#ifdef CONFIG_LOCKING
+	if (unlikely(t->rt_param.inh_task))
+		/* use effective priority */
+		t = t->rt_param.inh_task;
+
+	if (is_priority_boosted(t)) {
+		/* zero is reserved for priority-boosted tasks */
+		return 0;
+	} else
+#endif
+		return get_priority(t);
+}
+
+
+static void pfp_release_jobs(rt_domain_t* rt, struct bheap* tasks)
+{
+	pfp_domain_t *pfp = container_of(rt, pfp_domain_t, domain);
+	unsigned long flags;
+	struct task_struct* t;
+	struct bheap_node* hn;
+
+	raw_spin_lock_irqsave(&pfp->slock, flags);
+
+	while (!bheap_empty(tasks)) {
+		hn = bheap_take(fp_ready_order, tasks);
+		t = bheap2task(hn);
+		TRACE_TASK(t, "released (part:%d prio:%d)\n",
+			   get_partition(t), get_priority(t));
+		fp_prio_add(&pfp->ready_queue, t, priority_index(t));
+	}
+
+	/* do we need to preempt? */
+	if (fp_higher_prio(fp_prio_peek(&pfp->ready_queue), pfp->scheduled)) {
+		TRACE_CUR("preempted by new release\n");
+		preempt(pfp);
+	}
+
+	raw_spin_unlock_irqrestore(&pfp->slock, flags);
+}
+
+static void pfp_domain_init(pfp_domain_t* pfp,
+			       int cpu)
+{
+	fp_domain_init(&pfp->domain, NULL, pfp_release_jobs);
+	pfp->cpu      		= cpu;
+	pfp->scheduled		= NULL;
+	fp_prio_queue_init(&pfp->ready_queue);
+}
+
+static void requeue(struct task_struct* t, pfp_domain_t *pfp)
+{
+	if (t->state != TASK_RUNNING)
+		TRACE_TASK(t, "requeue: !TASK_RUNNING\n");
+
+	set_rt_flags(t, RT_F_RUNNING);
+	if (is_released(t, litmus_clock()))
+		fp_prio_add(&pfp->ready_queue, t, priority_index(t));
+	else
+		add_release(&pfp->domain, t); /* it has got to wait */
+}
+
+static void job_completion(struct task_struct* t, int forced)
+{
+	sched_trace_task_completion(t,forced);
+	TRACE_TASK(t, "job_completion().\n");
+
+	set_rt_flags(t, RT_F_SLEEP);
+	prepare_for_next_period(t);
+}
+
+static void pfp_tick(struct task_struct *t)
+{
+	pfp_domain_t *pfp = local_pfp;
+
+	/* Check for inconsistency. We don't need the lock for this since
+	 * ->scheduled is only changed in schedule, which obviously is not
+	 *  executing in parallel on this CPU
+	 */
+	BUG_ON(is_realtime(t) && t != pfp->scheduled);
+
+	if (is_realtime(t) && budget_enforced(t) && budget_exhausted(t)) {
+		if (!is_np(t)) {
+			litmus_reschedule_local();
+			TRACE("pfp_scheduler_tick: "
+			      "%d is preemptable "
+			      " => FORCE_RESCHED\n", t->pid);
+		} else if (is_user_np(t)) {
+			TRACE("pfp_scheduler_tick: "
+			      "%d is non-preemptable, "
+			      "preemption delayed.\n", t->pid);
+			request_exit_np(t);
+		}
+	}
+}
+
+static struct task_struct* pfp_schedule(struct task_struct * prev)
+{
+	pfp_domain_t* 	pfp = local_pfp;
+	struct task_struct*	next;
+
+	int out_of_time, sleep, preempt, np, exists, blocks, resched, migrate;
+
+	raw_spin_lock(&pfp->slock);
+
+	/* sanity checking
+	 * differently from gedf, when a task exits (dead)
+	 * pfp->schedule may be null and prev _is_ realtime
+	 */
+	BUG_ON(pfp->scheduled && pfp->scheduled != prev);
+	BUG_ON(pfp->scheduled && !is_realtime(prev));
+
+	/* (0) Determine state */
+	exists      = pfp->scheduled != NULL;
+	blocks      = exists && !is_running(pfp->scheduled);
+	out_of_time = exists &&
+				  budget_enforced(pfp->scheduled) &&
+				  budget_exhausted(pfp->scheduled);
+	np 	    = exists && is_np(pfp->scheduled);
+	sleep	    = exists && get_rt_flags(pfp->scheduled) == RT_F_SLEEP;
+	migrate     = exists && get_partition(pfp->scheduled) != pfp->cpu;
+	preempt     = migrate || fp_preemption_needed(&pfp->ready_queue, prev);
+
+	/* If we need to preempt do so.
+	 * The following checks set resched to 1 in case of special
+	 * circumstances.
+	 */
+	resched = preempt;
+
+	/* If a task blocks we have no choice but to reschedule.
+	 */
+	if (blocks)
+		resched = 1;
+
+	/* Request a sys_exit_np() call if we would like to preempt but cannot.
+	 * Multiple calls to request_exit_np() don't hurt.
+	 */
+	if (np && (out_of_time || preempt || sleep))
+		request_exit_np(pfp->scheduled);
+
+	/* Any task that is preemptable and either exhausts its execution
+	 * budget or wants to sleep completes. We may have to reschedule after
+	 * this.
+	 */
+	if (!np && (out_of_time || sleep) && !blocks && !migrate) {
+		job_completion(pfp->scheduled, !sleep);
+		resched = 1;
+	}
+
+	/* The final scheduling decision. Do we need to switch for some reason?
+	 * Switch if we are in RT mode and have no task or if we need to
+	 * resched.
+	 */
+	next = NULL;
+	if ((!np || blocks) && (resched || !exists)) {
+		/* When preempting a task that does not block, then
+		 * re-insert it into either the ready queue or the
+		 * release queue (if it completed). requeue() picks
+		 * the appropriate queue.
+		 */
+		if (pfp->scheduled && !blocks  && !migrate)
+			requeue(pfp->scheduled, pfp);
+		next = fp_prio_take(&pfp->ready_queue);
+	} else
+		/* Only override Linux scheduler if we have a real-time task
+		 * scheduled that needs to continue.
+		 */
+		if (exists)
+			next = prev;
+
+	if (next) {
+		TRACE_TASK(next, "scheduled at %llu\n", litmus_clock());
+		set_rt_flags(next, RT_F_RUNNING);
+	} else {
+		TRACE("becoming idle at %llu\n", litmus_clock());
+	}
+
+	pfp->scheduled = next;
+	sched_state_task_picked();
+	raw_spin_unlock(&pfp->slock);
+
+	return next;
+}
+
+#ifdef CONFIG_LITMUS_LOCKING
+
+/* prev is no longer scheduled --- see if it needs to migrate */
+static void pfp_finish_switch(struct task_struct *prev)
+{
+	pfp_domain_t *to;
+
+	if (is_realtime(prev) &&
+	    is_running(prev) &&
+	    get_partition(prev) != smp_processor_id()) {
+		TRACE_TASK(prev, "needs to migrate from P%d to P%d\n",
+			   smp_processor_id(), get_partition(prev));
+
+		to = task_pfp(prev);
+
+		raw_spin_lock(&to->slock);
+
+		TRACE_TASK(prev, "adding to queue on P%d\n", to->cpu);
+		requeue(prev, to);
+		if (fp_preemption_needed(&to->ready_queue, to->scheduled))
+			preempt(to);
+
+		raw_spin_unlock(&to->slock);
+
+	}
+}
+
+#endif
+
+/*	Prepare a task for running in RT mode
+ */
+static void pfp_task_new(struct task_struct * t, int on_rq, int running)
+{
+	pfp_domain_t* 	pfp = task_pfp(t);
+	unsigned long		flags;
+
+	TRACE_TASK(t, "P-FP: task new, cpu = %d\n",
+		   t->rt_param.task_params.cpu);
+
+	/* setup job parameters */
+	release_at(t, litmus_clock());
+
+	/* The task should be running in the queue, otherwise signal
+	 * code will try to wake it up with fatal consequences.
+	 */
+	raw_spin_lock_irqsave(&pfp->slock, flags);
+	if (running) {
+		/* there shouldn't be anything else running at the time */
+		BUG_ON(pfp->scheduled);
+		pfp->scheduled = t;
+	} else {
+		requeue(t, pfp);
+		/* maybe we have to reschedule */
+		preempt(pfp);
+	}
+	raw_spin_unlock_irqrestore(&pfp->slock, flags);
+}
+
+static void pfp_task_wake_up(struct task_struct *task)
+{
+	unsigned long		flags;
+	pfp_domain_t*		pfp = task_pfp(task);
+	lt_t			now;
+
+	TRACE_TASK(task, "wake_up at %llu\n", litmus_clock());
+	raw_spin_lock_irqsave(&pfp->slock, flags);
+
+#ifdef CONFIG_LITMUS_LOCKING
+	/* Should only be queued when processing a fake-wake up due to a
+	 * migration-related state change. */
+	if (unlikely(is_queued(task))) {
+		TRACE_TASK(task, "WARNING: waking task still queued. Is this right?\n");
+		goto out_unlock;
+	}
+#else
+	BUG_ON(is_queued(task));
+#endif
+	now = litmus_clock();
+	if (is_tardy(task, now)
+#ifdef CONFIG_LITMUS_LOCKING
+	/* We need to take suspensions because of semaphores into
+	 * account! If a job resumes after being suspended due to acquiring
+	 * a semaphore, it should never be treated as a new job release.
+	 */
+	    && !is_priority_boosted(task)
+#endif
+		) {
+		/* new sporadic release */
+		release_at(task, now);
+		sched_trace_task_release(task);
+	}
+
+	/* Only add to ready queue if it is not the currently-scheduled
+	 * task. This could be the case if a task was woken up concurrently
+	 * on a remote CPU before the executing CPU got around to actually
+	 * de-scheduling the task, i.e., wake_up() raced with schedule()
+	 * and won. Also, don't requeue if it is still queued, which can
+	 * happen under the DPCP due wake-ups racing with migrations.
+	 */
+	if (pfp->scheduled != task)
+		requeue(task, pfp);
+
+out_unlock:
+	raw_spin_unlock_irqrestore(&pfp->slock, flags);
+	TRACE_TASK(task, "wake up done\n");
+}
+
+static void pfp_task_block(struct task_struct *t)
+{
+	/* only running tasks can block, thus t is in no queue */
+	TRACE_TASK(t, "block at %llu, state=%d\n", litmus_clock(), t->state);
+
+	BUG_ON(!is_realtime(t));
+
+	/* If this task blocked normally, it shouldn't be queued. The exception is
+	 * if this is a simulated block()/wakeup() pair from the pull-migration code path.
+	 * This should only happen if the DPCP is being used.
+	 */
+#ifdef CONFIG_LITMUS_LOCKING
+	if (unlikely(is_queued(t)))
+		TRACE_TASK(t, "WARNING: blocking task still queued. Is this right?\n");
+#else
+	BUG_ON(is_queued(t));
+#endif
+}
+
+static void pfp_task_exit(struct task_struct * t)
+{
+	unsigned long flags;
+	pfp_domain_t* 	pfp = task_pfp(t);
+	rt_domain_t*		dom;
+
+	raw_spin_lock_irqsave(&pfp->slock, flags);
+	if (is_queued(t)) {
+		BUG(); /* This currently doesn't work. */
+		/* dequeue */
+		dom  = task_dom(t);
+		remove(dom, t);
+	}
+	if (pfp->scheduled == t) {
+		pfp->scheduled = NULL;
+		preempt(pfp);
+	}
+	TRACE_TASK(t, "RIP, now reschedule\n");
+
+	raw_spin_unlock_irqrestore(&pfp->slock, flags);
+}
+
+#ifdef CONFIG_LITMUS_LOCKING
+
+#include <litmus/fdso.h>
+#include <litmus/srp.h>
+
+static void fp_dequeue(pfp_domain_t* pfp, struct task_struct* t)
+{
+	BUG_ON(pfp->scheduled == t && is_queued(t));
+	if (is_queued(t))
+		fp_prio_remove(&pfp->ready_queue, t, priority_index(t));
+}
+
+static void fp_set_prio_inh(pfp_domain_t* pfp, struct task_struct* t,
+			    struct task_struct* prio_inh)
+{
+	int requeue;
+
+	if (!t || t->rt_param.inh_task == prio_inh) {
+		/* no update  required */
+		if (t)
+			TRACE_TASK(t, "no prio-inh update required\n");
+		return;
+	}
+
+	requeue = is_queued(t);
+	TRACE_TASK(t, "prio-inh: is_queued:%d\n", requeue);
+
+	if (requeue)
+		/* first remove */
+		fp_dequeue(pfp, t);
+
+	t->rt_param.inh_task = prio_inh;
+
+	if (requeue)
+		/* add again to the right queue */
+		fp_prio_add(&pfp->ready_queue, t, priority_index(t));
+}
+
+static int effective_agent_priority(int prio)
+{
+	/* make sure agents have higher priority */
+	return prio - LITMUS_MAX_PRIORITY;
+}
+
+static lt_t prio_point(int eprio)
+{
+	/* make sure we have non-negative prio points */
+	return eprio + LITMUS_MAX_PRIORITY;
+}
+
+static int prio_from_point(lt_t prio_point)
+{
+	return ((int) prio_point) - LITMUS_MAX_PRIORITY;
+}
+
+static void boost_priority(struct task_struct* t, lt_t priority_point)
+{
+	unsigned long		flags;
+	pfp_domain_t* 	pfp = task_pfp(t);
+
+	raw_spin_lock_irqsave(&pfp->slock, flags);
+
+
+	TRACE_TASK(t, "priority boosted at %llu\n", litmus_clock());
+
+	tsk_rt(t)->priority_boosted = 1;
+	/* tie-break by protocol-specific priority point */
+	tsk_rt(t)->boost_start_time = priority_point;
+
+	if (pfp->scheduled != t) {
+		/* holder may be queued: first stop queue changes */
+		raw_spin_lock(&pfp->domain.release_lock);
+		if (is_queued(t) &&
+		    /* If it is queued, then we need to re-order. */
+		    bheap_decrease(fp_ready_order, tsk_rt(t)->heap_node) &&
+		    /* If we bubbled to the top, then we need to check for preemptions. */
+		    fp_preemption_needed(&pfp->ready_queue, pfp->scheduled))
+				preempt(pfp);
+		raw_spin_unlock(&pfp->domain.release_lock);
+	} /* else: nothing to do since the job is not queued while scheduled */
+
+	raw_spin_unlock_irqrestore(&pfp->slock, flags);
+}
+
+static void unboost_priority(struct task_struct* t)
+{
+	unsigned long		flags;
+	pfp_domain_t* 	pfp = task_pfp(t);
+	lt_t			now;
+
+	raw_spin_lock_irqsave(&pfp->slock, flags);
+	now = litmus_clock();
+
+	/* assumption: this only happens when the job is scheduled */
+	BUG_ON(pfp->scheduled != t);
+
+	TRACE_TASK(t, "priority restored at %llu\n", now);
+
+	/* priority boosted jobs must be scheduled */
+	BUG_ON(pfp->scheduled != t);
+
+	tsk_rt(t)->priority_boosted = 0;
+	tsk_rt(t)->boost_start_time = 0;
+
+	/* check if this changes anything */
+	if (fp_preemption_needed(&pfp->ready_queue, pfp->scheduled))
+		preempt(pfp);
+
+	raw_spin_unlock_irqrestore(&pfp->slock, flags);
+}
+
+/* ******************** SRP support ************************ */
+
+static unsigned int pfp_get_srp_prio(struct task_struct* t)
+{
+	return get_priority(t);
+}
+
+/* ******************** FMLP support ********************** */
+
+struct fmlp_semaphore {
+	struct litmus_lock litmus_lock;
+
+	/* current resource holder */
+	struct task_struct *owner;
+
+	/* FIFO queue of waiting tasks */
+	wait_queue_head_t wait;
+};
+
+static inline struct fmlp_semaphore* fmlp_from_lock(struct litmus_lock* lock)
+{
+	return container_of(lock, struct fmlp_semaphore, litmus_lock);
+}
+int pfp_fmlp_lock(struct litmus_lock* l)
+{
+	struct task_struct* t = current;
+	struct fmlp_semaphore *sem = fmlp_from_lock(l);
+	wait_queue_t wait;
+	unsigned long flags;
+	lt_t time_of_request;
+
+	if (!is_realtime(t))
+		return -EPERM;
+
+	spin_lock_irqsave(&sem->wait.lock, flags);
+
+	/* tie-break by this point in time */
+	time_of_request = litmus_clock();
+
+	/* Priority-boost ourself *before* we suspend so that
+	 * our priority is boosted when we resume. */
+	boost_priority(t, time_of_request);
+
+	if (sem->owner) {
+		/* resource is not free => must suspend and wait */
+
+		init_waitqueue_entry(&wait, t);
+
+		/* FIXME: interruptible would be nice some day */
+		set_task_state(t, TASK_UNINTERRUPTIBLE);
+
+		__add_wait_queue_tail_exclusive(&sem->wait, &wait);
+
+		TS_LOCK_SUSPEND;
+
+		/* release lock before sleeping */
+		spin_unlock_irqrestore(&sem->wait.lock, flags);
+
+		/* We depend on the FIFO order.  Thus, we don't need to recheck
+		 * when we wake up; we are guaranteed to have the lock since
+		 * there is only one wake up per release.
+		 */
+
+		schedule();
+
+		TS_LOCK_RESUME;
+
+		/* Since we hold the lock, no other task will change
+		 * ->owner. We can thus check it without acquiring the spin
+		 * lock. */
+		BUG_ON(sem->owner != t);
+	} else {
+		/* it's ours now */
+		sem->owner = t;
+
+		spin_unlock_irqrestore(&sem->wait.lock, flags);
+	}
+
+	return 0;
+}
+
+int pfp_fmlp_unlock(struct litmus_lock* l)
+{
+	struct task_struct *t = current, *next;
+	struct fmlp_semaphore *sem = fmlp_from_lock(l);
+	unsigned long flags;
+	int err = 0;
+
+	spin_lock_irqsave(&sem->wait.lock, flags);
+
+	if (sem->owner != t) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	/* we lose the benefit of priority boosting */
+
+	unboost_priority(t);
+
+	/* check if there are jobs waiting for this resource */
+	next = __waitqueue_remove_first(&sem->wait);
+	if (next) {
+		/* next becomes the resouce holder */
+		sem->owner = next;
+
+		/* Wake up next. The waiting job is already priority-boosted. */
+		wake_up_process(next);
+	} else
+		/* resource becomes available */
+		sem->owner = NULL;
+
+out:
+	spin_unlock_irqrestore(&sem->wait.lock, flags);
+	return err;
+}
+
+int pfp_fmlp_close(struct litmus_lock* l)
+{
+	struct task_struct *t = current;
+	struct fmlp_semaphore *sem = fmlp_from_lock(l);
+	unsigned long flags;
+
+	int owner;
+
+	spin_lock_irqsave(&sem->wait.lock, flags);
+
+	owner = sem->owner == t;
+
+	spin_unlock_irqrestore(&sem->wait.lock, flags);
+
+	if (owner)
+		pfp_fmlp_unlock(l);
+
+	return 0;
+}
+
+void pfp_fmlp_free(struct litmus_lock* lock)
+{
+	kfree(fmlp_from_lock(lock));
+}
+
+static struct litmus_lock_ops pfp_fmlp_lock_ops = {
+	.close  = pfp_fmlp_close,
+	.lock   = pfp_fmlp_lock,
+	.unlock = pfp_fmlp_unlock,
+	.deallocate = pfp_fmlp_free,
+};
+
+static struct litmus_lock* pfp_new_fmlp(void)
+{
+	struct fmlp_semaphore* sem;
+
+	sem = kmalloc(sizeof(*sem), GFP_KERNEL);
+	if (!sem)
+		return NULL;
+
+	sem->owner   = NULL;
+	init_waitqueue_head(&sem->wait);
+	sem->litmus_lock.ops = &pfp_fmlp_lock_ops;
+
+	return &sem->litmus_lock;
+}
+
+/* ******************** MPCP support ********************** */
+
+struct mpcp_semaphore {
+	struct litmus_lock litmus_lock;
+
+	/* current resource holder */
+	struct task_struct *owner;
+
+	/* priority queue of waiting tasks */
+	wait_queue_head_t wait;
+
+	/* priority ceiling per cpu */
+	unsigned int prio_ceiling[NR_CPUS];
+
+	/* should jobs spin "virtually" for this resource? */
+	int vspin;
+};
+
+#define OMEGA_CEILING UINT_MAX
+
+/* Since jobs spin "virtually" while waiting to acquire a lock,
+ * they first must aquire a local per-cpu resource.
+ */
+static DEFINE_PER_CPU(wait_queue_head_t, mpcpvs_vspin_wait);
+static DEFINE_PER_CPU(struct task_struct*, mpcpvs_vspin);
+
+/* called with preemptions off <=> no local modifications */
+static void mpcp_vspin_enter(void)
+{
+	struct task_struct* t = current;
+
+	while (1) {
+		if (__get_cpu_var(mpcpvs_vspin) == NULL) {
+			/* good, we get to issue our request */
+			__get_cpu_var(mpcpvs_vspin) = t;
+			break;
+		} else {
+			/* some job is spinning => enqueue in request queue */
+			prio_wait_queue_t wait;
+			wait_queue_head_t* vspin = &__get_cpu_var(mpcpvs_vspin_wait);
+			unsigned long flags;
+
+			/* ordered by regular priority */
+			init_prio_waitqueue_entry(&wait, t, prio_point(get_priority(t)));
+
+			spin_lock_irqsave(&vspin->lock, flags);
+
+			set_task_state(t, TASK_UNINTERRUPTIBLE);
+
+			__add_wait_queue_prio_exclusive(vspin, &wait);
+
+			spin_unlock_irqrestore(&vspin->lock, flags);
+
+			TS_LOCK_SUSPEND;
+
+			preempt_enable_no_resched();
+
+			schedule();
+
+			preempt_disable();
+
+			TS_LOCK_RESUME;
+			/* Recheck if we got it --- some higher-priority process might
+			 * have swooped in. */
+		}
+	}
+	/* ok, now it is ours */
+}
+
+/* called with preemptions off */
+static void mpcp_vspin_exit(void)
+{
+	struct task_struct* t = current, *next;
+	unsigned long flags;
+	wait_queue_head_t* vspin = &__get_cpu_var(mpcpvs_vspin_wait);
+
+	BUG_ON(__get_cpu_var(mpcpvs_vspin) != t);
+
+	/* no spinning job */
+	__get_cpu_var(mpcpvs_vspin) = NULL;
+
+	/* see if anyone is waiting for us to stop "spinning" */
+	spin_lock_irqsave(&vspin->lock, flags);
+	next = __waitqueue_remove_first(vspin);
+
+	if (next)
+		wake_up_process(next);
+
+	spin_unlock_irqrestore(&vspin->lock, flags);
+}
+
+static inline struct mpcp_semaphore* mpcp_from_lock(struct litmus_lock* lock)
+{
+	return container_of(lock, struct mpcp_semaphore, litmus_lock);
+}
+
+int pfp_mpcp_lock(struct litmus_lock* l)
+{
+	struct task_struct* t = current;
+	struct mpcp_semaphore *sem = mpcp_from_lock(l);
+	prio_wait_queue_t wait;
+	unsigned long flags;
+
+	if (!is_realtime(t))
+		return -EPERM;
+
+	preempt_disable();
+
+	if (sem->vspin)
+		mpcp_vspin_enter();
+
+	/* Priority-boost ourself *before* we suspend so that
+	 * our priority is boosted when we resume. Use the priority
+	 * ceiling for the local partition. */
+	boost_priority(t, sem->prio_ceiling[get_partition(t)]);
+
+	spin_lock_irqsave(&sem->wait.lock, flags);
+
+	preempt_enable_no_resched();
+
+	if (sem->owner) {
+		/* resource is not free => must suspend and wait */
+
+		/* ordered by regular priority */
+		init_prio_waitqueue_entry(&wait, t, prio_point(get_priority(t)));
+
+		/* FIXME: interruptible would be nice some day */
+		set_task_state(t, TASK_UNINTERRUPTIBLE);
+
+		__add_wait_queue_prio_exclusive(&sem->wait, &wait);
+
+		TS_LOCK_SUSPEND;
+
+		/* release lock before sleeping */
+		spin_unlock_irqrestore(&sem->wait.lock, flags);
+
+		/* We depend on the FIFO order.  Thus, we don't need to recheck
+		 * when we wake up; we are guaranteed to have the lock since
+		 * there is only one wake up per release.
+		 */
+
+		schedule();
+
+		TS_LOCK_RESUME;
+
+		/* Since we hold the lock, no other task will change
+		 * ->owner. We can thus check it without acquiring the spin
+		 * lock. */
+		BUG_ON(sem->owner != t);
+	} else {
+		/* it's ours now */
+		sem->owner = t;
+
+		spin_unlock_irqrestore(&sem->wait.lock, flags);
+	}
+
+	return 0;
+}
+
+int pfp_mpcp_unlock(struct litmus_lock* l)
+{
+	struct task_struct *t = current, *next;
+	struct mpcp_semaphore *sem = mpcp_from_lock(l);
+	unsigned long flags;
+	int err = 0;
+
+	spin_lock_irqsave(&sem->wait.lock, flags);
+
+	if (sem->owner != t) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	/* we lose the benefit of priority boosting */
+
+	unboost_priority(t);
+
+	/* check if there are jobs waiting for this resource */
+	next = __waitqueue_remove_first(&sem->wait);
+	if (next) {
+		/* next becomes the resouce holder */
+		sem->owner = next;
+
+		/* Wake up next. The waiting job is already priority-boosted. */
+		wake_up_process(next);
+	} else
+		/* resource becomes available */
+		sem->owner = NULL;
+
+out:
+	spin_unlock_irqrestore(&sem->wait.lock, flags);
+
+	if (sem->vspin && err == 0) {
+		preempt_disable();
+		mpcp_vspin_exit();
+		preempt_enable();
+	}
+
+	return err;
+}
+
+int pfp_mpcp_open(struct litmus_lock* l, void* config)
+{
+	struct task_struct *t = current;
+	struct mpcp_semaphore *sem = mpcp_from_lock(l);
+	int cpu, local_cpu;
+	unsigned long flags;
+
+	if (!is_realtime(t))
+		/* we need to know the real-time priority */
+		return -EPERM;
+
+	local_cpu = get_partition(t);
+
+	spin_lock_irqsave(&sem->wait.lock, flags);
+
+	for (cpu = 0; cpu < NR_CPUS; cpu++)
+		if (cpu != local_cpu)
+		{
+			sem->prio_ceiling[cpu] = min(sem->prio_ceiling[cpu],
+						     get_priority(t));
+			TRACE_CUR("priority ceiling for sem %p is now %d on cpu %d\n",
+				  sem, sem->prio_ceiling[cpu], cpu);
+		}
+
+	spin_unlock_irqrestore(&sem->wait.lock, flags);
+
+	return 0;
+}
+
+int pfp_mpcp_close(struct litmus_lock* l)
+{
+	struct task_struct *t = current;
+	struct mpcp_semaphore *sem = mpcp_from_lock(l);
+	unsigned long flags;
+
+	int owner;
+
+	spin_lock_irqsave(&sem->wait.lock, flags);
+
+	owner = sem->owner == t;
+
+	spin_unlock_irqrestore(&sem->wait.lock, flags);
+
+	if (owner)
+		pfp_mpcp_unlock(l);
+
+	return 0;
+}
+
+void pfp_mpcp_free(struct litmus_lock* lock)
+{
+	kfree(mpcp_from_lock(lock));
+}
+
+static struct litmus_lock_ops pfp_mpcp_lock_ops = {
+	.close  = pfp_mpcp_close,
+	.lock   = pfp_mpcp_lock,
+	.open	= pfp_mpcp_open,
+	.unlock = pfp_mpcp_unlock,
+	.deallocate = pfp_mpcp_free,
+};
+
+static struct litmus_lock* pfp_new_mpcp(int vspin)
+{
+	struct mpcp_semaphore* sem;
+	int cpu;
+
+	sem = kmalloc(sizeof(*sem), GFP_KERNEL);
+	if (!sem)
+		return NULL;
+
+	sem->owner   = NULL;
+	init_waitqueue_head(&sem->wait);
+	sem->litmus_lock.ops = &pfp_mpcp_lock_ops;
+
+	for (cpu = 0; cpu < NR_CPUS; cpu++)
+		sem->prio_ceiling[cpu] = OMEGA_CEILING;
+
+	/* mark as virtual spinning */
+	sem->vspin = vspin;
+
+	return &sem->litmus_lock;
+}
+
+
+/* ******************** PCP support ********************** */
+
+
+struct pcp_semaphore {
+	struct list_head ceiling;
+
+	/* current resource holder */
+	struct task_struct *owner;
+
+	/* priority ceiling --- can be negative due to DPCP support */
+	int prio_ceiling;
+
+	/* on which processor is this PCP semaphore allocated? */
+	int on_cpu;
+};
+
+struct pcp_state {
+	struct list_head system_ceiling;
+
+	/* highest-priority waiting task */
+	struct task_struct* hp_waiter;
+
+	/* list of jobs waiting to get past the system ceiling */
+	wait_queue_head_t ceiling_blocked;
+};
+
+static void pcp_init_state(struct pcp_state* s)
+{
+	INIT_LIST_HEAD(&s->system_ceiling);
+	s->hp_waiter = NULL;
+	init_waitqueue_head(&s->ceiling_blocked);
+}
+
+static DEFINE_PER_CPU(struct pcp_state, pcp_state);
+
+/* assumes preemptions are off */
+static struct pcp_semaphore* pcp_get_ceiling(void)
+{
+	struct list_head* top = __get_cpu_var(pcp_state).system_ceiling.next;
+
+	if (top)
+		return list_entry(top, struct pcp_semaphore, ceiling);
+	else
+		return NULL;
+}
+
+/* assumes preempt off */
+static void pcp_add_ceiling(struct pcp_semaphore* sem)
+{
+	struct list_head *pos;
+	struct list_head *in_use = &__get_cpu_var(pcp_state).system_ceiling;
+	struct pcp_semaphore* held;
+
+	BUG_ON(sem->on_cpu != smp_processor_id());
+	BUG_ON(in_list(&sem->ceiling));
+
+	list_for_each(pos, in_use) {
+		held = list_entry(pos, struct pcp_semaphore, ceiling);
+		if (held->prio_ceiling >= sem->prio_ceiling) {
+			__list_add(&sem->ceiling, pos->prev, pos);
+			return;
+		}
+	}
+
+	/* we hit the end of the list */
+
+	list_add_tail(&sem->ceiling, in_use);
+}
+
+/* assumes preempt off */
+static int pcp_exceeds_ceiling(struct pcp_semaphore* ceiling,
+			      struct task_struct* task,
+			      int effective_prio)
+{
+	return ceiling == NULL ||
+		ceiling->prio_ceiling > effective_prio ||
+		ceiling->owner == task;
+}
+
+/* assumes preempt off */
+static void pcp_priority_inheritance(void)
+{
+	unsigned long	flags;
+	pfp_domain_t* 	pfp = local_pfp;
+
+	struct pcp_semaphore* ceiling = pcp_get_ceiling();
+	struct task_struct *blocker, *blocked;
+
+	blocker = ceiling ?  ceiling->owner : NULL;
+	blocked = __get_cpu_var(pcp_state).hp_waiter;
+
+	raw_spin_lock_irqsave(&pfp->slock, flags);
+
+	/* Current is no longer inheriting anything by default.  This should be
+	 * the currently scheduled job, and hence not currently queued. */
+	BUG_ON(current != pfp->scheduled);
+
+	fp_set_prio_inh(pfp, current, NULL);
+	fp_set_prio_inh(pfp, blocked, NULL);
+	fp_set_prio_inh(pfp, blocker, NULL);
+
+
+	/* Let blocking job inherit priority of blocked job, if required. */
+	if (blocker && blocked &&
+	    fp_higher_prio(blocked, blocker)) {
+		TRACE_TASK(blocker, "PCP inherits from %s/%d (prio %u -> %u) \n",
+			   blocked->comm, blocked->pid,
+			   get_priority(blocker), get_priority(blocked));
+		fp_set_prio_inh(pfp, blocker, blocked);
+	}
+
+	/* check if anything changed */
+	if (fp_higher_prio(fp_prio_peek(&pfp->ready_queue), pfp->scheduled))
+		preempt(pfp);
+
+	raw_spin_unlock_irqrestore(&pfp->slock, flags);
+}
+
+/* called with preemptions off */
+static void pcp_raise_ceiling(struct pcp_semaphore* sem,
+			      int effective_prio)
+{
+	struct task_struct* t = current;
+	struct pcp_semaphore* ceiling;
+	prio_wait_queue_t wait;
+	unsigned int waiting_higher_prio;
+
+	do {
+		ceiling = pcp_get_ceiling();
+		if (pcp_exceeds_ceiling(ceiling, t, effective_prio))
+			break;
+
+		TRACE_CUR("PCP ceiling-blocked, wanted sem %p, but %s/%d has the ceiling \n",
+			  sem, ceiling->owner->comm, ceiling->owner->pid);
+
+		/* we need to wait until the ceiling is lowered */
+
+		/* enqueue in priority order */
+		init_prio_waitqueue_entry(&wait, t, prio_point(effective_prio));
+		set_task_state(t, TASK_UNINTERRUPTIBLE);
+		waiting_higher_prio = add_wait_queue_prio_exclusive(
+			&__get_cpu_var(pcp_state).ceiling_blocked, &wait);
+
+		if (waiting_higher_prio == 0) {
+			TRACE_CUR("PCP new highest-prio waiter => prio inheritance\n");
+
+			/* we are the new highest-priority waiting job
+			 * => update inheritance */
+			__get_cpu_var(pcp_state).hp_waiter = t;
+			pcp_priority_inheritance();
+		}
+
+		TS_LOCK_SUSPEND;
+
+		preempt_enable_no_resched();
+		schedule();
+		preempt_disable();
+
+		/* pcp_resume_unblocked() removed us from wait queue */
+
+		TS_LOCK_RESUME;
+	} while(1);
+
+	TRACE_CUR("PCP got the ceiling and sem %p\n", sem);
+
+	/* We are good to go. The semaphore should be available. */
+	BUG_ON(sem->owner != NULL);
+
+	sem->owner = t;
+
+	pcp_add_ceiling(sem);
+}
+
+static void pcp_resume_unblocked(void)
+{
+	wait_queue_head_t *blocked =  &__get_cpu_var(pcp_state).ceiling_blocked;
+	unsigned long flags;
+	prio_wait_queue_t* q;
+	struct task_struct* t = NULL;
+
+	struct pcp_semaphore* ceiling = pcp_get_ceiling();
+
+	spin_lock_irqsave(&blocked->lock, flags);
+
+	while (waitqueue_active(blocked)) {
+		/* check first == highest-priority waiting job */
+		q = list_entry(blocked->task_list.next,
+			       prio_wait_queue_t, wq.task_list);
+		t = (struct task_struct*) q->wq.private;
+
+		/* can it proceed now? => let it go */
+		if (pcp_exceeds_ceiling(ceiling, t,
+					prio_from_point(q->priority))) {
+		    __remove_wait_queue(blocked, &q->wq);
+		    wake_up_process(t);
+		} else {
+			/* We are done. Update highest-priority waiter. */
+			__get_cpu_var(pcp_state).hp_waiter = t;
+			goto out;
+		}
+	}
+	/* If we get here, then there are no more waiting
+	 * jobs. */
+	__get_cpu_var(pcp_state).hp_waiter = NULL;
+out:
+	spin_unlock_irqrestore(&blocked->lock, flags);
+}
+
+/* assumes preempt off */
+static void pcp_lower_ceiling(struct pcp_semaphore* sem)
+{
+	BUG_ON(!in_list(&sem->ceiling));
+	BUG_ON(sem->owner != current);
+	BUG_ON(sem->on_cpu != smp_processor_id());
+
+	/* remove from ceiling list */
+	list_del(&sem->ceiling);
+
+	/* release */
+	sem->owner = NULL;
+
+	TRACE_CUR("PCP released sem %p\n", sem);
+
+	/* Wake up all ceiling-blocked jobs that now pass the ceiling. */
+	pcp_resume_unblocked();
+
+	pcp_priority_inheritance();
+}
+
+static void pcp_update_prio_ceiling(struct pcp_semaphore* sem,
+				    int effective_prio)
+{
+	/* This needs to be synchronized on something.
+	 * Might as well use waitqueue lock for the processor.
+	 * We assume this happens only before the task set starts execution,
+	 * (i.e., during initialization), but it may happen on multiple processors
+	 * at the same time.
+	 */
+	unsigned long flags;
+
+	struct pcp_state* s = &per_cpu(pcp_state, sem->on_cpu);
+
+	spin_lock_irqsave(&s->ceiling_blocked.lock, flags);
+
+	sem->prio_ceiling = min(sem->prio_ceiling, effective_prio);
+
+	spin_unlock_irqrestore(&s->ceiling_blocked.lock, flags);
+}
+
+static void pcp_init_semaphore(struct pcp_semaphore* sem, int cpu)
+{
+	sem->owner   = NULL;
+	INIT_LIST_HEAD(&sem->ceiling);
+	sem->prio_ceiling = INT_MAX;
+	sem->on_cpu = cpu;
+}
+
+
+/* ******************** DPCP support ********************** */
+
+struct dpcp_semaphore {
+	struct litmus_lock litmus_lock;
+	struct pcp_semaphore  pcp;
+	int owner_cpu;
+};
+
+static inline struct dpcp_semaphore* dpcp_from_lock(struct litmus_lock* lock)
+{
+	return container_of(lock, struct dpcp_semaphore, litmus_lock);
+}
+
+/* called with preemptions disabled */
+static void pfp_migrate_to(int target_cpu)
+{
+	struct task_struct* t = current;
+	pfp_domain_t *from;
+
+	if (get_partition(t) == target_cpu)
+		return;
+
+	/* make sure target_cpu makes sense */
+	BUG_ON(!cpu_online(target_cpu));
+
+	local_irq_disable();
+
+	/* scheduled task should not be in any ready or release queue */
+	BUG_ON(is_queued(t));
+
+	/* lock both pfp domains in order of address */
+	from = task_pfp(t);
+
+	raw_spin_lock(&from->slock);
+
+	/* switch partitions */
+	tsk_rt(t)->task_params.cpu = target_cpu;
+
+	raw_spin_unlock(&from->slock);
+
+	/* Don't trace scheduler costs as part of
+	 * locking overhead. Scheduling costs are accounted for
+	 * explicitly. */
+	TS_LOCK_SUSPEND;
+
+	local_irq_enable();
+	preempt_enable_no_resched();
+
+	/* deschedule to be migrated */
+	schedule();
+
+	/* we are now on the target processor */
+	preempt_disable();
+
+	/* start recording costs again */
+	TS_LOCK_RESUME;
+
+	BUG_ON(smp_processor_id() != target_cpu);
+}
+
+int pfp_dpcp_lock(struct litmus_lock* l)
+{
+	struct task_struct* t = current;
+	struct dpcp_semaphore *sem = dpcp_from_lock(l);
+	int eprio = effective_agent_priority(get_priority(t));
+	int from  = get_partition(t);
+	int to    = sem->pcp.on_cpu;
+
+	if (!is_realtime(t))
+		return -EPERM;
+
+	preempt_disable();
+
+	/* Priority-boost ourself *before* we suspend so that
+	 * our priority is boosted when we resume. */
+
+	boost_priority(t, get_priority(t));
+
+	pfp_migrate_to(to);
+
+	pcp_raise_ceiling(&sem->pcp, eprio);
+
+	/* yep, we got it => execute request */
+	sem->owner_cpu = from;
+
+	preempt_enable();
+
+	return 0;
+}
+
+int pfp_dpcp_unlock(struct litmus_lock* l)
+{
+	struct task_struct *t = current;
+	struct dpcp_semaphore *sem = dpcp_from_lock(l);
+	int err = 0;
+	int home;
+
+	preempt_disable();
+
+	if (sem->pcp.on_cpu != smp_processor_id() || sem->pcp.owner != t) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	home = sem->owner_cpu;
+
+	/* give it back */
+	pcp_lower_ceiling(&sem->pcp);
+
+	/* we lose the benefit of priority boosting */
+	unboost_priority(t);
+
+	pfp_migrate_to(home);
+
+out:
+	preempt_enable();
+
+	return err;
+}
+
+int pfp_dpcp_open(struct litmus_lock* l, void* __user config)
+{
+	struct task_struct *t = current;
+	struct dpcp_semaphore *sem = dpcp_from_lock(l);
+	int cpu, eprio;
+
+	if (!is_realtime(t))
+		/* we need to know the real-time priority */
+		return -EPERM;
+
+	if (get_user(cpu, (int*) config))
+		return -EFAULT;
+
+	/* make sure the resource location matches */
+	if (cpu != sem->pcp.on_cpu)
+		return -EINVAL;
+
+	eprio = effective_agent_priority(get_priority(t));
+
+	pcp_update_prio_ceiling(&sem->pcp, eprio);
+
+	return 0;
+}
+
+int pfp_dpcp_close(struct litmus_lock* l)
+{
+	struct task_struct *t = current;
+	struct dpcp_semaphore *sem = dpcp_from_lock(l);
+	int owner = 0;
+
+	preempt_disable();
+
+	if (sem->pcp.on_cpu == smp_processor_id())
+		owner = sem->pcp.owner == t;
+
+	preempt_enable();
+
+	if (owner)
+		pfp_dpcp_unlock(l);
+
+	return 0;
+}
+
+void pfp_dpcp_free(struct litmus_lock* lock)
+{
+	kfree(dpcp_from_lock(lock));
+}
+
+static struct litmus_lock_ops pfp_dpcp_lock_ops = {
+	.close  = pfp_dpcp_close,
+	.lock   = pfp_dpcp_lock,
+	.open	= pfp_dpcp_open,
+	.unlock = pfp_dpcp_unlock,
+	.deallocate = pfp_dpcp_free,
+};
+
+static struct litmus_lock* pfp_new_dpcp(int on_cpu)
+{
+	struct dpcp_semaphore* sem;
+
+	sem = kmalloc(sizeof(*sem), GFP_KERNEL);
+	if (!sem)
+		return NULL;
+
+	sem->litmus_lock.ops = &pfp_dpcp_lock_ops;
+	sem->owner_cpu = NO_CPU;
+	pcp_init_semaphore(&sem->pcp, on_cpu);
+
+	return &sem->litmus_lock;
+}
+
+
+/* **** lock constructor **** */
+
+
+static long pfp_allocate_lock(struct litmus_lock **lock, int type,
+				 void* __user config)
+{
+	int err = -ENXIO, cpu;
+	struct srp_semaphore* srp;
+
+	/* P-FP currently supports the SRP for local resources and the FMLP
+	 * for global resources. */
+	switch (type) {
+	case FMLP_SEM:
+		/* FIFO Mutex Locking Protocol */
+		*lock = pfp_new_fmlp();
+		if (*lock)
+			err = 0;
+		else
+			err = -ENOMEM;
+		break;
+
+	case MPCP_SEM:
+		/* Multiprocesor Priority Ceiling Protocol */
+		*lock = pfp_new_mpcp(0);
+		if (*lock)
+			err = 0;
+		else
+			err = -ENOMEM;
+		break;
+
+	case MPCP_VS_SEM:
+		/* Multiprocesor Priority Ceiling Protocol with virtual spinning */
+		*lock = pfp_new_mpcp(1);
+		if (*lock)
+			err = 0;
+		else
+			err = -ENOMEM;
+		break;
+
+	case DPCP_SEM:
+		/* Distributed Priority Ceiling Protocol */
+		if (get_user(cpu, (int*) config))
+			return -EFAULT;
+
+		if (!cpu_online(cpu))
+			return -EINVAL;
+
+		*lock = pfp_new_dpcp(cpu);
+		if (*lock)
+			err = 0;
+		else
+			err = -ENOMEM;
+		break;
+
+	case SRP_SEM:
+		/* Baker's Stack Resource Policy */
+		srp = allocate_srp_semaphore();
+		if (srp) {
+			*lock = &srp->litmus_lock;
+			err = 0;
+		} else
+			err = -ENOMEM;
+		break;
+	};
+
+	return err;
+}
+
+#endif
+
+static long pfp_admit_task(struct task_struct* tsk)
+{
+	if (task_cpu(tsk) == tsk->rt_param.task_params.cpu &&
+#ifdef CONFIG_RELEASE_MASTER
+	    /* don't allow tasks on release master CPU */
+	    task_cpu(tsk) != remote_dom(task_cpu(tsk))->release_master &&
+#endif
+	    get_priority(tsk) > 0)
+		return 0;
+	else
+		return -EINVAL;
+}
+
+static long pfp_activate_plugin(void)
+{
+#ifdef CONFIG_RELEASE_MASTER
+	int cpu;
+
+	for_each_online_cpu(cpu) {
+		remote_dom(cpu)->release_master = atomic_read(&release_master_cpu);
+	}
+#endif
+
+#ifdef CONFIG_LITMUS_LOCKING
+	get_srp_prio = pfp_get_srp_prio;
+
+	for_each_online_cpu(cpu) {
+		init_waitqueue_head(&per_cpu(mpcpvs_vspin_wait, cpu));
+		per_cpu(mpcpvs_vspin, cpu) = NULL;
+
+		pcp_init_state(&per_cpu(pcp_state, cpu));
+		pfp_doms[cpu] = remote_pfp(cpu);
+	}
+
+#endif
+
+	return 0;
+}
+
+
+/*	Plugin object	*/
+static struct sched_plugin pfp_plugin __cacheline_aligned_in_smp = {
+	.plugin_name		= "P-FP",
+	.tick			= pfp_tick,
+	.task_new		= pfp_task_new,
+	.complete_job		= complete_job,
+	.task_exit		= pfp_task_exit,
+	.schedule		= pfp_schedule,
+	.task_wake_up		= pfp_task_wake_up,
+	.task_block		= pfp_task_block,
+	.admit_task		= pfp_admit_task,
+	.activate_plugin	= pfp_activate_plugin,
+#ifdef CONFIG_LITMUS_LOCKING
+	.allocate_lock		= pfp_allocate_lock,
+	.finish_switch		= pfp_finish_switch,
+#endif
+};
+
+
+static int __init init_pfp(void)
+{
+	int i;
+
+	/* We do not really want to support cpu hotplug, do we? ;)
+	 * However, if we are so crazy to do so,
+	 * we cannot use num_online_cpu()
+	 */
+	for (i = 0; i < num_online_cpus(); i++) {
+		pfp_domain_init(remote_pfp(i), i);
+	}
+	return register_sched_plugin(&pfp_plugin);
+}
+
+module_init(init_pfp);
+
diff --git a/litmus/sched_plugin.c b/litmus/sched_plugin.c
new file mode 100644
index 000000000000..950fe5e6a1ab
--- /dev/null
+++ b/litmus/sched_plugin.c
@@ -0,0 +1,233 @@
+/* sched_plugin.c -- core infrastructure for the scheduler plugin system
+ *
+ * This file includes the initialization of the plugin system, the no-op Linux
+ * scheduler plugin, some dummy functions, and some helper functions.
+ */
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/sched.h>
+
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/preempt.h>
+#include <litmus/jobs.h>
+
+/*
+ * Generic function to trigger preemption on either local or remote cpu
+ * from scheduler plugins. The key feature is that this function is
+ * non-preemptive section aware and does not invoke the scheduler / send
+ * IPIs if the to-be-preempted task is actually non-preemptive.
+ */
+void preempt_if_preemptable(struct task_struct* t, int cpu)
+{
+	/* t is the real-time task executing on CPU on_cpu If t is NULL, then
+	 * on_cpu is currently scheduling background work.
+	 */
+
+	int reschedule = 0;
+
+	if (!t)
+		/* move non-real-time task out of the way */
+		reschedule = 1;
+	else {
+		if (smp_processor_id() == cpu) {
+			/* local CPU case */
+			/* check if we need to poke userspace */
+			if (is_user_np(t))
+				/* Yes, poke it. This doesn't have to be atomic since
+				 * the task is definitely not executing. */
+				request_exit_np(t);
+			else if (!is_kernel_np(t))
+				/* only if we are allowed to preempt the
+				 * currently-executing task */
+				reschedule = 1;
+		} else {
+			/* Remote CPU case.  Only notify if it's not a kernel
+			 * NP section and if we didn't set the userspace
+			 * flag. */
+			reschedule = !(is_kernel_np(t) || request_exit_np_atomic(t));
+		}
+	}
+	if (likely(reschedule))
+		litmus_reschedule(cpu);
+}
+
+
+/*************************************************************
+ *                   Dummy plugin functions                  *
+ *************************************************************/
+
+static void litmus_dummy_finish_switch(struct task_struct * prev)
+{
+}
+
+static struct task_struct* litmus_dummy_schedule(struct task_struct * prev)
+{
+	sched_state_task_picked();
+	return NULL;
+}
+
+static void litmus_dummy_tick(struct task_struct* tsk)
+{
+}
+
+static long litmus_dummy_admit_task(struct task_struct* tsk)
+{
+	printk(KERN_CRIT "LITMUS^RT: Linux plugin rejects %s/%d.\n",
+		tsk->comm, tsk->pid);
+	return -EINVAL;
+}
+
+static void litmus_dummy_task_new(struct task_struct *t, int on_rq, int running)
+{
+}
+
+static void litmus_dummy_task_wake_up(struct task_struct *task)
+{
+}
+
+static void litmus_dummy_task_block(struct task_struct *task)
+{
+}
+
+static void litmus_dummy_task_exit(struct task_struct *task)
+{
+}
+
+static void litmus_dummy_pre_setsched(struct task_struct *task, int policy)
+{
+}
+
+
+static long litmus_dummy_complete_job(void)
+{
+	return -ENOSYS;
+}
+
+static long litmus_dummy_activate_plugin(void)
+{
+	return 0;
+}
+
+static long litmus_dummy_deactivate_plugin(void)
+{
+	return 0;
+}
+
+#ifdef CONFIG_LITMUS_LOCKING
+
+static long litmus_dummy_allocate_lock(struct litmus_lock **lock, int type,
+				       void* __user config)
+{
+	return -ENXIO;
+}
+
+#endif
+
+
+/* The default scheduler plugin. It doesn't do anything and lets Linux do its
+ * job.
+ */
+struct sched_plugin linux_sched_plugin = {
+	.plugin_name = "Linux",
+	.tick = litmus_dummy_tick,
+	.task_new   = litmus_dummy_task_new,
+	.task_exit = litmus_dummy_task_exit,
+	.task_wake_up = litmus_dummy_task_wake_up,
+	.task_block = litmus_dummy_task_block,
+	.complete_job = litmus_dummy_complete_job,
+	.schedule = litmus_dummy_schedule,
+	.finish_switch = litmus_dummy_finish_switch,
+	.activate_plugin = litmus_dummy_activate_plugin,
+	.deactivate_plugin = litmus_dummy_deactivate_plugin,
+#ifdef CONFIG_LITMUS_LOCKING
+	.allocate_lock = litmus_dummy_allocate_lock,
+#endif
+	.admit_task = litmus_dummy_admit_task
+};
+
+/*
+ *	The reference to current plugin that is used to schedule tasks within
+ *	the system. It stores references to actual function implementations
+ *	Should be initialized by calling "init_***_plugin()"
+ */
+struct sched_plugin *litmus = &linux_sched_plugin;
+
+/* the list of registered scheduling plugins */
+static LIST_HEAD(sched_plugins);
+static DEFINE_RAW_SPINLOCK(sched_plugins_lock);
+
+#define CHECK(func) {\
+	if (!plugin->func) \
+		plugin->func = litmus_dummy_ ## func;}
+
+/* FIXME: get reference to module  */
+int register_sched_plugin(struct sched_plugin* plugin)
+{
+	printk(KERN_INFO "Registering LITMUS^RT plugin %s.\n",
+	       plugin->plugin_name);
+
+	/* make sure we don't trip over null pointers later */
+	CHECK(finish_switch);
+	CHECK(schedule);
+	CHECK(tick);
+	CHECK(task_wake_up);
+	CHECK(task_exit);
+	CHECK(task_block);
+	CHECK(task_new);
+	CHECK(complete_job);
+	CHECK(activate_plugin);
+	CHECK(deactivate_plugin);
+#ifdef CONFIG_LITMUS_LOCKING
+	CHECK(allocate_lock);
+#endif
+	CHECK(admit_task);
+	CHECK(pre_setsched);
+
+	if (!plugin->release_at)
+		plugin->release_at = release_at;
+
+	raw_spin_lock(&sched_plugins_lock);
+	list_add(&plugin->list, &sched_plugins);
+	raw_spin_unlock(&sched_plugins_lock);
+
+	return 0;
+}
+
+
+/* FIXME: reference counting, etc. */
+struct sched_plugin* find_sched_plugin(const char* name)
+{
+	struct list_head *pos;
+	struct sched_plugin *plugin;
+
+	raw_spin_lock(&sched_plugins_lock);
+	list_for_each(pos, &sched_plugins) {
+		plugin = list_entry(pos, struct sched_plugin, list);
+		if (!strcmp(plugin->plugin_name, name))
+		    goto out_unlock;
+	}
+	plugin = NULL;
+
+out_unlock:
+	raw_spin_unlock(&sched_plugins_lock);
+	return plugin;
+}
+
+int print_sched_plugins(char* buf, int max)
+{
+	int count = 0;
+	struct list_head *pos;
+	struct sched_plugin *plugin;
+
+	raw_spin_lock(&sched_plugins_lock);
+	list_for_each(pos, &sched_plugins) {
+		plugin = list_entry(pos, struct sched_plugin, list);
+		count += snprintf(buf + count, max - count, "%s\n", plugin->plugin_name);
+		if (max - count <= 0)
+			break;
+	}
+	raw_spin_unlock(&sched_plugins_lock);
+	return 	count;
+}
diff --git a/litmus/sched_psn_edf.c b/litmus/sched_psn_edf.c
new file mode 100644
index 000000000000..7b12689ab61a
--- /dev/null
+++ b/litmus/sched_psn_edf.c
@@ -0,0 +1,917 @@
+/*
+ * kernel/sched_psn_edf.c
+ *
+ * Implementation of the PSN-EDF scheduler plugin.
+ * Based on kern/sched_part_edf.c and kern/sched_gsn_edf.c.
+ *
+ * Suspensions and non-preemptable sections are supported.
+ * Priority inheritance is not supported.
+ */
+
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+
+#include <litmus/litmus.h>
+#include <litmus/wait.h>
+#include <litmus/jobs.h>
+#include <litmus/preempt.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/edf_common.h>
+#include <litmus/sched_trace.h>
+#include <litmus/trace.h>
+
+typedef struct {
+	rt_domain_t 		domain;
+	int          		cpu;
+	struct task_struct* 	scheduled; /* only RT tasks */
+/*
+ * scheduling lock slock
+ * protects the domain and serializes scheduling decisions
+ */
+#define slock domain.ready_lock
+
+} psnedf_domain_t;
+
+DEFINE_PER_CPU(psnedf_domain_t, psnedf_domains);
+
+#define local_edf		(&__get_cpu_var(psnedf_domains).domain)
+#define local_pedf		(&__get_cpu_var(psnedf_domains))
+#define remote_edf(cpu)		(&per_cpu(psnedf_domains, cpu).domain)
+#define remote_pedf(cpu)	(&per_cpu(psnedf_domains, cpu))
+#define task_edf(task)		remote_edf(get_partition(task))
+#define task_pedf(task)		remote_pedf(get_partition(task))
+
+
+static void psnedf_domain_init(psnedf_domain_t* pedf,
+			       check_resched_needed_t check,
+			       release_jobs_t release,
+			       int cpu)
+{
+	edf_domain_init(&pedf->domain, check, release);
+	pedf->cpu      		= cpu;
+	pedf->scheduled		= NULL;
+}
+
+static void requeue(struct task_struct* t, rt_domain_t *edf)
+{
+	if (t->state != TASK_RUNNING)
+		TRACE_TASK(t, "requeue: !TASK_RUNNING\n");
+
+	set_rt_flags(t, RT_F_RUNNING);
+	if (is_released(t, litmus_clock()))
+		__add_ready(edf, t);
+	else
+		add_release(edf, t); /* it has got to wait */
+}
+
+/* we assume the lock is being held */
+static void preempt(psnedf_domain_t *pedf)
+{
+	preempt_if_preemptable(pedf->scheduled, pedf->cpu);
+}
+
+#ifdef CONFIG_LITMUS_LOCKING
+
+static void boost_priority(struct task_struct* t)
+{
+	unsigned long		flags;
+	psnedf_domain_t* 	pedf = task_pedf(t);
+	lt_t			now;
+
+	raw_spin_lock_irqsave(&pedf->slock, flags);
+	now = litmus_clock();
+
+	TRACE_TASK(t, "priority boosted at %llu\n", now);
+
+	tsk_rt(t)->priority_boosted = 1;
+	tsk_rt(t)->boost_start_time = now;
+
+	if (pedf->scheduled != t) {
+		/* holder may be queued: first stop queue changes */
+		raw_spin_lock(&pedf->domain.release_lock);
+		if (is_queued(t) &&
+		    /* If it is queued, then we need to re-order. */
+		    bheap_decrease(edf_ready_order, tsk_rt(t)->heap_node) &&
+		    /* If we bubbled to the top, then we need to check for preemptions. */
+		    edf_preemption_needed(&pedf->domain, pedf->scheduled))
+				preempt(pedf);
+		raw_spin_unlock(&pedf->domain.release_lock);
+	} /* else: nothing to do since the job is not queued while scheduled */
+
+	raw_spin_unlock_irqrestore(&pedf->slock, flags);
+}
+
+static void unboost_priority(struct task_struct* t)
+{
+	unsigned long		flags;
+	psnedf_domain_t* 	pedf = task_pedf(t);
+	lt_t			now;
+
+	raw_spin_lock_irqsave(&pedf->slock, flags);
+	now = litmus_clock();
+
+	/* assumption: this only happens when the job is scheduled */
+	BUG_ON(pedf->scheduled != t);
+
+	TRACE_TASK(t, "priority restored at %llu\n", now);
+
+	/* priority boosted jobs must be scheduled */
+	BUG_ON(pedf->scheduled != t);
+
+	tsk_rt(t)->priority_boosted = 0;
+	tsk_rt(t)->boost_start_time = 0;
+
+	/* check if this changes anything */
+	if (edf_preemption_needed(&pedf->domain, pedf->scheduled))
+		preempt(pedf);
+
+	raw_spin_unlock_irqrestore(&pedf->slock, flags);
+}
+
+#endif
+
+/* This check is trivial in partioned systems as we only have to consider
+ * the CPU of the partition.
+ */
+static int psnedf_check_resched(rt_domain_t *edf)
+{
+	psnedf_domain_t *pedf = container_of(edf, psnedf_domain_t, domain);
+
+	/* because this is a callback from rt_domain_t we already hold
+	 * the necessary lock for the ready queue
+	 */
+	if (edf_preemption_needed(edf, pedf->scheduled)) {
+		preempt(pedf);
+		return 1;
+	} else
+		return 0;
+}
+
+static void job_completion(struct task_struct* t, int forced)
+{
+	sched_trace_task_completion(t,forced);
+	TRACE_TASK(t, "job_completion().\n");
+
+	set_rt_flags(t, RT_F_SLEEP);
+	prepare_for_next_period(t);
+}
+
+static void psnedf_tick(struct task_struct *t)
+{
+	psnedf_domain_t *pedf = local_pedf;
+
+	/* Check for inconsistency. We don't need the lock for this since
+	 * ->scheduled is only changed in schedule, which obviously is not
+	 *  executing in parallel on this CPU
+	 */
+	BUG_ON(is_realtime(t) && t != pedf->scheduled);
+
+	if (is_realtime(t) && budget_enforced(t) && budget_exhausted(t)) {
+		if (!is_np(t)) {
+			litmus_reschedule_local();
+			TRACE("psnedf_scheduler_tick: "
+			      "%d is preemptable "
+			      " => FORCE_RESCHED\n", t->pid);
+		} else if (is_user_np(t)) {
+			TRACE("psnedf_scheduler_tick: "
+			      "%d is non-preemptable, "
+			      "preemption delayed.\n", t->pid);
+			request_exit_np(t);
+		}
+	}
+}
+
+static struct task_struct* psnedf_schedule(struct task_struct * prev)
+{
+	psnedf_domain_t* 	pedf = local_pedf;
+	rt_domain_t*		edf  = &pedf->domain;
+	struct task_struct*	next;
+
+	int 			out_of_time, sleep, preempt,
+				np, exists, blocks, resched;
+
+	raw_spin_lock(&pedf->slock);
+
+	/* sanity checking
+	 * differently from gedf, when a task exits (dead)
+	 * pedf->schedule may be null and prev _is_ realtime
+	 */
+	BUG_ON(pedf->scheduled && pedf->scheduled != prev);
+	BUG_ON(pedf->scheduled && !is_realtime(prev));
+
+	/* (0) Determine state */
+	exists      = pedf->scheduled != NULL;
+	blocks      = exists && !is_running(pedf->scheduled);
+	out_of_time = exists &&
+				  budget_enforced(pedf->scheduled) &&
+				  budget_exhausted(pedf->scheduled);
+	np 	    = exists && is_np(pedf->scheduled);
+	sleep	    = exists && get_rt_flags(pedf->scheduled) == RT_F_SLEEP;
+	preempt     = edf_preemption_needed(edf, prev);
+
+	/* If we need to preempt do so.
+	 * The following checks set resched to 1 in case of special
+	 * circumstances.
+	 */
+	resched = preempt;
+
+	/* If a task blocks we have no choice but to reschedule.
+	 */
+	if (blocks)
+		resched = 1;
+
+	/* Request a sys_exit_np() call if we would like to preempt but cannot.
+	 * Multiple calls to request_exit_np() don't hurt.
+	 */
+	if (np && (out_of_time || preempt || sleep))
+		request_exit_np(pedf->scheduled);
+
+	/* Any task that is preemptable and either exhausts its execution
+	 * budget or wants to sleep completes. We may have to reschedule after
+	 * this.
+	 */
+	if (!np && (out_of_time || sleep) && !blocks) {
+		job_completion(pedf->scheduled, !sleep);
+		resched = 1;
+	}
+
+	/* The final scheduling decision. Do we need to switch for some reason?
+	 * Switch if we are in RT mode and have no task or if we need to
+	 * resched.
+	 */
+	next = NULL;
+	if ((!np || blocks) && (resched || !exists)) {
+		/* When preempting a task that does not block, then
+		 * re-insert it into either the ready queue or the
+		 * release queue (if it completed). requeue() picks
+		 * the appropriate queue.
+		 */
+		if (pedf->scheduled && !blocks)
+			requeue(pedf->scheduled, edf);
+		next = __take_ready(edf);
+	} else
+		/* Only override Linux scheduler if we have a real-time task
+		 * scheduled that needs to continue.
+		 */
+		if (exists)
+			next = prev;
+
+	if (next) {
+		TRACE_TASK(next, "scheduled at %llu\n", litmus_clock());
+		set_rt_flags(next, RT_F_RUNNING);
+	} else {
+		TRACE("becoming idle at %llu\n", litmus_clock());
+	}
+
+	pedf->scheduled = next;
+	sched_state_task_picked();
+	raw_spin_unlock(&pedf->slock);
+
+	return next;
+}
+
+
+/*	Prepare a task for running in RT mode
+ */
+static void psnedf_task_new(struct task_struct * t, int on_rq, int running)
+{
+	rt_domain_t* 		edf  = task_edf(t);
+	psnedf_domain_t* 	pedf = task_pedf(t);
+	unsigned long		flags;
+
+	TRACE_TASK(t, "psn edf: task new, cpu = %d\n",
+		   t->rt_param.task_params.cpu);
+
+	/* setup job parameters */
+	release_at(t, litmus_clock());
+
+	/* The task should be running in the queue, otherwise signal
+	 * code will try to wake it up with fatal consequences.
+	 */
+	raw_spin_lock_irqsave(&pedf->slock, flags);
+	if (running) {
+		/* there shouldn't be anything else running at the time */
+		BUG_ON(pedf->scheduled);
+		pedf->scheduled = t;
+	} else {
+		requeue(t, edf);
+		/* maybe we have to reschedule */
+		preempt(pedf);
+	}
+	raw_spin_unlock_irqrestore(&pedf->slock, flags);
+}
+
+static void psnedf_task_wake_up(struct task_struct *task)
+{
+	unsigned long		flags;
+	psnedf_domain_t* 	pedf = task_pedf(task);
+	rt_domain_t* 		edf  = task_edf(task);
+	lt_t			now;
+
+	TRACE_TASK(task, "wake_up at %llu\n", litmus_clock());
+	raw_spin_lock_irqsave(&pedf->slock, flags);
+	BUG_ON(is_queued(task));
+	now = litmus_clock();
+	if (is_tardy(task, now)
+#ifdef CONFIG_LITMUS_LOCKING
+	/* We need to take suspensions because of semaphores into
+	 * account! If a job resumes after being suspended due to acquiring
+	 * a semaphore, it should never be treated as a new job release.
+	 */
+	    && !is_priority_boosted(task)
+#endif
+		) {
+		/* new sporadic release */
+		release_at(task, now);
+		sched_trace_task_release(task);
+	}
+
+	/* Only add to ready queue if it is not the currently-scheduled
+	 * task. This could be the case if a task was woken up concurrently
+	 * on a remote CPU before the executing CPU got around to actually
+	 * de-scheduling the task, i.e., wake_up() raced with schedule()
+	 * and won.
+	 */
+	if (pedf->scheduled != task)
+		requeue(task, edf);
+
+	raw_spin_unlock_irqrestore(&pedf->slock, flags);
+	TRACE_TASK(task, "wake up done\n");
+}
+
+static void psnedf_task_block(struct task_struct *t)
+{
+	/* only running tasks can block, thus t is in no queue */
+	TRACE_TASK(t, "block at %llu, state=%d\n", litmus_clock(), t->state);
+
+	BUG_ON(!is_realtime(t));
+	BUG_ON(is_queued(t));
+}
+
+static void psnedf_task_exit(struct task_struct * t)
+{
+	unsigned long flags;
+	psnedf_domain_t* 	pedf = task_pedf(t);
+	rt_domain_t*		edf;
+
+	raw_spin_lock_irqsave(&pedf->slock, flags);
+	if (is_queued(t)) {
+		/* dequeue */
+		edf  = task_edf(t);
+		remove(edf, t);
+	}
+	if (pedf->scheduled == t)
+		pedf->scheduled = NULL;
+
+	TRACE_TASK(t, "RIP, now reschedule\n");
+
+	preempt(pedf);
+	raw_spin_unlock_irqrestore(&pedf->slock, flags);
+}
+
+#ifdef CONFIG_LITMUS_LOCKING
+
+#include <litmus/fdso.h>
+#include <litmus/srp.h>
+
+/* ******************** SRP support ************************ */
+
+static unsigned int psnedf_get_srp_prio(struct task_struct* t)
+{
+	/* assumes implicit deadlines */
+	return get_rt_period(t);
+}
+
+/* ******************** FMLP support ********************** */
+
+/* struct for semaphore with priority inheritance */
+struct fmlp_semaphore {
+	struct litmus_lock litmus_lock;
+
+	/* current resource holder */
+	struct task_struct *owner;
+
+	/* FIFO queue of waiting tasks */
+	wait_queue_head_t wait;
+};
+
+static inline struct fmlp_semaphore* fmlp_from_lock(struct litmus_lock* lock)
+{
+	return container_of(lock, struct fmlp_semaphore, litmus_lock);
+}
+int psnedf_fmlp_lock(struct litmus_lock* l)
+{
+	struct task_struct* t = current;
+	struct fmlp_semaphore *sem = fmlp_from_lock(l);
+	wait_queue_t wait;
+	unsigned long flags;
+
+	if (!is_realtime(t))
+		return -EPERM;
+
+	preempt_disable();
+
+	TRACE_CUR("want FMLP sem %p\n", sem);
+
+	boost_priority(t);
+
+	spin_lock_irqsave(&sem->wait.lock, flags);
+
+	if (sem->owner) {
+		/* resource is not free => must suspend and wait */
+
+		init_waitqueue_entry(&wait, t);
+
+		/* FIXME: interruptible would be nice some day */
+		set_task_state(t, TASK_UNINTERRUPTIBLE);
+
+		TRACE_CUR("blocking on FMLP sem %p\n", sem);
+		__add_wait_queue_tail_exclusive(&sem->wait, &wait);
+
+		/* release lock before sleeping */
+		spin_unlock_irqrestore(&sem->wait.lock, flags);
+
+		/* We depend on the FIFO order.  Thus, we don't need to recheck
+		 * when we wake up; we are guaranteed to have the lock since
+		 * there is only one wake up per release.
+		 */
+
+		TS_LOCK_SUSPEND;
+
+		preempt_enable_no_resched();
+
+		schedule();
+
+		preempt_disable();
+
+		TS_LOCK_RESUME;
+
+		/* Since we hold the lock, no other task will change
+		 * ->owner. We can thus check it without acquiring the spin
+		 * lock. */
+		BUG_ON(sem->owner != t);
+	} else {
+		/* it's ours now */
+		sem->owner = t;
+
+		spin_unlock_irqrestore(&sem->wait.lock, flags);
+	}
+
+	TRACE_CUR("got FMLP sem %p\n", sem);
+
+	preempt_enable();
+
+	return 0;
+}
+
+int psnedf_fmlp_unlock(struct litmus_lock* l)
+{
+	struct task_struct *t = current, *next;
+	struct fmlp_semaphore *sem = fmlp_from_lock(l);
+	unsigned long flags;
+	int err = 0;
+
+	spin_lock_irqsave(&sem->wait.lock, flags);
+
+	if (sem->owner != t) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	TRACE_CUR("releasing FMLP sem %p\n", sem);
+
+	/* we lose the benefit of priority boosting */
+
+	unboost_priority(t);
+
+	/* check if there are jobs waiting for this resource */
+	next = __waitqueue_remove_first(&sem->wait);
+	if (next) {
+		/* next becomes the resouce holder */
+		sem->owner = next;
+
+		/* wake up next */
+		wake_up_process(next);
+	} else
+		/* resource becomes available */
+		sem->owner = NULL;
+
+out:
+	spin_unlock_irqrestore(&sem->wait.lock, flags);
+	return err;
+}
+
+int psnedf_fmlp_close(struct litmus_lock* l)
+{
+	struct task_struct *t = current;
+	struct fmlp_semaphore *sem = fmlp_from_lock(l);
+	unsigned long flags;
+
+	int owner;
+
+	spin_lock_irqsave(&sem->wait.lock, flags);
+
+	owner = sem->owner == t;
+
+	spin_unlock_irqrestore(&sem->wait.lock, flags);
+
+	if (owner)
+		psnedf_fmlp_unlock(l);
+
+	return 0;
+}
+
+void psnedf_fmlp_free(struct litmus_lock* lock)
+{
+	kfree(fmlp_from_lock(lock));
+}
+
+static struct litmus_lock_ops psnedf_fmlp_lock_ops = {
+	.close  = psnedf_fmlp_close,
+	.lock   = psnedf_fmlp_lock,
+	.unlock = psnedf_fmlp_unlock,
+	.deallocate = psnedf_fmlp_free,
+};
+
+static struct litmus_lock* psnedf_new_fmlp(void)
+{
+	struct fmlp_semaphore* sem;
+
+	sem = kmalloc(sizeof(*sem), GFP_KERNEL);
+	if (!sem)
+		return NULL;
+
+	sem->owner   = NULL;
+	init_waitqueue_head(&sem->wait);
+	sem->litmus_lock.ops = &psnedf_fmlp_lock_ops;
+
+	return &sem->litmus_lock;
+}
+
+
+
+/* ******************** OMLP support **********************/
+
+/* Since jobs spin "virtually" while waiting to acquire a lock,
+ * they first must aquire a local per-cpu resource.
+ */
+static DEFINE_PER_CPU(wait_queue_head_t, omlp_token_wait);
+static DEFINE_PER_CPU(struct task_struct*, omlp_token);
+
+/* called with preemptions off <=> no local modifications */
+static void omlp_grab_token(void)
+{
+	struct task_struct* t = current;
+
+	while (1) {
+		if (__get_cpu_var(omlp_token) == NULL) {
+			/* take it */
+			__get_cpu_var(omlp_token) = t;
+			break;
+		} else {
+			/* some job is spinning => enqueue in request queue */
+			prio_wait_queue_t wait;
+			wait_queue_head_t* token_waiters = &__get_cpu_var(omlp_token_wait);
+			unsigned long flags;
+
+			/* ordered by regular priority; break by lower PID */
+			init_prio_waitqueue_entry_tie(&wait, t, get_deadline(t), t->pid);
+
+			spin_lock_irqsave(&token_waiters->lock, flags);
+
+			set_task_state(t, TASK_UNINTERRUPTIBLE);
+
+			__add_wait_queue_prio_exclusive(token_waiters, &wait);
+
+			TRACE_CUR("waiting for OMLP token\n");
+
+			spin_unlock_irqrestore(&token_waiters->lock, flags);
+
+			TS_LOCK_SUSPEND;
+
+			preempt_enable_no_resched();
+
+			schedule();
+
+			preempt_disable();
+
+			TS_LOCK_RESUME;
+			/* Recheck if we got it */
+		}
+	}
+	/* ok, now it is ours */
+	TRACE_CUR("got OMLP token\n");
+}
+
+/* called with preemptions off */
+static void omlp_release_token(void)
+{
+	struct task_struct* t = current, *next;
+	unsigned long flags;
+	wait_queue_head_t* token_waiters = &__get_cpu_var(omlp_token_wait);
+
+	BUG_ON(__get_cpu_var(omlp_token) != t);
+
+	__get_cpu_var(omlp_token) = NULL;
+
+	TRACE_CUR("released OMLP token\n");
+
+	spin_lock_irqsave(&token_waiters->lock, flags);
+	next = __waitqueue_remove_first(token_waiters);
+
+	if (next)
+		wake_up_process(next);
+
+	spin_unlock_irqrestore(&token_waiters->lock, flags);
+}
+
+
+struct omlp_semaphore {
+	struct litmus_lock litmus_lock;
+
+	/* current resource holder */
+	struct task_struct *owner;
+
+	/* FIFO queue of waiting tasks */
+	wait_queue_head_t wait;
+};
+
+static inline struct omlp_semaphore* omlp_from_lock(struct litmus_lock* lock)
+{
+	return container_of(lock, struct omlp_semaphore, litmus_lock);
+}
+int psnedf_omlp_lock(struct litmus_lock* l)
+{
+	struct task_struct* t = current;
+	struct omlp_semaphore *sem = omlp_from_lock(l);
+	wait_queue_t wait;
+	unsigned long flags;
+
+	if (!is_realtime(t))
+		return -EPERM;
+
+	preempt_disable();
+
+	omlp_grab_token();
+
+	/* Priority-boost ourself *before* we suspend so that
+	 * our priority is boosted when we resume. */
+	boost_priority(t);
+
+	spin_lock_irqsave(&sem->wait.lock, flags);
+
+	if (sem->owner) {
+		/* resource is not free => must suspend and wait */
+
+		init_waitqueue_entry(&wait, t);
+
+		/* FIXME: interruptible would be nice some day */
+		set_task_state(t, TASK_UNINTERRUPTIBLE);
+
+		__add_wait_queue_tail_exclusive(&sem->wait, &wait);
+
+		/* release lock before sleeping */
+		spin_unlock_irqrestore(&sem->wait.lock, flags);
+
+		/* We depend on the FIFO order.  Thus, we don't need to recheck
+		 * when we wake up; we are guaranteed to have the lock since
+		 * there is only one wake up per release.
+		 */
+		TS_LOCK_SUSPEND;
+
+		preempt_enable_no_resched();
+
+		schedule();
+
+		preempt_disable();
+
+		TS_LOCK_RESUME;
+
+		/* Since we hold the lock, no other task will change
+		 * ->owner. We can thus check it without acquiring the spin
+		 * lock. */
+		BUG_ON(sem->owner != t);
+	} else {
+		/* it's ours now */
+		sem->owner = t;
+
+		spin_unlock_irqrestore(&sem->wait.lock, flags);
+	}
+
+	preempt_enable();
+
+	return 0;
+}
+
+int psnedf_omlp_unlock(struct litmus_lock* l)
+{
+	struct task_struct *t = current, *next;
+	struct omlp_semaphore *sem = omlp_from_lock(l);
+	unsigned long flags;
+	int err = 0;
+
+	preempt_disable();
+
+	spin_lock_irqsave(&sem->wait.lock, flags);
+
+	if (sem->owner != t) {
+		err = -EINVAL;
+		spin_unlock_irqrestore(&sem->wait.lock, flags);
+		goto out;
+	}
+
+	/* we lose the benefit of priority boosting */
+
+	unboost_priority(t);
+
+	/* check if there are jobs waiting for this resource */
+	next = __waitqueue_remove_first(&sem->wait);
+	if (next) {
+		/* next becomes the resouce holder */
+		sem->owner = next;
+
+		/* Wake up next. The waiting job is already priority-boosted. */
+		wake_up_process(next);
+	} else
+		/* resource becomes available */
+		sem->owner = NULL;
+
+	spin_unlock_irqrestore(&sem->wait.lock, flags);
+
+	omlp_release_token();
+
+out:
+	preempt_enable();
+	return err;
+}
+
+int psnedf_omlp_close(struct litmus_lock* l)
+{
+	struct task_struct *t = current;
+	struct omlp_semaphore *sem = omlp_from_lock(l);
+	unsigned long flags;
+
+	int owner;
+
+	spin_lock_irqsave(&sem->wait.lock, flags);
+
+	owner = sem->owner == t;
+
+	spin_unlock_irqrestore(&sem->wait.lock, flags);
+
+	if (owner)
+		psnedf_omlp_unlock(l);
+
+	return 0;
+}
+
+void psnedf_omlp_free(struct litmus_lock* lock)
+{
+	kfree(omlp_from_lock(lock));
+}
+
+static struct litmus_lock_ops psnedf_omlp_lock_ops = {
+	.close  = psnedf_omlp_close,
+	.lock   = psnedf_omlp_lock,
+	.unlock = psnedf_omlp_unlock,
+	.deallocate = psnedf_omlp_free,
+};
+
+static struct litmus_lock* psnedf_new_omlp(void)
+{
+	struct omlp_semaphore* sem;
+
+	sem = kmalloc(sizeof(*sem), GFP_KERNEL);
+	if (!sem)
+		return NULL;
+
+	sem->owner   = NULL;
+	init_waitqueue_head(&sem->wait);
+	sem->litmus_lock.ops = &psnedf_omlp_lock_ops;
+
+	return &sem->litmus_lock;
+}
+
+
+/* **** lock constructor **** */
+
+
+static long psnedf_allocate_lock(struct litmus_lock **lock, int type,
+				 void* __user unused)
+{
+	int err = -ENXIO;
+	struct srp_semaphore* srp;
+
+	/* PSN-EDF currently supports the SRP for local resources and the FMLP
+	 * for global resources. */
+	switch (type) {
+	case FMLP_SEM:
+		/* Flexible Multiprocessor Locking Protocol */
+		*lock = psnedf_new_fmlp();
+		if (*lock)
+			err = 0;
+		else
+			err = -ENOMEM;
+		break;
+
+	case OMLP_SEM:
+		/* O(m) Locking Protocol */
+		*lock = psnedf_new_omlp();
+		if (*lock)
+			err = 0;
+		else
+			err = -ENOMEM;
+		break;
+
+	case SRP_SEM:
+		/* Baker's Stack Resource Policy */
+		srp = allocate_srp_semaphore();
+		if (srp) {
+			*lock = &srp->litmus_lock;
+			err = 0;
+		} else
+			err = -ENOMEM;
+		break;
+	};
+
+	return err;
+}
+
+#endif
+
+
+static long psnedf_activate_plugin(void)
+{
+
+	int cpu;
+
+	for_each_online_cpu(cpu) {
+#ifdef CONFIG_RELEASE_MASTER
+		remote_edf(cpu)->release_master = atomic_read(&release_master_cpu);
+#endif
+#ifdef CONFIG_LITMUS_LOCKING
+		init_waitqueue_head(&per_cpu(omlp_token_wait, cpu));
+		per_cpu(omlp_token, cpu) = NULL;
+#endif
+	}
+
+
+#ifdef CONFIG_LITMUS_LOCKING
+	get_srp_prio = psnedf_get_srp_prio;
+#endif
+
+	return 0;
+}
+
+static long psnedf_admit_task(struct task_struct* tsk)
+{
+	if (task_cpu(tsk) == tsk->rt_param.task_params.cpu
+#ifdef CONFIG_RELEASE_MASTER
+	    /* don't allow tasks on release master CPU */
+	     && task_cpu(tsk) != remote_edf(task_cpu(tsk))->release_master
+#endif
+		)
+		return 0;
+	else
+		return -EINVAL;
+}
+
+/*	Plugin object	*/
+static struct sched_plugin psn_edf_plugin __cacheline_aligned_in_smp = {
+	.plugin_name		= "PSN-EDF",
+	.tick			= psnedf_tick,
+	.task_new		= psnedf_task_new,
+	.complete_job		= complete_job,
+	.task_exit		= psnedf_task_exit,
+	.schedule		= psnedf_schedule,
+	.task_wake_up		= psnedf_task_wake_up,
+	.task_block		= psnedf_task_block,
+	.admit_task		= psnedf_admit_task,
+	.activate_plugin	= psnedf_activate_plugin,
+#ifdef CONFIG_LITMUS_LOCKING
+	.allocate_lock		= psnedf_allocate_lock,
+#endif
+};
+
+
+static int __init init_psn_edf(void)
+{
+	int i;
+
+	/* We do not really want to support cpu hotplug, do we? ;)
+	 * However, if we are so crazy to do so,
+	 * we cannot use num_online_cpu()
+	 */
+	for (i = 0; i < num_online_cpus(); i++) {
+		psnedf_domain_init(remote_pedf(i),
+				   psnedf_check_resched,
+				   NULL, i);
+	}
+	return register_sched_plugin(&psn_edf_plugin);
+}
+
+module_init(init_psn_edf);
+
diff --git a/litmus/sched_task_trace.c b/litmus/sched_task_trace.c
new file mode 100644
index 000000000000..5ef8d09ab41f
--- /dev/null
+++ b/litmus/sched_task_trace.c
@@ -0,0 +1,241 @@
+/*
+ * sched_task_trace.c -- record scheduling events to a byte stream
+ */
+
+#define NO_TASK_TRACE_DECLS
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/percpu.h>
+
+#include <litmus/ftdev.h>
+#include <litmus/litmus.h>
+
+#include <litmus/sched_trace.h>
+#include <litmus/feather_trace.h>
+#include <litmus/ftdev.h>
+
+
+#define NO_EVENTS		(1 << CONFIG_SCHED_TASK_TRACE_SHIFT)
+
+#define now() litmus_clock()
+
+struct local_buffer {
+	struct st_event_record record[NO_EVENTS];
+	char   flag[NO_EVENTS];
+	struct ft_buffer ftbuf;
+};
+
+DEFINE_PER_CPU(struct local_buffer, st_event_buffer);
+
+static struct ftdev st_dev;
+
+static int st_dev_can_open(struct ftdev *dev, unsigned int cpu)
+{
+	return cpu_online(cpu) ? 0 : -ENODEV;
+}
+
+static int __init init_sched_task_trace(void)
+{
+	struct local_buffer* buf;
+	int i, ok = 0, err;
+	printk("Allocated %u sched_trace_xxx() events per CPU "
+	       "(buffer size: %d bytes)\n",
+	       NO_EVENTS, (int) sizeof(struct local_buffer));
+
+	err = ftdev_init(&st_dev, THIS_MODULE,
+			num_online_cpus(), "sched_trace");
+	if (err)
+		goto err_out;
+
+	for (i = 0; i < st_dev.minor_cnt; i++) {
+		buf = &per_cpu(st_event_buffer, i);
+		ok += init_ft_buffer(&buf->ftbuf, NO_EVENTS,
+				     sizeof(struct st_event_record),
+				     buf->flag,
+				     buf->record);
+		st_dev.minor[i].buf = &buf->ftbuf;
+	}
+	if (ok == st_dev.minor_cnt) {
+		st_dev.can_open = st_dev_can_open;
+		err = register_ftdev(&st_dev);
+		if (err)
+			goto err_dealloc;
+	} else {
+		err = -EINVAL;
+		goto err_dealloc;
+	}
+
+	return 0;
+
+err_dealloc:
+	ftdev_exit(&st_dev);
+err_out:
+	printk(KERN_WARNING "Could not register sched_trace module\n");
+	return err;
+}
+
+static void __exit exit_sched_task_trace(void)
+{
+	ftdev_exit(&st_dev);
+}
+
+module_init(init_sched_task_trace);
+module_exit(exit_sched_task_trace);
+
+
+static inline struct st_event_record* get_record(u8 type, struct task_struct* t)
+{
+	struct st_event_record* rec = NULL;
+	struct local_buffer* buf;
+
+	buf = &get_cpu_var(st_event_buffer);
+	if (ft_buffer_start_write(&buf->ftbuf, (void**) &rec)) {
+		rec->hdr.type = type;
+		rec->hdr.cpu  = smp_processor_id();
+		rec->hdr.pid  = t ? t->pid : 0;
+		rec->hdr.job  = t ? t->rt_param.job_params.job_no : 0;
+	} else {
+		put_cpu_var(st_event_buffer);
+	}
+	/* rec will be NULL if it failed */
+	return rec;
+}
+
+static inline void put_record(struct st_event_record* rec)
+{
+	struct local_buffer* buf;
+	buf = &__get_cpu_var(st_event_buffer);
+	ft_buffer_finish_write(&buf->ftbuf, rec);
+	put_cpu_var(st_event_buffer);
+}
+
+feather_callback void do_sched_trace_task_name(unsigned long id, unsigned long _task)
+{
+	struct task_struct *t = (struct task_struct*) _task;
+	struct st_event_record* rec = get_record(ST_NAME, t);
+	int i;
+	if (rec) {
+		for (i = 0; i < min(TASK_COMM_LEN, ST_NAME_LEN); i++)
+			rec->data.name.cmd[i] = t->comm[i];
+		put_record(rec);
+	}
+}
+
+feather_callback void do_sched_trace_task_param(unsigned long id, unsigned long _task)
+{
+	struct task_struct *t = (struct task_struct*) _task;
+	struct st_event_record* rec = get_record(ST_PARAM, t);
+	if (rec) {
+		rec->data.param.wcet      = get_exec_cost(t);
+		rec->data.param.period    = get_rt_period(t);
+		rec->data.param.phase     = get_rt_phase(t);
+		rec->data.param.partition = get_partition(t);
+		rec->data.param.class     = get_class(t);
+		put_record(rec);
+	}
+}
+
+feather_callback void do_sched_trace_task_release(unsigned long id, unsigned long _task)
+{
+	struct task_struct *t = (struct task_struct*) _task;
+	struct st_event_record* rec = get_record(ST_RELEASE, t);
+	if (rec) {
+		rec->data.release.release  = get_release(t);
+		rec->data.release.deadline = get_deadline(t);
+		put_record(rec);
+	}
+}
+
+/* skipped: st_assigned_data, we don't use it atm */
+
+feather_callback void do_sched_trace_task_switch_to(unsigned long id,
+						    unsigned long _task)
+{
+	struct task_struct *t = (struct task_struct*) _task;
+	struct st_event_record* rec;
+	if (is_realtime(t)) {
+		rec = get_record(ST_SWITCH_TO, t);
+		if (rec) {
+			rec->data.switch_to.when      = now();
+			rec->data.switch_to.exec_time = get_exec_time(t);
+			put_record(rec);
+		}
+	}
+}
+
+feather_callback void do_sched_trace_task_switch_away(unsigned long id,
+						      unsigned long _task)
+{
+	struct task_struct *t = (struct task_struct*) _task;
+	struct st_event_record* rec;
+	if (is_realtime(t)) {
+		rec = get_record(ST_SWITCH_AWAY, t);
+		if (rec) {
+			rec->data.switch_away.when      = now();
+			rec->data.switch_away.exec_time = get_exec_time(t);
+			put_record(rec);
+		}
+	}
+}
+
+feather_callback void do_sched_trace_task_completion(unsigned long id,
+						     unsigned long _task,
+						     unsigned long forced)
+{
+	struct task_struct *t = (struct task_struct*) _task;
+	struct st_event_record* rec = get_record(ST_COMPLETION, t);
+	if (rec) {
+		rec->data.completion.when   = now();
+		rec->data.completion.forced = forced;
+		put_record(rec);
+	}
+}
+
+feather_callback void do_sched_trace_task_block(unsigned long id,
+						unsigned long _task)
+{
+	struct task_struct *t = (struct task_struct*) _task;
+	struct st_event_record* rec = get_record(ST_BLOCK, t);
+	if (rec) {
+		rec->data.block.when      = now();
+		put_record(rec);
+	}
+}
+
+feather_callback void do_sched_trace_task_resume(unsigned long id,
+						 unsigned long _task)
+{
+	struct task_struct *t = (struct task_struct*) _task;
+	struct st_event_record* rec = get_record(ST_RESUME, t);
+	if (rec) {
+		rec->data.resume.when      = now();
+		put_record(rec);
+	}
+}
+
+feather_callback void do_sched_trace_sys_release(unsigned long id,
+						 unsigned long _start)
+{
+	lt_t *start = (lt_t*) _start;
+	struct st_event_record* rec = get_record(ST_SYS_RELEASE, NULL);
+	if (rec) {
+		rec->data.sys_release.when    = now();
+		rec->data.sys_release.release = *start;
+		put_record(rec);
+	}
+}
+
+feather_callback void do_sched_trace_action(unsigned long id,
+					    unsigned long _task,
+					    unsigned long action)
+{
+	struct task_struct *t = (struct task_struct*) _task;
+	struct st_event_record* rec = get_record(ST_ACTION, t);
+
+	if (rec) {
+		rec->data.action.when   = now();
+		rec->data.action.action = action;
+		put_record(rec);
+	}
+}
diff --git a/litmus/sched_trace.c b/litmus/sched_trace.c
new file mode 100644
index 000000000000..f4171fddbbb1
--- /dev/null
+++ b/litmus/sched_trace.c
@@ -0,0 +1,252 @@
+/*
+ * sched_trace.c -- record scheduling events to a byte stream.
+ */
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/miscdevice.h>
+#include <asm/uaccess.h>
+#include <linux/module.h>
+#include <linux/sysrq.h>
+
+#include <linux/kfifo.h>
+
+#include <litmus/sched_trace.h>
+#include <litmus/litmus.h>
+
+#define SCHED_TRACE_NAME "litmus/log"
+
+/* Compute size of TRACE() buffer */
+#define LITMUS_TRACE_BUF_SIZE (1 << CONFIG_SCHED_DEBUG_TRACE_SHIFT)
+
+/* Max length of one read from the buffer */
+#define MAX_READ_LEN (64 * 1024)
+
+/* Max length for one write --- by TRACE() --- to the buffer. This is used to
+ * allocate a per-cpu buffer for printf() formatting. */
+#define MSG_SIZE 255
+
+
+static DEFINE_MUTEX(reader_mutex);
+static atomic_t reader_cnt = ATOMIC_INIT(0);
+static DEFINE_KFIFO(debug_buffer, char, LITMUS_TRACE_BUF_SIZE);
+
+
+static DEFINE_RAW_SPINLOCK(log_buffer_lock);
+static DEFINE_PER_CPU(char[MSG_SIZE], fmt_buffer);
+
+/*
+ * sched_trace_log_message - Write to the trace buffer (log_buffer)
+ *
+ * This is the only function accessing the log_buffer from inside the
+ * kernel for writing.
+ * Concurrent access to sched_trace_log_message must be serialized using
+ * log_buffer_lock
+ * The maximum length of a formatted message is 255
+ */
+void sched_trace_log_message(const char* fmt, ...)
+{
+	unsigned long 	flags;
+	va_list 	args;
+	size_t		len;
+	char*		buf;
+
+	if (!atomic_read(&reader_cnt))
+		/* early exit if nobody is listening */
+		return;
+
+	va_start(args, fmt);
+	local_irq_save(flags);
+
+	/* format message */
+	buf = __get_cpu_var(fmt_buffer);
+	len = vscnprintf(buf, MSG_SIZE, fmt, args);
+
+	raw_spin_lock(&log_buffer_lock);
+	/* Don't copy the trailing null byte, we don't want null bytes in a
+	 * text file.
+	 */
+	kfifo_in(&debug_buffer, buf, len);
+	raw_spin_unlock(&log_buffer_lock);
+
+	local_irq_restore(flags);
+	va_end(args);
+}
+
+
+/*
+ * log_read - Read the trace buffer
+ *
+ * This function is called as a file operation from userspace.
+ * Readers can sleep. Access is serialized through reader_mutex
+ */
+static ssize_t log_read(struct file *filp,
+			char __user *to, size_t len,
+			loff_t *f_pos)
+{
+	/* we ignore f_pos, this is strictly sequential */
+
+	ssize_t error = -EINVAL;
+	char* mem;
+
+	if (mutex_lock_interruptible(&reader_mutex)) {
+		error = -ERESTARTSYS;
+		goto out;
+	}
+
+	if (len > MAX_READ_LEN)
+		len = MAX_READ_LEN;
+
+	mem = kmalloc(len, GFP_KERNEL);
+	if (!mem) {
+		error = -ENOMEM;
+		goto out_unlock;
+	}
+
+	error = kfifo_out(&debug_buffer, mem, len);
+	while (!error) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(110);
+		if (signal_pending(current))
+			error = -ERESTARTSYS;
+		else
+			error = kfifo_out(&debug_buffer, mem, len);
+	}
+
+	if (error > 0 && copy_to_user(to, mem, error))
+		error = -EFAULT;
+
+	kfree(mem);
+ out_unlock:
+	mutex_unlock(&reader_mutex);
+ out:
+	return error;
+}
+
+/*
+ * Enable redirection of printk() messages to the trace buffer.
+ * Defined in kernel/printk.c
+ */
+extern int trace_override;
+extern int trace_recurse;
+
+/*
+ * log_open - open the global log message ring buffer.
+ */
+static int log_open(struct inode *in, struct file *filp)
+{
+	int error = -EINVAL;
+
+	if (mutex_lock_interruptible(&reader_mutex)) {
+		error = -ERESTARTSYS;
+		goto out;
+	}
+
+	atomic_inc(&reader_cnt);
+	error = 0;
+
+	printk(KERN_DEBUG
+	       "sched_trace kfifo with buffer starting at: 0x%p\n",
+	       debug_buffer.buf);
+
+	/* override printk() */
+	trace_override++;
+
+	mutex_unlock(&reader_mutex);
+ out:
+	return error;
+}
+
+static int log_release(struct inode *in, struct file *filp)
+{
+	int error = -EINVAL;
+
+	if (mutex_lock_interruptible(&reader_mutex)) {
+		error = -ERESTARTSYS;
+		goto out;
+	}
+
+	atomic_dec(&reader_cnt);
+
+	/* release printk() overriding */
+	trace_override--;
+
+	printk(KERN_DEBUG "sched_trace kfifo released\n");
+
+	mutex_unlock(&reader_mutex);
+ out:
+	return error;
+}
+
+/*
+ * log_fops  - The file operations for accessing the global LITMUS log message
+ *             buffer.
+ *
+ * Except for opening the device file it uses the same operations as trace_fops.
+ */
+static struct file_operations log_fops = {
+	.owner   = THIS_MODULE,
+	.open    = log_open,
+	.release = log_release,
+	.read    = log_read,
+};
+
+static struct miscdevice litmus_log_dev = {
+	.name    = SCHED_TRACE_NAME,
+	.minor   = MISC_DYNAMIC_MINOR,
+	.fops    = &log_fops,
+};
+
+#ifdef CONFIG_MAGIC_SYSRQ
+void dump_trace_buffer(int max)
+{
+	char line[80];
+	int len;
+	int count = 0;
+
+	/* potential, but very unlikely, race... */
+	trace_recurse = 1;
+	while ((max == 0 || count++ < max) &&
+	       (len = kfifo_out(&debug_buffer, line, sizeof(line - 1))) > 0) {
+		line[len] = '\0';
+		printk("%s", line);
+	}
+	trace_recurse = 0;
+}
+
+static void sysrq_dump_trace_buffer(int key)
+{
+	dump_trace_buffer(100);
+}
+
+static struct sysrq_key_op sysrq_dump_trace_buffer_op = {
+	.handler	= sysrq_dump_trace_buffer,
+	.help_msg	= "dump-trace-buffer(Y)",
+	.action_msg	= "writing content of TRACE() buffer",
+};
+#endif
+
+static int __init init_sched_trace(void)
+{
+	printk("Initializing TRACE() device\n");
+
+#ifdef CONFIG_MAGIC_SYSRQ
+	/* offer some debugging help */
+	if (!register_sysrq_key('y', &sysrq_dump_trace_buffer_op))
+		printk("Registered dump-trace-buffer(Y) magic sysrq.\n");
+	else
+		printk("Could not register dump-trace-buffer(Y) magic sysrq.\n");
+#endif
+
+	return misc_register(&litmus_log_dev);
+}
+
+static void __exit exit_sched_trace(void)
+{
+	misc_deregister(&litmus_log_dev);
+}
+
+module_init(init_sched_trace);
+module_exit(exit_sched_trace);
diff --git a/litmus/srp.c b/litmus/srp.c
new file mode 100644
index 000000000000..2ed4ec12a9d3
--- /dev/null
+++ b/litmus/srp.c
@@ -0,0 +1,295 @@
+/* ************************************************************************** */
+/*                          STACK RESOURCE POLICY                             */
+/* ************************************************************************** */
+
+#include <asm/atomic.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/fdso.h>
+#include <litmus/trace.h>
+
+
+#ifdef CONFIG_LITMUS_LOCKING
+
+#include <litmus/srp.h>
+
+srp_prioritization_t get_srp_prio;
+
+struct srp {
+	struct list_head	ceiling;
+	wait_queue_head_t	ceiling_blocked;
+};
+#define system_ceiling(srp) list2prio(srp->ceiling.next)
+#define ceiling2sem(c) container_of(c, struct srp_semaphore, ceiling)
+
+#define UNDEF_SEM -2
+
+atomic_t srp_objects_in_use = ATOMIC_INIT(0);
+
+DEFINE_PER_CPU(struct srp, srp);
+
+/* Initialize SRP semaphores at boot time. */
+static int __init srp_init(void)
+{
+	int i;
+
+	printk("Initializing SRP per-CPU ceilings...");
+	for (i = 0; i < NR_CPUS; i++) {
+		init_waitqueue_head(&per_cpu(srp, i).ceiling_blocked);
+		INIT_LIST_HEAD(&per_cpu(srp, i).ceiling);
+	}
+	printk(" done!\n");
+
+	return 0;
+}
+module_init(srp_init);
+
+/* SRP task priority comparison function. Smaller numeric values have higher
+ * priority, tie-break is PID. Special case: priority == 0 <=> no priority
+ */
+static int srp_higher_prio(struct srp_priority* first,
+			   struct srp_priority* second)
+{
+	if (!first->priority)
+		return 0;
+	else
+		return  !second->priority ||
+			first->priority < second->priority || (
+			first->priority == second->priority &&
+			first->pid < second->pid);
+}
+
+
+static int srp_exceeds_ceiling(struct task_struct* first,
+			       struct srp* srp)
+{
+	struct srp_priority prio;
+
+	if (list_empty(&srp->ceiling))
+		return 1;
+	else {
+		prio.pid = first->pid;
+		prio.priority = get_srp_prio(first);
+		return srp_higher_prio(&prio, system_ceiling(srp)) ||
+			ceiling2sem(system_ceiling(srp))->owner == first;
+	}
+}
+
+static void srp_add_prio(struct srp* srp, struct srp_priority* prio)
+{
+	struct list_head *pos;
+	if (in_list(&prio->list)) {
+		printk(KERN_CRIT "WARNING: SRP violation detected, prio is already in "
+		       "ceiling list! cpu=%d, srp=%p\n", smp_processor_id(), ceiling2sem(prio));
+		return;
+	}
+	list_for_each(pos, &srp->ceiling)
+		if (unlikely(srp_higher_prio(prio, list2prio(pos)))) {
+			__list_add(&prio->list, pos->prev, pos);
+			return;
+		}
+
+	list_add_tail(&prio->list, &srp->ceiling);
+}
+
+
+static int lock_srp_semaphore(struct litmus_lock* l)
+{
+	struct srp_semaphore* sem = container_of(l, struct srp_semaphore, litmus_lock);
+
+	if (!is_realtime(current))
+		return -EPERM;
+
+	preempt_disable();
+
+	/* Update ceiling. */
+	srp_add_prio(&__get_cpu_var(srp), &sem->ceiling);
+
+	/* SRP invariant: all resources available */
+	BUG_ON(sem->owner != NULL);
+
+	sem->owner = current;
+	TRACE_CUR("acquired srp 0x%p\n", sem);
+
+	preempt_enable();
+
+	return 0;
+}
+
+static int unlock_srp_semaphore(struct litmus_lock* l)
+{
+	struct srp_semaphore* sem = container_of(l, struct srp_semaphore, litmus_lock);
+	int err = 0;
+
+	preempt_disable();
+
+	if (sem->owner != current) {
+		err = -EINVAL;
+	} else {
+		/* Determine new system priority ceiling for this CPU. */
+		BUG_ON(!in_list(&sem->ceiling.list));
+
+		list_del(&sem->ceiling.list);
+		sem->owner = NULL;
+
+		/* Wake tasks on this CPU, if they exceed current ceiling. */
+		TRACE_CUR("released srp 0x%p\n", sem);
+		wake_up_all(&__get_cpu_var(srp).ceiling_blocked);
+	}
+
+	preempt_enable();
+	return err;
+}
+
+static int open_srp_semaphore(struct litmus_lock* l, void* __user arg)
+{
+	struct srp_semaphore* sem = container_of(l, struct srp_semaphore, litmus_lock);
+	int err = 0;
+	struct task_struct* t = current;
+	struct srp_priority t_prio;
+
+	if (!is_realtime(t))
+		return -EPERM;
+
+	TRACE_CUR("opening SRP semaphore %p, cpu=%d\n", sem, sem->cpu);
+
+	preempt_disable();
+
+	if (sem->owner != NULL)
+		err = -EBUSY;
+
+	if (err == 0) {
+		if (sem->cpu == UNDEF_SEM)
+			sem->cpu = get_partition(t);
+		else if (sem->cpu != get_partition(t))
+			err = -EPERM;
+	}
+
+	if (err == 0) {
+		t_prio.priority = get_srp_prio(t);
+		t_prio.pid      = t->pid;
+		if (srp_higher_prio(&t_prio, &sem->ceiling)) {
+			sem->ceiling.priority = t_prio.priority;
+			sem->ceiling.pid      = t_prio.pid;
+		}
+	}
+
+	preempt_enable();
+
+	return err;
+}
+
+static int close_srp_semaphore(struct litmus_lock* l)
+{
+	struct srp_semaphore* sem = container_of(l, struct srp_semaphore, litmus_lock);
+	int err = 0;
+
+	preempt_disable();
+
+	if (sem->owner == current)
+		unlock_srp_semaphore(l);
+
+	preempt_enable();
+
+	return err;
+}
+
+static void deallocate_srp_semaphore(struct litmus_lock* l)
+{
+	struct srp_semaphore* sem = container_of(l, struct srp_semaphore, litmus_lock);
+	atomic_dec(&srp_objects_in_use);
+	kfree(sem);
+}
+
+static struct litmus_lock_ops srp_lock_ops = {
+	.open   = open_srp_semaphore,
+	.close  = close_srp_semaphore,
+	.lock   = lock_srp_semaphore,
+	.unlock = unlock_srp_semaphore,
+	.deallocate = deallocate_srp_semaphore,
+};
+
+struct srp_semaphore* allocate_srp_semaphore(void)
+{
+	struct srp_semaphore* sem;
+
+	sem = kmalloc(sizeof(*sem), GFP_KERNEL);
+	if (!sem)
+		return NULL;
+
+	INIT_LIST_HEAD(&sem->ceiling.list);
+	sem->ceiling.priority = 0;
+	sem->cpu     = UNDEF_SEM;
+	sem->owner   = NULL;
+
+	sem->litmus_lock.ops = &srp_lock_ops;
+
+	atomic_inc(&srp_objects_in_use);
+	return sem;
+}
+
+static int srp_wake_up(wait_queue_t *wait, unsigned mode, int sync,
+		       void *key)
+{
+	int cpu = smp_processor_id();
+	struct task_struct *tsk = wait->private;
+	if (cpu != get_partition(tsk))
+		TRACE_TASK(tsk, "srp_wake_up on wrong cpu, partition is %d\b",
+			   get_partition(tsk));
+	else if (srp_exceeds_ceiling(tsk, &__get_cpu_var(srp)))
+		return default_wake_function(wait, mode, sync, key);
+	return 0;
+}
+
+static void do_ceiling_block(struct task_struct *tsk)
+{
+	wait_queue_t wait = {
+		.private   = tsk,
+		.func      = srp_wake_up,
+		.task_list = {NULL, NULL}
+	};
+
+	tsk->state = TASK_UNINTERRUPTIBLE;
+	add_wait_queue(&__get_cpu_var(srp).ceiling_blocked, &wait);
+	tsk->rt_param.srp_non_recurse = 1;
+	preempt_enable_no_resched();
+	schedule();
+	preempt_disable();
+	tsk->rt_param.srp_non_recurse = 0;
+	remove_wait_queue(&__get_cpu_var(srp).ceiling_blocked, &wait);
+}
+
+/* Wait for current task priority to exceed system-wide priority ceiling.
+ * FIXME: the hotpath should be inline.
+ */
+void srp_ceiling_block(void)
+{
+	struct task_struct *tsk = current;
+
+	/* Only applies to real-time tasks, but optimize for RT tasks. */
+	if (unlikely(!is_realtime(tsk)))
+		return;
+
+	/* Avoid recursive ceiling blocking. */
+	if (unlikely(tsk->rt_param.srp_non_recurse))
+		return;
+
+	/* Bail out early if there aren't any SRP resources around. */
+	if (likely(!atomic_read(&srp_objects_in_use)))
+		return;
+
+	preempt_disable();
+	if (!srp_exceeds_ceiling(tsk, &__get_cpu_var(srp))) {
+		TRACE_CUR("is priority ceiling blocked.\n");
+		while (!srp_exceeds_ceiling(tsk, &__get_cpu_var(srp)))
+			do_ceiling_block(tsk);
+		TRACE_CUR("finally exceeds system ceiling.\n");
+	} else
+		TRACE_CUR("is not priority ceiling blocked\n");
+	preempt_enable();
+}
+
+#endif
diff --git a/litmus/sync.c b/litmus/sync.c
new file mode 100644
index 000000000000..bf75fde5450b
--- /dev/null
+++ b/litmus/sync.c
@@ -0,0 +1,104 @@
+/* litmus/sync.c - Support for synchronous and asynchronous task system releases.
+ *
+ *
+ */
+
+#include <asm/atomic.h>
+#include <asm/uaccess.h>
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <linux/sched.h>
+#include <linux/completion.h>
+
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/jobs.h>
+
+#include <litmus/sched_trace.h>
+
+static DECLARE_COMPLETION(ts_release);
+
+static long do_wait_for_ts_release(void)
+{
+	long ret = 0;
+
+	/* If the interruption races with a release, the completion object
+	 * may have a non-zero counter. To avoid this problem, this should
+	 * be replaced by wait_for_completion().
+	 *
+	 * For debugging purposes, this is interruptible for now.
+	 */
+	ret = wait_for_completion_interruptible(&ts_release);
+
+	return ret;
+}
+
+int count_tasks_waiting_for_release(void)
+{
+	unsigned long flags;
+	int task_count = 0;
+	struct list_head *pos;
+
+	spin_lock_irqsave(&ts_release.wait.lock, flags);
+	list_for_each(pos, &ts_release.wait.task_list) {
+		task_count++;
+	}
+	spin_unlock_irqrestore(&ts_release.wait.lock, flags);
+
+	return task_count;
+}
+
+static long do_release_ts(lt_t start)
+{
+	int  task_count = 0;
+	unsigned long flags;
+	struct list_head	*pos;
+	struct task_struct 	*t;
+
+
+	spin_lock_irqsave(&ts_release.wait.lock, flags);
+	TRACE("<<<<<< synchronous task system release >>>>>>\n");
+
+	sched_trace_sys_release(&start);
+	list_for_each(pos, &ts_release.wait.task_list) {
+		t = (struct task_struct*) list_entry(pos,
+						     struct __wait_queue,
+						     task_list)->private;
+		task_count++;
+		litmus->release_at(t, start + t->rt_param.task_params.phase);
+		sched_trace_task_release(t);
+	}
+
+	spin_unlock_irqrestore(&ts_release.wait.lock, flags);
+
+	complete_n(&ts_release, task_count);
+
+	return task_count;
+}
+
+
+asmlinkage long sys_wait_for_ts_release(void)
+{
+	long ret = -EPERM;
+	struct task_struct *t = current;
+
+	if (is_realtime(t))
+		ret = do_wait_for_ts_release();
+
+	return ret;
+}
+
+
+asmlinkage long sys_release_ts(lt_t __user *__delay)
+{
+	long ret;
+	lt_t delay;
+
+	/* FIXME: check capabilities... */
+
+	ret = copy_from_user(&delay, __delay, sizeof(delay));
+	if (ret == 0)
+		ret = do_release_ts(litmus_clock() + delay);
+
+	return ret;
+}
diff --git a/litmus/trace.c b/litmus/trace.c
new file mode 100644
index 000000000000..39200c8ff74e
--- /dev/null
+++ b/litmus/trace.c
@@ -0,0 +1,213 @@
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/uaccess.h>
+
+#include <litmus/ftdev.h>
+#include <litmus/litmus.h>
+#include <litmus/trace.h>
+
+/******************************************************************************/
+/*                          Allocation                                        */
+/******************************************************************************/
+
+static struct ftdev overhead_dev;
+
+#define trace_ts_buf overhead_dev.minor[0].buf
+
+static unsigned int ts_seq_no = 0;
+
+static inline void __save_timestamp_cpu(unsigned long event,
+					uint8_t type, uint8_t cpu)
+{
+	unsigned int seq_no;
+	struct timestamp *ts;
+	seq_no = fetch_and_inc((int *) &ts_seq_no);
+	if (ft_buffer_start_write(trace_ts_buf, (void**)  &ts)) {
+		ts->event     = event;
+		ts->timestamp = ft_timestamp();
+		ts->seq_no    = seq_no;
+		ts->cpu       = cpu;
+		ts->task_type = type;
+		ft_buffer_finish_write(trace_ts_buf, ts);
+	}
+}
+
+static void __add_timestamp_user(struct timestamp *pre_recorded)
+{
+	unsigned int seq_no;
+	struct timestamp *ts;
+	seq_no = fetch_and_inc((int *) &ts_seq_no);
+
+	if (ft_buffer_start_write(trace_ts_buf, (void**)  &ts)) {
+		*ts = *pre_recorded;
+		ts->seq_no = seq_no;
+		ft_buffer_finish_write(trace_ts_buf, ts);
+	}
+}
+
+static inline void __save_timestamp(unsigned long event,
+				   uint8_t type)
+{
+	__save_timestamp_cpu(event, type, raw_smp_processor_id());
+}
+
+/* hack: fake timestamp to user-reported time, and record parts of the PID */
+feather_callback void save_timestamp_time(unsigned long event, unsigned long ptr)
+{
+	uint64_t* time = (uint64_t*) ptr;
+	unsigned int seq_no;
+	struct timestamp *ts;
+	seq_no = fetch_and_inc((int *) &ts_seq_no);
+	if (ft_buffer_start_write(trace_ts_buf, (void**)  &ts)) {
+		ts->event     = event;
+		ts->timestamp = *time;
+		ts->seq_no    = seq_no;
+		/* type takes lowest byte of PID */
+		ts->task_type = (uint8_t) current->pid;
+		/* cpu takes second-lowest byte of PID*/
+		ts->cpu       = (uint8_t) (current->pid >> 8);
+
+		ft_buffer_finish_write(trace_ts_buf, ts);
+	}
+}
+
+feather_callback void save_timestamp_pid(unsigned long event)
+{
+	/* Abuse existing fields to partially export PID. */
+	__save_timestamp_cpu(event,
+			     /* type takes lowest byte of PID */
+			     (uint8_t) current->pid,
+			     /* cpu takes second-lowest byte of PID*/
+			     (uint8_t) (current->pid >> 8));
+}
+
+feather_callback void save_timestamp(unsigned long event)
+{
+	__save_timestamp(event, TSK_UNKNOWN);
+}
+
+feather_callback void save_timestamp_def(unsigned long event,
+					 unsigned long type)
+{
+	__save_timestamp(event, (uint8_t) type);
+}
+
+feather_callback void save_timestamp_task(unsigned long event,
+					  unsigned long t_ptr)
+{
+	int rt = is_realtime((struct task_struct *) t_ptr);
+	__save_timestamp(event, rt ? TSK_RT : TSK_BE);
+}
+
+feather_callback void save_timestamp_cpu(unsigned long event,
+					 unsigned long cpu)
+{
+	__save_timestamp_cpu(event, TSK_UNKNOWN, cpu);
+}
+
+feather_callback void save_task_latency(unsigned long event,
+					unsigned long when_ptr)
+{
+	lt_t now = litmus_clock();
+	lt_t *when = (lt_t*) when_ptr;
+	unsigned int seq_no;
+	int cpu = raw_smp_processor_id();
+	struct timestamp *ts;
+
+	seq_no = fetch_and_inc((int *) &ts_seq_no);
+	if (ft_buffer_start_write(trace_ts_buf, (void**)  &ts)) {
+		ts->event     = event;
+		ts->timestamp = now - *when;
+		ts->seq_no    = seq_no;
+		ts->cpu       = cpu;
+		ts->task_type = TSK_RT;
+		ft_buffer_finish_write(trace_ts_buf, ts);
+	}
+}
+
+/******************************************************************************/
+/*                        DEVICE FILE DRIVER                                  */
+/******************************************************************************/
+
+/*
+ * should be 8M; it is the max we can ask to buddy system allocator (MAX_ORDER)
+ * and we might not get as much
+ */
+#define NO_TIMESTAMPS (2 << 16)
+
+static int alloc_timestamp_buffer(struct ftdev* ftdev, unsigned int idx)
+{
+	unsigned int count = NO_TIMESTAMPS;
+	while (count && !trace_ts_buf) {
+		printk("time stamp buffer: trying to allocate %u time stamps.\n", count);
+		ftdev->minor[idx].buf = alloc_ft_buffer(count, sizeof(struct timestamp));
+		count /= 2;
+	}
+	return ftdev->minor[idx].buf ? 0 : -ENOMEM;
+}
+
+static void free_timestamp_buffer(struct ftdev* ftdev, unsigned int idx)
+{
+	free_ft_buffer(ftdev->minor[idx].buf);
+	ftdev->minor[idx].buf = NULL;
+}
+
+static ssize_t write_timestamp_from_user(struct ft_buffer* buf, size_t len,
+					 const char __user *from)
+{
+	ssize_t consumed = 0;
+	struct timestamp ts;
+
+	/* don't give us partial timestamps */
+	if (len % sizeof(ts))
+		return -EINVAL;
+
+	while (len >= sizeof(ts)) {
+		if (copy_from_user(&ts, from, sizeof(ts))) {
+			consumed = -EFAULT;
+			goto out;
+		}
+		len  -= sizeof(ts);
+		from += sizeof(ts);
+		consumed += sizeof(ts);
+
+		__add_timestamp_user(&ts);
+	}
+
+out:
+	return consumed;
+}
+
+static int __init init_ft_overhead_trace(void)
+{
+	int err;
+
+	printk("Initializing Feather-Trace overhead tracing device.\n");
+	err = ftdev_init(&overhead_dev, THIS_MODULE, 1, "ft_trace");
+	if (err)
+		goto err_out;
+
+	overhead_dev.alloc = alloc_timestamp_buffer;
+	overhead_dev.free  = free_timestamp_buffer;
+	overhead_dev.write = write_timestamp_from_user;
+
+	err = register_ftdev(&overhead_dev);
+	if (err)
+		goto err_dealloc;
+
+	return 0;
+
+err_dealloc:
+	ftdev_exit(&overhead_dev);
+err_out:
+	printk(KERN_WARNING "Could not register ft_trace module.\n");
+	return err;
+}
+
+static void __exit exit_ft_overhead_trace(void)
+{
+	ftdev_exit(&overhead_dev);
+}
+
+module_init(init_ft_overhead_trace);
+module_exit(exit_ft_overhead_trace);
-- 
cgit v1.2.2