From 3d5537c160c1484e8d562b9828baf679cc53f67a Mon Sep 17 00:00:00 2001
From: Glenn Elliott <gelliott@cs.unc.edu>
Date: Thu, 2 Jun 2011 16:06:05 -0400
Subject: Full patch for klitirqd with Nvidia GPU support.

---
 arch/x86/kernel/irq.c                 |   14 +
 arch/x86/kernel/syscall_table_32.S    |    1 +
 include/linux/completion.h            |    1 +
 include/linux/interrupt.h             |    9 +-
 include/linux/mutex.h                 |   10 +
 include/linux/semaphore.h             |    9 +
 include/linux/workqueue.h             |   18 +
 include/litmus/affinity.h             |   78 ++
 include/litmus/fdso.h                 |    6 +-
 include/litmus/litmus.h               |    1 +
 include/litmus/litmus_softirq.h       |  199 +++++
 include/litmus/nvidia_info.h          |   37 +
 include/litmus/preempt.h              |    1 +
 include/litmus/rt_param.h             |   44 +-
 include/litmus/sched_plugin.h         |   22 +-
 include/litmus/sched_trace.h          |  174 +++-
 include/litmus/sched_trace_external.h |   42 +
 include/litmus/unistd_32.h            |    3 +-
 include/litmus/unistd_64.h            |    5 +-
 kernel/lockdep.c                      |    3 +-
 kernel/mutex.c                        |  141 +++
 kernel/sched.c                        |   23 +-
 kernel/semaphore.c                    |   13 +-
 kernel/softirq.c                      |  278 +++++-
 kernel/workqueue.c                    |   70 +-
 litmus/Kconfig                        |   89 ++
 litmus/Makefile                       |    4 +
 litmus/affinity.c                     |   49 +
 litmus/edf_common.c                   |    6 +
 litmus/fdso.c                         |    1 +
 litmus/litmus.c                       |   82 +-
 litmus/litmus_proc.c                  |   17 +
 litmus/litmus_softirq.c               | 1579 +++++++++++++++++++++++++++++++++
 litmus/locking.c                      |    1 -
 litmus/nvidia_info.c                  |  526 +++++++++++
 litmus/preempt.c                      |    7 +
 litmus/sched_cedf.c                   |  852 +++++++++++++++++-
 litmus/sched_gsn_edf.c                |  756 +++++++++++++++-
 litmus/sched_litmus.c                 |    2 +
 litmus/sched_plugin.c                 |   29 +
 litmus/sched_task_trace.c             |  216 ++++-
 litmus/sched_trace_external.c         |   45 +
 42 files changed, 5325 insertions(+), 138 deletions(-)
 create mode 100644 include/litmus/affinity.h
 create mode 100644 include/litmus/litmus_softirq.h
 create mode 100644 include/litmus/nvidia_info.h
 create mode 100644 include/litmus/sched_trace_external.h
 create mode 100644 litmus/affinity.c
 create mode 100644 litmus/litmus_softirq.c
 create mode 100644 litmus/nvidia_info.c
 create mode 100644 litmus/sched_trace_external.c

diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 91fd0c70a18a..50abbc6b7429 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -8,6 +8,10 @@
 #include <linux/smp.h>
 #include <linux/ftrace.h>
 
+#ifdef CONFIG_LITMUS_NVIDIA
+#include <litmus/sched_trace.h>
+#endif
+
 #include <asm/apic.h>
 #include <asm/io_apic.h>
 #include <asm/irq.h>
@@ -244,7 +248,17 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
 				__func__, smp_processor_id(), vector, irq);
 	}
 
+//#ifndef CONFIG_LITMUS_NVIDIA
 	irq_exit();
+//#else
+	/* skip softirqs if we're tracing an interrupt top-half */
+	/* comment out if-statement if we want to trace with bh on. */
+	//if(!is_interrupt_tracing_active())
+//	irq_exit();
+
+
+//	sched_trace_nv_interrupt_end();
+//#endif
 
 	set_irq_regs(old_regs);
 	return 1;
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index 37702905f658..b5ddae40cee2 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -352,3 +352,4 @@ ENTRY(sys_call_table)
 	.long sys_wait_for_ts_release
 	.long sys_release_ts
 	.long sys_null_call
+	.long sys_register_nv_device
diff --git a/include/linux/completion.h b/include/linux/completion.h
index c63950e8a863..3ce20dd3086e 100644
--- a/include/linux/completion.h
+++ b/include/linux/completion.h
@@ -76,6 +76,7 @@ static inline void init_completion(struct completion *x)
 	init_waitqueue_head(&x->wait);
 }
 
+extern void __wait_for_completion_locked(struct completion *);
 extern void wait_for_completion(struct completion *);
 extern int wait_for_completion_interruptible(struct completion *x);
 extern int wait_for_completion_killable(struct completion *x);
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index a0384a4d1e6f..5d22f5342376 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -459,6 +459,10 @@ struct tasklet_struct
 	atomic_t count;
 	void (*func)(unsigned long);
 	unsigned long data;
+
+#ifdef CONFIG_LITMUS_SOFTIRQD
+	struct task_struct *owner;
+#endif
 };
 
 #define DECLARE_TASKLET(name, func, data) \
@@ -496,6 +500,7 @@ static inline void tasklet_unlock_wait(struct tasklet_struct *t)
 #define tasklet_unlock(t) do { } while (0)
 #endif
 
+extern void ___tasklet_schedule(struct tasklet_struct *t);
 extern void __tasklet_schedule(struct tasklet_struct *t);
 
 static inline void tasklet_schedule(struct tasklet_struct *t)
@@ -504,6 +509,7 @@ static inline void tasklet_schedule(struct tasklet_struct *t)
 		__tasklet_schedule(t);
 }
 
+extern void ___tasklet_hi_schedule(struct tasklet_struct *t);
 extern void __tasklet_hi_schedule(struct tasklet_struct *t);
 
 static inline void tasklet_hi_schedule(struct tasklet_struct *t)
@@ -512,6 +518,7 @@ static inline void tasklet_hi_schedule(struct tasklet_struct *t)
 		__tasklet_hi_schedule(t);
 }
 
+extern void ___tasklet_hi_schedule_first(struct tasklet_struct *t);
 extern void __tasklet_hi_schedule_first(struct tasklet_struct *t);
 
 /*
@@ -541,7 +548,7 @@ static inline void tasklet_disable(struct tasklet_struct *t)
 }
 
 static inline void tasklet_enable(struct tasklet_struct *t)
-{
+{	
 	smp_mb__before_atomic_dec();
 	atomic_dec(&t->count);
 }
diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index f363bc8fdc74..9f3199571994 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -126,6 +126,15 @@ static inline int mutex_is_locked(struct mutex *lock)
 	return atomic_read(&lock->count) != 1;
 }
 
+/* return non-zero to abort.  only pre-side-effects may abort */
+typedef int (*side_effect_t)(unsigned long);
+extern void mutex_lock_sfx(struct mutex *lock,
+						   side_effect_t pre, unsigned long pre_arg,
+						   side_effect_t post, unsigned long post_arg);
+extern void mutex_unlock_sfx(struct mutex *lock,
+							 side_effect_t pre, unsigned long pre_arg,
+							 side_effect_t post, unsigned long post_arg);
+
 /*
  * See kernel/mutex.c for detailed documentation of these APIs.
  * Also see Documentation/mutex-design.txt.
@@ -145,6 +154,7 @@ extern void mutex_lock(struct mutex *lock);
 extern int __must_check mutex_lock_interruptible(struct mutex *lock);
 extern int __must_check mutex_lock_killable(struct mutex *lock);
 
+
 # define mutex_lock_nested(lock, subclass) mutex_lock(lock)
 # define mutex_lock_interruptible_nested(lock, subclass) mutex_lock_interruptible(lock)
 # define mutex_lock_killable_nested(lock, subclass) mutex_lock_killable(lock)
diff --git a/include/linux/semaphore.h b/include/linux/semaphore.h
index 5310d27abd2a..69e3f57661ec 100644
--- a/include/linux/semaphore.h
+++ b/include/linux/semaphore.h
@@ -49,4 +49,13 @@ extern int __must_check down_trylock(struct semaphore *sem);
 extern int __must_check down_timeout(struct semaphore *sem, long jiffies);
 extern void up(struct semaphore *sem);
 
+extern void __down(struct semaphore *sem);
+extern void __up(struct semaphore *sem);
+
+struct semaphore_waiter {
+	struct list_head list;
+	struct task_struct *task;
+	int up;
+};
+
 #endif /* __LINUX_SEMAPHORE_H */
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 25e02c941bac..5fecfb375eeb 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -83,6 +83,9 @@ struct work_struct {
 #ifdef CONFIG_LOCKDEP
 	struct lockdep_map lockdep_map;
 #endif
+#ifdef CONFIG_LITMUS_SOFTIRQD
+	struct task_struct *owner;
+#endif
 };
 
 #define WORK_DATA_INIT()	ATOMIC_LONG_INIT(WORK_STRUCT_NO_CPU)
@@ -115,11 +118,25 @@ struct execute_work {
 #define __WORK_INIT_LOCKDEP_MAP(n, k)
 #endif
 
+#ifdef CONFIG_LITMUS_SOFTIRQD
+#define __WORK_INIT_OWNER() \
+	.owner = NULL,
+
+#define PREPARE_OWNER(_work, _owner) \
+	do { \
+		(_work)->owner = (_owner); \
+	} while(0)
+#else
+#define __WORK_INIT_OWNER()
+#define PREPARE_OWNER(_work, _owner)
+#endif
+
 #define __WORK_INITIALIZER(n, f) {				\
 	.data = WORK_DATA_STATIC_INIT(),			\
 	.entry	= { &(n).entry, &(n).entry },			\
 	.func = (f),						\
 	__WORK_INIT_LOCKDEP_MAP(#n, &(n))			\
+	__WORK_INIT_OWNER() \
 	}
 
 #define __DELAYED_WORK_INITIALIZER(n, f) {			\
@@ -327,6 +344,7 @@ extern void flush_workqueue(struct workqueue_struct *wq);
 extern void flush_scheduled_work(void);
 extern void flush_delayed_work(struct delayed_work *work);
 
+extern int __schedule_work(struct work_struct *work);
 extern int schedule_work(struct work_struct *work);
 extern int schedule_work_on(int cpu, struct work_struct *work);
 extern int schedule_delayed_work(struct delayed_work *work, unsigned long delay);
diff --git a/include/litmus/affinity.h b/include/litmus/affinity.h
new file mode 100644
index 000000000000..877b4099c6e2
--- /dev/null
+++ b/include/litmus/affinity.h
@@ -0,0 +1,78 @@
+#ifndef __LITMUS_AFFINITY_H
+#define __LITMUS_AFFINITY_H
+
+#include <linux/cpumask.h>
+
+/*
+  L1 (instr) = depth 0
+  L1 (data)  = depth 1
+  L2 = depth 2
+  L3 = depth 3
+ */
+#define NUM_CACHE_LEVELS 4
+
+struct neighborhood
+{
+	unsigned int size[NUM_CACHE_LEVELS];
+	cpumask_var_t neighbors[NUM_CACHE_LEVELS];
+};
+
+/* topology info is stored redundently in a big array for fast lookups */
+extern struct neighborhood neigh_info[NR_CPUS];
+
+void init_topology(void); /* called by Litmus module's _init_litmus() */
+
+/* Works like:
+void get_nearest_available_cpu(cpu_entry_t* nearest, cpu_entry_t* start, cpu_entry_t* entries, int release_master)
+
+Set release_master = -1 for no RM.
+ */
+#define get_nearest_available_cpu(nearest, start, entries, release_master) \
+{ \
+	(nearest) = NULL; \
+	if(!(start)->linked) \
+	{ \
+		(nearest) = (start); \
+	} \
+	else \
+	{ \
+		int __level; \
+		int __cpu; \
+		struct neighborhood* __neighbors = &neigh_info[(start)->cpu]; \
+		\
+		for(__level = 0; (__level < NUM_CACHE_LEVELS) && !(nearest); ++__level) \
+		{ \
+			if(__neighbors->size[__level] > 1) \
+			{ \
+				for_each_cpu(__cpu, __neighbors->neighbors[__level]) \
+				{ \
+					if(__cpu != (release_master)) \
+					{ \
+						cpu_entry_t* __entry = &per_cpu((entries), __cpu); \
+						if(!__entry->linked) \
+						{ \
+							(nearest) = __entry; \
+							break; \
+						} \
+					} \
+				} \
+			} \
+			else if(__neighbors->size[__level] == 0) \
+			{ \
+				break; \
+			} \
+		} \
+	} \
+	\
+	if((nearest)) \
+	{ \
+		TRACE("P%d is closest available CPU to P%d\n", (nearest)->cpu, (start)->cpu); \
+	} \
+	else \
+	{ \
+		TRACE("Could not find an available CPU close to P%d\n", \
+						(start)->cpu); \
+	} \
+}
+
+#endif
diff --git a/include/litmus/fdso.h b/include/litmus/fdso.h
index caf2a1e6918c..c740e8fc3e88 100644
--- a/include/litmus/fdso.h
+++ b/include/litmus/fdso.h
@@ -18,9 +18,10 @@ typedef enum  {
 	MIN_OBJ_TYPE 	= 0,
 
 	FMLP_SEM	= 0,
-	SRP_SEM		= 1,
+	KFMLP_SEM	= 1,
+	SRP_SEM		= 2,
 
-	MAX_OBJ_TYPE	= 1
+	MAX_OBJ_TYPE	= SRP_SEM
 } obj_type_t;
 
 struct inode_obj_id {
@@ -64,6 +65,7 @@ static inline void* od_lookup(int od, obj_type_t type)
 }
 
 #define lookup_fmlp_sem(od)((struct pi_semaphore*)  od_lookup(od, FMLP_SEM))
+#define lookup_kfmlp_sem(od)((struct pi_semaphore*)  od_lookup(od, KFMLP_SEM))
 #define lookup_srp_sem(od) ((struct srp_semaphore*) od_lookup(od, SRP_SEM))
 #define lookup_ics(od)     ((struct ics*)           od_lookup(od, ICS_ID))
 
diff --git a/include/litmus/litmus.h b/include/litmus/litmus.h
index e7769ca36ec0..3df242bf272f 100644
--- a/include/litmus/litmus.h
+++ b/include/litmus/litmus.h
@@ -26,6 +26,7 @@ static inline int in_list(struct list_head* list)
 		);
 }
 
+
 struct task_struct* __waitqueue_remove_first(wait_queue_head_t *wq);
 
 #define NO_CPU			0xffffffff
diff --git a/include/litmus/litmus_softirq.h b/include/litmus/litmus_softirq.h
new file mode 100644
index 000000000000..34287f3cbb8d
--- /dev/null
+++ b/include/litmus/litmus_softirq.h
@@ -0,0 +1,199 @@
+#ifndef __LITMUS_SOFTIRQ_H
+#define __LITMUS_SOFTIRQ_H
+
+#include <linux/interrupt.h>
+#include <linux/workqueue.h>
+
+/*
+   Threaded tasklet handling for Litmus.  Tasklets
+   are scheduled with the priority of the tasklet's
+   owner---that is, the RT task on behalf the tasklet
+   runs.
+ 
+   Tasklets are current scheduled in FIFO order with
+   NO priority inheritance for "blocked" tasklets.
+ 
+   klitirqd assumes the priority of the owner of the
+   tasklet when the tasklet is next to execute.
+ 
+   Currently, hi-tasklets are scheduled before
+   low-tasklets, regardless of priority of low-tasklets.
+   And likewise, low-tasklets are scheduled before work
+   queue objects.  This priority inversion probably needs
+   to be fixed, though it is not an issue if our work with
+   GPUs as GPUs are owned (and associated klitirqds) for
+   exclusive time periods, thus no inversions can
+   occur.
+ */
+
+
+
+#define NR_LITMUS_SOFTIRQD CONFIG_NR_LITMUS_SOFTIRQD
+
+/* Spawns NR_LITMUS_SOFTIRQD klitirqd daemons.
+   Actual launch of threads is deffered to kworker's
+   workqueue, so daemons will likely not be immediately
+   running when this function returns, though the required
+   data will be initialized.
+ 
+   @affinity_set: an array expressing the processor affinity
+    for each of the NR_LITMUS_SOFTIRQD daemons.  May be set
+    to NULL for global scheduling.
+ 
+	- Examples -
+	8-CPU system with two CPU clusters:
+		affinity[] = {0, 0, 0, 0, 3, 3, 3, 3}
+		NOTE: Daemons not actually bound to specified CPU, but rather
+		cluster in which the CPU resides.
+ 
+	8-CPU system, partitioned:
+		affinity[] = {0, 1, 2, 3, 4, 5, 6, 7}
+ 
+	FIXME: change array to a CPU topology or array of cpumasks
+ 
+ */
+void spawn_klitirqd(int* affinity);
+
+
+/* Raises a flag to tell klitirqds to terminate.
+   Termination is async, so some threads may be running
+   after function return. */
+void kill_klitirqd(void);
+
+
+/* Returns 1 if all NR_LITMUS_SOFTIRQD klitirqs are ready
+   to handle tasklets. 0, otherwise.*/
+int klitirqd_is_ready(void);
+
+/* Returns 1 if no NR_LITMUS_SOFTIRQD klitirqs are ready
+   to handle tasklets. 0, otherwise.*/
+int klitirqd_is_dead(void);
+
+/* Flushes all pending work out to the OS for regular
+ * tasklet/work processing of the specified 'owner'
+ *
+ * PRECOND: klitirqd_thread must have a clear entry
+ * in the GPU registry, otherwise this call will become
+ * a no-op as work will loop back to the klitirqd_thread.
+ *
+ * Pass NULL for owner to flush ALL pending items.
+ */
+void flush_pending(struct task_struct* klitirqd_thread,
+				   struct task_struct* owner);
+
+struct task_struct* get_klitirqd(unsigned int k_id);
+
+
+extern int __litmus_tasklet_schedule(
+        struct tasklet_struct *t,
+        unsigned int k_id);
+
+/* schedule a tasklet on klitirqd #k_id */
+static inline int litmus_tasklet_schedule(
+    struct tasklet_struct *t,
+    unsigned int k_id)
+{
+	int ret = 0;
+	if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state))
+		ret = __litmus_tasklet_schedule(t, k_id);
+	return(ret);
+}
+
+/* for use by __tasklet_schedule() */
+static inline int _litmus_tasklet_schedule(
+    struct tasklet_struct *t,
+    unsigned int k_id)
+{
+    return(__litmus_tasklet_schedule(t, k_id));
+}
+
+
+
+
+extern int __litmus_tasklet_hi_schedule(struct tasklet_struct *t,
+                                         unsigned int k_id);
+
+/* schedule a hi tasklet on klitirqd #k_id */
+static inline int litmus_tasklet_hi_schedule(struct tasklet_struct *t,
+                                              unsigned int k_id)
+{
+	int ret = 0;
+	if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state))
+		ret = __litmus_tasklet_hi_schedule(t, k_id);
+	return(ret);
+}
+
+/* for use by __tasklet_hi_schedule() */
+static inline int _litmus_tasklet_hi_schedule(struct tasklet_struct *t,
+                                               unsigned int k_id)
+{
+    return(__litmus_tasklet_hi_schedule(t, k_id));
+}
+
+
+
+
+
+extern int __litmus_tasklet_hi_schedule_first(
+    struct tasklet_struct *t,
+    unsigned int k_id);
+
+/* schedule a hi tasklet on klitirqd #k_id on next go-around */
+/* PRECONDITION: Interrupts must be disabled. */
+static inline int litmus_tasklet_hi_schedule_first(
+    struct tasklet_struct *t,
+    unsigned int k_id)
+{
+	int ret = 0;
+	if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state))
+		ret = __litmus_tasklet_hi_schedule_first(t, k_id);
+	return(ret);
+}
+
+/* for use by __tasklet_hi_schedule_first() */
+static inline int _litmus_tasklet_hi_schedule_first(
+    struct tasklet_struct *t,
+    unsigned int k_id)
+{
+    return(__litmus_tasklet_hi_schedule_first(t, k_id));
+}
+
+
+
+//////////////
+
+extern int __litmus_schedule_work(
+	struct work_struct* w,
+	unsigned int k_id);
+
+static inline int litmus_schedule_work(
+	struct work_struct* w,
+	unsigned int k_id)
+{
+	return(__litmus_schedule_work(w, k_id));
+}
+
+
+
+///////////// mutex operations for client threads.
+ 
+void down_and_set_stat(struct task_struct* t,
+					 enum klitirqd_sem_status to_set,
+					 struct mutex* sem);
+
+void __down_and_reset_and_set_stat(struct task_struct* t,
+				enum klitirqd_sem_status to_reset,
+				enum klitirqd_sem_status to_set,
+				struct mutex* sem);
+
+void up_and_set_stat(struct task_struct* t,
+					enum klitirqd_sem_status to_set,
+					struct mutex* sem);
+
+
+
+void release_klitirqd_lock(struct task_struct* t);
+
+int reacquire_klitirqd_lock(struct task_struct* t);
+
+#endif
diff --git a/include/litmus/nvidia_info.h b/include/litmus/nvidia_info.h
new file mode 100644
index 000000000000..579301d77cf5
--- /dev/null
+++ b/include/litmus/nvidia_info.h
@@ -0,0 +1,37 @@
+#ifndef __LITMUS_NVIDIA_H
+#define __LITMUS_NVIDIA_H
+
+#include <linux/interrupt.h>
+
+
+#include <litmus/litmus_softirq.h>
+
+
+#define NV_DEVICE_NUM NR_LITMUS_SOFTIRQD
+
+int init_nvidia_info(void);
+
+int is_nvidia_func(void* func_addr);
+
+void dump_nvidia_info(const struct tasklet_struct *t);
+
+
+// Returns the Nvidia device # associated with provided tasklet and work_struct.
+u32 get_tasklet_nv_device_num(const struct tasklet_struct *t);
+u32 get_work_nv_device_num(const struct work_struct *t);
+
+
+int init_nv_device_reg(void);
+//int get_nv_device_id(struct task_struct* owner);
+
+
+int reg_nv_device(int reg_device_id, int register_device);
+
+struct task_struct* get_nv_device_owner(u32 target_device_id);
+
+void lock_nv_registry(u32 reg_device_id, unsigned long* flags);
+void unlock_nv_registry(u32 reg_device_id, unsigned long* flags);
+
+void increment_nv_int_count(u32 device);
+
+#endif
diff --git a/include/litmus/preempt.h b/include/litmus/preempt.h
index 260c6fe17986..244924f93c48 100644
--- a/include/litmus/preempt.h
+++ b/include/litmus/preempt.h
@@ -26,6 +26,7 @@ const char* sched_state_name(int s);
 				    (x), #x, __FUNCTION__);		\
 	} while (0);
 
+//#define TRACE_SCHED_STATE_CHANGE(x, y, cpu) /* ignore */
 #define TRACE_SCHED_STATE_CHANGE(x, y, cpu)				\
 	TRACE_STATE("[P%d] 0x%x (%s) -> 0x%x (%s)\n",			\
 		    cpu,  (x), sched_state_name(x),			\
diff --git a/include/litmus/rt_param.h b/include/litmus/rt_param.h
index 5de422c742f6..53af3ce1d955 100644
--- a/include/litmus/rt_param.h
+++ b/include/litmus/rt_param.h
@@ -69,6 +69,8 @@ struct control_page {
 /* don't export internal data structures to user space (liblitmus) */
 #ifdef __KERNEL__
 
+#include <linux/semaphore.h>
+
 struct _rt_domain;
 struct bheap_node;
 struct release_heap;
@@ -94,6 +96,14 @@ struct rt_job {
 
 struct pfair_param;
 
+enum klitirqd_sem_status
+{
+	NEED_TO_REACQUIRE,
+	REACQUIRING,
+	NOT_HELD,
+	HELD
+};
+
 /*	RT task parameters for scheduling extensions
  *	These parameters are inherited during clone and therefore must
  *	be explicitly set up before the task set is launched.
@@ -108,6 +118,38 @@ struct rt_param {
 	/* is the task present? (true if it can be scheduled) */
 	unsigned int		present:1;
 
+#ifdef CONFIG_LITMUS_SOFTIRQD
+    /* proxy threads have minimum priority by default */
+    unsigned int        is_proxy_thread:1;
+    
+	/* pointer to klitirqd currently working on this
+	   task_struct's behalf.  only set by the task pointed
+	   to by klitirqd.
+	 
+	   ptr only valid if is_proxy_thread == 0
+	 */
+	struct task_struct* cur_klitirqd;
+
+	/* Used to implement mutual execution exclusion between
+	 * job and klitirqd execution.  Job must always hold
+	 * it's klitirqd_sem to execute.  klitirqd instance
+	 * must hold the semaphore before executing on behalf
+	 * of a job.
+	 */
+	//struct semaphore			klitirqd_sem;
+	struct mutex				klitirqd_sem;
+
+	/* status of held klitirqd_sem, even if the held klitirqd_sem is from
+	   another task (only proxy threads do this though).
+	 */
+	atomic_t					klitirqd_sem_stat;
+#endif
+
+#ifdef CONFIG_LITMUS_NVIDIA
+	/* number of top-half interrupts handled on behalf of current job */
+	atomic_t					nv_int_count;
+#endif
+
 #ifdef CONFIG_LITMUS_LOCKING
 	/* Is the task being priority-boosted by a locking protocol? */
 	unsigned int		priority_boosted:1;
@@ -128,7 +170,7 @@ struct rt_param {
 	 * an increased task priority.
 	 */
 	 struct task_struct*	inh_task;
-
+    
 #ifdef CONFIG_NP_SECTION
 	/* For the FMLP under PSN-EDF, it is required to make the task
 	 * non-preemptive from kernel space. In order not to interfere with
diff --git a/include/litmus/sched_plugin.h b/include/litmus/sched_plugin.h
index 6e7cabdddae8..df50930d14a0 100644
--- a/include/litmus/sched_plugin.h
+++ b/include/litmus/sched_plugin.h
@@ -29,7 +29,6 @@ typedef struct task_struct* (*schedule_t)(struct task_struct * prev);
  */
 typedef void (*finish_switch_t)(struct task_struct *prev);
 
-
 /********************* task state changes ********************/
 
 /* Called to setup a new real-time task.
@@ -58,6 +57,17 @@ typedef void (*task_exit_t)    (struct task_struct *);
 typedef long (*allocate_lock_t) (struct litmus_lock **lock, int type,
 				 void* __user config);
 
+/* Called to change inheritance levels of given task */
+typedef void (*set_prio_inh_t)(struct task_struct* t,
+                               struct task_struct* prio_inh);
+typedef void (*clear_prio_inh_t)(struct task_struct* t);
+
+
+typedef void (*set_prio_inh_klitirq_t)(struct task_struct* klitirqd,
+                                       struct task_struct* old_owner,
+                                       struct task_struct* new_owner);
+typedef void (*clear_prio_inh_klitirqd_t)(struct task_struct* klitirqd,
+                                          struct task_struct* old_owner);
 
 /********************* sys call backends  ********************/
 /* This function causes the caller to sleep until the next release */
@@ -88,7 +98,7 @@ struct sched_plugin {
 	/*	task state changes 	*/
 	admit_task_t		admit_task;
 
-        task_new_t 		task_new;
+    task_new_t			task_new;
 	task_wake_up_t		task_wake_up;
 	task_block_t		task_block;
 	task_exit_t 		task_exit;
@@ -96,6 +106,14 @@ struct sched_plugin {
 #ifdef CONFIG_LITMUS_LOCKING
 	/*	locking protocols	*/
 	allocate_lock_t		allocate_lock;
+    
+    set_prio_inh_t      set_prio_inh;
+    clear_prio_inh_t    clear_prio_inh;
+#endif
+    
+#ifdef CONFIG_LITMUS_SOFTIRQD
+    set_prio_inh_klitirq_t		set_prio_inh_klitirqd;
+    clear_prio_inh_klitirqd_t	clear_prio_inh_klitirqd;
 #endif
 } __attribute__ ((__aligned__(SMP_CACHE_BYTES)));
 
diff --git a/include/litmus/sched_trace.h b/include/litmus/sched_trace.h
index 7ca34cb13881..1486c778aff8 100644
--- a/include/litmus/sched_trace.h
+++ b/include/litmus/sched_trace.h
@@ -11,12 +11,12 @@ struct st_trace_header {
 	u8	cpu;		/* On which CPU was it recorded? */
 	u16	pid;		/* PID of the task.              */
 	u32	job;		/* The job sequence number.      */
-};
+} __attribute__((packed));
 
 #define ST_NAME_LEN 16
 struct st_name_data {
 	char	cmd[ST_NAME_LEN];/* The name of the executable of this process. */
-};
+} __attribute__((packed));
 
 struct st_param_data {		/* regular params */
 	u32	wcet;
@@ -25,30 +25,29 @@ struct st_param_data {		/* regular params */
 	u8	partition;
 	u8	class;
 	u8	__unused[2];
-};
+} __attribute__((packed));
 
 struct st_release_data {	/* A job is was/is going to be released. */
 	u64	release;	/* What's the release time?              */
 	u64	deadline;	/* By when must it finish?		 */
-};
+} __attribute__((packed));
 
 struct st_assigned_data {	/* A job was asigned to a CPU. 		 */
 	u64	when;
 	u8	target;		/* Where should it execute?	         */
 	u8	__unused[7];
-};
+} __attribute__((packed));
 
 struct st_switch_to_data {	/* A process was switched to on a given CPU.   */
 	u64	when;		/* When did this occur?                        */
 	u32	exec_time;	/* Time the current job has executed.          */
 	u8	__unused[4];
-
-};
+} __attribute__((packed));
 
 struct st_switch_away_data {	/* A process was switched away from on a given CPU. */
 	u64	when;
 	u64	exec_time;
-};
+} __attribute__((packed));
 
 struct st_completion_data {	/* A job completed. */
 	u64	when;
@@ -56,35 +55,92 @@ struct st_completion_data {	/* A job completed. */
 				 * next task automatically; set to 0 otherwise.
 				 */
 	u8	__uflags:7;
-	u8	__unused[7];
-};
+	u16 nv_int_count;
+	u8	__unused[5];
+} __attribute__((packed));
 
 struct st_block_data {		/* A task blocks. */
 	u64	when;
 	u64	__unused;
-};
+} __attribute__((packed));
 
 struct st_resume_data {		/* A task resumes. */
 	u64	when;
 	u64	__unused;
-};
+} __attribute__((packed));
 
 struct st_action_data {
 	u64	when;
 	u8	action;
 	u8	__unused[7];
-};
+} __attribute__((packed));
 
 struct st_sys_release_data {
 	u64	when;
 	u64	release;
-};
+} __attribute__((packed));
+
+
+struct st_tasklet_release_data {
+	u64 when;
+	u64 __unused;
+} __attribute__((packed));
+
+struct st_tasklet_begin_data {
+	u64 when;
+	u16 exe_pid;
+	u8  __unused[6];
+} __attribute__((packed));
+
+struct st_tasklet_end_data {
+	u64 when;
+	u16 exe_pid;
+	u8	flushed;
+	u8	__unused[5];
+} __attribute__((packed));
+
+
+struct st_work_release_data {
+	u64 when;
+	u64 __unused;
+} __attribute__((packed));
+
+struct st_work_begin_data {
+	u64 when;
+	u16 exe_pid;
+	u8	__unused[6];
+} __attribute__((packed));
+
+struct st_work_end_data {
+	u64 when;
+	u16 exe_pid;
+	u8	flushed;
+	u8	__unused[5];
+} __attribute__((packed));
+
+struct st_effective_priority_change_data {
+	u64 when;
+	u16 inh_pid;
+	u8	__unused[6];
+} __attribute__((packed));
+
+struct st_nv_interrupt_begin_data {
+	u64 when;
+	u32 device;
+	u8  __unused[4];
+} __attribute__((packed));
+
+struct st_nv_interrupt_end_data {
+	u64 when;
+	u32 device;
+	u8  __unused[4];
+} __attribute__((packed));
 
 #define DATA(x) struct st_ ## x ## _data x;
 
 typedef enum {
-        ST_NAME = 1,		/* Start at one, so that we can spot
-				 * uninitialized records. */
+    ST_NAME = 1, /* Start at one, so that we can spot
+				  * uninitialized records. */
 	ST_PARAM,
 	ST_RELEASE,
 	ST_ASSIGNED,
@@ -94,7 +150,16 @@ typedef enum {
 	ST_BLOCK,
 	ST_RESUME,
 	ST_ACTION,
-	ST_SYS_RELEASE
+	ST_SYS_RELEASE,
+	ST_TASKLET_RELEASE,
+	ST_TASKLET_BEGIN,
+	ST_TASKLET_END,
+	ST_WORK_RELEASE,
+	ST_WORK_BEGIN,
+	ST_WORK_END,
+	ST_EFF_PRIO_CHANGE,
+	ST_NV_INTERRUPT_BEGIN,
+	ST_NV_INTERRUPT_END,
 } st_event_record_type_t;
 
 struct st_event_record {
@@ -113,8 +178,17 @@ struct st_event_record {
 		DATA(resume);
 		DATA(action);
 		DATA(sys_release);
+		DATA(tasklet_release);
+		DATA(tasklet_begin);
+		DATA(tasklet_end);
+		DATA(work_release);
+		DATA(work_begin);
+		DATA(work_end);
+		DATA(effective_priority_change);
+		DATA(nv_interrupt_begin);
+		DATA(nv_interrupt_end);
 	} data;
-};
+} __attribute__((packed));
 
 #undef DATA
 
@@ -129,6 +203,8 @@ struct st_event_record {
 	ft_event1(id, callback, task)
 #define SCHED_TRACE2(id, callback, task, xtra) \
 	ft_event2(id, callback, task, xtra)
+#define SCHED_TRACE3(id, callback, task, xtra1, xtra2) \
+	ft_event3(id, callback, task, xtra1, xtra2)
 
 /* provide prototypes; needed on sparc64 */
 #ifndef NO_TASK_TRACE_DECLS
@@ -155,12 +231,45 @@ feather_callback void do_sched_trace_action(unsigned long id,
 feather_callback void do_sched_trace_sys_release(unsigned long id,
 						 lt_t* start);
 
+
+feather_callback void do_sched_trace_tasklet_release(unsigned long id,
+												   struct task_struct* owner);
+feather_callback void do_sched_trace_tasklet_begin(unsigned long id,
+												  struct task_struct* owner);
+feather_callback void do_sched_trace_tasklet_end(unsigned long id,
+												 struct task_struct* owner,
+												 unsigned long flushed);
+
+feather_callback void do_sched_trace_work_release(unsigned long id,
+													 struct task_struct* owner);
+feather_callback void do_sched_trace_work_begin(unsigned long id,
+												struct task_struct* owner,
+												struct task_struct* exe);
+feather_callback void do_sched_trace_work_end(unsigned long id,
+											  struct task_struct* owner,
+											  struct task_struct* exe,
+											  unsigned long flushed);
+
+feather_callback void do_sched_trace_eff_prio_change(unsigned long id,
+											  struct task_struct* task,
+											  struct task_struct* inh);
+
+feather_callback void do_sched_trace_nv_interrupt_begin(unsigned long id,
+												u32 device);
+feather_callback void do_sched_trace_nv_interrupt_end(unsigned long id,
+												unsigned long unused);
+
+
+/* returns true if we're tracing an interrupt on current CPU */
+/* int is_interrupt_tracing_active(void); */
+
 #endif
 
 #else
 
 #define SCHED_TRACE(id, callback, task)        /* no tracing */
 #define SCHED_TRACE2(id, callback, task, xtra) /* no tracing */
+#define SCHED_TRACE3(id, callback, task, xtra1, xtra2)
 
 #endif
 
@@ -193,6 +302,35 @@ feather_callback void do_sched_trace_sys_release(unsigned long id,
 	SCHED_TRACE(SCHED_TRACE_BASE_ID + 10, do_sched_trace_sys_release, when)
 
 
+#define sched_trace_tasklet_release(t) \
+	SCHED_TRACE(SCHED_TRACE_BASE_ID + 11, do_sched_trace_tasklet_release, t)
+
+#define sched_trace_tasklet_begin(t) \
+	SCHED_TRACE(SCHED_TRACE_BASE_ID + 12, do_sched_trace_tasklet_begin, t)
+
+#define sched_trace_tasklet_end(t, flushed) \
+	SCHED_TRACE2(SCHED_TRACE_BASE_ID + 13, do_sched_trace_tasklet_end, t, flushed)
+
+
+#define sched_trace_work_release(t) \
+	SCHED_TRACE(SCHED_TRACE_BASE_ID + 14, do_sched_trace_work_release, t)
+
+#define sched_trace_work_begin(t, e) \
+	SCHED_TRACE2(SCHED_TRACE_BASE_ID + 15, do_sched_trace_work_begin, t, e)
+
+#define sched_trace_work_end(t, e, flushed) \
+	SCHED_TRACE3(SCHED_TRACE_BASE_ID + 16, do_sched_trace_work_end, t, e, flushed)
+
+
+#define sched_trace_eff_prio_change(t, inh) \
+	SCHED_TRACE2(SCHED_TRACE_BASE_ID + 17, do_sched_trace_eff_prio_change, t, inh)
+
+
+#define sched_trace_nv_interrupt_begin(d) \
+	SCHED_TRACE(SCHED_TRACE_BASE_ID + 18, do_sched_trace_nv_interrupt_begin, d)
+#define sched_trace_nv_interrupt_end() \
+	SCHED_TRACE(SCHED_TRACE_BASE_ID + 19, do_sched_trace_nv_interrupt_end, 0ul)
+
 #define sched_trace_quantum_boundary() /* NOT IMPLEMENTED */
 
 #endif /* __KERNEL__ */
diff --git a/include/litmus/sched_trace_external.h b/include/litmus/sched_trace_external.h
new file mode 100644
index 000000000000..c2c872639880
--- /dev/null
+++ b/include/litmus/sched_trace_external.h
@@ -0,0 +1,42 @@
+/*
+ * sched_trace.h -- record scheduler events to a byte stream for offline analysis.
+ */
+#ifndef _LINUX_SCHED_TRACE_EXTERNAL_H_
+#define _LINUX_SCHED_TRACE_EXTERNAL_H_
+
+extern void __sched_trace_tasklet_begin_external(struct task_struct* t);
+static inline void sched_trace_tasklet_begin_external(struct task_struct* t)
+{
+	__sched_trace_tasklet_begin_external(t);
+}
+
+extern void __sched_trace_tasklet_end_external(struct task_struct* t, unsigned long flushed);
+static inline void sched_trace_tasklet_end_external(struct task_struct* t, unsigned long flushed)
+{
+	__sched_trace_tasklet_end_external(t, flushed);
+}
+
+extern void __sched_trace_work_begin_external(struct task_struct* t, struct task_struct* e);
+static inline void sched_trace_work_begin_external(struct task_struct* t, struct task_struct* e)
+{
+	__sched_trace_work_begin_external(t, e);
+}
+
+extern void __sched_trace_work_end_external(struct task_struct* t, struct task_struct* e, unsigned long f);
+static inline void sched_trace_work_end_external(struct task_struct* t, struct task_struct* e, unsigned long f)
+{
+	__sched_trace_work_end_external(t, e, f);
+}
+
+extern void __sched_trace_nv_interrupt_begin_external(u32 device);
+static inline void sched_trace_nv_interrupt_begin_external(u32 device)
+{
+	__sched_trace_nv_interrupt_begin_external(device);
+}
+
+extern void __sched_trace_nv_interrupt_end_external(void);
+static inline void sched_trace_nv_interrupt_end_external(void)
+{
+	__sched_trace_nv_interrupt_end_external();
+}
+#endif
diff --git a/include/litmus/unistd_32.h b/include/litmus/unistd_32.h
index 94264c27d9ac..c6efc4c40af2 100644
--- a/include/litmus/unistd_32.h
+++ b/include/litmus/unistd_32.h
@@ -17,5 +17,6 @@
 #define __NR_wait_for_ts_release __LSC(9)
 #define __NR_release_ts		__LSC(10)
 #define __NR_null_call		__LSC(11)
+#define __NR_register_nv_device			__LSC(12)
 
-#define NR_litmus_syscalls 12
+#define NR_litmus_syscalls 13
diff --git a/include/litmus/unistd_64.h b/include/litmus/unistd_64.h
index d5ced0d2642c..b44a7c33bdf8 100644
--- a/include/litmus/unistd_64.h
+++ b/include/litmus/unistd_64.h
@@ -29,5 +29,8 @@ __SYSCALL(__NR_wait_for_ts_release, sys_wait_for_ts_release)
 __SYSCALL(__NR_release_ts, sys_release_ts)
 #define __NR_null_call				__LSC(11)
 __SYSCALL(__NR_null_call, sys_null_call)
+#define __NR_register_nv_device			__LSC(12)
+__SYSCALL(__NR_register_nv_device, sys_register_nv_device)
 
-#define NR_litmus_syscalls 12
+
+#define NR_litmus_syscalls 13
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index f2852a510232..ebff2cf715c5 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -530,7 +530,7 @@ static void print_lock(struct held_lock *hlock)
 	print_ip_sym(hlock->acquire_ip);
 }
 
-static void lockdep_print_held_locks(struct task_struct *curr)
+void lockdep_print_held_locks(struct task_struct *curr)
 {
 	int i, depth = curr->lockdep_depth;
 
@@ -546,6 +546,7 @@ static void lockdep_print_held_locks(struct task_struct *curr)
 		print_lock(curr->held_locks + i);
 	}
 }
+EXPORT_SYMBOL(lockdep_print_held_locks);
 
 static void print_kernel_version(void)
 {
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 200407c1502f..435685ecd068 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -496,3 +496,144 @@ int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
 	return 1;
 }
 EXPORT_SYMBOL(atomic_dec_and_mutex_lock);
+
+
+
+
+
+
+
+
+//__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, _RET_IP_);
+
+void mutex_lock_sfx(struct mutex *lock,
+				   side_effect_t pre, unsigned long pre_arg,
+				   side_effect_t post, unsigned long post_arg)
+{
+	long state = TASK_UNINTERRUPTIBLE;
+	unsigned int subclass = 0;
+	unsigned long ip = _RET_IP_;
+	
+
+	struct task_struct *task = current;
+	struct mutex_waiter waiter;
+	unsigned long flags;
+	
+	preempt_disable();
+	mutex_acquire(&lock->dep_map, subclass, 0, ip);
+
+	spin_lock_mutex(&lock->wait_lock, flags);
+	
+	if(pre)
+	{
+		if(unlikely(pre(pre_arg)))
+		{
+			// this will fuck with lockdep's CONFIG_PROVE_LOCKING...
+			spin_unlock_mutex(&lock->wait_lock, flags);
+			preempt_enable();
+			return;
+		}
+	}
+
+	debug_mutex_lock_common(lock, &waiter);
+	debug_mutex_add_waiter(lock, &waiter, task_thread_info(task));
+	
+	/* add waiting tasks to the end of the waitqueue (FIFO): */
+	list_add_tail(&waiter.list, &lock->wait_list);
+	waiter.task = task;
+	
+	if (atomic_xchg(&lock->count, -1) == 1)
+		goto done;
+	
+	lock_contended(&lock->dep_map, ip);
+	
+	for (;;) {
+		/*
+		 * Lets try to take the lock again - this is needed even if
+		 * we get here for the first time (shortly after failing to
+		 * acquire the lock), to make sure that we get a wakeup once
+		 * it's unlocked. Later on, if we sleep, this is the
+		 * operation that gives us the lock. We xchg it to -1, so
+		 * that when we release the lock, we properly wake up the
+		 * other waiters:
+		 */
+		if (atomic_xchg(&lock->count, -1) == 1)
+			break;
+		
+		__set_task_state(task, state);
+		
+		/* didnt get the lock, go to sleep: */
+		spin_unlock_mutex(&lock->wait_lock, flags);
+		preempt_enable_no_resched();
+		schedule();
+		preempt_disable();
+		spin_lock_mutex(&lock->wait_lock, flags);
+	}
+	
+done:
+	lock_acquired(&lock->dep_map, ip);
+	/* got the lock - rejoice! */
+	mutex_remove_waiter(lock, &waiter, current_thread_info());
+	mutex_set_owner(lock);
+	
+	/* set it to 0 if there are no waiters left: */
+	if (likely(list_empty(&lock->wait_list)))
+		atomic_set(&lock->count, 0);
+	
+	if(post)
+		post(post_arg);	
+	
+	spin_unlock_mutex(&lock->wait_lock, flags);
+	
+	debug_mutex_free_waiter(&waiter);
+	preempt_enable();
+	
+	//return 0;	
+}
+EXPORT_SYMBOL(mutex_lock_sfx);
+
+
+
+//__mutex_unlock_common_slowpath(lock_count, 1);
+
+void mutex_unlock_sfx(struct mutex *lock,
+					side_effect_t pre, unsigned long pre_arg,
+					side_effect_t post, unsigned long post_arg)
+{
+	//struct mutex *lock = container_of(lock_count, struct mutex, count);
+	unsigned long flags;
+	
+	spin_lock_mutex(&lock->wait_lock, flags);
+	
+	if(pre)
+		pre(pre_arg);
+	
+	//mutex_release(&lock->dep_map, nested, _RET_IP_);
+	mutex_release(&lock->dep_map, 1, _RET_IP_);
+	debug_mutex_unlock(lock);
+	
+	/*
+	 * some architectures leave the lock unlocked in the fastpath failure
+	 * case, others need to leave it locked. In the later case we have to
+	 * unlock it here
+	 */
+	if (__mutex_slowpath_needs_to_unlock())
+		atomic_set(&lock->count, 1);
+	
+	if (!list_empty(&lock->wait_list)) {
+		/* get the first entry from the wait-list: */
+		struct mutex_waiter *waiter =
+		list_entry(lock->wait_list.next,
+				   struct mutex_waiter, list);
+		
+		debug_mutex_wake_waiter(lock, waiter);
+		
+		wake_up_process(waiter->task);
+	}
+	
+	if(post)
+		post(post_arg);
+	
+	spin_unlock_mutex(&lock->wait_lock, flags);	
+}
+EXPORT_SYMBOL(mutex_unlock_sfx);
diff --git a/kernel/sched.c b/kernel/sched.c
index c5d775079027..3162605ffc91 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -82,6 +82,10 @@
 #include <litmus/sched_trace.h>
 #include <litmus/trace.h>
 
+#ifdef CONFIG_LITMUS_SOFTIRQD
+#include <litmus/litmus_softirq.h>
+#endif
+
 static void litmus_tick(struct rq*, struct task_struct*);
 
 #define CREATE_TRACE_POINTS
@@ -3789,6 +3793,7 @@ pick_next_task(struct rq *rq)
 	}
 }
 
+
 /*
  * schedule() is the main scheduler function.
  */
@@ -3807,6 +3812,10 @@ need_resched:
 	rcu_note_context_switch(cpu);
 	prev = rq->curr;
 
+#ifdef CONFIG_LITMUS_SOFTIRQD
+	release_klitirqd_lock(prev);
+#endif
+	
 	release_kernel_lock(prev);
 need_resched_nonpreemptible:
 	TS_SCHED_START;
@@ -3882,15 +3891,20 @@ need_resched_nonpreemptible:
 
 	if (sched_state_validate_switch() || unlikely(reacquire_kernel_lock(prev)))
 		goto need_resched_nonpreemptible;
-
+	
 	preempt_enable_no_resched();
+
 	if (need_resched())
 		goto need_resched;
 
+	reacquire_klitirqd_lock(prev);
+	
 	srp_ceiling_block();
 }
 EXPORT_SYMBOL(schedule);
 
+
+
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
 /*
  * Look out! "owner" is an entirely speculative pointer
@@ -4051,6 +4065,7 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
 	}
 }
 
+
 /**
  * __wake_up - wake up threads blocked on a waitqueue.
  * @q: the waitqueue
@@ -4236,6 +4251,12 @@ void __sched wait_for_completion(struct completion *x)
 }
 EXPORT_SYMBOL(wait_for_completion);
 
+void __sched __wait_for_completion_locked(struct completion *x)
+{
+	do_wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(__wait_for_completion_locked);
+
 /**
  * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
  * @x:  holds the state of this particular completion
diff --git a/kernel/semaphore.c b/kernel/semaphore.c
index 94a62c0d4ade..c947a046a6d7 100644
--- a/kernel/semaphore.c
+++ b/kernel/semaphore.c
@@ -33,11 +33,11 @@
 #include <linux/spinlock.h>
 #include <linux/ftrace.h>
 
-static noinline void __down(struct semaphore *sem);
+noinline void __down(struct semaphore *sem);
 static noinline int __down_interruptible(struct semaphore *sem);
 static noinline int __down_killable(struct semaphore *sem);
 static noinline int __down_timeout(struct semaphore *sem, long jiffies);
-static noinline void __up(struct semaphore *sem);
+noinline void __up(struct semaphore *sem);
 
 /**
  * down - acquire the semaphore
@@ -190,11 +190,13 @@ EXPORT_SYMBOL(up);
 
 /* Functions for the contended case */
 
+/*
 struct semaphore_waiter {
 	struct list_head list;
 	struct task_struct *task;
 	int up;
 };
+ */
 
 /*
  * Because this function is inlined, the 'state' parameter will be
@@ -233,10 +235,12 @@ static inline int __sched __down_common(struct semaphore *sem, long state,
 	return -EINTR;
 }
 
-static noinline void __sched __down(struct semaphore *sem)
+noinline void __sched __down(struct semaphore *sem)
 {
 	__down_common(sem, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
 }
+EXPORT_SYMBOL(__down);
+
 
 static noinline int __sched __down_interruptible(struct semaphore *sem)
 {
@@ -253,7 +257,7 @@ static noinline int __sched __down_timeout(struct semaphore *sem, long jiffies)
 	return __down_common(sem, TASK_UNINTERRUPTIBLE, jiffies);
 }
 
-static noinline void __sched __up(struct semaphore *sem)
+noinline void __sched __up(struct semaphore *sem)
 {
 	struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list,
 						struct semaphore_waiter, list);
@@ -261,3 +265,4 @@ static noinline void __sched __up(struct semaphore *sem)
 	waiter->up = 1;
 	wake_up_process(waiter->task);
 }
+EXPORT_SYMBOL(__up);
\ No newline at end of file
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 07b4f1b1a73a..be4b8fab3637 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -29,6 +29,14 @@
 #include <trace/events/irq.h>
 
 #include <asm/irq.h>
+
+#include <litmus/litmus.h>
+#include <litmus/sched_trace.h>
+
+#ifdef CONFIG_LITMUS_NVIDIA
+#include <litmus/nvidia_info.h>
+#endif
+
 /*
    - No shared variables, all the data are CPU local.
    - If a softirq needs serialization, let it serialize itself
@@ -54,7 +62,7 @@ EXPORT_SYMBOL(irq_stat);
 
 static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
 
-static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
+static DEFINE_PER_CPU(struct task_struct *, ksoftirqd) = NULL;
 
 char *softirq_to_name[NR_SOFTIRQS] = {
 	"HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
@@ -177,6 +185,7 @@ void local_bh_enable_ip(unsigned long ip)
 }
 EXPORT_SYMBOL(local_bh_enable_ip);
 
+
 /*
  * We restart softirq processing MAX_SOFTIRQ_RESTART times,
  * and we fall back to softirqd after that.
@@ -187,34 +196,30 @@ EXPORT_SYMBOL(local_bh_enable_ip);
  * should not be able to lock up the box.
  */
 #define MAX_SOFTIRQ_RESTART 10
-
-asmlinkage void __do_softirq(void)
+static void ____do_softirq(void)
 {
-	struct softirq_action *h;
 	__u32 pending;
-	int max_restart = MAX_SOFTIRQ_RESTART;
+	
+	struct softirq_action *h;
 	int cpu;
-
+	
 	pending = local_softirq_pending();
+	
 	account_system_vtime(current);
-
-	__local_bh_disable((unsigned long)__builtin_return_address(0));
-	lockdep_softirq_enter();
-
+	
 	cpu = smp_processor_id();
-restart:
-	/* Reset the pending bitmask before enabling irqs */
-	set_softirq_pending(0);
 
+	set_softirq_pending(0);
+	
 	local_irq_enable();
-
+	
 	h = softirq_vec;
-
+	
 	do {
 		if (pending & 1) {
 			int prev_count = preempt_count();
 			kstat_incr_softirqs_this_cpu(h - softirq_vec);
-
+			
 			trace_softirq_entry(h, softirq_vec);
 			h->action(h);
 			trace_softirq_exit(h, softirq_vec);
@@ -226,26 +231,70 @@ restart:
 				       h->action, prev_count, preempt_count());
 				preempt_count() = prev_count;
 			}
-
+			
 			rcu_bh_qs(cpu);
 		}
 		h++;
 		pending >>= 1;
 	} while (pending);
-
+	
 	local_irq_disable();
+}
+
+static void ___do_softirq(void)
+{
+	__u32 pending;
+
+	//struct softirq_action *h;
+	int max_restart = MAX_SOFTIRQ_RESTART;
+	//int cpu;
+
+	pending = local_softirq_pending();
+
+restart:
+	____do_softirq();
 
 	pending = local_softirq_pending();
 	if (pending && --max_restart)
 		goto restart;
 
 	if (pending)
+	{
 		wakeup_softirqd();
+	}
+}
 
+asmlinkage void __do_softirq(void)
+{
+#ifdef LITMUS_THREAD_ALL_SOFTIRQ
+	/* Skip straight to wakeup_softirqd() if we're using 
+	 LITMUS_THREAD_ALL_SOFTIRQ (unless there's really high prio-stuff waiting.). */
+	struct task_struct *tsk = __get_cpu_var(ksoftirqd);
+	
+	if(tsk)
+	{
+		__u32 pending = local_softirq_pending();
+		const __u32 high_prio_softirq = (1<<HI_SOFTIRQ) | (1<<TIMER_SOFTIRQ) | (1<<HRTIMER_SOFTIRQ);
+		if(pending && !(pending & high_prio_softirq))
+		{
+			wakeup_softirqd();
+			return;
+		}
+	}
+#endif
+	
+	/*
+	 * 'immediate' softirq execution:
+	 */
+	__local_bh_disable((unsigned long)__builtin_return_address(0));
+	lockdep_softirq_enter();
+	
+	___do_softirq();
+	
 	lockdep_softirq_exit();
-
+	
 	account_system_vtime(current);
-	_local_bh_enable();
+	_local_bh_enable();	
 }
 
 #ifndef __ARCH_HAS_DO_SOFTIRQ
@@ -357,7 +406,63 @@ struct tasklet_head
 static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec);
 static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec);
 
+
 void __tasklet_schedule(struct tasklet_struct *t)
+{
+#ifdef CONFIG_LITMUS_NVIDIA
+	if(is_nvidia_func(t->func))
+	{
+		u32 nvidia_device = get_tasklet_nv_device_num(t);	
+		//		TRACE("%s: Handling NVIDIA tasklet for device\t%u\tat\t%llu\n",
+		//			  __FUNCTION__, nvidia_device,litmus_clock());
+
+		unsigned long flags;
+		struct task_struct* device_owner;
+
+		lock_nv_registry(nvidia_device, &flags);
+
+		device_owner = get_nv_device_owner(nvidia_device);
+
+		if(device_owner==NULL)
+		{
+			t->owner = NULL;
+		}
+		else
+		{
+			if(is_realtime(device_owner))
+			{
+				TRACE("%s: Handling NVIDIA tasklet for device %u at %llu\n",
+					  __FUNCTION__, nvidia_device,litmus_clock());				
+				TRACE("%s: the owner task %d of NVIDIA Device %u is RT-task\n",
+					  __FUNCTION__,device_owner->pid,nvidia_device);
+
+				t->owner = device_owner;
+				sched_trace_tasklet_release(t->owner);
+				if(likely(_litmus_tasklet_schedule(t,nvidia_device)))
+				{
+					unlock_nv_registry(nvidia_device, &flags);
+					return;
+				}
+				else
+				{
+					t->owner = NULL; /* fall through to normal scheduling */
+				}
+			}
+			else
+			{
+				t->owner = NULL;
+			}
+		}
+		unlock_nv_registry(nvidia_device, &flags);
+	}
+#endif
+
+	___tasklet_schedule(t);
+}
+EXPORT_SYMBOL(__tasklet_schedule);
+
+
+void ___tasklet_schedule(struct tasklet_struct *t)
 {
 	unsigned long flags;
 
@@ -368,10 +473,64 @@ void __tasklet_schedule(struct tasklet_struct *t)
 	raise_softirq_irqoff(TASKLET_SOFTIRQ);
 	local_irq_restore(flags);
 }
+EXPORT_SYMBOL(___tasklet_schedule);
 
-EXPORT_SYMBOL(__tasklet_schedule);
 
 void __tasklet_hi_schedule(struct tasklet_struct *t)
+{
+#ifdef CONFIG_LITMUS_NVIDIA
+	if(is_nvidia_func(t->func))
+	{	
+		u32 nvidia_device = get_tasklet_nv_device_num(t);
+		//		TRACE("%s: Handling NVIDIA tasklet for device\t%u\tat\t%llu\n",
+		//			  __FUNCTION__, nvidia_device,litmus_clock());
+
+		unsigned long flags;
+		struct task_struct* device_owner;
+		
+		lock_nv_registry(nvidia_device, &flags);
+		
+		device_owner = get_nv_device_owner(nvidia_device);
+
+		if(device_owner==NULL) 
+		{
+			t->owner = NULL;
+		}
+		else
+		{
+			if( is_realtime(device_owner))
+			{
+				TRACE("%s: Handling NVIDIA tasklet for device %u\tat %llu\n",
+					  __FUNCTION__, nvidia_device,litmus_clock());				
+				TRACE("%s: the owner task %d of NVIDIA Device %u is RT-task\n",
+					  __FUNCTION__,device_owner->pid,nvidia_device);
+				
+				t->owner = device_owner;
+				sched_trace_tasklet_release(t->owner);
+				if(likely(_litmus_tasklet_hi_schedule(t,nvidia_device)))
+				{
+					unlock_nv_registry(nvidia_device, &flags);
+					return;
+				}
+				else
+				{
+					t->owner = NULL; /* fall through to normal scheduling */
+				}
+			}
+			else
+			{
+				t->owner = NULL;
+			}
+		}
+		unlock_nv_registry(nvidia_device, &flags);
+	}
+#endif
+
+	___tasklet_hi_schedule(t);
+}
+EXPORT_SYMBOL(__tasklet_hi_schedule);
+
+void ___tasklet_hi_schedule(struct tasklet_struct* t)
 {
 	unsigned long flags;
 
@@ -382,10 +541,64 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
 	raise_softirq_irqoff(HI_SOFTIRQ);
 	local_irq_restore(flags);
 }
-
-EXPORT_SYMBOL(__tasklet_hi_schedule);
+EXPORT_SYMBOL(___tasklet_hi_schedule);
 
 void __tasklet_hi_schedule_first(struct tasklet_struct *t)
+{
+	BUG_ON(!irqs_disabled());
+#ifdef CONFIG_LITMUS_NVIDIA	
+	if(is_nvidia_func(t->func))
+	{	
+		u32 nvidia_device = get_tasklet_nv_device_num(t);
+		//		TRACE("%s: Handling NVIDIA tasklet for device\t%u\tat\t%llu\n",
+		//			  __FUNCTION__, nvidia_device,litmus_clock());
+		unsigned long flags;
+		struct task_struct* device_owner;
+		
+		lock_nv_registry(nvidia_device, &flags);
+
+		device_owner = get_nv_device_owner(nvidia_device);
+
+		if(device_owner==NULL)
+		{
+			t->owner = NULL;
+		}
+		else
+		{
+			if(is_realtime(device_owner))
+			{
+				TRACE("%s: Handling NVIDIA tasklet for device %u at %llu\n",
+					  __FUNCTION__, nvidia_device,litmus_clock());
+				
+				TRACE("%s: the owner task %d of NVIDIA Device %u is RT-task\n",
+					  __FUNCTION__,device_owner->pid,nvidia_device);
+				
+				t->owner = device_owner;
+				sched_trace_tasklet_release(t->owner);
+				if(likely(_litmus_tasklet_hi_schedule_first(t,nvidia_device)))
+				{
+					unlock_nv_registry(nvidia_device, &flags);
+					return;
+				}
+				else
+				{
+					t->owner = NULL; /* fall through to normal scheduling */
+				}
+			}
+			else
+			{
+				t->owner = NULL;
+			}
+		}
+		unlock_nv_registry(nvidia_device, &flags);
+	}
+#endif
+
+	___tasklet_hi_schedule_first(t);
+}
+EXPORT_SYMBOL(__tasklet_hi_schedule_first);
+
+void ___tasklet_hi_schedule_first(struct tasklet_struct* t)
 {
 	BUG_ON(!irqs_disabled());
 
@@ -393,8 +606,7 @@ void __tasklet_hi_schedule_first(struct tasklet_struct *t)
 	__get_cpu_var(tasklet_hi_vec).head = t;
 	__raise_softirq_irqoff(HI_SOFTIRQ);
 }
-
-EXPORT_SYMBOL(__tasklet_hi_schedule_first);
+EXPORT_SYMBOL(___tasklet_hi_schedule_first);
 
 static void tasklet_action(struct softirq_action *a)
 {
@@ -450,6 +662,7 @@ static void tasklet_hi_action(struct softirq_action *a)
 			if (!atomic_read(&t->count)) {
 				if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
 					BUG();
+
 				t->func(t->data);
 				tasklet_unlock(t);
 				continue;
@@ -473,8 +686,13 @@ void tasklet_init(struct tasklet_struct *t,
 	t->next = NULL;
 	t->state = 0;
 	atomic_set(&t->count, 0);
+
 	t->func = func;
 	t->data = data;
+
+#ifdef CONFIG_LITMUS_SOFTIRQD
+	t->owner = NULL;
+#endif
 }
 
 EXPORT_SYMBOL(tasklet_init);
@@ -489,6 +707,7 @@ void tasklet_kill(struct tasklet_struct *t)
 			yield();
 		} while (test_bit(TASKLET_STATE_SCHED, &t->state));
 	}
+
 	tasklet_unlock_wait(t);
 	clear_bit(TASKLET_STATE_SCHED, &t->state);
 }
@@ -694,6 +913,8 @@ void __init softirq_init(void)
 
 static int run_ksoftirqd(void * __bind_cpu)
 {
+	unsigned long flags;
+	
 	set_current_state(TASK_INTERRUPTIBLE);
 
 	while (!kthread_should_stop()) {
@@ -712,7 +933,11 @@ static int run_ksoftirqd(void * __bind_cpu)
 			   don't process */
 			if (cpu_is_offline((long)__bind_cpu))
 				goto wait_to_die;
-			do_softirq();
+			
+			local_irq_save(flags);
+			____do_softirq();
+			local_irq_restore(flags);
+			
 			preempt_enable_no_resched();
 			cond_resched();
 			preempt_disable();
@@ -760,6 +985,7 @@ void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu)
 	for (i = &per_cpu(tasklet_vec, cpu).head; *i; i = &(*i)->next) {
 		if (*i == t) {
 			*i = t->next;
+
 			/* If this was the tail element, move the tail ptr */
 			if (*i == NULL)
 				per_cpu(tasklet_vec, cpu).tail = i;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f77afd939229..8139208eaee1 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -47,6 +47,13 @@
 
 #include "workqueue_sched.h"
 
+#ifdef CONFIG_LITMUS_NVIDIA
+#include <litmus/litmus.h>
+#include <litmus/sched_trace.h>
+#include <litmus/nvidia_info.h>
+#endif
+
+
 enum {
 	/* global_cwq flags */
 	GCWQ_MANAGE_WORKERS	= 1 << 0,	/* need to manage workers */
@@ -1010,9 +1017,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
 		work_flags |= WORK_STRUCT_DELAYED;
 		worklist = &cwq->delayed_works;
 	}
-
 	insert_work(cwq, work, worklist, work_flags);
-
 	spin_unlock_irqrestore(&gcwq->lock, flags);
 }
 
@@ -2526,10 +2531,69 @@ EXPORT_SYMBOL(cancel_delayed_work_sync);
  */
 int schedule_work(struct work_struct *work)
 {
-	return queue_work(system_wq, work);
+#ifdef CONFIG_LITMUS_NVIDIA
+	if(is_nvidia_func(work->func))
+	{
+		u32 nvidiaDevice = get_work_nv_device_num(work);
+		
+		//1) Ask Litmus which task owns GPU <nvidiaDevice>. (API to be defined.)
+		unsigned long flags;
+		struct task_struct* device_owner;
+		
+		lock_nv_registry(nvidiaDevice, &flags);
+		
+		device_owner = get_nv_device_owner(nvidiaDevice);
+		
+		//2) If there is an owner, set work->owner to the owner's task struct.
+		if(device_owner==NULL) 
+		{
+			work->owner = NULL;
+			//TRACE("%s: the owner task of NVIDIA Device %u is NULL\n",__FUNCTION__,nvidiaDevice);
+		}
+		else
+		{
+			if( is_realtime(device_owner))
+			{
+				TRACE("%s: Handling NVIDIA work for device\t%u\tat\t%llu\n",
+					  __FUNCTION__, nvidiaDevice,litmus_clock());
+				TRACE("%s: the owner task %d of NVIDIA Device %u is RT-task\n",
+					  __FUNCTION__,
+					  device_owner->pid,
+					  nvidiaDevice);
+				
+				//3) Call litmus_schedule_work() and return (don't execute the rest
+				//	of schedule_schedule()).
+				work->owner = device_owner;
+				sched_trace_work_release(work->owner);
+				if(likely(litmus_schedule_work(work, nvidiaDevice)))
+				{
+					unlock_nv_registry(nvidiaDevice, &flags);
+					return 1;
+				}
+				else
+				{
+					work->owner = NULL; /* fall through to normal work scheduling */
+				}
+			}
+			else
+			{
+				work->owner = NULL;
+			}
+		}
+		unlock_nv_registry(nvidiaDevice, &flags);
+	}
+#endif
+
+	return(__schedule_work(work));
 }
 EXPORT_SYMBOL(schedule_work);
 
+int __schedule_work(struct work_struct* work)
+{
+	return queue_work(system_wq, work);
+}
+EXPORT_SYMBOL(__schedule_work);
+
 /*
  * schedule_work_on - put work task on a specific cpu
  * @cpu: cpu to put the work task on
diff --git a/litmus/Kconfig b/litmus/Kconfig
index ad8dc8308cf0..7e865d4dd703 100644
--- a/litmus/Kconfig
+++ b/litmus/Kconfig
@@ -62,6 +62,25 @@ config LITMUS_LOCKING
 
 endmenu
 
+menu "Performance Enhancements"
+
+config SCHED_CPU_AFFINITY
+	bool "Local Migration Affinity"
+	default y
+	help
+	  Rescheduled tasks prefer CPUs near to their previously used CPU.  This
+	  may improve performance through possible preservation of cache affinity.
+
+	  Warning: May make bugs ahrder to find since tasks may migrate less often.
+
+	  NOTES:
+	  	* Pfair/PD^2 does not support this option.
+		* Only x86 currently supported.
+
+	  Say Yes if unsure.
+
+endmenu
+
 menu "Tracing"
 
 config FEATHER_TRACE
@@ -182,4 +201,74 @@ config SCHED_DEBUG_TRACE_CALLER
 
 endmenu
 
+menu "Interrupt Handling"
+
+config LITMUS_THREAD_ALL_SOFTIRQ
+       bool "Process all softirqs in ksoftirqd threads."
+       default n
+       help
+	     (Experimental) Thread all softirqs to ksoftirqd
+		 daemon threads, similar to PREEMPT_RT.  I/O
+		 throughput will will drop with this enabled, but
+		 latencies due to interrupts will be reduced.
+
+		 WARNING: Timer responsiveness will likely be
+		 decreased as timer callbacks are also threaded.
+		 This is unlike PREEEMPT_RTs hardirqs.
+
+		If unsure, say No.
+
+config LITMUS_SOFTIRQD
+       bool "Spawn klitirqd interrupt handling threads."
+	   depends on LITMUS_LOCKING
+	   default n
+	   help
+	     Create klitirqd interrupt handling threads.  Work must be
+		 specifically dispatched to these workers.  (Softirqs for
+		 Litmus tasks are not magically redirected to klitirqd.)
+
+		 G-EDF ONLY for now!
+
+	     If unsure, say No.
+
+config NR_LITMUS_SOFTIRQD
+	   int "Number of klitirqd."
+	   depends on LITMUS_SOFTIRQD
+	   range 1 4096
+	   default "1"
+	   help
+	     Should be <= to the number of CPUs in your system.
+
+config LITMUS_NVIDIA
+	  bool "Litmus handling of NVIDIA interrupts."
+	  depends on LITMUS_SOFTIRQD
+	  default n
+	  help
+	    Direct tasklets from NVIDIA devices to Litmus's klitirqd.
+
+		If unsure, say No.
+
+choice
+	  prompt "CUDA/Driver Version Support"
+	  default CUDA_4_0
+	  depends on LITMUS_NVIDIA
+	  help
+	  	Select the version of CUDA/driver to support.
+	
+config CUDA_4_0
+	  bool "CUDA 4.0"
+	  depends on LITMUS_NVIDIA
+	  help
+	  	Support CUDA 4.0 RC2 (dev. driver version: x86_64-270.40)
+
+config CUDA_3_2
+	  bool "CUDA 3.2"
+	  depends on LITMUS_NVIDIA
+	  help
+	  	Support CUDA 3.2 (dev. driver version: x86_64-260.24)
+
+endchoice
+
+endmenu
+
 endmenu
diff --git a/litmus/Makefile b/litmus/Makefile
index ad9936e07b83..892e01c2e1b3 100644
--- a/litmus/Makefile
+++ b/litmus/Makefile
@@ -21,8 +21,12 @@ obj-y     = sched_plugin.o litmus.o \
 
 obj-$(CONFIG_PLUGIN_CEDF) += sched_cedf.o
 obj-$(CONFIG_PLUGIN_PFAIR) += sched_pfair.o
+obj-$(CONFIG_SCHED_CPU_AFFINITY) += affinity.o
 
 obj-$(CONFIG_FEATHER_TRACE) += ft_event.o ftdev.o
 obj-$(CONFIG_SCHED_TASK_TRACE) += sched_task_trace.o
 obj-$(CONFIG_SCHED_DEBUG_TRACE) += sched_trace.o
 obj-$(CONFIG_SCHED_OVERHEAD_TRACE) += trace.o
+
+obj-$(CONFIG_LITMUS_SOFTIRQD) += litmus_softirq.o
+obj-$(CONFIG_LITMUS_NVIDIA) += nvidia_info.o sched_trace_external.o
diff --git a/litmus/affinity.c b/litmus/affinity.c
new file mode 100644
index 000000000000..3b430d18885b
--- /dev/null
+++ b/litmus/affinity.c
@@ -0,0 +1,49 @@
+#include <linux/cpu.h>
+
+#include <litmus/affinity.h>
+
+struct neighborhood neigh_info[NR_CPUS];
+
+/* called by _init_litmus() */
+void init_topology(void)
+{
+	int cpu;
+	int i;
+	int chk;
+	int depth = num_cache_leaves;
+
+	if(depth > NUM_CACHE_LEVELS)
+		depth = NUM_CACHE_LEVELS;
+
+	for_each_online_cpu(cpu)
+	{
+		for(i = 0; i < depth; ++i)
+		{
+			long unsigned int firstbits;
+
+			chk = get_shared_cpu_map((struct cpumask *)&neigh_info[cpu].neighbors[i], cpu, i);
+			if(chk) /* failed */
+			{
+				neigh_info[cpu].size[i] = 0;
+			}
+			else
+			{
+				/* size = num bits in mask */
+				neigh_info[cpu].size[i] = cpumask_weight((struct cpumask *)&neigh_info[cpu].neighbors[i]);
+			}
+			firstbits = *neigh_info[cpu].neighbors[i]->bits;
+			printk("CPU %d has %d neighbors at level %d. (mask = %lx)\n",
+							cpu, neigh_info[cpu].size[i], i, firstbits);
+		}
+
+		/* set data for non-existent levels */
+		for(; i < NUM_CACHE_LEVELS; ++i)
+		{
+			neigh_info[cpu].size[i] = 0;
+
+			printk("CPU %d has %d neighbors at level %d. (mask = %lx)\n",
+							cpu, neigh_info[cpu].size[i], i, 0lu);
+		}
+	}
+}
+
diff --git a/litmus/edf_common.c b/litmus/edf_common.c
index 9b44dc2d8d1e..fbd67ab5f467 100644
--- a/litmus/edf_common.c
+++ b/litmus/edf_common.c
@@ -65,6 +65,12 @@ int edf_higher_prio(struct task_struct* first,
 
 
 	return !is_realtime(second_task)  ||
+    
+#ifdef CONFIG_LITMUS_SOFTIRQD
+        /* proxy threads always lose w/o inheritance. */
+        (first_task->rt_param.is_proxy_thread <
+            second_task->rt_param.is_proxy_thread) ||
+#endif
 
 		/* is the deadline of the first task earlier?
 		 * Then it has higher priority.
diff --git a/litmus/fdso.c b/litmus/fdso.c
index aa7b384264e3..2b7f9ba85857 100644
--- a/litmus/fdso.c
+++ b/litmus/fdso.c
@@ -22,6 +22,7 @@ extern struct fdso_ops generic_lock_ops;
 
 static const struct fdso_ops* fdso_ops[] = {
 	&generic_lock_ops, /* FMLP_SEM */
+	&generic_lock_ops, /* KFMLP_SEM */
 	&generic_lock_ops, /* SRP_SEM */
 };
 
diff --git a/litmus/litmus.c b/litmus/litmus.c
index 26938acacafc..29363c6ad565 100644
--- a/litmus/litmus.c
+++ b/litmus/litmus.c
@@ -17,6 +17,14 @@
 #include <litmus/litmus_proc.h>
 #include <litmus/sched_trace.h>
 
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+#include <litmus/affinity.h>
+#endif
+
+#ifdef CONFIG_LITMUS_NVIDIA
+#include <litmus/nvidia_info.h>
+#endif
+
 /* Number of RT tasks that exist in the system */
 atomic_t rt_task_count 		= ATOMIC_INIT(0);
 static DEFINE_RAW_SPINLOCK(task_transition_lock);
@@ -47,6 +55,28 @@ void bheap_node_free(struct bheap_node* hn)
 struct release_heap* release_heap_alloc(int gfp_flags);
 void release_heap_free(struct release_heap* rh);
 
+#ifdef CONFIG_LITMUS_NVIDIA
+/*
+ * sys_register_nv_device
+ * @nv_device_id: The Nvidia device id that the task want to register
+ * @reg_action: set to '1' to register the specified device. zero otherwise.
+ * Syscall for register task's designated nvidia device into NV_DEVICE_REG array
+ * Returns EFAULT  if nv_device_id is out of range.
+ *	   0       if success
+ */
+asmlinkage long sys_register_nv_device(int nv_device_id, int reg_action)
+{
+	/* register the device to caller (aka 'current') */
+	return(reg_nv_device(nv_device_id, reg_action));
+}
+#else
+asmlinkage long sys_register_nv_device(int nv_device_id, int reg_action)
+{
+	return(-EINVAL);
+}
+#endif
+
+
 /*
  * sys_set_task_rt_param
  * @pid: Pid of the task which scheduling parameters must be changed
@@ -115,7 +145,7 @@ asmlinkage long sys_set_rt_task_param(pid_t pid, struct rt_task __user * param)
 		tp.cls != RT_CLASS_BEST_EFFORT)
 	{
 		printk(KERN_INFO "litmus: real-time task %d rejected "
-				 "because its class is invalid\n");
+				 "because its class is invalid\n", pid);
 		goto out_unlock;
 	}
 	if (tp.budget_policy != NO_ENFORCEMENT &&
@@ -131,6 +161,22 @@ asmlinkage long sys_set_rt_task_param(pid_t pid, struct rt_task __user * param)
 
 	target->rt_param.task_params = tp;
 
+#ifdef CONFIG_LITMUS_SOFTIRQD
+	/* proxy thread off by default */
+	target->rt_param.is_proxy_thread = 0;
+    target->rt_param.cur_klitirqd = NULL;
+	//init_MUTEX(&target->rt_param.klitirqd_sem);
+	mutex_init(&target->rt_param.klitirqd_sem);
+	//init_completion(&target->rt_param.klitirqd_sem);
+	//target->rt_param.klitirqd_sem_stat = NOT_HELD;
+	atomic_set(&target->rt_param.klitirqd_sem_stat, NOT_HELD);
+#endif
+
+#ifdef CONFIG_LITMUS_NVIDIA
+	atomic_set(&target->rt_param.nv_int_count, 0);
+#endif
+
+
 	retval = 0;
       out_unlock:
 	read_unlock_irq(&tasklist_lock);
@@ -265,6 +311,7 @@ asmlinkage long sys_query_job_no(unsigned int __user *job)
 	return retval;
 }
 
+
 /* sys_null_call() is only used for determining raw system call
  * overheads (kernel entry, kernel exit). It has no useful side effects.
  * If ts is non-NULL, then the current Feather-Trace time is recorded.
@@ -278,7 +325,7 @@ asmlinkage long sys_null_call(cycles_t __user *ts)
 		now = get_cycles();
 		ret = put_user(now, ts);
 	}
-
+	
 	return ret;
 }
 
@@ -299,6 +346,20 @@ static void reinit_litmus_state(struct task_struct* p, int restore)
 	 * at this point in time.
 	 */
 	WARN_ON(p->rt_param.inh_task);
+   
+#ifdef CONFIG_LITMUS_SOFTIRQD
+	/* We probably should not have any tasklets executing for
+     * us at this time.
+	 */    
+    WARN_ON(p->rt_param.cur_klitirqd);
+	WARN_ON(atomic_read(&p->rt_param.klitirqd_sem_stat) == HELD);
+
+	if(p->rt_param.cur_klitirqd)
+		flush_pending(p->rt_param.cur_klitirqd, p);
+
+	if(atomic_read(&p->rt_param.klitirqd_sem_stat) == HELD)
+		up_and_set_stat(p, NOT_HELD, &p->rt_param.klitirqd_sem);
+#endif
 
 	/* Cleanup everything else. */
 	memset(&p->rt_param, 0, sizeof(p->rt_param));
@@ -399,7 +460,7 @@ static void synch_on_plugin_switch(void* info)
  */
 int switch_sched_plugin(struct sched_plugin* plugin)
 {
-	unsigned long flags;
+	//unsigned long flags;
 	int ret = 0;
 
 	BUG_ON(!plugin);
@@ -413,8 +474,15 @@ int switch_sched_plugin(struct sched_plugin* plugin)
 	while (atomic_read(&cannot_use_plugin) < num_online_cpus())
 		cpu_relax();
 
+#ifdef CONFIG_LITMUS_SOFTIRQD
+	if(!klitirqd_is_dead())
+	{
+		kill_klitirqd();
+	}
+#endif
+
 	/* stop task transitions */
-	raw_spin_lock_irqsave(&task_transition_lock, flags);
+	//raw_spin_lock_irqsave(&task_transition_lock, flags);
 
 	/* don't switch if there are active real-time tasks */
 	if (atomic_read(&rt_task_count) == 0) {
@@ -432,7 +500,7 @@ int switch_sched_plugin(struct sched_plugin* plugin)
 	} else
 		ret = -EBUSY;
 out:
-	raw_spin_unlock_irqrestore(&task_transition_lock, flags);
+	//raw_spin_unlock_irqrestore(&task_transition_lock, flags);
 	atomic_set(&cannot_use_plugin, 0);
 	return ret;
 }
@@ -540,6 +608,10 @@ static int __init _init_litmus(void)
 
 	init_litmus_proc();
 
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+	init_topology();
+#endif
+
 	return 0;
 }
 
diff --git a/litmus/litmus_proc.c b/litmus/litmus_proc.c
index 4bf725a36c9c..381513366c7a 100644
--- a/litmus/litmus_proc.c
+++ b/litmus/litmus_proc.c
@@ -19,12 +19,19 @@ static struct proc_dir_entry *litmus_dir = NULL,
 	*plugs_dir = NULL,
 #ifdef CONFIG_RELEASE_MASTER
 	*release_master_file = NULL,
+#endif
+#ifdef CONFIG_LITMUS_SOFTIRQD
+	*klitirqd_file = NULL,
 #endif
 	*plugs_file = NULL;
 
 /* in litmus/sync.c */
 int count_tasks_waiting_for_release(void);
 
+extern int proc_read_klitirqd_stats(char *page, char **start,
+									off_t off, int count,
+									int *eof, void *data);
+
 static int proc_read_stats(char *page, char **start,
 			   off_t off, int count,
 			   int *eof, void *data)
@@ -161,6 +168,12 @@ int __init init_litmus_proc(void)
 	release_master_file->write_proc  = proc_write_release_master;
 #endif
 
+#ifdef CONFIG_LITMUS_SOFTIRQD
+	klitirqd_file =
+		create_proc_read_entry("klitirqd_stats", 0444, litmus_dir,
+							   proc_read_klitirqd_stats, NULL);
+#endif	
+	
 	stat_file = create_proc_read_entry("stats", 0444, litmus_dir,
 					   proc_read_stats, NULL);
 
@@ -187,6 +200,10 @@ void exit_litmus_proc(void)
 		remove_proc_entry("stats", litmus_dir);
 	if (curr_file)
 		remove_proc_entry("active_plugin", litmus_dir);
+#ifdef CONFIG_LITMUS_SOFTIRQD
+	if (klitirqd_file)
+		remove_proc_entry("klitirqd_stats", litmus_dir);
+#endif
 #ifdef CONFIG_RELEASE_MASTER
 	if (release_master_file)
 		remove_proc_entry("release_master", litmus_dir);
diff --git a/litmus/litmus_softirq.c b/litmus/litmus_softirq.c
new file mode 100644
index 000000000000..271e770dbaea
--- /dev/null
+++ b/litmus/litmus_softirq.c
@@ -0,0 +1,1579 @@
+#include <linux/interrupt.h>
+#include <linux/percpu.h>
+#include <linux/cpu.h>
+#include <linux/kthread.h>
+#include <linux/ftrace.h>
+#include <linux/smp.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+
+#include <linux/sched.h>
+#include <linux/cpuset.h>
+
+#include <litmus/litmus.h>
+#include <litmus/sched_trace.h>
+#include <litmus/jobs.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/litmus_softirq.h>
+
+/* TODO: Remove unneeded mb() and other barriers. */
+
+
+/* counts number of daemons ready to handle litmus irqs. */
+static atomic_t num_ready_klitirqds = ATOMIC_INIT(0);
+
+enum pending_flags
+{
+    LIT_TASKLET_LOW = 0x1,
+    LIT_TASKLET_HI  = LIT_TASKLET_LOW<<1,
+	LIT_WORK = LIT_TASKLET_HI<<1
+};
+
+/* only support tasklet processing for now. */
+struct tasklet_head
+{
+	struct tasklet_struct *head;
+	struct tasklet_struct **tail;
+};
+
+struct klitirqd_info
+{
+	struct task_struct*		klitirqd;
+    struct task_struct*     current_owner;
+    int						terminating;
+
+
+	raw_spinlock_t			lock;
+	
+	u32						pending;
+	atomic_t				num_hi_pending;
+	atomic_t				num_low_pending;
+	atomic_t				num_work_pending;
+
+	/* in order of priority */
+	struct tasklet_head     pending_tasklets_hi;
+	struct tasklet_head		pending_tasklets;
+	struct list_head		worklist;
+};
+
+/* one list for each klitirqd */
+static struct klitirqd_info klitirqds[NR_LITMUS_SOFTIRQD];
+
+
+
+
+
+int proc_read_klitirqd_stats(char *page, char **start,
+							 off_t off, int count,
+							 int *eof, void *data)
+{
+	int len = snprintf(page, PAGE_SIZE,
+				"num ready klitirqds: %d\n\n",
+				atomic_read(&num_ready_klitirqds));
+	
+	if(klitirqd_is_ready())
+	{
+		int i;
+		for(i = 0; i < NR_LITMUS_SOFTIRQD; ++i)
+		{
+			len +=
+				snprintf(page + len - 1, PAGE_SIZE, /* -1 to strip off \0 */
+						 "klitirqd_th%d: %s/%d\n"
+						 "\tcurrent_owner: %s/%d\n"
+						 "\tpending: %x\n"
+						 "\tnum hi: %d\n"
+						 "\tnum low: %d\n"
+						 "\tnum work: %d\n\n",
+						 i,
+						 klitirqds[i].klitirqd->comm, klitirqds[i].klitirqd->pid,
+						 (klitirqds[i].current_owner != NULL) ?
+						 	klitirqds[i].current_owner->comm : "(null)",
+						 (klitirqds[i].current_owner != NULL) ?
+							klitirqds[i].current_owner->pid : 0,
+						 klitirqds[i].pending,
+						 atomic_read(&klitirqds[i].num_hi_pending),
+						 atomic_read(&klitirqds[i].num_low_pending),
+						 atomic_read(&klitirqds[i].num_work_pending));
+		}
+	}
+
+	return(len);
+}
+
+				   
+
+
+
+#if 0
+static atomic_t dump_id = ATOMIC_INIT(0);
+
+static void __dump_state(struct klitirqd_info* which, const char* caller)
+{
+	struct tasklet_struct* list;
+
+	int id = atomic_inc_return(&dump_id);
+
+	//if(in_interrupt())
+	{
+		if(which->current_owner)
+		{
+			TRACE("(id: %d  caller: %s)\n"
+				"klitirqd: %s/%d\n"
+				"current owner: %s/%d\n"
+				"pending: %x\n",
+				id, caller,
+				which->klitirqd->comm, which->klitirqd->pid,
+				which->current_owner->comm, which->current_owner->pid,
+				which->pending);
+		}
+		else
+		{
+			TRACE("(id: %d  caller: %s)\n"
+				"klitirqd: %s/%d\n"
+				"current owner: %p\n"
+				"pending: %x\n",
+				id, caller,
+				which->klitirqd->comm, which->klitirqd->pid,
+				NULL,
+				which->pending);
+		}
+
+		list = which->pending_tasklets.head;
+		while(list)
+		{
+			struct tasklet_struct *t = list;
+			list = list->next; /* advance */
+			if(t->owner)
+				TRACE("(id: %d  caller: %s) Tasklet: %x, Owner = %s/%d\n", id, caller, t, t->owner->comm, t->owner->pid);
+			else
+				TRACE("(id: %d  caller: %s) Tasklet: %x, Owner = %p\n", id, caller, t, NULL);
+		}
+	}
+}
+
+static void dump_state(struct klitirqd_info* which, const char* caller)
+{
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&which->lock, flags);
+    __dump_state(which, caller);
+    raw_spin_unlock_irqrestore(&which->lock, flags);
+}
+#endif
+
+
+/* forward declarations */
+static void ___litmus_tasklet_schedule(struct tasklet_struct *t,
+									   struct klitirqd_info *which,
+									   int wakeup);
+static void ___litmus_tasklet_hi_schedule(struct tasklet_struct *t,
+										  struct klitirqd_info *which,
+										  int wakeup);
+static void ___litmus_schedule_work(struct work_struct *w,
+									struct klitirqd_info *which,
+									int wakeup);
+
+
+
+inline unsigned int klitirqd_id(struct task_struct* tsk)
+{
+    int i;
+    for(i = 0; i < NR_LITMUS_SOFTIRQD; ++i)
+    {
+        if(klitirqds[i].klitirqd == tsk)
+        {
+            return i;
+        }
+    }
+    
+    BUG();
+    
+    return 0;
+}
+
+
+inline static u32 litirq_pending_hi_irqoff(struct klitirqd_info* which)
+{
+    return (which->pending & LIT_TASKLET_HI);
+}
+
+inline static u32 litirq_pending_low_irqoff(struct klitirqd_info* which)
+{
+    return (which->pending & LIT_TASKLET_LOW);
+}
+
+inline static u32 litirq_pending_work_irqoff(struct klitirqd_info* which)
+{
+	return (which->pending & LIT_WORK);
+}
+
+inline static u32 litirq_pending_irqoff(struct klitirqd_info* which)
+{
+    return(which->pending);
+}
+
+
+inline static u32 litirq_pending(struct klitirqd_info* which)
+{
+    unsigned long flags;
+    u32 pending;
+    
+    raw_spin_lock_irqsave(&which->lock, flags);
+    pending = litirq_pending_irqoff(which);
+    raw_spin_unlock_irqrestore(&which->lock, flags);
+    
+    return pending;
+};
+
+inline static u32 litirq_pending_with_owner(struct klitirqd_info* which, struct task_struct* owner)
+{
+	unsigned long flags;
+	u32 pending;
+
+	raw_spin_lock_irqsave(&which->lock, flags);
+	pending = litirq_pending_irqoff(which);
+	if(pending)
+	{
+		if(which->current_owner != owner)
+		{
+			pending = 0;  // owner switch!
+		}
+	}
+	raw_spin_unlock_irqrestore(&which->lock, flags);
+
+	return pending;
+}
+
+
+inline static u32 litirq_pending_and_sem_and_owner(struct klitirqd_info* which,
+				struct mutex** sem,
+				struct task_struct** t)
+{
+	unsigned long flags;
+	u32 pending;
+
+	/* init values */
+	*sem = NULL;
+	*t = NULL;
+
+	raw_spin_lock_irqsave(&which->lock, flags);
+
+	pending = litirq_pending_irqoff(which);
+	if(pending)
+	{
+		if(which->current_owner != NULL)
+		{
+			*t = which->current_owner;
+			*sem = &tsk_rt(which->current_owner)->klitirqd_sem;
+		}
+		else
+		{
+			BUG();
+		}
+	}
+	raw_spin_unlock_irqrestore(&which->lock, flags);
+
+	if(likely(*sem))
+	{
+		return pending;
+	}
+	else
+	{
+		return 0;
+	}
+}
+
+/* returns true if the next piece of work to do is from a different owner.
+ */
+static int tasklet_ownership_change(
+				struct klitirqd_info* which,
+				enum pending_flags taskletQ)
+{
+	/* this function doesn't have to look at work objects since they have
+	   priority below tasklets. */
+
+    unsigned long flags;
+    int ret = 0;
+
+    raw_spin_lock_irqsave(&which->lock, flags);
+    
+	switch(taskletQ)
+	{
+	case LIT_TASKLET_HI:
+		if(litirq_pending_hi_irqoff(which))
+		{
+			ret = (which->pending_tasklets_hi.head->owner != 
+						which->current_owner);
+		}
+		break;
+	case LIT_TASKLET_LOW:
+		if(litirq_pending_low_irqoff(which))
+		{
+			ret = (which->pending_tasklets.head->owner !=
+						which->current_owner);
+		}
+		break;
+	default:
+		break;
+	}
+    
+    raw_spin_unlock_irqrestore(&which->lock, flags);
+    
+    TRACE_TASK(which->klitirqd, "ownership change needed: %d\n", ret);
+    
+    return ret;
+}
+
+
+static void __reeval_prio(struct klitirqd_info* which)
+{
+    struct task_struct* next_owner = NULL;
+	struct task_struct* klitirqd = which->klitirqd;
+	
+	/* Check in prio-order */
+	u32 pending = litirq_pending_irqoff(which);
+	
+	//__dump_state(which, "__reeval_prio: before");
+	
+	if(pending)
+	{
+		if(pending & LIT_TASKLET_HI)
+		{
+			next_owner = which->pending_tasklets_hi.head->owner;
+		}
+		else if(pending & LIT_TASKLET_LOW)
+		{
+			next_owner = which->pending_tasklets.head->owner;
+		}
+		else if(pending & LIT_WORK)
+		{
+			struct work_struct* work =
+				list_first_entry(&which->worklist, struct work_struct, entry);
+			next_owner = work->owner;
+		}
+	}
+
+	if(next_owner != which->current_owner)
+	{
+		struct task_struct* old_owner = which->current_owner;
+
+		/* bind the next owner. */
+		which->current_owner = next_owner;
+		mb();
+
+        if(next_owner != NULL)
+        {
+			if(!in_interrupt())
+			{
+				TRACE_CUR("%s: Ownership change: %s/%d to %s/%d\n", __FUNCTION__,
+						((tsk_rt(klitirqd)->inh_task) ? tsk_rt(klitirqd)->inh_task : klitirqd)->comm,
+						((tsk_rt(klitirqd)->inh_task) ? tsk_rt(klitirqd)->inh_task : klitirqd)->pid,
+						next_owner->comm, next_owner->pid);
+			}
+			else
+			{
+				TRACE("%s: Ownership change: %s/%d to %s/%d\n", __FUNCTION__,
+					((tsk_rt(klitirqd)->inh_task) ? tsk_rt(klitirqd)->inh_task : klitirqd)->comm,
+					((tsk_rt(klitirqd)->inh_task) ? tsk_rt(klitirqd)->inh_task : klitirqd)->pid,
+					next_owner->comm, next_owner->pid);				
+			}
+
+			litmus->set_prio_inh_klitirqd(klitirqd, old_owner, next_owner);
+        }
+        else
+        {
+			if(likely(!in_interrupt()))
+			{
+				TRACE_CUR("%s: Ownership change: %s/%d to NULL (reverting)\n",
+						__FUNCTION__, klitirqd->comm, klitirqd->pid);
+			}
+			else
+			{
+				// is this a bug?
+				TRACE("%s: Ownership change: %s/%d to NULL (reverting)\n",
+					__FUNCTION__, klitirqd->comm, klitirqd->pid);			
+			}
+           
+			BUG_ON(pending != 0);
+			litmus->clear_prio_inh_klitirqd(klitirqd, old_owner);
+        }
+    }
+	
+	//__dump_state(which, "__reeval_prio: after");
+}
+
+static void reeval_prio(struct klitirqd_info* which)
+{
+    unsigned long flags;
+    
+    raw_spin_lock_irqsave(&which->lock, flags);
+    __reeval_prio(which);
+    raw_spin_unlock_irqrestore(&which->lock, flags);
+}
+
+
+static void wakeup_litirqd_locked(struct klitirqd_info* which)
+{
+	/* Interrupts are disabled: no need to stop preemption */
+	if (which && which->klitirqd)
+	{
+        __reeval_prio(which); /* configure the proper priority */
+
+		if(which->klitirqd->state != TASK_RUNNING)
+		{
+        	TRACE("%s: Waking up klitirqd: %s/%d\n", __FUNCTION__,
+			  	which->klitirqd->comm, which->klitirqd->pid);
+
+			wake_up_process(which->klitirqd);
+		}
+    }
+}
+
+
+static void do_lit_tasklet(struct klitirqd_info* which,
+						   struct tasklet_head* pending_tasklets)
+{
+    unsigned long flags;
+	struct tasklet_struct *list;
+	atomic_t* count;
+
+    raw_spin_lock_irqsave(&which->lock, flags);
+	
+	//__dump_state(which, "do_lit_tasklet: before steal");
+	
+	/* copy out the tasklets for our private use. */
+	list = pending_tasklets->head;
+	pending_tasklets->head = NULL;
+	pending_tasklets->tail = &pending_tasklets->head;
+	
+	/* remove pending flag */
+	which->pending &= (pending_tasklets == &which->pending_tasklets) ?
+		~LIT_TASKLET_LOW :
+		~LIT_TASKLET_HI;
+	
+	count = (pending_tasklets == &which->pending_tasklets) ?
+		&which->num_low_pending:
+		&which->num_hi_pending;
+	
+	//__dump_state(which, "do_lit_tasklet: after steal");
+	
+    raw_spin_unlock_irqrestore(&which->lock, flags);
+
+    
+    while(list)
+    {
+        struct tasklet_struct *t = list;
+        
+        /* advance, lest we forget */
+		list = list->next;
+        
+        /* execute tasklet if it has my priority and is free */
+		if ((t->owner == which->current_owner) && tasklet_trylock(t)) {
+			if (!atomic_read(&t->count)) {
+				if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
+                {
+					BUG();
+                }
+                TRACE_CUR("%s: Invoking tasklet.\n", __FUNCTION__);
+				t->func(t->data);
+				tasklet_unlock(t);
+				
+				atomic_dec(count);
+				
+				continue;  /* process more tasklets */
+			}
+			tasklet_unlock(t);
+		}
+        
+        TRACE_CUR("%s: Could not invoke tasklet.  Requeuing.\n", __FUNCTION__);
+        
+		/* couldn't process tasklet.  put it back at the end of the queue. */
+		if(pending_tasklets == &which->pending_tasklets)
+			___litmus_tasklet_schedule(t, which, 0);
+		else
+			___litmus_tasklet_hi_schedule(t, which, 0);
+    }
+}
+
+
+// returns 1 if priorities need to be changed to continue processing
+// pending tasklets.
+static int do_litirq(struct klitirqd_info* which)
+{
+    u32 pending;
+    int resched = 0;
+    
+    if(in_interrupt())
+    {
+        TRACE("%s: exiting early: in interrupt context!\n", __FUNCTION__);
+        return(0);
+    }
+	
+	if(which->klitirqd != current)
+	{
+        TRACE_CUR("%s: exiting early: thread/info mismatch! Running %s/%d but given %s/%d.\n",
+				  __FUNCTION__, current->comm, current->pid,
+				  which->klitirqd->comm, which->klitirqd->pid);
+        return(0);
+	}
+	
+    if(!is_realtime(current))
+    {
+        TRACE_CUR("%s: exiting early: klitirqd is not real-time. Sched Policy = %d\n",
+				  __FUNCTION__, current->policy);
+        return(0);
+    }
+
+    
+    /* We only handle tasklets & work objects, no need for RCU triggers? */
+    
+    pending = litirq_pending(which);
+    if(pending)
+    {
+        /* extract the work to do and do it! */
+        if(pending & LIT_TASKLET_HI)
+        {
+            TRACE_CUR("%s: Invoking HI tasklets.\n", __FUNCTION__);
+            do_lit_tasklet(which, &which->pending_tasklets_hi);
+            resched = tasklet_ownership_change(which, LIT_TASKLET_HI);
+            
+            if(resched)
+            {
+                TRACE_CUR("%s: HI tasklets of another owner remain. "
+						  "Skipping any LOW tasklets.\n", __FUNCTION__);
+            }
+        }
+        
+        if(!resched && (pending & LIT_TASKLET_LOW))
+        {
+            TRACE_CUR("%s: Invoking LOW tasklets.\n", __FUNCTION__);
+			do_lit_tasklet(which, &which->pending_tasklets);
+			resched = tasklet_ownership_change(which, LIT_TASKLET_LOW);
+			
+            if(resched)
+            {
+                TRACE_CUR("%s: LOW tasklets of another owner remain. "
+						  "Skipping any work objects.\n", __FUNCTION__);
+            }
+        }
+    }
+	
+	return(resched);
+}
+
+
+static void do_work(struct klitirqd_info* which)
+{
+	unsigned long flags;
+	work_func_t f;
+	struct work_struct* work;
+	
+	// only execute one work-queue item to yield to tasklets.
+	// ...is this a good idea, or should we just batch them?
+	raw_spin_lock_irqsave(&which->lock, flags);
+	
+	if(!litirq_pending_work_irqoff(which))
+	{
+		raw_spin_unlock_irqrestore(&which->lock, flags);
+		goto no_work;
+	}
+
+	work = list_first_entry(&which->worklist, struct work_struct, entry);
+	list_del_init(&work->entry);
+	
+	if(list_empty(&which->worklist))
+	{
+		which->pending &= ~LIT_WORK;
+	}
+	
+	raw_spin_unlock_irqrestore(&which->lock, flags);
+	
+	
+	
+	/* safe to read current_owner outside of lock since only this thread
+	 may write to the pointer. */
+	if(work->owner == which->current_owner)
+	{
+		TRACE_CUR("%s: Invoking work object.\n", __FUNCTION__);
+		// do the work!
+		work_clear_pending(work);
+		f = work->func;
+		f(work);  /* can't touch 'work' after this point,
+				   the user may have freed it. */
+		
+		atomic_dec(&which->num_work_pending);
+	}
+	else
+	{
+		TRACE_CUR("%s: Could not invoke work object.  Requeuing.\n",
+				  __FUNCTION__);
+		___litmus_schedule_work(work, which, 0);
+	}
+	
+no_work:
+	return;
+}
+
+
+static int set_litmus_daemon_sched(void)
+{
+    /* set up a daemon job that will never complete.
+       it should only ever run on behalf of another
+       real-time task.
+
+       TODO: Transition to a new job whenever a
+       new tasklet is handled */
+    
+    int ret = 0;
+
+	struct rt_task tp = {
+		.exec_cost = 0,
+		.period = 1000000000, /* dummy 1 second period */
+		.phase = 0,
+		.cpu = task_cpu(current),
+		.budget_policy = NO_ENFORCEMENT,
+		.cls = RT_CLASS_BEST_EFFORT
+	};
+	
+	struct sched_param param = { .sched_priority = 0};
+	
+	
+	/* set task params, mark as proxy thread, and init other data */
+	tsk_rt(current)->task_params = tp;
+	tsk_rt(current)->is_proxy_thread = 1;
+	tsk_rt(current)->cur_klitirqd = NULL;
+	//init_MUTEX(&tsk_rt(current)->klitirqd_sem);
+	mutex_init(&tsk_rt(current)->klitirqd_sem);
+	//init_completion(&tsk_rt(current)->klitirqd_sem);
+	atomic_set(&tsk_rt(current)->klitirqd_sem_stat, NOT_HELD);
+	
+	/* inform the OS we're SCHED_LITMUS --
+	   sched_setscheduler_nocheck() calls litmus_admit_task(). */
+	sched_setscheduler_nocheck(current, SCHED_LITMUS, &param);	
+
+    return ret;
+}
+
+static void enter_execution_phase(struct klitirqd_info* which,
+								  struct mutex* sem,
+								  struct task_struct* t)
+{
+	TRACE_CUR("%s: Trying to enter execution phase. "
+			  "Acquiring semaphore of %s/%d\n", __FUNCTION__,
+			  t->comm, t->pid);
+	down_and_set_stat(current, HELD, sem);
+	TRACE_CUR("%s: Execution phase entered! "
+			  "Acquired semaphore of %s/%d\n", __FUNCTION__,
+			  t->comm, t->pid);
+}
+
+static void exit_execution_phase(struct klitirqd_info* which,
+								 struct mutex* sem,
+								 struct task_struct* t)
+{
+	TRACE_CUR("%s: Exiting execution phase. "
+			  "Releasing semaphore of %s/%d\n", __FUNCTION__,
+			  t->comm, t->pid);
+	if(atomic_read(&tsk_rt(current)->klitirqd_sem_stat) == HELD)
+	{
+		up_and_set_stat(current, NOT_HELD, sem);
+		TRACE_CUR("%s: Execution phase exited! "
+				  "Released semaphore of %s/%d\n", __FUNCTION__,
+				  t->comm, t->pid);		
+	}
+	else
+	{
+		TRACE_CUR("%s: COULDN'T RELEASE SEMAPHORE BECAUSE ONE IS NOT HELD!\n", __FUNCTION__);
+	}
+}
+
+/* main loop for klitsoftirqd */
+static int run_klitirqd(void* unused)
+{
+	struct klitirqd_info* which = &klitirqds[klitirqd_id(current)];
+	struct mutex* sem;
+	struct task_struct* owner;
+
+    int rt_status = set_litmus_daemon_sched();
+
+    if(rt_status != 0)
+    {
+        TRACE_CUR("%s: Failed to transition to rt-task.\n", __FUNCTION__);
+        goto rt_failed;
+    }
+    
+	atomic_inc(&num_ready_klitirqds);
+	
+	set_current_state(TASK_INTERRUPTIBLE);
+    
+	while (!kthread_should_stop())
+	{
+		preempt_disable();
+		if (!litirq_pending(which))
+		{
+            /* sleep for work */
+            TRACE_CUR("%s: No more tasklets or work objects. Going to sleep.\n",
+					  __FUNCTION__);
+			preempt_enable_no_resched();
+            schedule();
+
+			if(kthread_should_stop()) /* bail out */
+			{
+				TRACE_CUR("%s:%d: Signaled to terminate.\n", __FUNCTION__, __LINE__);
+				continue;
+			}
+
+			preempt_disable();
+		}
+        
+		__set_current_state(TASK_RUNNING);
+
+		while (litirq_pending_and_sem_and_owner(which, &sem, &owner))
+		{
+			int needs_resched = 0;
+
+			preempt_enable_no_resched();
+	
+			BUG_ON(sem == NULL);
+
+			// wait to enter execution phase; wait for 'current_owner' to block.
+			enter_execution_phase(which, sem, owner);
+
+			if(kthread_should_stop())
+			{
+				TRACE_CUR("%s:%d: Signaled to terminate.\n", __FUNCTION__, __LINE__);
+				break;
+			}
+
+			preempt_disable();
+			
+			/* Double check that there's still pending work and the owner hasn't
+			 * changed. Pending items may have been flushed while we were sleeping.
+			 */
+			if(litirq_pending_with_owner(which, owner))
+			{
+				TRACE_CUR("%s: Executing tasklets and/or work objects.\n",
+						  __FUNCTION__);				
+				
+				needs_resched = do_litirq(which);
+				
+				preempt_enable_no_resched();
+			
+				// work objects are preemptible.
+				if(!needs_resched)
+				{
+					do_work(which);
+				}            
+			
+				// exit execution phase.
+				exit_execution_phase(which, sem, owner);
+				
+				TRACE_CUR("%s: Setting up next priority.\n", __FUNCTION__);
+				reeval_prio(which); /* check if we need to change priority here */
+			}
+			else
+			{
+				TRACE_CUR("%s: Pending work was flushed!  Prev owner was %s/%d\n",
+								__FUNCTION__,
+								owner->comm, owner->pid);					
+				preempt_enable_no_resched();
+
+				// exit execution phase.
+				exit_execution_phase(which, sem, owner);
+			}
+
+			cond_resched();
+			preempt_disable();
+		}
+		preempt_enable();
+		set_current_state(TASK_INTERRUPTIBLE);
+	}
+	__set_current_state(TASK_RUNNING);
+	
+	atomic_dec(&num_ready_klitirqds);
+
+rt_failed:
+    litmus_exit_task(current);
+    
+	return rt_status;
+}
+
+
+struct klitirqd_launch_data
+{
+	int* cpu_affinity;
+	struct work_struct work;
+};
+
+/* executed by a kworker from workqueues */
+static void launch_klitirqd(struct work_struct *work)
+{
+    int i;
+	
+	struct klitirqd_launch_data* launch_data =
+		container_of(work, struct klitirqd_launch_data, work);
+    
+    TRACE("%s: Creating %d klitirqds\n", __FUNCTION__, NR_LITMUS_SOFTIRQD);
+    
+    /* create the daemon threads */
+    for(i = 0; i < NR_LITMUS_SOFTIRQD; ++i)
+    {
+		if(launch_data->cpu_affinity)
+		{
+			klitirqds[i].klitirqd = 
+				kthread_create(
+				   run_klitirqd,
+				   /* treat the affinity as a pointer, we'll cast it back later */
+				   (void*)(long long)launch_data->cpu_affinity[i],
+				   "klitirqd_th%d/%d",
+				   i,
+				   launch_data->cpu_affinity[i]);
+			
+			/* litmus will put is in the right cluster. */
+			kthread_bind(klitirqds[i].klitirqd, launch_data->cpu_affinity[i]);
+		}
+		else
+		{
+			klitirqds[i].klitirqd = 
+				kthread_create(
+				   run_klitirqd,
+				   /* treat the affinity as a pointer, we'll cast it back later */
+				   (void*)(long long)(-1),
+				   "klitirqd_th%d",
+				   i);
+		}
+    }    
+    
+    TRACE("%s: Launching %d klitirqds\n", __FUNCTION__, NR_LITMUS_SOFTIRQD);
+    
+    /* unleash the daemons */
+    for(i = 0; i < NR_LITMUS_SOFTIRQD; ++i)
+    {
+        wake_up_process(klitirqds[i].klitirqd);
+    }
+    
+	if(launch_data->cpu_affinity)
+		kfree(launch_data->cpu_affinity);
+	kfree(launch_data);
+}
+
+
+void spawn_klitirqd(int* affinity)
+{
+    int i;
+    struct klitirqd_launch_data* delayed_launch;
+	
+	if(atomic_read(&num_ready_klitirqds) != 0)
+	{
+		TRACE("%s: At least one klitirqd is already running! Need to call kill_klitirqd()?\n");
+		return;
+	}
+    
+    /* init the tasklet & work queues */
+    for(i = 0; i < NR_LITMUS_SOFTIRQD; ++i)
+    {
+		klitirqds[i].terminating = 0;
+		klitirqds[i].pending = 0;
+		
+		klitirqds[i].num_hi_pending.counter = 0;
+		klitirqds[i].num_low_pending.counter = 0;
+		klitirqds[i].num_work_pending.counter = 0;
+		
+        klitirqds[i].pending_tasklets_hi.head = NULL;
+        klitirqds[i].pending_tasklets_hi.tail = &klitirqds[i].pending_tasklets_hi.head;		
+		
+        klitirqds[i].pending_tasklets.head = NULL;
+        klitirqds[i].pending_tasklets.tail = &klitirqds[i].pending_tasklets.head;
+
+		INIT_LIST_HEAD(&klitirqds[i].worklist);
+		
+		raw_spin_lock_init(&klitirqds[i].lock);
+    }
+    
+    /* wait to flush the initializations to memory since other threads
+       will access it. */    
+    mb();
+    
+    /* tell a work queue to launch the threads.  we can't make scheduling
+       calls since we're in an atomic state. */
+    TRACE("%s: Setting callback up to launch klitirqds\n", __FUNCTION__);
+	delayed_launch = kmalloc(sizeof(struct klitirqd_launch_data), GFP_ATOMIC);
+	if(affinity)
+	{
+		delayed_launch->cpu_affinity =
+			kmalloc(sizeof(int)*NR_LITMUS_SOFTIRQD, GFP_ATOMIC);
+		
+		memcpy(delayed_launch->cpu_affinity, affinity,
+			sizeof(int)*NR_LITMUS_SOFTIRQD);
+	}
+	else
+	{
+		delayed_launch->cpu_affinity = NULL;
+	}
+    INIT_WORK(&delayed_launch->work, launch_klitirqd);
+    schedule_work(&delayed_launch->work);
+}
+
+
+void kill_klitirqd(void)
+{
+	if(!klitirqd_is_dead())
+	{
+    	int i;
+    
+    	TRACE("%s: Killing %d klitirqds\n", __FUNCTION__, NR_LITMUS_SOFTIRQD);
+    
+    	for(i = 0; i < NR_LITMUS_SOFTIRQD; ++i)
+    	{
+			if(klitirqds[i].terminating != 1)
+			{
+				klitirqds[i].terminating = 1;
+				mb(); /* just to be sure? */
+				flush_pending(klitirqds[i].klitirqd, NULL);
+
+				/* signal termination */
+       			kthread_stop(klitirqds[i].klitirqd);
+			}
+    	}
+	}
+}
+
+
+int klitirqd_is_ready(void)
+{
+	return(atomic_read(&num_ready_klitirqds) == NR_LITMUS_SOFTIRQD);
+}
+
+int klitirqd_is_dead(void)
+{
+	return(atomic_read(&num_ready_klitirqds) == 0);
+}
+
+
+struct task_struct* get_klitirqd(unsigned int k_id)
+{
+	return(klitirqds[k_id].klitirqd);
+}
+
+
+void flush_pending(struct task_struct* klitirqd_thread,
+				   struct task_struct* owner)
+{	
+	unsigned int k_id = klitirqd_id(klitirqd_thread);
+	struct klitirqd_info *which = &klitirqds[k_id];
+	
+	unsigned long flags;
+	struct tasklet_struct *list;
+
+	u32 work_flushed = 0;
+	
+	raw_spin_lock_irqsave(&which->lock, flags);
+	
+	//__dump_state(which, "flush_pending: before");
+	
+	// flush hi tasklets.
+	if(litirq_pending_hi_irqoff(which))
+	{
+		which->pending &= ~LIT_TASKLET_HI;
+		
+		list = which->pending_tasklets_hi.head;
+		which->pending_tasklets_hi.head = NULL;
+		which->pending_tasklets_hi.tail = &which->pending_tasklets_hi.head;
+		
+		TRACE("%s: Handing HI tasklets back to Linux.\n", __FUNCTION__);
+		
+		while(list)
+		{
+			struct tasklet_struct *t = list;
+			list = list->next;
+			
+			if(likely((t->owner == owner) || (owner == NULL)))
+			{
+				if(unlikely(!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)))
+				{
+					BUG();
+				}
+
+				work_flushed |= LIT_TASKLET_HI;
+
+				t->owner = NULL;
+
+				// WTF?
+				if(!test_and_set_bit(TASKLET_STATE_SCHED, &t->state))
+				{
+					atomic_dec(&which->num_hi_pending);
+					___tasklet_hi_schedule(t);
+				}
+				else
+				{
+					TRACE("%s: dropped hi tasklet??\n", __FUNCTION__);
+					BUG();
+				}
+			}
+			else
+			{
+				TRACE("%s: Could not flush a HI tasklet.\n", __FUNCTION__);
+				// put back on queue.
+				___litmus_tasklet_hi_schedule(t, which, 0);
+			}
+		}
+	}
+	
+	// flush low tasklets.
+	if(litirq_pending_low_irqoff(which))
+	{
+		which->pending &= ~LIT_TASKLET_LOW;
+		
+		list = which->pending_tasklets.head;
+		which->pending_tasklets.head = NULL;
+		which->pending_tasklets.tail = &which->pending_tasklets.head;
+		
+		TRACE("%s: Handing LOW tasklets back to Linux.\n", __FUNCTION__);
+		
+		while(list)
+		{
+			struct tasklet_struct *t = list;
+			list = list->next;
+			
+			if(likely((t->owner == owner) || (owner == NULL)))
+			{
+				if(unlikely(!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)))
+				{
+					BUG();
+				}
+
+				work_flushed |= LIT_TASKLET_LOW;
+				
+				t->owner = NULL;
+				sched_trace_tasklet_end(owner, 1ul);
+
+				if(!test_and_set_bit(TASKLET_STATE_SCHED, &t->state))
+				{
+					atomic_dec(&which->num_low_pending);
+					___tasklet_schedule(t);
+				}
+				else
+				{
+					TRACE("%s: dropped tasklet??\n", __FUNCTION__);
+					BUG();
+				}
+			}
+			else
+			{
+				TRACE("%s: Could not flush a LOW tasklet.\n", __FUNCTION__);
+				// put back on queue
+				___litmus_tasklet_schedule(t, which, 0);
+			}
+		}
+	}
+	
+	// flush work objects
+	if(litirq_pending_work_irqoff(which))
+	{
+		which->pending &= ~LIT_WORK;
+		
+		TRACE("%s: Handing work objects back to Linux.\n", __FUNCTION__);
+		
+		while(!list_empty(&which->worklist))
+		{
+			struct work_struct* work =
+				list_first_entry(&which->worklist, struct work_struct, entry);
+			list_del_init(&work->entry);
+			
+			if(likely((work->owner == owner) || (owner == NULL)))
+			{
+				work_flushed |= LIT_WORK;
+				atomic_dec(&which->num_work_pending);
+
+				work->owner = NULL;
+				sched_trace_work_end(owner, current, 1ul);
+				__schedule_work(work);
+			}
+			else
+			{
+				TRACE("%s: Could not flush a work object.\n", __FUNCTION__);
+				// put back on queue
+				___litmus_schedule_work(work, which, 0);
+			}
+		}
+	}
+	
+	//__dump_state(which, "flush_pending: after (before reeval prio)");
+	
+
+	mb(); /* commit changes to pending flags */
+
+	/* reset the scheduling priority */
+	if(work_flushed)
+	{
+		__reeval_prio(which);
+
+		/* Try to offload flushed tasklets to Linux's ksoftirqd. */
+		if(work_flushed & (LIT_TASKLET_LOW | LIT_TASKLET_HI))
+		{
+			wakeup_softirqd();
+		}
+	}
+	else
+	{
+		TRACE_CUR("%s: no work flushed, so __reeval_prio() skipped\n", __FUNCTION__);
+	}
+
+	raw_spin_unlock_irqrestore(&which->lock, flags);	
+}
+
+
+
+
+static void ___litmus_tasklet_schedule(struct tasklet_struct *t,
+									   struct klitirqd_info *which,
+									   int wakeup)
+{
+	unsigned long flags;
+	u32 old_pending;
+
+	t->next = NULL;
+	
+    raw_spin_lock_irqsave(&which->lock, flags);
+    
+	//__dump_state(which, "___litmus_tasklet_schedule: before queuing");
+	
+    *(which->pending_tasklets.tail) = t;
+    which->pending_tasklets.tail = &t->next;
+   
+	old_pending = which->pending;
+	which->pending |= LIT_TASKLET_LOW;
+
+	atomic_inc(&which->num_low_pending);
+	
+	mb();
+
+	if(!old_pending && wakeup)
+	{
+		wakeup_litirqd_locked(which); /* wake up the klitirqd */
+	}
+	
+	//__dump_state(which, "___litmus_tasklet_schedule: after queuing");
+	
+    raw_spin_unlock_irqrestore(&which->lock, flags);	
+}
+
+int __litmus_tasklet_schedule(struct tasklet_struct *t, unsigned int k_id)
+{
+	int ret = 0; /* assume failure */
+    if(unlikely((t->owner == NULL) || !is_realtime(t->owner)))
+    {
+        TRACE("%s: No owner associated with this tasklet!\n", __FUNCTION__);
+        BUG();
+    }
+    
+    if(unlikely(k_id >= NR_LITMUS_SOFTIRQD))
+    {
+        TRACE("%s: No klitirqd_th%d!\n", __FUNCTION__, k_id);
+        BUG();        
+    }
+
+	if(likely(!klitirqds[k_id].terminating))
+	{
+		/* Can't accept tasklets while we're processing a workqueue
+		   because they're handled by the same thread. This case is
+		   very RARE.
+
+		   TODO: Use a separate thread for work objects!!!!!!
+         */
+		if(likely(atomic_read(&klitirqds[k_id].num_work_pending) == 0))
+		{
+			ret = 1;
+			___litmus_tasklet_schedule(t, &klitirqds[k_id], 1);
+		}
+		else
+		{
+			TRACE("%s: rejected tasklet because of pending work.\n",
+						__FUNCTION__);
+		}
+	}
+	return(ret);
+}
+
+EXPORT_SYMBOL(__litmus_tasklet_schedule);
+
+
+static void ___litmus_tasklet_hi_schedule(struct tasklet_struct *t,
+									   struct klitirqd_info *which,
+									   int wakeup)
+{
+	unsigned long flags;
+	u32 old_pending;
+
+	t->next = NULL;
+	
+    raw_spin_lock_irqsave(&which->lock, flags);
+
+    *(which->pending_tasklets_hi.tail) = t;
+    which->pending_tasklets_hi.tail = &t->next;
+    
+	old_pending = which->pending;
+	which->pending |= LIT_TASKLET_HI;
+	
+	atomic_inc(&which->num_hi_pending);
+	
+	mb();
+
+	if(!old_pending && wakeup)
+	{
+		wakeup_litirqd_locked(which); /* wake up the klitirqd */
+	}
+	
+    raw_spin_unlock_irqrestore(&which->lock, flags);	
+}
+
+int __litmus_tasklet_hi_schedule(struct tasklet_struct *t, unsigned int k_id)
+{
+	int ret = 0; /* assume failure */
+    if(unlikely((t->owner == NULL) || !is_realtime(t->owner)))
+    {
+        TRACE("%s: No owner associated with this tasklet!\n", __FUNCTION__);
+        BUG();
+    }
+    
+    if(unlikely(k_id >= NR_LITMUS_SOFTIRQD))
+    {
+        TRACE("%s: No klitirqd_th%d!\n", __FUNCTION__, k_id);
+        BUG();        
+    }
+    
+    if(unlikely(!klitirqd_is_ready()))
+    {
+        TRACE("%s: klitirqd is not ready!\n", __FUNCTION__, k_id);
+        BUG();        
+    }    
+    
+	if(likely(!klitirqds[k_id].terminating))
+	{
+		if(likely(atomic_read(&klitirqds[k_id].num_work_pending) == 0))
+		{
+			ret = 1;
+			___litmus_tasklet_hi_schedule(t, &klitirqds[k_id], 1);
+		}
+		else
+		{
+			TRACE("%s: rejected tasklet because of pending work.\n",
+						__FUNCTION__);
+		}
+	}
+	return(ret);
+}
+
+EXPORT_SYMBOL(__litmus_tasklet_hi_schedule);
+
+
+int __litmus_tasklet_hi_schedule_first(struct tasklet_struct *t, unsigned int k_id)
+{
+	int ret = 0; /* assume failure */
+	u32 old_pending;
+
+	BUG_ON(!irqs_disabled());
+    
+    if(unlikely((t->owner == NULL) || !is_realtime(t->owner)))
+    {
+        TRACE("%s: No owner associated with this tasklet!\n", __FUNCTION__);
+        BUG();
+    }
+    
+    if(unlikely(k_id >= NR_LITMUS_SOFTIRQD))
+    {
+        TRACE("%s: No klitirqd_th%u!\n", __FUNCTION__, k_id);
+        BUG();        
+    }
+    
+    if(unlikely(!klitirqd_is_ready()))
+    {
+        TRACE("%s: klitirqd is not ready!\n", __FUNCTION__, k_id);
+        BUG();        
+    }
+
+	if(likely(!klitirqds[k_id].terminating))
+	{
+    	raw_spin_lock(&klitirqds[k_id].lock);
+    
+		if(likely(atomic_read(&klitirqds[k_id].num_work_pending) == 0))
+		{
+			ret = 1;  // success!
+
+			t->next = klitirqds[k_id].pending_tasklets_hi.head;
+    		klitirqds[k_id].pending_tasklets_hi.head = t;
+	
+			old_pending = klitirqds[k_id].pending;
+			klitirqds[k_id].pending |= LIT_TASKLET_HI;
+		
+			atomic_inc(&klitirqds[k_id].num_hi_pending);
+   
+			mb();
+
+			if(!old_pending)
+    			wakeup_litirqd_locked(&klitirqds[k_id]); /* wake up the klitirqd */
+		}
+		else
+		{
+			TRACE("%s: rejected tasklet because of pending work.\n",
+					__FUNCTION__);
+		}
+
+    	raw_spin_unlock(&klitirqds[k_id].lock);
+	}
+	return(ret);
+}
+
+EXPORT_SYMBOL(__litmus_tasklet_hi_schedule_first);
+
+
+
+static void ___litmus_schedule_work(struct work_struct *w,
+									struct klitirqd_info *which,
+									int wakeup)
+{
+	unsigned long flags;
+	u32 old_pending;
+
+	raw_spin_lock_irqsave(&which->lock, flags);
+	
+	work_pending(w);
+	list_add_tail(&w->entry, &which->worklist);
+	
+	old_pending = which->pending;
+	which->pending |= LIT_WORK;
+
+	atomic_inc(&which->num_work_pending);
+	
+	mb();
+
+	if(!old_pending && wakeup)
+	{
+		wakeup_litirqd_locked(which); /* wakeup the klitirqd */
+	}
+	
+	raw_spin_unlock_irqrestore(&which->lock, flags);
+}
+
+int __litmus_schedule_work(struct work_struct *w, unsigned int k_id)
+{
+	int ret = 1; /* assume success */
+	if(unlikely(w->owner == NULL) || !is_realtime(w->owner))
+	{
+		TRACE("%s: No owner associated with this work object!\n", __FUNCTION__);
+		BUG();
+	}
+	
+	if(unlikely(k_id >= NR_LITMUS_SOFTIRQD))
+	{
+		TRACE("%s: No klitirqd_th%u!\n", k_id);
+		BUG();
+	}
+	
+    if(unlikely(!klitirqd_is_ready()))
+    {
+        TRACE("%s: klitirqd is not ready!\n", __FUNCTION__, k_id);
+        BUG();        
+    }	
+
+	if(likely(!klitirqds[k_id].terminating))
+		___litmus_schedule_work(w, &klitirqds[k_id], 1);
+	else
+		ret = 0;
+	return(ret);
+}
+EXPORT_SYMBOL(__litmus_schedule_work);
+
+
+static int set_klitirqd_sem_status(unsigned long stat)
+{
+	TRACE_CUR("SETTING STATUS FROM %d TO %d\n",
+					atomic_read(&tsk_rt(current)->klitirqd_sem_stat),
+					stat);
+	atomic_set(&tsk_rt(current)->klitirqd_sem_stat, stat);
+	//mb();
+
+	return(0);
+}
+
+static int set_klitirqd_sem_status_if_not_held(unsigned long stat)
+{
+	if(atomic_read(&tsk_rt(current)->klitirqd_sem_stat) != HELD)
+	{
+		return(set_klitirqd_sem_status(stat));
+	}
+	return(-1);
+}
+
+
+void __down_and_reset_and_set_stat(struct task_struct* t,
+					   enum klitirqd_sem_status to_reset,
+					   enum klitirqd_sem_status to_set,
+					   struct mutex* sem)
+{
+#if 0
+	struct rt_param* param = container_of(sem, struct rt_param, klitirqd_sem);
+	struct task_struct* task = container_of(param, struct task_struct, rt_param);
+
+	TRACE_CUR("%s: entered.  Locking semaphore of %s/%d\n",
+					__FUNCTION__, task->comm, task->pid);
+#endif
+
+	mutex_lock_sfx(sem,
+				   set_klitirqd_sem_status_if_not_held, to_reset,
+				   set_klitirqd_sem_status, to_set);
+#if 0
+	TRACE_CUR("%s: exiting.  Have semaphore of %s/%d\n",
+					__FUNCTION__, task->comm, task->pid);
+#endif
+}
+
+void down_and_set_stat(struct task_struct* t,
+					   enum klitirqd_sem_status to_set,
+					   struct mutex* sem)
+{
+#if 0
+	struct rt_param* param = container_of(sem, struct rt_param, klitirqd_sem);
+	struct task_struct* task = container_of(param, struct task_struct, rt_param);
+
+	TRACE_CUR("%s: entered.  Locking semaphore of %s/%d\n",
+					__FUNCTION__, task->comm, task->pid);
+#endif
+
+	mutex_lock_sfx(sem,
+				   NULL, 0,
+				   set_klitirqd_sem_status, to_set);
+
+#if 0
+	TRACE_CUR("%s: exiting.  Have semaphore of %s/%d\n",
+					__FUNCTION__, task->comm, task->pid);
+#endif
+}
+
+
+void up_and_set_stat(struct task_struct* t,
+					 enum klitirqd_sem_status to_set,
+					 struct mutex* sem)
+{
+#if 0
+	struct rt_param* param = container_of(sem, struct rt_param, klitirqd_sem);
+	struct task_struct* task = container_of(param, struct task_struct, rt_param);
+
+	TRACE_CUR("%s: entered.  Unlocking semaphore of %s/%d\n",
+					__FUNCTION__,
+					task->comm, task->pid);
+#endif
+
+	mutex_unlock_sfx(sem, NULL, 0,
+					 set_klitirqd_sem_status, to_set);
+
+#if 0
+	TRACE_CUR("%s: exiting.  Unlocked semaphore of %s/%d\n",
+					__FUNCTION__,
+					task->comm, task->pid);
+#endif
+}
+
+
+
+void release_klitirqd_lock(struct task_struct* t)
+{
+	if(is_realtime(t) && (atomic_read(&tsk_rt(t)->klitirqd_sem_stat) == HELD))
+	{
+		struct mutex* sem;
+		struct task_struct* owner = t;
+		
+		if(t->state == TASK_RUNNING)
+		{
+			TRACE_TASK(t, "NOT giving up klitirqd_sem because we're not blocked!\n");
+			return;
+		}
+		
+		if(likely(!tsk_rt(t)->is_proxy_thread))
+		{
+			sem = &tsk_rt(t)->klitirqd_sem;
+		}
+		else
+		{
+			unsigned int k_id = klitirqd_id(t);
+			owner = klitirqds[k_id].current_owner;
+
+			BUG_ON(t != klitirqds[k_id].klitirqd);
+
+			if(likely(owner))
+			{
+				sem = &tsk_rt(owner)->klitirqd_sem;
+			}
+			else
+			{
+				BUG();
+				
+				// We had the rug pulled out from under us.  Abort attempt
+				// to reacquire the lock since our client no longer needs us.
+				TRACE_CUR("HUH?!  How did this happen?\n");
+				atomic_set(&tsk_rt(t)->klitirqd_sem_stat, NOT_HELD);
+				return;
+			}
+		}
+		
+		//TRACE_CUR("Releasing semaphore of %s/%d...\n", owner->comm, owner->pid);
+		up_and_set_stat(t, NEED_TO_REACQUIRE, sem);
+		//TRACE_CUR("Semaphore of %s/%d released!\n", owner->comm, owner->pid);
+	}
+	/*
+	else if(is_realtime(t))
+	{
+		TRACE_CUR("%s: Nothing to do.  Stat = %d\n", __FUNCTION__, tsk_rt(t)->klitirqd_sem_stat);
+	}
+	*/
+}
+
+int reacquire_klitirqd_lock(struct task_struct* t)
+{
+	int ret = 0;
+
+	if(is_realtime(t) && (atomic_read(&tsk_rt(t)->klitirqd_sem_stat) == NEED_TO_REACQUIRE))
+	{
+		struct mutex* sem;
+		struct task_struct* owner = t;
+		
+		if(likely(!tsk_rt(t)->is_proxy_thread))
+		{
+			sem = &tsk_rt(t)->klitirqd_sem;
+		}
+		else
+		{
+			unsigned int k_id = klitirqd_id(t);		
+			//struct task_struct* owner = klitirqds[k_id].current_owner;
+			owner = klitirqds[k_id].current_owner;
+			
+			BUG_ON(t != klitirqds[k_id].klitirqd);
+
+			if(likely(owner))
+			{
+				sem = &tsk_rt(owner)->klitirqd_sem;
+			}
+			else
+			{
+				// We had the rug pulled out from under us.  Abort attempt
+				// to reacquire the lock since our client no longer needs us.
+				TRACE_CUR("No longer needs to reacquire klitirqd_sem!\n");
+				atomic_set(&tsk_rt(t)->klitirqd_sem_stat, NOT_HELD);
+				return(0);
+			}
+		}
+		
+		//TRACE_CUR("Trying to reacquire semaphore of %s/%d\n", owner->comm, owner->pid);
+		__down_and_reset_and_set_stat(t, REACQUIRING, HELD, sem);
+		//TRACE_CUR("Reacquired semaphore %s/%d\n", owner->comm, owner->pid);
+	}
+	/*
+	else if(is_realtime(t))
+	{
+		TRACE_CUR("%s: Nothing to do.  Stat = %d\n", __FUNCTION__, tsk_rt(t)->klitirqd_sem_stat);
+	}
+	*/
+
+	return(ret);
+}
+
diff --git a/litmus/locking.c b/litmus/locking.c
index 2693f1aca859..cfce98e7480d 100644
--- a/litmus/locking.c
+++ b/litmus/locking.c
@@ -121,7 +121,6 @@ struct task_struct* __waitqueue_remove_first(wait_queue_head_t *wq)
 	return(t);
 }
 
-
 #else
 
 struct fdso_ops generic_lock_ops = {};
diff --git a/litmus/nvidia_info.c b/litmus/nvidia_info.c
new file mode 100644
index 000000000000..78f035244d21
--- /dev/null
+++ b/litmus/nvidia_info.c
@@ -0,0 +1,526 @@
+#include <linux/module.h>
+#include <linux/semaphore.h>
+#include <linux/pci.h>
+
+#include <litmus/sched_trace.h>
+#include <litmus/nvidia_info.h>
+#include <litmus/litmus.h>
+
+typedef unsigned char      NvV8;  /* "void": enumerated or multiple fields   */
+typedef unsigned short     NvV16; /* "void": enumerated or multiple fields   */
+typedef unsigned char      NvU8;  /* 0 to 255                                */
+typedef unsigned short     NvU16; /* 0 to 65535                              */
+typedef signed char        NvS8;  /* -128 to 127                             */
+typedef signed short       NvS16; /* -32768 to 32767                         */
+typedef float              NvF32; /* IEEE Single Precision (S1E8M23)         */
+typedef double             NvF64; /* IEEE Double Precision (S1E11M52)        */
+typedef unsigned int       NvV32; /* "void": enumerated or multiple fields   */
+typedef unsigned int       NvU32; /* 0 to 4294967295                         */
+typedef unsigned long long NvU64; /* 0 to 18446744073709551615          */
+typedef union
+{
+    volatile NvV8 Reg008[1];
+    volatile NvV16 Reg016[1];
+    volatile NvV32 Reg032[1];
+} litmus_nv_hwreg_t, * litmus_nv_phwreg_t;
+
+typedef struct
+{
+    NvU64 address;
+    NvU64 size;
+    NvU32 offset;
+    NvU32 *map;
+    litmus_nv_phwreg_t map_u;
+} litmus_nv_aperture_t;
+
+typedef struct
+{
+    void  *priv;                    /* private data */
+    void  *os_state;                /* os-specific device state */
+	
+    int    rmInitialized;
+    int    flags;
+	
+    /* PCI config info */
+    NvU32 domain;
+    NvU16 bus;
+    NvU16 slot;
+    NvU16 vendor_id;
+    NvU16 device_id;
+    NvU16 subsystem_id;
+    NvU32 gpu_id;
+    void *handle;
+	
+    NvU32 pci_cfg_space[16];
+	
+    /* physical characteristics */
+    litmus_nv_aperture_t bars[3];
+    litmus_nv_aperture_t *regs;
+    litmus_nv_aperture_t *fb, ud;
+    litmus_nv_aperture_t agp;
+	
+    NvU32  interrupt_line;
+	
+    NvU32 agp_config;
+    NvU32 agp_status;
+	
+    NvU32 primary_vga;
+	
+    NvU32 sim_env;
+	
+    NvU32 rc_timer_enabled;
+	
+    /* list of events allocated for this device */
+    void *event_list;
+	
+    void *kern_mappings;
+	
+} litmus_nv_state_t;
+
+typedef struct work_struct litmus_nv_task_t;
+
+typedef struct litmus_nv_work_s {
+    litmus_nv_task_t task;
+    void *data;
+} litmus_nv_work_t;
+
+typedef struct litmus_nv_linux_state_s {
+    litmus_nv_state_t nv_state;
+    atomic_t usage_count;
+	
+    struct pci_dev *dev;
+    void *agp_bridge;
+    void *alloc_queue;
+	
+    void *timer_sp;
+    void *isr_sp;
+    void *pci_cfgchk_sp;
+    void *isr_bh_sp;
+
+#ifdef CONFIG_CUDA_4_0
+	char registry_keys[512];
+#endif
+
+    /* keep track of any pending bottom halfes */
+    struct tasklet_struct tasklet;
+    litmus_nv_work_t work;
+	
+    /* get a timer callback every second */
+    struct timer_list rc_timer;
+	
+    /* lock for linux-specific data, not used by core rm */
+    struct semaphore ldata_lock;
+	
+    /* lock for linux-specific alloc queue */
+    struct semaphore at_lock;
+	
+#if 0
+#if defined(NV_USER_MAP)
+    /* list of user mappings */
+    struct nv_usermap_s *usermap_list;
+	
+    /* lock for VMware-specific mapping list */
+    struct semaphore mt_lock;
+#endif /* defined(NV_USER_MAP) */	
+#if defined(NV_PM_SUPPORT_OLD_STYLE_APM)
+	void *apm_nv_dev;
+#endif
+#endif
+	
+    NvU32 device_num;
+    struct litmus_nv_linux_state_s *next;
+} litmus_nv_linux_state_t;
+
+void dump_nvidia_info(const struct tasklet_struct *t)
+{
+	litmus_nv_state_t* nvstate = NULL;
+	litmus_nv_linux_state_t* linuxstate =  NULL;
+	struct pci_dev* pci = NULL;
+	
+	nvstate = (litmus_nv_state_t*)(t->data);
+	
+	if(nvstate)
+	{
+		TRACE("NV State:\n"
+			  "\ttasklet ptr = %p\n"
+			  "\tstate ptr = %p\n"
+			  "\tprivate data ptr = %p\n"
+			  "\tos state ptr = %p\n"
+			  "\tdomain = %u\n"
+			  "\tbus = %u\n"
+			  "\tslot = %u\n"
+			  "\tvender_id = %u\n"
+			  "\tdevice_id = %u\n"
+			  "\tsubsystem_id = %u\n"
+			  "\tgpu_id = %u\n"
+			  "\tinterrupt_line = %u\n",
+			  t,
+			  nvstate,
+			  nvstate->priv,
+			  nvstate->os_state,
+			  nvstate->domain,
+			  nvstate->bus,
+			  nvstate->slot,
+			  nvstate->vendor_id,
+			  nvstate->device_id,
+			  nvstate->subsystem_id,
+			  nvstate->gpu_id,
+			  nvstate->interrupt_line);
+		
+		linuxstate = container_of(nvstate, litmus_nv_linux_state_t, nv_state);
+	}
+	else
+	{
+		TRACE("INVALID NVSTATE????\n");
+	}
+	
+	if(linuxstate)
+	{
+		int ls_offset = (void*)(&(linuxstate->device_num)) - (void*)(linuxstate);
+		int ns_offset_raw = (void*)(&(linuxstate->device_num)) - (void*)(&(linuxstate->nv_state));
+		int ns_offset_desired = (void*)(&(linuxstate->device_num)) - (void*)(nvstate);
+		
+		
+		TRACE("LINUX NV State:\n"
+			  "\tlinux nv state ptr: %p\n"
+			  "\taddress of tasklet: %p\n"
+			  "\taddress of work: %p\n"
+			  "\tusage_count: %d\n"
+			  "\tdevice_num: %u\n"
+			  "\ttasklet addr == this tasklet: %d\n"
+			  "\tpci: %p\n",
+			  linuxstate,
+			  &(linuxstate->tasklet),
+			  &(linuxstate->work),
+			  atomic_read(&(linuxstate->usage_count)),
+			  linuxstate->device_num,
+			  (t == &(linuxstate->tasklet)),
+			  linuxstate->dev);
+		
+		pci = linuxstate->dev;
+		
+		TRACE("Offsets:\n"
+			  "\tOffset from LinuxState: %d, %x\n"
+			  "\tOffset from NVState: %d, %x\n"
+			  "\tOffset from parameter: %d, %x\n"
+			  "\tdevice_num: %u\n",
+			  ls_offset, ls_offset,
+			  ns_offset_raw, ns_offset_raw,
+			  ns_offset_desired, ns_offset_desired,
+			  *((u32*)((void*)nvstate + ns_offset_desired)));
+	}
+	else
+	{
+		TRACE("INVALID LINUXNVSTATE?????\n");
+	}
+
+#if 0
+	if(pci)
+	{
+		TRACE("PCI DEV Info:\n"
+			  "pci device ptr: %p\n"
+			  "\tdevfn = %d\n"
+			  "\tvendor = %d\n"
+			  "\tdevice = %d\n"
+			  "\tsubsystem_vendor = %d\n"
+			  "\tsubsystem_device = %d\n"
+			  "\tslot # = %d\n",
+			  pci,
+			  pci->devfn,
+			  pci->vendor,
+			  pci->device,
+			  pci->subsystem_vendor,
+			  pci->subsystem_device,
+			  pci->slot->number);
+	}
+	else
+	{
+		TRACE("INVALID PCIDEV PTR?????\n");
+	}
+#endif
+}
+
+static struct module* nvidia_mod = NULL;
+int init_nvidia_info(void)
+{
+	mutex_lock(&module_mutex);
+	nvidia_mod = find_module("nvidia");
+	mutex_unlock(&module_mutex);	
+	if(nvidia_mod != NULL)
+	{
+		TRACE("%s : Found NVIDIA module. Core Code: %p to %p\n", __FUNCTION__,
+			  (void*)(nvidia_mod->module_core),
+			  (void*)(nvidia_mod->module_core) + nvidia_mod->core_size);
+		init_nv_device_reg();
+		return(0);
+	}
+	else
+	{
+		TRACE("%s : Could not find NVIDIA module!  Loaded?\n", __FUNCTION__);
+		return(-1);
+	}
+}
+
+
+/* works with pointers to static data inside the module too. */
+int is_nvidia_func(void* func_addr)
+{
+	int ret = 0;
+	if(nvidia_mod)
+	{
+		ret = within_module_core((long unsigned int)func_addr, nvidia_mod);
+		/*
+		if(ret)
+		{
+			TRACE("%s : %p is in NVIDIA module: %d\n",
+			  	__FUNCTION__, func_addr, ret);
+		}*/
+	}
+	
+	return(ret);
+}
+
+u32 get_tasklet_nv_device_num(const struct tasklet_struct *t)
+{
+	// life is too short to use hard-coded offsets.  update this later.
+	litmus_nv_state_t* nvstate = (litmus_nv_state_t*)(t->data);
+	litmus_nv_linux_state_t* linuxstate = container_of(nvstate, litmus_nv_linux_state_t, nv_state);
+
+	BUG_ON(linuxstate->device_num >= NV_DEVICE_NUM);
+
+	return(linuxstate->device_num);
+
+	//int DEVICE_NUM_OFFSET = (void*)(&(linuxstate->device_num)) - (void*)(nvstate);
+
+#if 0
+	// offset determined though observed behavior of the NV driver.
+	//const int DEVICE_NUM_OFFSET = 0x480;  // CUDA 4.0 RC1
+	//const int DEVICE_NUM_OFFSET = 0x510;  // CUDA 4.0 RC2
+
+	void* state = (void*)(t->data);
+	void* device_num_ptr = state + DEVICE_NUM_OFFSET;
+	
+	//dump_nvidia_info(t);
+	return(*((u32*)device_num_ptr));
+#endif
+}
+
+u32 get_work_nv_device_num(const struct work_struct *t)
+{
+	// offset determined though observed behavior of the NV driver.
+	const int DEVICE_NUM_OFFSET = sizeof(struct work_struct);
+	void* state = (void*)(t);
+	void** device_num_ptr = state + DEVICE_NUM_OFFSET;
+	return(*((u32*)(*device_num_ptr)));
+}
+
+
+
+typedef struct {
+	raw_spinlock_t	lock;
+	struct task_struct *device_owner;
+}nv_device_registry_t;
+
+static nv_device_registry_t NV_DEVICE_REG[NV_DEVICE_NUM];
+
+int init_nv_device_reg(void)
+{
+	int i;
+	
+	//memset(NV_DEVICE_REG, 0, sizeof(NV_DEVICE_REG));
+	
+	for(i = 0; i < NV_DEVICE_NUM; ++i)
+	{
+		raw_spin_lock_init(&NV_DEVICE_REG[i].lock);
+		NV_DEVICE_REG[i].device_owner = NULL;
+	}
+				 
+	return(1);
+}
+
+/* use to get nv_device_id by given owner.
+ (if return -1, can't get the assocaite device id)*/
+/*
+int get_nv_device_id(struct task_struct* owner)
+{
+	int i;
+	if(!owner)
+	{
+		return(-1);
+	}
+	for(i = 0; i < NV_DEVICE_NUM; ++i)
+	{
+		if(NV_DEVICE_REG[i].device_owner == owner)
+			return(i);
+	}
+	return(-1); 
+}
+*/
+
+
+
+static int __reg_nv_device(int reg_device_id)
+{
+    struct task_struct* old =
+		cmpxchg(&NV_DEVICE_REG[reg_device_id].device_owner,
+				NULL,
+				current);
+
+	mb();
+
+	if(likely(old == NULL))
+	{
+		down_and_set_stat(current, HELD, &tsk_rt(current)->klitirqd_sem);
+		TRACE_CUR("%s: device %d registered.\n", __FUNCTION__, reg_device_id);
+		return(0);
+	}   
+	else
+	{   
+		TRACE_CUR("%s: device %d is already in use!\n", __FUNCTION__, reg_device_id);
+		return(-EBUSY);    
+	}
+
+#if 0
+	//unsigned long flags;
+	//raw_spin_lock_irqsave(&NV_DEVICE_REG[reg_device_id].lock, flags);
+	//lock_nv_registry(reg_device_id, &flags);
+
+	if(likely(NV_DEVICE_REG[reg_device_id].device_owner == NULL))
+	{
+		NV_DEVICE_REG[reg_device_id].device_owner = current;
+		mb(); // needed?
+
+		// release spin lock before chance of going to sleep.
+		//raw_spin_unlock_irqrestore(&NV_DEVICE_REG[reg_device_id].lock, flags);	
+		//unlock_nv_registry(reg_device_id, &flags);
+
+		down_and_set_stat(current, HELD, &tsk_rt(current)->klitirqd_sem);
+		TRACE_CUR("%s: device %d registered.\n", __FUNCTION__, reg_device_id);
+		return(0);
+	}
+	else
+	{
+		//raw_spin_unlock_irqrestore(&NV_DEVICE_REG[reg_device_id].lock, flags);
+		//unlock_nv_registry(reg_device_id, &flags);
+
+		TRACE_CUR("%s: device %d is already in use!\n", __FUNCTION__, reg_device_id);
+		return(-EBUSY);		
+	}
+#endif
+}
+
+static int __clear_reg_nv_device(int de_reg_device_id)
+{
+	int ret;
+	unsigned long flags;
+    struct task_struct* klitirqd_th = get_klitirqd(de_reg_device_id);
+	struct task_struct* old;
+	
+	lock_nv_registry(de_reg_device_id, &flags);
+	
+	old = cmpxchg(&NV_DEVICE_REG[de_reg_device_id].device_owner,
+				current,
+				NULL);
+	
+	mb();
+			    
+	if(likely(old == current))
+	{   
+		flush_pending(klitirqd_th, current);
+		//unlock_nv_registry(de_reg_device_id, &flags);
+		
+		up_and_set_stat(current, NOT_HELD, &tsk_rt(current)->klitirqd_sem);
+
+		unlock_nv_registry(de_reg_device_id, &flags);
+		ret = 0;
+		
+		TRACE_CUR("%s: semaphore released.\n",__FUNCTION__);
+	}
+	else
+	{
+		unlock_nv_registry(de_reg_device_id, &flags);
+		ret = -EINVAL;
+		
+		if(old)
+			TRACE_CUR("%s: device %d is not registered for this process's use!  %s/%d is!\n",
+					  __FUNCTION__, de_reg_device_id, old->comm, old->pid);
+		else
+			TRACE_CUR("%s: device %d is not registered for this process's use! No one is!\n",
+					  __FUNCTION__, de_reg_device_id);
+	}
+
+	return(ret);
+}
+
+
+int reg_nv_device(int reg_device_id, int reg_action)
+{
+	int ret;
+
+	if((reg_device_id < NV_DEVICE_NUM) && (reg_device_id >= 0))
+	{
+		if(reg_action)
+			ret = __reg_nv_device(reg_device_id);
+		else
+			ret = __clear_reg_nv_device(reg_device_id);
+	}
+	else
+	{
+		ret = -ENODEV;
+	}
+
+	return(ret);
+}
+
+/* use to get the owner of nv_device_id. */
+struct task_struct* get_nv_device_owner(u32 target_device_id)
+{
+	struct task_struct* owner;
+	BUG_ON(target_device_id >= NV_DEVICE_NUM);
+	owner = NV_DEVICE_REG[target_device_id].device_owner;
+	return(owner);
+}
+
+void lock_nv_registry(u32 target_device_id, unsigned long* flags)
+{
+	BUG_ON(target_device_id >= NV_DEVICE_NUM);
+
+	if(in_interrupt())
+		TRACE("Locking registry for %d.\n", target_device_id);
+	else
+		TRACE_CUR("Locking registry for %d.\n", target_device_id);
+
+	raw_spin_lock_irqsave(&NV_DEVICE_REG[target_device_id].lock, *flags);
+}
+
+void unlock_nv_registry(u32 target_device_id, unsigned long* flags)
+{
+	BUG_ON(target_device_id >= NV_DEVICE_NUM);
+
+	if(in_interrupt())
+		TRACE("Unlocking registry for %d.\n", target_device_id);
+	else
+		TRACE_CUR("Unlocking registry for %d.\n", target_device_id);
+
+	raw_spin_unlock_irqrestore(&NV_DEVICE_REG[target_device_id].lock, *flags);
+}
+
+
+void increment_nv_int_count(u32 device)
+{
+	unsigned long flags;
+	struct task_struct* owner;
+
+	lock_nv_registry(device, &flags);
+
+	owner = NV_DEVICE_REG[device].device_owner;
+	if(owner)
+	{
+		atomic_inc(&tsk_rt(owner)->nv_int_count);
+	}
+
+	unlock_nv_registry(device, &flags);
+}
+EXPORT_SYMBOL(increment_nv_int_count);
+
+
diff --git a/litmus/preempt.c b/litmus/preempt.c
index ebe2e3461895..08b98c3b57bf 100644
--- a/litmus/preempt.c
+++ b/litmus/preempt.c
@@ -30,8 +30,11 @@ void sched_state_will_schedule(struct task_struct* tsk)
 		/* Litmus tasks should never be subject to a remote
 		 * set_tsk_need_resched(). */
 		BUG_ON(is_realtime(tsk));
+
+/*
 	TRACE_TASK(tsk, "set_tsk_need_resched() ret:%p\n",
 		   __builtin_return_address(0));
+*/
 }
 
 /* Called by the IPI handler after another CPU called smp_send_resched(). */
@@ -43,13 +46,17 @@ void sched_state_ipi(void)
 		/* Cause scheduler to be invoked.
 		 * This will cause a transition to WILL_SCHEDULE. */
 		set_tsk_need_resched(current);
+		/*
 		TRACE_STATE("IPI -> set_tsk_need_resched(%s/%d)\n",
 			    current->comm, current->pid);
+		*/
 	} else {
 		/* ignore */
+		/*
 		TRACE_STATE("ignoring IPI in state %x (%s)\n",
 			    get_sched_state(),
 			    sched_state_name(get_sched_state()));
+		*/
 	}
 }
 
diff --git a/litmus/sched_cedf.c b/litmus/sched_cedf.c
index 73fe1c442a0d..9b0a8d3b624d 100644
--- a/litmus/sched_cedf.c
+++ b/litmus/sched_cedf.c
@@ -29,6 +29,7 @@
 #include <linux/percpu.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/uaccess.h>
 
 #include <linux/module.h>
 
@@ -45,7 +46,18 @@
 
 /* to configure the cluster size */
 #include <litmus/litmus_proc.h>
-#include <linux/uaccess.h>
+
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+#include <litmus/affinity.h>
+#endif
+
+#ifdef CONFIG_LITMUS_SOFTIRQD
+#include <litmus/litmus_softirq.h>
+#endif
+
+#ifdef CONFIG_LITMUS_NVIDIA
+#include <litmus/nvidia_info.h>
+#endif
 
 /* Reference configuration variable. Determines which cache level is used to
  * group CPUs into clusters.  GLOBAL_CLUSTER, which is the default, means that
@@ -95,7 +107,7 @@ typedef struct clusterdomain {
 	struct bheap_node *heap_node;
 	struct bheap      cpu_heap;
 	/* lock for this cluster */
-#define lock domain.ready_lock
+#define cedf_lock domain.ready_lock
 } cedf_domain_t;
 
 /* a cedf_domain per cluster; allocation is done at init/activation time */
@@ -257,21 +269,50 @@ static noinline void requeue(struct task_struct* task)
 	}
 }
 
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+static cpu_entry_t* cedf_get_nearest_available_cpu(
+				cedf_domain_t *cluster, cpu_entry_t* start)
+{
+	cpu_entry_t* affinity;
+
+	get_nearest_available_cpu(affinity, start, cedf_cpu_entries, -1);
+
+	/* make sure CPU is in our cluster */
+	if(affinity && cpu_isset(affinity->cpu, *cluster->cpu_map))
+		return(affinity);
+	else
+		return(NULL);
+}
+#endif
+
+
 /* check for any necessary preemptions */
 static void check_for_preemptions(cedf_domain_t *cluster)
 {
 	struct task_struct *task;
-	cpu_entry_t* last;
+	cpu_entry_t *last;
 
 	for(last = lowest_prio_cpu(cluster);
 	    edf_preemption_needed(&cluster->domain, last->linked);
 	    last = lowest_prio_cpu(cluster)) {
 		/* preemption necessary */
 		task = __take_ready(&cluster->domain);
-		TRACE("check_for_preemptions: attempting to link task %d to %d\n",
-		      task->pid, last->cpu);
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+		{
+			cpu_entry_t* affinity =
+					cedf_get_nearest_available_cpu(cluster,
+							&per_cpu(cedf_cpu_entries, task_cpu(task)));
+			if(affinity)
+				last = affinity;
+			else if(last->linked)
+				requeue(last->linked);
+		}
+#else
 		if (last->linked)
 			requeue(last->linked);
+#endif
+		TRACE("check_for_preemptions: attempting to link task %d to %d\n",
+				task->pid, last->cpu);
 		link_task_to_cpu(task, last);
 		preempt(last);
 	}
@@ -292,12 +333,12 @@ static void cedf_release_jobs(rt_domain_t* rt, struct bheap* tasks)
 	cedf_domain_t* cluster = container_of(rt, cedf_domain_t, domain);
 	unsigned long flags;
 
-	raw_spin_lock_irqsave(&cluster->lock, flags);
+	raw_spin_lock_irqsave(&cluster->cedf_lock, flags);
 
 	__merge_ready(&cluster->domain, tasks);
 	check_for_preemptions(cluster);
 
-	raw_spin_unlock_irqrestore(&cluster->lock, flags);
+	raw_spin_unlock_irqrestore(&cluster->cedf_lock, flags);
 }
 
 /* caller holds cedf_lock */
@@ -307,6 +348,10 @@ static noinline void job_completion(struct task_struct *t, int forced)
 
 	sched_trace_task_completion(t, forced);
 
+#ifdef CONFIG_LITMUS_NVIDIA
+	atomic_set(&tsk_rt(t)->nv_int_count, 0);
+#endif
+
 	TRACE_TASK(t, "job_completion().\n");
 
 	/* set flags */
@@ -378,7 +423,7 @@ static struct task_struct* cedf_schedule(struct task_struct * prev)
 	int out_of_time, sleep, preempt, np, exists, blocks;
 	struct task_struct* next = NULL;
 
-	raw_spin_lock(&cluster->lock);
+	raw_spin_lock(&cluster->cedf_lock);
 	clear_will_schedule();
 
 	/* sanity checking */
@@ -462,7 +507,7 @@ static struct task_struct* cedf_schedule(struct task_struct * prev)
 			next = prev;
 
 	sched_state_task_picked();
-	raw_spin_unlock(&cluster->lock);
+	raw_spin_unlock(&cluster->cedf_lock);
 
 #ifdef WANT_ALL_SCHED_EVENTS
 	TRACE("cedf_lock released, next=0x%p\n", next);
@@ -504,7 +549,7 @@ static void cedf_task_new(struct task_struct * t, int on_rq, int running)
 	/* the cluster doesn't change even if t is running */
 	cluster = task_cpu_cluster(t);
 
-	raw_spin_lock_irqsave(&cluster->domain.ready_lock, flags);
+	raw_spin_lock_irqsave(&cluster->cedf_lock, flags);
 
 	/* setup job params */
 	release_at(t, litmus_clock());
@@ -521,20 +566,22 @@ static void cedf_task_new(struct task_struct * t, int on_rq, int running)
 	t->rt_param.linked_on          = NO_CPU;
 
 	cedf_job_arrival(t);
-	raw_spin_unlock_irqrestore(&(cluster->domain.ready_lock), flags);
+	raw_spin_unlock_irqrestore(&cluster->cedf_lock, flags);
 }
 
 static void cedf_task_wake_up(struct task_struct *task)
 {
 	unsigned long flags;
-	lt_t now;
+	//lt_t now;
 	cedf_domain_t *cluster;
 
 	TRACE_TASK(task, "wake_up at %llu\n", litmus_clock());
 
 	cluster = task_cpu_cluster(task);
 
-	raw_spin_lock_irqsave(&cluster->lock, flags);
+	raw_spin_lock_irqsave(&cluster->cedf_lock, flags);
+
+#if 0  // sporadic task model
 	/* We need to take suspensions because of semaphores into
 	 * account! If a job resumes after being suspended due to acquiring
 	 * a semaphore, it should never be treated as a new job release.
@@ -556,8 +603,17 @@ static void cedf_task_wake_up(struct task_struct *task)
 			}
 		}
 	}
-	cedf_job_arrival(task);
-	raw_spin_unlock_irqrestore(&cluster->lock, flags);
+#endif
+
+	//BUG_ON(tsk_rt(task)->linked_on != NO_CPU);
+	set_rt_flags(task, RT_F_RUNNING);  // periodic model
+
+	if(tsk_rt(task)->linked_on == NO_CPU)
+		cedf_job_arrival(task);
+	else
+		TRACE("WTF, mate?!\n");
+
+	raw_spin_unlock_irqrestore(&cluster->cedf_lock, flags);
 }
 
 static void cedf_task_block(struct task_struct *t)
@@ -570,9 +626,9 @@ static void cedf_task_block(struct task_struct *t)
 	cluster = task_cpu_cluster(t);
 
 	/* unlink if necessary */
-	raw_spin_lock_irqsave(&cluster->lock, flags);
+	raw_spin_lock_irqsave(&cluster->cedf_lock, flags);
 	unlink(t);
-	raw_spin_unlock_irqrestore(&cluster->lock, flags);
+	raw_spin_unlock_irqrestore(&cluster->cedf_lock, flags);
 
 	BUG_ON(!is_realtime(t));
 }
@@ -584,7 +640,7 @@ static void cedf_task_exit(struct task_struct * t)
 	cedf_domain_t *cluster = task_cpu_cluster(t);
 
 	/* unlink if necessary */
-	raw_spin_lock_irqsave(&cluster->lock, flags);
+	raw_spin_lock_irqsave(&cluster->cedf_lock, flags);
 	unlink(t);
 	if (tsk_rt(t)->scheduled_on != NO_CPU) {
 		cpu_entry_t *cpu;
@@ -592,7 +648,7 @@ static void cedf_task_exit(struct task_struct * t)
 		cpu->scheduled = NULL;
 		tsk_rt(t)->scheduled_on = NO_CPU;
 	}
-	raw_spin_unlock_irqrestore(&cluster->lock, flags);
+	raw_spin_unlock_irqrestore(&cluster->cedf_lock, flags);
 
 	BUG_ON(!is_realtime(t));
         TRACE_TASK(t, "RIP\n");
@@ -603,6 +659,721 @@ static long cedf_admit_task(struct task_struct* tsk)
 	return task_cpu(tsk) == tsk->rt_param.task_params.cpu ? 0 : -EINVAL;
 }
 
+
+
+
+
+
+
+
+
+
+
+
+
+#ifdef CONFIG_LITMUS_LOCKING
+
+#include <litmus/fdso.h>
+
+
+static void __set_priority_inheritance(struct task_struct* t, struct task_struct* prio_inh)
+{
+	int linked_on;
+	int check_preempt = 0;	
+	
+	cedf_domain_t* cluster = task_cpu_cluster(t);
+	
+	if(prio_inh != NULL)
+		TRACE_TASK(t, "inherits priority from %s/%d\n", prio_inh->comm, prio_inh->pid);
+	else
+		TRACE_TASK(t, "inherits priority from %p\n", prio_inh);
+	
+	sched_trace_eff_prio_change(t, prio_inh);
+	
+	tsk_rt(t)->inh_task = prio_inh;
+	
+	linked_on  = tsk_rt(t)->linked_on;
+	
+	/* If it is scheduled, then we need to reorder the CPU heap. */
+	if (linked_on != NO_CPU) {
+		TRACE_TASK(t, "%s: linked  on %d\n",
+				   __FUNCTION__, linked_on);
+		/* Holder is scheduled; need to re-order CPUs.
+		 * We can't use heap_decrease() here since
+		 * the cpu_heap is ordered in reverse direction, so
+		 * it is actually an increase. */
+		bheap_delete(cpu_lower_prio, &cluster->cpu_heap,
+                     per_cpu(cedf_cpu_entries, linked_on).hn);
+		bheap_insert(cpu_lower_prio, &cluster->cpu_heap,
+                     per_cpu(cedf_cpu_entries, linked_on).hn);
+	} else {
+		/* holder may be queued: first stop queue changes */
+		raw_spin_lock(&cluster->domain.release_lock);
+		if (is_queued(t)) {
+			TRACE_TASK(t, "%s: is queued\n", __FUNCTION__);
+			
+			/* We need to update the position of holder in some
+			 * heap. Note that this could be a release heap if we
+			 * budget enforcement is used and this job overran. */
+			check_preempt = !bheap_decrease(edf_ready_order, tsk_rt(t)->heap_node);
+			
+		} else {
+			/* Nothing to do: if it is not queued and not linked
+			 * then it is either sleeping or currently being moved
+			 * by other code (e.g., a timer interrupt handler) that
+			 * will use the correct priority when enqueuing the
+			 * task. */
+			TRACE_TASK(t, "%s: is NOT queued => Done.\n", __FUNCTION__);
+		}
+		raw_spin_unlock(&cluster->domain.release_lock);
+		
+		/* If holder was enqueued in a release heap, then the following
+		 * preemption check is pointless, but we can't easily detect
+		 * that case. If you want to fix this, then consider that
+		 * simply adding a state flag requires O(n) time to update when
+		 * releasing n tasks, which conflicts with the goal to have
+		 * O(log n) merges. */
+		if (check_preempt) {
+			/* heap_decrease() hit the top level of the heap: make
+			 * sure preemption checks get the right task, not the
+			 * potentially stale cache. */
+			bheap_uncache_min(edf_ready_order, &cluster->domain.ready_queue);
+			check_for_preemptions(cluster);
+		}
+	}
+}
+
+/* called with IRQs off */
+static void set_priority_inheritance(struct task_struct* t, struct task_struct* prio_inh)
+{
+	cedf_domain_t* cluster = task_cpu_cluster(t);
+	
+	raw_spin_lock(&cluster->cedf_lock);
+	
+	__set_priority_inheritance(t, prio_inh);
+	
+#ifdef CONFIG_LITMUS_SOFTIRQD
+	if(tsk_rt(t)->cur_klitirqd != NULL)
+	{
+		TRACE_TASK(t, "%s/%d inherits a new priority!\n",
+				   tsk_rt(t)->cur_klitirqd->comm, tsk_rt(t)->cur_klitirqd->pid);
+		
+		__set_priority_inheritance(tsk_rt(t)->cur_klitirqd, prio_inh);
+	}
+#endif
+	
+	raw_spin_unlock(&cluster->cedf_lock);
+}
+
+
+/* called with IRQs off */
+static void __clear_priority_inheritance(struct task_struct* t)
+{
+    TRACE_TASK(t, "priority restored\n");
+	
+    if(tsk_rt(t)->scheduled_on != NO_CPU)
+    {
+		sched_trace_eff_prio_change(t, NULL);
+		
+        tsk_rt(t)->inh_task = NULL;
+        
+        /* Check if rescheduling is necessary. We can't use heap_decrease()
+         * since the priority was effectively lowered. */
+        unlink(t);
+        cedf_job_arrival(t);
+    }
+    else
+    {
+        __set_priority_inheritance(t, NULL);
+    }
+	
+#ifdef CONFIG_LITMUS_SOFTIRQD
+	if(tsk_rt(t)->cur_klitirqd != NULL)
+	{
+		TRACE_TASK(t, "%s/%d inheritance set back to owner.\n",
+				   tsk_rt(t)->cur_klitirqd->comm, tsk_rt(t)->cur_klitirqd->pid);
+		
+		if(tsk_rt(tsk_rt(t)->cur_klitirqd)->scheduled_on != NO_CPU)
+		{
+			sched_trace_eff_prio_change(tsk_rt(t)->cur_klitirqd, t);
+			
+			tsk_rt(tsk_rt(t)->cur_klitirqd)->inh_task = t;
+			
+			/* Check if rescheduling is necessary. We can't use heap_decrease()
+			 * since the priority was effectively lowered. */
+			unlink(tsk_rt(t)->cur_klitirqd);
+			cedf_job_arrival(tsk_rt(t)->cur_klitirqd);
+		}
+		else
+		{
+			__set_priority_inheritance(tsk_rt(t)->cur_klitirqd, t);
+		}
+	}
+#endif
+}
+
+/* called with IRQs off */
+static void clear_priority_inheritance(struct task_struct* t)
+{
+	cedf_domain_t* cluster = task_cpu_cluster(t);
+	
+	raw_spin_lock(&cluster->cedf_lock);
+	__clear_priority_inheritance(t);
+	raw_spin_unlock(&cluster->cedf_lock);
+}
+
+
+
+#ifdef CONFIG_LITMUS_SOFTIRQD
+/* called with IRQs off */
+static void set_priority_inheritance_klitirqd(struct task_struct* klitirqd,
+											  struct task_struct* old_owner,
+											  struct task_struct* new_owner)
+{
+	cedf_domain_t* cluster = task_cpu_cluster(klitirqd);
+	
+	BUG_ON(!(tsk_rt(klitirqd)->is_proxy_thread));
+	
+	raw_spin_lock(&cluster->cedf_lock);
+	
+	if(old_owner != new_owner)
+	{
+		if(old_owner)
+		{
+			// unreachable?
+			tsk_rt(old_owner)->cur_klitirqd = NULL;
+		}
+		
+		TRACE_TASK(klitirqd, "giving ownership to %s/%d.\n",
+				   new_owner->comm, new_owner->pid);
+		
+		tsk_rt(new_owner)->cur_klitirqd = klitirqd;
+	}
+	
+	__set_priority_inheritance(klitirqd,
+							   (tsk_rt(new_owner)->inh_task == NULL) ?
+							   new_owner :
+							   tsk_rt(new_owner)->inh_task);
+	
+	raw_spin_unlock(&cluster->cedf_lock);
+}
+
+/* called with IRQs off */
+static void clear_priority_inheritance_klitirqd(struct task_struct* klitirqd,
+												struct task_struct* old_owner)
+{
+	cedf_domain_t* cluster = task_cpu_cluster(klitirqd);
+	
+	BUG_ON(!(tsk_rt(klitirqd)->is_proxy_thread));
+	
+	raw_spin_lock(&cluster->cedf_lock);
+    
+    TRACE_TASK(klitirqd, "priority restored\n");
+	
+    if(tsk_rt(klitirqd)->scheduled_on != NO_CPU)
+    {
+        tsk_rt(klitirqd)->inh_task = NULL;
+        
+        /* Check if rescheduling is necessary. We can't use heap_decrease()
+         * since the priority was effectively lowered. */
+        unlink(klitirqd);
+        cedf_job_arrival(klitirqd);
+    }
+    else
+    {
+        __set_priority_inheritance(klitirqd, NULL);
+    }
+	
+	tsk_rt(old_owner)->cur_klitirqd = NULL;
+	
+	raw_spin_unlock(&cluster->cedf_lock);
+}
+#endif  // CONFIG_LITMUS_SOFTIRQD
+
+
+/* ******************** KFMLP support ********************** */
+
+/* struct for semaphore with priority inheritance */
+struct kfmlp_queue
+{
+	wait_queue_head_t wait;
+	struct task_struct* owner;
+	struct task_struct* hp_waiter;
+	int count; /* number of waiters + holder */
+};
+
+struct kfmlp_semaphore
+{
+	struct litmus_lock litmus_lock;
+	
+	spinlock_t lock;
+	
+	int num_resources; /* aka k */
+	struct kfmlp_queue *queues; /* array */
+	struct kfmlp_queue *shortest_queue; /* pointer to shortest queue */
+};
+
+static inline struct kfmlp_semaphore* kfmlp_from_lock(struct litmus_lock* lock)
+{
+	return container_of(lock, struct kfmlp_semaphore, litmus_lock);
+}
+
+static inline int kfmlp_get_idx(struct kfmlp_semaphore* sem,
+								struct kfmlp_queue* queue)
+{
+	return (queue - &sem->queues[0]);
+}
+
+static inline struct kfmlp_queue* kfmlp_get_queue(struct kfmlp_semaphore* sem,
+												  struct task_struct* holder)
+{
+	int i;
+	for(i = 0; i < sem->num_resources; ++i)
+		if(sem->queues[i].owner == holder)
+			return(&sem->queues[i]);
+	return(NULL);
+}
+
+/* caller is responsible for locking */
+static struct task_struct* kfmlp_find_hp_waiter(struct kfmlp_queue *kqueue,
+										 struct task_struct *skip)
+{
+	struct list_head	*pos;
+	struct task_struct 	*queued, *found = NULL;
+	
+	list_for_each(pos, &kqueue->wait.task_list) {
+		queued  = (struct task_struct*) list_entry(pos, wait_queue_t,
+												   task_list)->private;
+		
+		/* Compare task prios, find high prio task. */
+		if (queued != skip && edf_higher_prio(queued, found))
+			found = queued;
+	}
+	return found;
+}
+
+static inline struct kfmlp_queue* kfmlp_find_shortest(
+										  struct kfmlp_semaphore* sem,
+										  struct kfmlp_queue* search_start)
+{
+	// we start our search at search_start instead of at the beginning of the
+	// queue list to load-balance across all resources.
+	struct kfmlp_queue* step = search_start;
+	struct kfmlp_queue* shortest = sem->shortest_queue;
+	
+	do
+	{
+		step = (step+1 != &sem->queues[sem->num_resources]) ?
+		step+1 : &sem->queues[0];
+		if(step->count < shortest->count)
+		{
+			shortest = step;
+			if(step->count == 0)
+				break; /* can't get any shorter */
+		}
+	}while(step != search_start);
+	
+	return(shortest);
+}
+
+static struct task_struct* kfmlp_remove_hp_waiter(struct kfmlp_semaphore* sem)
+{
+	/* must hold sem->lock */
+	
+	struct kfmlp_queue *my_queue = NULL;
+	struct task_struct *max_hp = NULL;
+	
+	
+	struct list_head	*pos;
+	struct task_struct 	*queued;
+	int i;
+	
+	for(i = 0; i < sem->num_resources; ++i)
+	{
+		if( (sem->queues[i].count > 1) &&
+		   ((my_queue == NULL) ||
+			(edf_higher_prio(sem->queues[i].hp_waiter, my_queue->hp_waiter))) )
+		{
+			my_queue = &sem->queues[i];
+		}
+	}
+	
+	if(my_queue)
+	{
+		cedf_domain_t* cluster;
+		
+		max_hp = my_queue->hp_waiter;
+		BUG_ON(!max_hp);
+
+		TRACE_CUR("queue %d: stealing %s/%d from queue %d\n",
+				  kfmlp_get_idx(sem, my_queue),
+				  max_hp->comm, max_hp->pid,
+				  kfmlp_get_idx(sem, my_queue));
+		
+		my_queue->hp_waiter = kfmlp_find_hp_waiter(my_queue, max_hp);
+		
+		/*
+		 if(my_queue->hp_waiter)
+		 TRACE_CUR("queue %d: new hp_waiter is %s/%d\n",
+		 kfmlp_get_idx(sem, my_queue),
+		 my_queue->hp_waiter->comm,
+		 my_queue->hp_waiter->pid);
+		 else
+		 TRACE_CUR("queue %d: new hp_waiter is %p\n",
+		 kfmlp_get_idx(sem, my_queue), NULL);
+		 */
+	
+		cluster = task_cpu_cluster(max_hp);
+
+		raw_spin_lock(&cluster->cedf_lock);
+		
+		/*
+		 if(my_queue->owner)
+		 TRACE_CUR("queue %d: owner is %s/%d\n",
+		 kfmlp_get_idx(sem, my_queue),
+		 my_queue->owner->comm,
+		 my_queue->owner->pid);
+		 else
+		 TRACE_CUR("queue %d: owner is %p\n",
+		 kfmlp_get_idx(sem, my_queue),
+		 NULL);
+		 */
+		
+		if(tsk_rt(my_queue->owner)->inh_task == max_hp)
+		{
+			__clear_priority_inheritance(my_queue->owner);
+			if(my_queue->hp_waiter != NULL)
+			{
+				__set_priority_inheritance(my_queue->owner, my_queue->hp_waiter);
+			}
+		}
+		raw_spin_unlock(&cluster->cedf_lock);
+		
+		list_for_each(pos, &my_queue->wait.task_list)
+		{
+			queued  = (struct task_struct*) list_entry(pos, wait_queue_t,
+													   task_list)->private;
+			/* Compare task prios, find high prio task. */
+			if (queued == max_hp)
+			{
+				/*
+				 TRACE_CUR("queue %d: found entry in wait queue.  REMOVING!\n",
+				 kfmlp_get_idx(sem, my_queue));
+				 */
+				__remove_wait_queue(&my_queue->wait,
+									list_entry(pos, wait_queue_t, task_list));
+				break;
+			}
+		}
+		--(my_queue->count);
+	}
+	
+	return(max_hp);
+}
+
+int cedf_kfmlp_lock(struct litmus_lock* l)
+{
+	struct task_struct* t = current;
+	struct kfmlp_semaphore *sem = kfmlp_from_lock(l);
+	struct kfmlp_queue* my_queue;
+	wait_queue_t wait;
+	unsigned long flags;
+	
+	if (!is_realtime(t))
+		return -EPERM;
+	
+	spin_lock_irqsave(&sem->lock, flags);
+	
+	my_queue = sem->shortest_queue;
+	
+	if (my_queue->owner) {
+		/* resource is not free => must suspend and wait */
+		TRACE_CUR("queue %d: Resource is not free => must suspend and wait.\n",
+				  kfmlp_get_idx(sem, my_queue));
+		
+		init_waitqueue_entry(&wait, t);
+		
+		/* FIXME: interruptible would be nice some day */
+		set_task_state(t, TASK_UNINTERRUPTIBLE);
+		
+		__add_wait_queue_tail_exclusive(&my_queue->wait, &wait);
+		
+		/* check if we need to activate priority inheritance */
+		if (edf_higher_prio(t, my_queue->hp_waiter))
+		{
+			my_queue->hp_waiter = t;
+			if (edf_higher_prio(t, my_queue->owner))
+			{
+				set_priority_inheritance(my_queue->owner, my_queue->hp_waiter);
+			}
+		}
+		
+		++(my_queue->count);
+		sem->shortest_queue = kfmlp_find_shortest(sem, my_queue);
+		
+		/* release lock before sleeping */
+		spin_unlock_irqrestore(&sem->lock, flags);
+		
+		/* We depend on the FIFO order.  Thus, we don't need to recheck
+		 * when we wake up; we are guaranteed to have the lock since
+		 * there is only one wake up per release (or steal).
+		 */
+		schedule();
+		
+		
+		if(my_queue->owner == t)
+		{
+			TRACE_CUR("queue %d: acquired through waiting\n",
+					  kfmlp_get_idx(sem, my_queue));
+		}
+		else
+		{
+			/* this case may happen if our wait entry was stolen
+			 between queues.  record where we went.*/
+			my_queue = kfmlp_get_queue(sem, t);
+			BUG_ON(!my_queue);
+			TRACE_CUR("queue %d: acquired through stealing\n",
+					  kfmlp_get_idx(sem, my_queue));
+		}
+	}
+	else
+	{
+		TRACE_CUR("queue %d: acquired immediately\n",
+				  kfmlp_get_idx(sem, my_queue));
+		
+		my_queue->owner = t;
+		
+		++(my_queue->count);
+		sem->shortest_queue = kfmlp_find_shortest(sem, my_queue);		
+		
+		spin_unlock_irqrestore(&sem->lock, flags);
+	}
+	
+	return kfmlp_get_idx(sem, my_queue);
+}
+
+int cedf_kfmlp_unlock(struct litmus_lock* l)
+{
+	struct task_struct *t = current, *next;
+	struct kfmlp_semaphore *sem = kfmlp_from_lock(l);
+	struct kfmlp_queue *my_queue;
+	unsigned long flags;
+	int err = 0;
+	
+	spin_lock_irqsave(&sem->lock, flags);
+	
+	my_queue = kfmlp_get_queue(sem, t);
+	
+	if (!my_queue) {
+		err = -EINVAL;
+		goto out;
+	}
+	
+	/* check if there are jobs waiting for this resource */
+	next = __waitqueue_remove_first(&my_queue->wait);
+	if (next) {
+		/*
+		 TRACE_CUR("queue %d: ASSIGNING %s/%d as owner - next\n",
+		 kfmlp_get_idx(sem, my_queue),
+		 next->comm, next->pid);
+		 */
+		/* next becomes the resouce holder */
+		my_queue->owner = next;
+		
+		--(my_queue->count);
+		if(my_queue->count < sem->shortest_queue->count)
+		{
+			sem->shortest_queue = my_queue;
+		}	
+		
+		TRACE_CUR("queue %d: lock ownership passed to %s/%d\n",
+				  kfmlp_get_idx(sem, my_queue), next->comm, next->pid);
+		
+		/* determine new hp_waiter if necessary */
+		if (next == my_queue->hp_waiter) {
+			TRACE_TASK(next, "was highest-prio waiter\n");
+			/* next has the highest priority --- it doesn't need to
+			 * inherit.  However, we need to make sure that the
+			 * next-highest priority in the queue is reflected in
+			 * hp_waiter. */
+			my_queue->hp_waiter = kfmlp_find_hp_waiter(my_queue, next);
+			if (my_queue->hp_waiter)
+				TRACE_TASK(my_queue->hp_waiter, "queue %d: is new highest-prio waiter\n", kfmlp_get_idx(sem, my_queue));
+			else
+				TRACE("queue %d: no further waiters\n", kfmlp_get_idx(sem, my_queue));
+		} else {
+			/* Well, if next is not the highest-priority waiter,
+			 * then it ought to inherit the highest-priority
+			 * waiter's priority. */
+			set_priority_inheritance(next, my_queue->hp_waiter);
+		}
+		
+		/* wake up next */
+		wake_up_process(next);
+	}
+	else
+	{
+		TRACE_CUR("queue %d: looking to steal someone...\n", kfmlp_get_idx(sem, my_queue));
+		
+		next = kfmlp_remove_hp_waiter(sem); /* returns NULL if nothing to steal */
+		
+		/*
+		 if(next)
+		 TRACE_CUR("queue %d: ASSIGNING %s/%d as owner - steal\n",
+		 kfmlp_get_idx(sem, my_queue),
+		 next->comm, next->pid);
+		 */
+		
+		my_queue->owner = next;
+		
+		if(next)
+		{
+			TRACE_CUR("queue %d: lock ownership passed to %s/%d (which was stolen)\n",
+					  kfmlp_get_idx(sem, my_queue),
+					  next->comm, next->pid);
+			
+			/* wake up next */
+			wake_up_process(next);			
+		}
+		else
+		{
+			TRACE_CUR("queue %d: no one to steal.\n", kfmlp_get_idx(sem, my_queue));
+			
+			--(my_queue->count);
+			if(my_queue->count < sem->shortest_queue->count)
+			{
+				sem->shortest_queue = my_queue;
+			}
+		}
+	}
+	
+	/* we lose the benefit of priority inheritance (if any) */
+	if (tsk_rt(t)->inh_task)
+		clear_priority_inheritance(t);
+	
+out:
+	spin_unlock_irqrestore(&sem->lock, flags);
+	
+	return err;
+}
+
+int cedf_kfmlp_close(struct litmus_lock* l)
+{
+	struct task_struct *t = current;
+	struct kfmlp_semaphore *sem = kfmlp_from_lock(l);
+	struct kfmlp_queue *my_queue;
+	unsigned long flags;
+	
+	int owner;
+	
+	spin_lock_irqsave(&sem->lock, flags);
+	
+	my_queue = kfmlp_get_queue(sem, t);	
+	owner = (my_queue) ? (my_queue->owner == t) : 0;
+	
+	spin_unlock_irqrestore(&sem->lock, flags);
+	
+	if (owner)
+		cedf_kfmlp_unlock(l);
+	
+	return 0;
+}
+
+void cedf_kfmlp_free(struct litmus_lock* l)
+{
+	struct kfmlp_semaphore *sem = kfmlp_from_lock(l);
+	kfree(sem->queues);
+	kfree(sem);
+}
+
+static struct litmus_lock_ops cedf_kfmlp_lock_ops = {
+	.close  = cedf_kfmlp_close,
+	.lock   = cedf_kfmlp_lock,
+	.unlock = cedf_kfmlp_unlock,
+	.deallocate = cedf_kfmlp_free,
+};
+
+static struct litmus_lock* cedf_new_kfmlp(void* __user arg, int* ret_code)
+{
+	struct kfmlp_semaphore* sem;
+	int num_resources = 0;
+	int i;
+	
+	if(!access_ok(VERIFY_READ, arg, sizeof(num_resources)))
+	{
+		*ret_code = -EINVAL;
+		return(NULL);
+	}
+	if(__copy_from_user(&num_resources, arg, sizeof(num_resources)))
+	{
+		*ret_code = -EINVAL;
+		return(NULL);
+	}
+	if(num_resources < 1)
+	{
+		*ret_code = -EINVAL;
+		return(NULL);		
+	}
+	
+	sem = kmalloc(sizeof(*sem), GFP_KERNEL);
+	if(!sem)
+	{
+		*ret_code = -ENOMEM;
+		return NULL;
+	}
+	
+	sem->queues = kmalloc(sizeof(struct kfmlp_queue)*num_resources, GFP_KERNEL);
+	if(!sem->queues)
+	{
+		kfree(sem);
+		*ret_code = -ENOMEM;
+		return NULL;		
+	}
+	
+	sem->litmus_lock.ops = &cedf_kfmlp_lock_ops;
+	spin_lock_init(&sem->lock);
+	sem->num_resources = num_resources;
+	
+	for(i = 0; i < num_resources; ++i)
+	{
+		sem->queues[i].owner = NULL;
+		sem->queues[i].hp_waiter = NULL;
+		init_waitqueue_head(&sem->queues[i].wait);
+		sem->queues[i].count = 0;
+	}
+	
+	sem->shortest_queue = &sem->queues[0];
+	
+	*ret_code = 0;
+	return &sem->litmus_lock;
+}
+
+
+/* **** lock constructor **** */
+
+static long cedf_allocate_lock(struct litmus_lock **lock, int type,
+								 void* __user arg)
+{
+	int err = -ENXIO;
+	
+	/* C-EDF currently only supports the FMLP for global resources
+		WITHIN a given cluster.  DO NOT USE CROSS-CLUSTER! */
+	switch (type) {
+		case KFMLP_SEM:
+			*lock = cedf_new_kfmlp(arg, &err);
+			break;
+	};
+	
+	return err;
+}
+
+#endif  // CONFIG_LITMUS_LOCKING
+
+
+
+
+
+
 /* total number of cluster */
 static int num_clusters;
 /* we do not support cluster of different sizes */
@@ -746,6 +1517,40 @@ static long cedf_activate_plugin(void)
 			break;
 		}
 	}
+	
+#ifdef CONFIG_LITMUS_SOFTIRQD
+	{
+		/* distribute the daemons evenly across the clusters. */
+		int* affinity = kmalloc(NR_LITMUS_SOFTIRQD * sizeof(int), GFP_ATOMIC);
+		int num_daemons_per_cluster = NR_LITMUS_SOFTIRQD / num_clusters;
+		int left_over = NR_LITMUS_SOFTIRQD % num_clusters;
+		
+		int daemon = 0;
+		for(i = 0; i < num_clusters; ++i)
+		{
+			int num_on_this_cluster = num_daemons_per_cluster;
+			if(left_over)
+			{
+				++num_on_this_cluster;
+				--left_over;
+			}
+			
+			for(j = 0; j < num_on_this_cluster; ++j)
+			{
+				// first CPU of this cluster
+				affinity[daemon++] = i*cluster_size;
+			}
+		}
+	
+		spawn_klitirqd(affinity);
+		
+		kfree(affinity);
+	}
+#endif
+	
+#ifdef CONFIG_LITMUS_NVIDIA
+	init_nvidia_info();
+#endif	
 
 	free_cpumask_var(mask);
 	clusters_allocated = 1;
@@ -765,6 +1570,15 @@ static struct sched_plugin cedf_plugin __cacheline_aligned_in_smp = {
 	.task_block		= cedf_task_block,
 	.admit_task		= cedf_admit_task,
 	.activate_plugin	= cedf_activate_plugin,
+#ifdef CONFIG_LITMUS_LOCKING
+	.allocate_lock	= cedf_allocate_lock,
+    .set_prio_inh   = set_priority_inheritance,
+    .clear_prio_inh = clear_priority_inheritance,	
+#endif
+#ifdef CONFIG_LITMUS_SOFTIRQD
+	.set_prio_inh_klitirqd = set_priority_inheritance_klitirqd,
+	.clear_prio_inh_klitirqd = clear_priority_inheritance_klitirqd,
+#endif	
 };
 
 static struct proc_dir_entry *cluster_file = NULL, *cedf_dir = NULL;
diff --git a/litmus/sched_gsn_edf.c b/litmus/sched_gsn_edf.c
index 3092797480f8..d04e0703c154 100644
--- a/litmus/sched_gsn_edf.c
+++ b/litmus/sched_gsn_edf.c
@@ -12,6 +12,8 @@
 #include <linux/percpu.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/uaccess.h>
+
 
 #include <litmus/litmus.h>
 #include <litmus/jobs.h>
@@ -25,6 +27,19 @@
 
 #include <linux/module.h>
 
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+#include <litmus/affinity.h>
+#endif
+
+#ifdef CONFIG_LITMUS_SOFTIRQD
+#include <litmus/litmus_softirq.h>
+#endif
+
+#ifdef CONFIG_LITMUS_NVIDIA
+#include <litmus/nvidia_info.h>
+#endif
+
+
 /* Overview of GSN-EDF operations.
  *
  * For a detailed explanation of GSN-EDF have a look at the FMLP paper. This
@@ -253,21 +268,52 @@ static noinline void requeue(struct task_struct* task)
 	}
 }
 
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+static cpu_entry_t* gsnedf_get_nearest_available_cpu(cpu_entry_t* start)
+{
+	cpu_entry_t* affinity;
+
+	get_nearest_available_cpu(affinity, start, gsnedf_cpu_entries,
+#ifdef CONFIG_RELEASE_MASTER
+					gsnedf.release_master
+#else
+					-1
+#endif
+					);
+
+	return(affinity);
+}
+#endif
+
 /* check for any necessary preemptions */
 static void check_for_preemptions(void)
 {
 	struct task_struct *task;
-	cpu_entry_t* last;
+	cpu_entry_t *last;
 
 	for(last = lowest_prio_cpu();
 	    edf_preemption_needed(&gsnedf, last->linked);
 	    last = lowest_prio_cpu()) {
 		/* preemption necessary */
 		task = __take_ready(&gsnedf);
-		TRACE("check_for_preemptions: attempting to link task %d to %d\n",
-		      task->pid, last->cpu);
+
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+		{
+			cpu_entry_t* affinity = gsnedf_get_nearest_available_cpu(
+							&per_cpu(gsnedf_cpu_entries, task_cpu(task)));
+			if(affinity)
+				last = affinity;
+			else if(last->linked)
+				requeue(last->linked);
+		}
+#else
 		if (last->linked)
 			requeue(last->linked);
+#endif
+
+		TRACE("check_for_preemptions: attempting to link task %d to %d\n", 
+						task->pid, last->cpu);
+
 		link_task_to_cpu(task, last);
 		preempt(last);
 	}
@@ -277,7 +323,7 @@ static void check_for_preemptions(void)
 static noinline void gsnedf_job_arrival(struct task_struct* task)
 {
 	BUG_ON(!task);
-
+    
 	requeue(task);
 	check_for_preemptions();
 }
@@ -298,9 +344,13 @@ static void gsnedf_release_jobs(rt_domain_t* rt, struct bheap* tasks)
 static noinline void job_completion(struct task_struct *t, int forced)
 {
 	BUG_ON(!t);
-
+	
 	sched_trace_task_completion(t, forced);
 
+#ifdef CONFIG_LITMUS_NVIDIA
+	atomic_set(&tsk_rt(t)->nv_int_count, 0);
+#endif
+
 	TRACE_TASK(t, "job_completion().\n");
 
 	/* set flags */
@@ -401,17 +451,19 @@ static struct task_struct* gsnedf_schedule(struct task_struct * prev)
 	TRACE_TASK(prev, "invoked gsnedf_schedule.\n");
 #endif
 
+	/*
 	if (exists)
 		TRACE_TASK(prev,
 			   "blocks:%d out_of_time:%d np:%d sleep:%d preempt:%d "
 			   "state:%d sig:%d\n",
 			   blocks, out_of_time, np, sleep, preempt,
 			   prev->state, signal_pending(prev));
+	 */
+	
 	if (entry->linked && preempt)
 		TRACE_TASK(prev, "will be preempted by %s/%d\n",
 			   entry->linked->comm, entry->linked->pid);
 
-
 	/* If a task blocks we have no choice but to reschedule.
 	 */
 	if (blocks)
@@ -456,12 +508,15 @@ static struct task_struct* gsnedf_schedule(struct task_struct * prev)
 			entry->scheduled->rt_param.scheduled_on = NO_CPU;
 			TRACE_TASK(entry->scheduled, "scheduled_on = NO_CPU\n");
 		}
-	} else
+	}
+	else
+	{
 		/* Only override Linux scheduler if we have a real-time task
 		 * scheduled that needs to continue.
 		 */
 		if (exists)
 			next = prev;
+	}
 
 	sched_state_task_picked();
 
@@ -486,8 +541,9 @@ static struct task_struct* gsnedf_schedule(struct task_struct * prev)
 static void gsnedf_finish_switch(struct task_struct *prev)
 {
 	cpu_entry_t* 	entry = &__get_cpu_var(gsnedf_cpu_entries);
-
+	
 	entry->scheduled = is_realtime(current) ? current : NULL;
+	
 #ifdef WANT_ALL_SCHED_EVENTS
 	TRACE_TASK(prev, "switched away from\n");
 #endif
@@ -536,11 +592,14 @@ static void gsnedf_task_new(struct task_struct * t, int on_rq, int running)
 static void gsnedf_task_wake_up(struct task_struct *task)
 {
 	unsigned long flags;
-	lt_t now;
-
+	lt_t now;	
+	
 	TRACE_TASK(task, "wake_up at %llu\n", litmus_clock());
 
 	raw_spin_lock_irqsave(&gsnedf_lock, flags);
+	
+	
+#if 0  // sporadic task model
 	/* We need to take suspensions because of semaphores into
 	 * account! If a job resumes after being suspended due to acquiring
 	 * a semaphore, it should never be treated as a new job release.
@@ -562,19 +621,26 @@ static void gsnedf_task_wake_up(struct task_struct *task)
 			}
 		}
 	}
+#else  // periodic task model
+	set_rt_flags(task, RT_F_RUNNING);
+#endif
+	
 	gsnedf_job_arrival(task);
 	raw_spin_unlock_irqrestore(&gsnedf_lock, flags);
 }
 
 static void gsnedf_task_block(struct task_struct *t)
 {
+	// TODO: is this called on preemption??
 	unsigned long flags;
 
 	TRACE_TASK(t, "block at %llu\n", litmus_clock());
 
 	/* unlink if necessary */
 	raw_spin_lock_irqsave(&gsnedf_lock, flags);
+	
 	unlink(t);
+	
 	raw_spin_unlock_irqrestore(&gsnedf_lock, flags);
 
 	BUG_ON(!is_realtime(t));
@@ -608,51 +674,53 @@ static long gsnedf_admit_task(struct task_struct* tsk)
 
 #include <litmus/fdso.h>
 
-/* called with IRQs off */
-static void set_priority_inheritance(struct task_struct* t, struct task_struct* prio_inh)
+
+static void __set_priority_inheritance(struct task_struct* t, struct task_struct* prio_inh)
 {
 	int linked_on;
-	int check_preempt = 0;
-
-	raw_spin_lock(&gsnedf_lock);
-
-	TRACE_TASK(t, "inherits priority from %s/%d\n", prio_inh->comm, prio_inh->pid);
+	int check_preempt = 0;	
+	
+	if(prio_inh != NULL)
+		TRACE_TASK(t, "inherits priority from %s/%d\n", prio_inh->comm, prio_inh->pid);
+	else
+		TRACE_TASK(t, "inherits priority from %p\n", prio_inh);
+	
+	sched_trace_eff_prio_change(t, prio_inh);
+	
 	tsk_rt(t)->inh_task = prio_inh;
-
+	
 	linked_on  = tsk_rt(t)->linked_on;
-
+	
 	/* If it is scheduled, then we need to reorder the CPU heap. */
 	if (linked_on != NO_CPU) {
 		TRACE_TASK(t, "%s: linked  on %d\n",
-			   __FUNCTION__, linked_on);
+				   __FUNCTION__, linked_on);
 		/* Holder is scheduled; need to re-order CPUs.
 		 * We can't use heap_decrease() here since
 		 * the cpu_heap is ordered in reverse direction, so
 		 * it is actually an increase. */
 		bheap_delete(cpu_lower_prio, &gsnedf_cpu_heap,
-			    gsnedf_cpus[linked_on]->hn);
+                     gsnedf_cpus[linked_on]->hn);
 		bheap_insert(cpu_lower_prio, &gsnedf_cpu_heap,
-			    gsnedf_cpus[linked_on]->hn);
+                     gsnedf_cpus[linked_on]->hn);
 	} else {
 		/* holder may be queued: first stop queue changes */
 		raw_spin_lock(&gsnedf.release_lock);
 		if (is_queued(t)) {
-			TRACE_TASK(t, "%s: is queued\n",
-				   __FUNCTION__);
+			TRACE_TASK(t, "%s: is queued\n", __FUNCTION__);
+
 			/* We need to update the position of holder in some
 			 * heap. Note that this could be a release heap if we
 			 * budget enforcement is used and this job overran. */
-			check_preempt =
-				!bheap_decrease(edf_ready_order,
-					       tsk_rt(t)->heap_node);
+			check_preempt = !bheap_decrease(edf_ready_order, tsk_rt(t)->heap_node);
+
 		} else {
 			/* Nothing to do: if it is not queued and not linked
 			 * then it is either sleeping or currently being moved
 			 * by other code (e.g., a timer interrupt handler) that
 			 * will use the correct priority when enqueuing the
 			 * task. */
-			TRACE_TASK(t, "%s: is NOT queued => Done.\n",
-				   __FUNCTION__);
+			TRACE_TASK(t, "%s: is NOT queued => Done.\n", __FUNCTION__);
 		}
 		raw_spin_unlock(&gsnedf.release_lock);
 
@@ -666,34 +734,148 @@ static void set_priority_inheritance(struct task_struct* t, struct task_struct*
 			/* heap_decrease() hit the top level of the heap: make
 			 * sure preemption checks get the right task, not the
 			 * potentially stale cache. */
-			bheap_uncache_min(edf_ready_order,
-					 &gsnedf.ready_queue);
+			bheap_uncache_min(edf_ready_order, &gsnedf.ready_queue);
 			check_for_preemptions();
 		}
 	}
+}
 
+/* called with IRQs off */
+static void set_priority_inheritance(struct task_struct* t, struct task_struct* prio_inh)
+{
+	raw_spin_lock(&gsnedf_lock);
+
+	__set_priority_inheritance(t, prio_inh);
+
+#ifdef CONFIG_LITMUS_SOFTIRQD
+	if(tsk_rt(t)->cur_klitirqd != NULL)
+	{
+		TRACE_TASK(t, "%s/%d inherits a new priority!\n",
+				tsk_rt(t)->cur_klitirqd->comm, tsk_rt(t)->cur_klitirqd->pid);
+
+		__set_priority_inheritance(tsk_rt(t)->cur_klitirqd, prio_inh);
+	}
+#endif
+	
 	raw_spin_unlock(&gsnedf_lock);
 }
 
+
+/* called with IRQs off */
+static void __clear_priority_inheritance(struct task_struct* t)
+{
+    TRACE_TASK(t, "priority restored\n");
+	
+    if(tsk_rt(t)->scheduled_on != NO_CPU)
+    {
+		sched_trace_eff_prio_change(t, NULL);
+		
+        tsk_rt(t)->inh_task = NULL;
+        
+        /* Check if rescheduling is necessary. We can't use heap_decrease()
+         * since the priority was effectively lowered. */
+        unlink(t);
+        gsnedf_job_arrival(t);
+    }
+    else
+    {
+        __set_priority_inheritance(t, NULL);
+    }
+
+#ifdef CONFIG_LITMUS_SOFTIRQD
+	if(tsk_rt(t)->cur_klitirqd != NULL)
+	{
+		TRACE_TASK(t, "%s/%d inheritance set back to owner.\n",
+				tsk_rt(t)->cur_klitirqd->comm, tsk_rt(t)->cur_klitirqd->pid);
+
+		if(tsk_rt(tsk_rt(t)->cur_klitirqd)->scheduled_on != NO_CPU)
+		{
+			sched_trace_eff_prio_change(tsk_rt(t)->cur_klitirqd, t);
+			
+			tsk_rt(tsk_rt(t)->cur_klitirqd)->inh_task = t;
+			
+			/* Check if rescheduling is necessary. We can't use heap_decrease()
+			 * since the priority was effectively lowered. */
+			unlink(tsk_rt(t)->cur_klitirqd);
+			gsnedf_job_arrival(tsk_rt(t)->cur_klitirqd);
+		}
+		else
+		{
+			__set_priority_inheritance(tsk_rt(t)->cur_klitirqd, t);
+		}
+	}
+#endif
+}
+
 /* called with IRQs off */
 static void clear_priority_inheritance(struct task_struct* t)
 {
 	raw_spin_lock(&gsnedf_lock);
+	__clear_priority_inheritance(t);
+	raw_spin_unlock(&gsnedf_lock);
+}
 
-	/* A job only stops inheriting a priority when it releases a
-	 * resource. Thus we can make the following assumption.*/
-	BUG_ON(tsk_rt(t)->scheduled_on == NO_CPU);
-
-	TRACE_TASK(t, "priority restored\n");
-	tsk_rt(t)->inh_task = NULL;
+#ifdef CONFIG_LITMUS_SOFTIRQD
+/* called with IRQs off */
+static void set_priority_inheritance_klitirqd(struct task_struct* klitirqd,
+											  struct task_struct* old_owner,
+											  struct task_struct* new_owner)
+{
+	BUG_ON(!(tsk_rt(klitirqd)->is_proxy_thread));
+	
+	raw_spin_lock(&gsnedf_lock);
+	
+	if(old_owner != new_owner)
+	{
+		if(old_owner)
+		{
+			// unreachable?
+			tsk_rt(old_owner)->cur_klitirqd = NULL;
+		}
+	
+		TRACE_TASK(klitirqd, "giving ownership to %s/%d.\n",
+				   new_owner->comm, new_owner->pid);
 
-	/* Check if rescheduling is necessary. We can't use heap_decrease()
-	 * since the priority was effectively lowered. */
-	unlink(t);
-	gsnedf_job_arrival(t);
+		tsk_rt(new_owner)->cur_klitirqd = klitirqd;
+	}
+	
+	__set_priority_inheritance(klitirqd,
+			(tsk_rt(new_owner)->inh_task == NULL) ?
+				new_owner :
+				tsk_rt(new_owner)->inh_task);
+	
+	raw_spin_unlock(&gsnedf_lock);
+}
 
+/* called with IRQs off */
+static void clear_priority_inheritance_klitirqd(struct task_struct* klitirqd,
+												struct task_struct* old_owner)
+{
+	BUG_ON(!(tsk_rt(klitirqd)->is_proxy_thread));
+	
+	raw_spin_lock(&gsnedf_lock);
+    
+    TRACE_TASK(klitirqd, "priority restored\n");
+	
+    if(tsk_rt(klitirqd)->scheduled_on != NO_CPU)
+    {
+        tsk_rt(klitirqd)->inh_task = NULL;
+        
+        /* Check if rescheduling is necessary. We can't use heap_decrease()
+         * since the priority was effectively lowered. */
+        unlink(klitirqd);
+        gsnedf_job_arrival(klitirqd);
+    }
+    else
+    {
+        __set_priority_inheritance(klitirqd, NULL);
+    }
+	
+	tsk_rt(old_owner)->cur_klitirqd = NULL;
+	
 	raw_spin_unlock(&gsnedf_lock);
 }
+#endif
 
 
 /* ******************** FMLP support ********************** */
@@ -892,11 +1074,477 @@ static struct litmus_lock* gsnedf_new_fmlp(void)
 	return &sem->litmus_lock;
 }
 
+
+
+
+
+
+
+/* ******************** KFMLP support ********************** */
+
+/* struct for semaphore with priority inheritance */
+struct kfmlp_queue
+{
+	wait_queue_head_t wait;
+	struct task_struct* owner;
+	struct task_struct* hp_waiter;
+	int count; /* number of waiters + holder */
+};
+
+struct kfmlp_semaphore
+{
+	struct litmus_lock litmus_lock;
+	
+	spinlock_t	lock;
+	
+	int num_resources; /* aka k */
+	
+	struct kfmlp_queue *queues; /* array */
+	struct kfmlp_queue *shortest_queue; /* pointer to shortest queue */
+};
+
+static inline struct kfmlp_semaphore* kfmlp_from_lock(struct litmus_lock* lock)
+{
+	return container_of(lock, struct kfmlp_semaphore, litmus_lock);
+}
+
+static inline int kfmlp_get_idx(struct kfmlp_semaphore* sem,
+								struct kfmlp_queue* queue)
+{
+	return (queue - &sem->queues[0]);
+}
+
+static inline struct kfmlp_queue* kfmlp_get_queue(struct kfmlp_semaphore* sem,
+												  struct task_struct* holder)
+{
+	int i;
+	for(i = 0; i < sem->num_resources; ++i)
+		if(sem->queues[i].owner == holder)
+			return(&sem->queues[i]);
+	return(NULL);
+}
+
+/* caller is responsible for locking */
+static struct task_struct* kfmlp_find_hp_waiter(struct kfmlp_queue *kqueue,
+										 struct task_struct *skip)
+{
+	struct list_head	*pos;
+	struct task_struct 	*queued, *found = NULL;
+	
+	list_for_each(pos, &kqueue->wait.task_list) {
+		queued  = (struct task_struct*) list_entry(pos, wait_queue_t,
+												   task_list)->private;
+		
+		/* Compare task prios, find high prio task. */
+		if (queued != skip && edf_higher_prio(queued, found))
+			found = queued;
+	}
+	return found;
+}
+
+static inline struct kfmlp_queue* kfmlp_find_shortest(
+										struct kfmlp_semaphore* sem,
+										struct kfmlp_queue* search_start)
+{
+	// we start our search at search_start instead of at the beginning of the
+	// queue list to load-balance across all resources.
+	struct kfmlp_queue* step = search_start;
+	struct kfmlp_queue* shortest = sem->shortest_queue;
+	
+	do
+	{
+		step = (step+1 != &sem->queues[sem->num_resources]) ?
+			step+1 : &sem->queues[0];
+		if(step->count < shortest->count)
+		{
+			shortest = step;
+			if(step->count == 0)
+				break; /* can't get any shorter */
+		}
+	}while(step != search_start);
+	
+	return(shortest);
+}
+
+static struct task_struct* kfmlp_remove_hp_waiter(struct kfmlp_semaphore* sem)
+{
+	/* must hold sem->lock */
+	
+	struct kfmlp_queue *my_queue = NULL;
+	struct task_struct *max_hp = NULL;
+
+	
+	struct list_head	*pos;
+	struct task_struct 	*queued;
+	int i;
+	
+	for(i = 0; i < sem->num_resources; ++i)
+	{
+		if( (sem->queues[i].count > 1) &&
+			((my_queue == NULL) ||
+			 (edf_higher_prio(sem->queues[i].hp_waiter, my_queue->hp_waiter))) )
+		{
+			my_queue = &sem->queues[i];
+		}
+	}
+	
+	if(my_queue)
+	{		
+		max_hp = my_queue->hp_waiter;
+		
+		BUG_ON(!max_hp);
+		
+		TRACE_CUR("queue %d: stealing %s/%d from queue %d\n",
+				  kfmlp_get_idx(sem, my_queue),
+				  max_hp->comm, max_hp->pid,
+				  kfmlp_get_idx(sem, my_queue));
+		
+		my_queue->hp_waiter = kfmlp_find_hp_waiter(my_queue, max_hp);
+		
+		/*
+		if(my_queue->hp_waiter)
+			TRACE_CUR("queue %d: new hp_waiter is %s/%d\n",
+					  kfmlp_get_idx(sem, my_queue),
+					  my_queue->hp_waiter->comm,
+					  my_queue->hp_waiter->pid);
+		else
+			TRACE_CUR("queue %d: new hp_waiter is %p\n",
+					  kfmlp_get_idx(sem, my_queue), NULL);
+		 */
+		
+		raw_spin_lock(&gsnedf_lock);
+		
+		/*
+		if(my_queue->owner)
+			TRACE_CUR("queue %d: owner is %s/%d\n",
+					  kfmlp_get_idx(sem, my_queue),
+					  my_queue->owner->comm,
+					  my_queue->owner->pid);
+		else
+			TRACE_CUR("queue %d: owner is %p\n",
+					  kfmlp_get_idx(sem, my_queue),
+					  NULL);
+		 */
+		
+		if(tsk_rt(my_queue->owner)->inh_task == max_hp)
+		{
+			__clear_priority_inheritance(my_queue->owner);
+			if(my_queue->hp_waiter != NULL)
+			{
+				__set_priority_inheritance(my_queue->owner, my_queue->hp_waiter);
+			}
+		}
+		raw_spin_unlock(&gsnedf_lock);
+		
+		list_for_each(pos, &my_queue->wait.task_list)
+		{
+			queued  = (struct task_struct*) list_entry(pos, wait_queue_t,
+													   task_list)->private;
+			/* Compare task prios, find high prio task. */
+			if (queued == max_hp)
+			{
+				/*
+				TRACE_CUR("queue %d: found entry in wait queue.  REMOVING!\n",
+						  kfmlp_get_idx(sem, my_queue));
+				*/
+				__remove_wait_queue(&my_queue->wait,
+									list_entry(pos, wait_queue_t, task_list));
+				break;
+			}
+		}
+		--(my_queue->count);
+	}
+	
+	return(max_hp);
+}
+
+int gsnedf_kfmlp_lock(struct litmus_lock* l)
+{
+	struct task_struct* t = current;
+	struct kfmlp_semaphore *sem = kfmlp_from_lock(l);
+	struct kfmlp_queue* my_queue;
+	wait_queue_t wait;
+	unsigned long flags;
+	
+	if (!is_realtime(t))
+		return -EPERM;
+	
+	spin_lock_irqsave(&sem->lock, flags);
+	
+	my_queue = sem->shortest_queue;
+	
+	if (my_queue->owner) {
+		/* resource is not free => must suspend and wait */
+		TRACE_CUR("queue %d: Resource is not free => must suspend and wait.\n",
+				  kfmlp_get_idx(sem, my_queue));
+		
+		init_waitqueue_entry(&wait, t);
+		
+		/* FIXME: interruptible would be nice some day */
+		set_task_state(t, TASK_UNINTERRUPTIBLE);
+		
+		__add_wait_queue_tail_exclusive(&my_queue->wait, &wait);
+		
+		/* check if we need to activate priority inheritance */
+		if (edf_higher_prio(t, my_queue->hp_waiter))
+		{
+			my_queue->hp_waiter = t;
+			if (edf_higher_prio(t, my_queue->owner))
+			{
+				set_priority_inheritance(my_queue->owner, my_queue->hp_waiter);
+			}
+		}
+		
+		++(my_queue->count);
+		sem->shortest_queue = kfmlp_find_shortest(sem, my_queue);
+		
+		/* release lock before sleeping */
+		spin_unlock_irqrestore(&sem->lock, flags);
+		
+		/* We depend on the FIFO order.  Thus, we don't need to recheck
+		 * when we wake up; we are guaranteed to have the lock since
+		 * there is only one wake up per release (or steal).
+		 */
+		schedule();
+
+
+		if(my_queue->owner == t)
+		{
+			TRACE_CUR("queue %d: acquired through waiting\n",
+					  kfmlp_get_idx(sem, my_queue));
+		}
+		else
+		{
+			/* this case may happen if our wait entry was stolen
+			   between queues. record where we went. */
+			my_queue = kfmlp_get_queue(sem, t);
+
+			BUG_ON(!my_queue);
+			TRACE_CUR("queue %d: acquired through stealing\n",
+					  kfmlp_get_idx(sem, my_queue));
+		}
+	}
+	else
+	{
+		TRACE_CUR("queue %d: acquired immediately\n",
+				  kfmlp_get_idx(sem, my_queue));
+
+		my_queue->owner = t;
+		
+		++(my_queue->count);
+		sem->shortest_queue = kfmlp_find_shortest(sem, my_queue);		
+		
+		spin_unlock_irqrestore(&sem->lock, flags);
+	}
+	
+	return kfmlp_get_idx(sem, my_queue);
+}
+
+int gsnedf_kfmlp_unlock(struct litmus_lock* l)
+{
+	struct task_struct *t = current, *next;
+	struct kfmlp_semaphore *sem = kfmlp_from_lock(l);
+	struct kfmlp_queue *my_queue;
+	unsigned long flags;
+	int err = 0;
+	
+	spin_lock_irqsave(&sem->lock, flags);
+	
+	my_queue = kfmlp_get_queue(sem, t);
+	
+	if (!my_queue) {
+		err = -EINVAL;
+		goto out;
+	}
+	
+	/* check if there are jobs waiting for this resource */
+	next = __waitqueue_remove_first(&my_queue->wait);
+	if (next) {
+		/*
+		TRACE_CUR("queue %d: ASSIGNING %s/%d as owner - next\n",
+				  kfmlp_get_idx(sem, my_queue),
+				  next->comm, next->pid);
+		*/
+		/* next becomes the resouce holder */
+		my_queue->owner = next;
+		
+		--(my_queue->count);
+		if(my_queue->count < sem->shortest_queue->count)
+		{
+			sem->shortest_queue = my_queue;
+		}	
+		
+		TRACE_CUR("queue %d: lock ownership passed to %s/%d\n",
+				  kfmlp_get_idx(sem, my_queue), next->comm, next->pid);
+		
+		/* determine new hp_waiter if necessary */
+		if (next == my_queue->hp_waiter) {
+			TRACE_TASK(next, "was highest-prio waiter\n");
+			/* next has the highest priority --- it doesn't need to
+			 * inherit.  However, we need to make sure that the
+			 * next-highest priority in the queue is reflected in
+			 * hp_waiter. */
+			my_queue->hp_waiter = kfmlp_find_hp_waiter(my_queue, next);
+			if (my_queue->hp_waiter)
+				TRACE_TASK(my_queue->hp_waiter, "queue %d: is new highest-prio waiter\n", kfmlp_get_idx(sem, my_queue));
+			else
+				TRACE("queue %d: no further waiters\n", kfmlp_get_idx(sem, my_queue));
+		} else {
+			/* Well, if next is not the highest-priority waiter,
+			 * then it ought to inherit the highest-priority
+			 * waiter's priority. */
+			set_priority_inheritance(next, my_queue->hp_waiter);
+		}
+		
+		/* wake up next */
+		wake_up_process(next);
+	}
+	else
+	{
+		TRACE_CUR("queue %d: looking to steal someone...\n", kfmlp_get_idx(sem, my_queue));
+		
+		next = kfmlp_remove_hp_waiter(sem); /* returns NULL if nothing to steal */
+
+		/*
+		if(next)
+			TRACE_CUR("queue %d: ASSIGNING %s/%d as owner - steal\n",
+					  kfmlp_get_idx(sem, my_queue),
+					  next->comm, next->pid);
+		*/
+		
+		my_queue->owner = next;
+		
+		if(next)
+		{
+			TRACE_CUR("queue %d: lock ownership passed to %s/%d (which was stolen)\n",
+					  kfmlp_get_idx(sem, my_queue),
+					  next->comm, next->pid);
+			
+			/* wake up next */
+			wake_up_process(next);			
+		}
+		else
+		{
+			TRACE_CUR("queue %d: no one to steal.\n", kfmlp_get_idx(sem, my_queue));
+			
+			--(my_queue->count);
+			if(my_queue->count < sem->shortest_queue->count)
+			{
+				sem->shortest_queue = my_queue;
+			}
+		}
+	}
+	
+	/* we lose the benefit of priority inheritance (if any) */
+	if (tsk_rt(t)->inh_task)
+		clear_priority_inheritance(t);
+	
+out:
+	spin_unlock_irqrestore(&sem->lock, flags);
+	
+	return err;
+}
+
+int gsnedf_kfmlp_close(struct litmus_lock* l)
+{
+	struct task_struct *t = current;
+	struct kfmlp_semaphore *sem = kfmlp_from_lock(l);
+	struct kfmlp_queue *my_queue;
+	unsigned long flags;
+	
+	int owner;
+	
+	spin_lock_irqsave(&sem->lock, flags);
+	
+	my_queue = kfmlp_get_queue(sem, t);	
+	owner = (my_queue) ? (my_queue->owner == t) : 0;
+
+	spin_unlock_irqrestore(&sem->lock, flags);
+	
+	if (owner)
+		gsnedf_kfmlp_unlock(l);
+	
+	return 0;
+}
+
+void gsnedf_kfmlp_free(struct litmus_lock* l)
+{
+	struct kfmlp_semaphore *sem = kfmlp_from_lock(l);
+	kfree(sem->queues);
+	kfree(sem);
+}
+
+static struct litmus_lock_ops gsnedf_kfmlp_lock_ops = {
+	.close  = gsnedf_kfmlp_close,
+	.lock   = gsnedf_kfmlp_lock,
+	.unlock = gsnedf_kfmlp_unlock,
+	.deallocate = gsnedf_kfmlp_free,
+};
+
+static struct litmus_lock* gsnedf_new_kfmlp(void* __user arg, int* ret_code)
+{
+	struct kfmlp_semaphore* sem;
+	int num_resources = 0;
+	int i;
+	
+	if(!access_ok(VERIFY_READ, arg, sizeof(num_resources)))
+	{
+		*ret_code = -EINVAL;
+		return(NULL);
+	}
+	if(__copy_from_user(&num_resources, arg, sizeof(num_resources)))
+	{
+		*ret_code = -EINVAL;
+		return(NULL);
+	}
+	if(num_resources < 1)
+	{
+		*ret_code = -EINVAL;
+		return(NULL);		
+	}
+	
+	sem = kmalloc(sizeof(*sem), GFP_KERNEL);
+	if(!sem)
+	{
+		*ret_code = -ENOMEM;
+		return NULL;
+	}
+	
+	sem->queues = kmalloc(sizeof(struct kfmlp_queue)*num_resources, GFP_KERNEL);
+	if(!sem->queues)
+	{
+		kfree(sem);
+		*ret_code = -ENOMEM;
+		return NULL;		
+	}
+	
+	sem->litmus_lock.ops = &gsnedf_kfmlp_lock_ops;
+	spin_lock_init(&sem->lock);
+	sem->num_resources = num_resources;
+	
+	for(i = 0; i < num_resources; ++i)
+	{
+		sem->queues[i].owner = NULL;
+		sem->queues[i].hp_waiter = NULL;
+		init_waitqueue_head(&sem->queues[i].wait);
+		sem->queues[i].count = 0;
+	}
+	
+	sem->shortest_queue = &sem->queues[0];
+	
+	*ret_code = 0;
+	return &sem->litmus_lock;
+}
+
+
+
+
+
 /* **** lock constructor **** */
 
 
 static long gsnedf_allocate_lock(struct litmus_lock **lock, int type,
-				 void* __user unused)
+				 void* __user arg)
 {
 	int err = -ENXIO;
 
@@ -911,7 +1559,10 @@ static long gsnedf_allocate_lock(struct litmus_lock **lock, int type,
 		else
 			err = -ENOMEM;
 		break;
-
+			
+	case KFMLP_SEM:
+		*lock = gsnedf_new_kfmlp(arg, &err);
+		break;
 	};
 
 	return err;
@@ -919,7 +1570,6 @@ static long gsnedf_allocate_lock(struct litmus_lock **lock, int type,
 
 #endif
 
-
 static long gsnedf_activate_plugin(void)
 {
 	int cpu;
@@ -946,6 +1596,15 @@ static long gsnedf_activate_plugin(void)
 		}
 #endif
 	}
+    
+#ifdef CONFIG_LITMUS_SOFTIRQD
+    spawn_klitirqd(NULL);
+#endif
+
+#ifdef CONFIG_LITMUS_NVIDIA
+	init_nvidia_info();
+#endif
+	
 	return 0;
 }
 
@@ -963,8 +1622,15 @@ static struct sched_plugin gsn_edf_plugin __cacheline_aligned_in_smp = {
 	.admit_task		= gsnedf_admit_task,
 	.activate_plugin	= gsnedf_activate_plugin,
 #ifdef CONFIG_LITMUS_LOCKING
-	.allocate_lock		= gsnedf_allocate_lock,
+	.allocate_lock	= gsnedf_allocate_lock,
+    .set_prio_inh   = set_priority_inheritance,
+    .clear_prio_inh = clear_priority_inheritance,	
+#endif
+#ifdef CONFIG_LITMUS_SOFTIRQD
+	.set_prio_inh_klitirqd = set_priority_inheritance_klitirqd,
+	.clear_prio_inh_klitirqd = clear_priority_inheritance_klitirqd,
 #endif
+
 };
 
 
diff --git a/litmus/sched_litmus.c b/litmus/sched_litmus.c
index e6952896dc4b..1bca2e1a33cd 100644
--- a/litmus/sched_litmus.c
+++ b/litmus/sched_litmus.c
@@ -103,7 +103,9 @@ litmus_schedule(struct rq *rq, struct task_struct *prev)
 		}
 #ifdef  __ARCH_WANT_UNLOCKED_CTXSW
 		if (next->oncpu)
+		{
 			TRACE_TASK(next, "waiting for !oncpu");
+		}
 		while (next->oncpu) {
 			cpu_relax();
 			mb();
diff --git a/litmus/sched_plugin.c b/litmus/sched_plugin.c
index d54886df1f57..8802670a4b0b 100644
--- a/litmus/sched_plugin.c
+++ b/litmus/sched_plugin.c
@@ -129,6 +129,27 @@ static long litmus_dummy_allocate_lock(struct litmus_lock **lock, int type,
 	return -ENXIO;
 }
 
+static void litmus_dummy_set_prio_inh(struct task_struct* a, struct task_struct* b)
+{
+}
+
+static void litmus_dummy_clear_prio_inh(struct task_struct* t)
+{
+}
+
+#endif
+
+#ifdef CONFIG_LITMUS_SOFTIRQD
+static void litmus_dummy_set_prio_inh_klitirq(struct task_struct* klitirqd,
+                                       struct task_struct* old_owner,
+                                       struct task_struct* new_owner)
+{
+}
+
+static void litmus_dummy_clear_prio_inh_klitirqd(struct task_struct* klitirqd,
+                                          struct task_struct* old_owner)
+{
+}
 #endif
 
 
@@ -149,6 +170,12 @@ struct sched_plugin linux_sched_plugin = {
 	.deactivate_plugin = litmus_dummy_deactivate_plugin,
 #ifdef CONFIG_LITMUS_LOCKING
 	.allocate_lock = litmus_dummy_allocate_lock,
+    .set_prio_inh = litmus_dummy_set_prio_inh,
+    .clear_prio_inh = litmus_dummy_clear_prio_inh,
+#endif
+#ifdef CONFIG_LITMUS_SOFTIRQD
+	.set_prio_inh_klitirqd = litmus_dummy_set_prio_inh_klitirq,
+	.clear_prio_inh_klitirqd = litmus_dummy_clear_prio_inh_klitirqd,
 #endif
 	.admit_task = litmus_dummy_admit_task
 };
@@ -187,6 +214,8 @@ int register_sched_plugin(struct sched_plugin* plugin)
 	CHECK(deactivate_plugin);
 #ifdef CONFIG_LITMUS_LOCKING
 	CHECK(allocate_lock);
+    CHECK(set_prio_inh);
+    CHECK(clear_prio_inh);
 #endif
 	CHECK(admit_task);
 
diff --git a/litmus/sched_task_trace.c b/litmus/sched_task_trace.c
index 5ef8d09ab41f..7aeb99b668d3 100644
--- a/litmus/sched_task_trace.c
+++ b/litmus/sched_task_trace.c
@@ -7,6 +7,7 @@
 #include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/percpu.h>
+#include <linux/hardirq.h>
 
 #include <litmus/ftdev.h>
 #include <litmus/litmus.h>
@@ -16,13 +17,13 @@
 #include <litmus/ftdev.h>
 
 
-#define NO_EVENTS		(1 << CONFIG_SCHED_TASK_TRACE_SHIFT)
+#define NUM_EVENTS		(1 << (CONFIG_SCHED_TASK_TRACE_SHIFT+11))
 
 #define now() litmus_clock()
 
 struct local_buffer {
-	struct st_event_record record[NO_EVENTS];
-	char   flag[NO_EVENTS];
+	struct st_event_record record[NUM_EVENTS];
+	char   flag[NUM_EVENTS];
 	struct ft_buffer ftbuf;
 };
 
@@ -41,7 +42,7 @@ static int __init init_sched_task_trace(void)
 	int i, ok = 0, err;
 	printk("Allocated %u sched_trace_xxx() events per CPU "
 	       "(buffer size: %d bytes)\n",
-	       NO_EVENTS, (int) sizeof(struct local_buffer));
+	       NUM_EVENTS, (int) sizeof(struct local_buffer));
 
 	err = ftdev_init(&st_dev, THIS_MODULE,
 			num_online_cpus(), "sched_trace");
@@ -50,7 +51,7 @@ static int __init init_sched_task_trace(void)
 
 	for (i = 0; i < st_dev.minor_cnt; i++) {
 		buf = &per_cpu(st_event_buffer, i);
-		ok += init_ft_buffer(&buf->ftbuf, NO_EVENTS,
+		ok += init_ft_buffer(&buf->ftbuf, NUM_EVENTS,
 				     sizeof(struct st_event_record),
 				     buf->flag,
 				     buf->record);
@@ -154,7 +155,8 @@ feather_callback void do_sched_trace_task_switch_to(unsigned long id,
 {
 	struct task_struct *t = (struct task_struct*) _task;
 	struct st_event_record* rec;
-	if (is_realtime(t)) {
+	//if (is_realtime(t))  /* comment out to trace EVERYTHING */
+	{
 		rec = get_record(ST_SWITCH_TO, t);
 		if (rec) {
 			rec->data.switch_to.when      = now();
@@ -169,7 +171,8 @@ feather_callback void do_sched_trace_task_switch_away(unsigned long id,
 {
 	struct task_struct *t = (struct task_struct*) _task;
 	struct st_event_record* rec;
-	if (is_realtime(t)) {
+	//if (is_realtime(t))  /* comment out to trace EVERYTHING */
+	{
 		rec = get_record(ST_SWITCH_AWAY, t);
 		if (rec) {
 			rec->data.switch_away.when      = now();
@@ -188,6 +191,7 @@ feather_callback void do_sched_trace_task_completion(unsigned long id,
 	if (rec) {
 		rec->data.completion.when   = now();
 		rec->data.completion.forced = forced;
+		rec->data.completion.nv_int_count = (u16)atomic_read(&tsk_rt(t)->nv_int_count);
 		put_record(rec);
 	}
 }
@@ -239,3 +243,201 @@ feather_callback void do_sched_trace_action(unsigned long id,
 		put_record(rec);
 	}
 }
+
+
+feather_callback void do_sched_trace_tasklet_release(unsigned long id,
+												   unsigned long _owner)
+{
+	struct task_struct *t = (struct task_struct*) _owner;
+	struct st_event_record *rec = get_record(ST_TASKLET_RELEASE, t);
+	
+	if (rec) {
+		rec->data.tasklet_release.when = now();
+		put_record(rec);
+	}
+}
+
+
+feather_callback void do_sched_trace_tasklet_begin(unsigned long id,
+												   unsigned long _owner)
+{
+	struct task_struct *t = (struct task_struct*) _owner;
+	struct st_event_record *rec = get_record(ST_TASKLET_BEGIN, t);
+	
+	if (rec) {
+		rec->data.tasklet_begin.when = now();
+
+		if(!in_interrupt())
+			rec->data.tasklet_begin.exe_pid = current->pid;
+		else
+			rec->data.tasklet_begin.exe_pid = 0;
+
+		put_record(rec);
+	}
+}
+EXPORT_SYMBOL(do_sched_trace_tasklet_begin);
+
+
+feather_callback void do_sched_trace_tasklet_end(unsigned long id,
+												 unsigned long _owner,
+												 unsigned long _flushed)
+{
+	struct task_struct *t = (struct task_struct*) _owner;
+	struct st_event_record *rec = get_record(ST_TASKLET_END, t);
+	
+	if (rec) {
+		rec->data.tasklet_end.when = now();
+		rec->data.tasklet_end.flushed = _flushed;
+
+		if(!in_interrupt())
+			rec->data.tasklet_end.exe_pid = current->pid;
+		else
+			rec->data.tasklet_end.exe_pid = 0;
+
+		put_record(rec);
+	}
+}
+EXPORT_SYMBOL(do_sched_trace_tasklet_end);
+
+
+feather_callback void do_sched_trace_work_release(unsigned long id,
+													 unsigned long _owner)
+{
+	struct task_struct *t = (struct task_struct*) _owner;
+	struct st_event_record *rec = get_record(ST_WORK_RELEASE, t);
+	
+	if (rec) {
+		rec->data.work_release.when = now();
+		put_record(rec);
+	}
+}
+
+
+feather_callback void do_sched_trace_work_begin(unsigned long id,
+												unsigned long _owner,
+												unsigned long _exe)
+{
+	struct task_struct *t = (struct task_struct*) _owner;
+	struct st_event_record *rec = get_record(ST_WORK_BEGIN, t);
+	
+	if (rec) {
+		struct task_struct *exe = (struct task_struct*) _exe;
+		rec->data.work_begin.exe_pid = exe->pid;
+		rec->data.work_begin.when = now();
+		put_record(rec);
+	}
+}
+EXPORT_SYMBOL(do_sched_trace_work_begin);
+
+
+feather_callback void do_sched_trace_work_end(unsigned long id,
+											  unsigned long _owner,
+											  unsigned long _exe,
+											  unsigned long _flushed)
+{
+	struct task_struct *t = (struct task_struct*) _owner;
+	struct st_event_record *rec = get_record(ST_WORK_END, t);
+	
+	if (rec) {
+		struct task_struct *exe = (struct task_struct*) _exe;
+		rec->data.work_end.exe_pid = exe->pid;
+		rec->data.work_end.flushed = _flushed;
+		rec->data.work_end.when = now();
+		put_record(rec);
+	}
+}
+EXPORT_SYMBOL(do_sched_trace_work_end);
+
+
+feather_callback void do_sched_trace_eff_prio_change(unsigned long id,
+											  unsigned long _task,
+											  unsigned long _inh)
+{
+	struct task_struct *t = (struct task_struct*) _task;
+	struct st_event_record *rec = get_record(ST_EFF_PRIO_CHANGE, t);
+	
+	if (rec) {
+		struct task_struct *inh = (struct task_struct*) _inh;
+		rec->data.effective_priority_change.when = now();
+		rec->data.effective_priority_change.inh_pid = (inh != NULL) ?
+			inh->pid :
+			0xffff;
+		
+		put_record(rec);
+	}
+}
+
+
+/* pray for no nesting of nv interrupts on same CPU... */
+struct tracing_interrupt_map
+{
+	int active;
+	int count;
+	unsigned long data[128]; // assume nesting less than 128...
+};
+DEFINE_PER_CPU(struct tracing_interrupt_map, active_interrupt_tracing);
+
+feather_callback void do_sched_trace_nv_interrupt_begin(unsigned long id,
+												unsigned long _device)
+{
+	struct st_event_record *rec;
+
+	{
+		struct tracing_interrupt_map* int_map = &per_cpu(active_interrupt_tracing, smp_processor_id());
+		if(int_map->active == 0xcafebabe)
+		{
+			int_map->count++;
+		}
+		else
+		{
+			int_map->active = 0xcafebabe;
+			int_map->count = 1;
+		}
+		int_map->data[int_map->count-1] = _device;
+	}
+
+	rec = get_record(ST_NV_INTERRUPT_BEGIN, NULL);
+	if(rec) {
+		u32 device = _device;
+		rec->data.nv_interrupt_begin.when = now();
+		rec->data.nv_interrupt_begin.device = device;
+		put_record(rec);
+	}
+}
+EXPORT_SYMBOL(do_sched_trace_nv_interrupt_begin);
+
+/*
+int is_interrupt_tracing_active(void)
+{
+	struct tracing_interrupt_map* int_map = &per_cpu(active_interrupt_tracing, smp_processor_id());
+	if(int_map->active == 0xcafebabe)
+		return 1;
+	return 0;
+}
+*/
+
+feather_callback void do_sched_trace_nv_interrupt_end(unsigned long id, unsigned long unused)
+{
+	struct tracing_interrupt_map* int_map = &per_cpu(active_interrupt_tracing, smp_processor_id());
+	if(int_map->active == 0xcafebabe)
+	{
+		struct st_event_record *rec = get_record(ST_NV_INTERRUPT_END, NULL);
+
+		int_map->count--;
+		if(int_map->count == 0)
+			int_map->active = 0;
+
+		if(rec) {
+			rec->data.nv_interrupt_end.when = now();
+			rec->data.nv_interrupt_end.device = int_map->data[int_map->count];
+			put_record(rec);
+		}
+	}
+}
+EXPORT_SYMBOL(do_sched_trace_nv_interrupt_end);
+
+
+
+
+
+
diff --git a/litmus/sched_trace_external.c b/litmus/sched_trace_external.c
new file mode 100644
index 000000000000..d7d7d8bae298
--- /dev/null
+++ b/litmus/sched_trace_external.c
@@ -0,0 +1,45 @@
+#include <linux/module.h>
+
+#include <litmus/sched_trace.h>
+#include <litmus/litmus.h>
+
+void __sched_trace_tasklet_begin_external(struct task_struct* t)
+{
+	sched_trace_tasklet_begin(t);
+}
+EXPORT_SYMBOL(__sched_trace_tasklet_begin_external);
+
+void __sched_trace_tasklet_end_external(struct task_struct* t, unsigned long flushed)
+{
+	sched_trace_tasklet_end(t, flushed);
+}
+EXPORT_SYMBOL(__sched_trace_tasklet_end_external);
+
+
+
+void __sched_trace_work_begin_external(struct task_struct* t, struct task_struct* e)
+{
+	sched_trace_work_begin(t, e);
+}
+EXPORT_SYMBOL(__sched_trace_work_begin_external);
+
+void __sched_trace_work_end_external(struct task_struct* t, struct task_struct* e, unsigned long f)
+{
+	sched_trace_work_end(t, e, f);
+}
+EXPORT_SYMBOL(__sched_trace_work_end_external);
+
+
+
+void __sched_trace_nv_interrupt_begin_external(u32 device)
+{
+	unsigned long _device = device;
+	sched_trace_nv_interrupt_begin(_device);
+}
+EXPORT_SYMBOL(__sched_trace_nv_interrupt_begin_external);
+
+void __sched_trace_nv_interrupt_end_external(void)
+{
+	sched_trace_nv_interrupt_end();
+}
+EXPORT_SYMBOL(__sched_trace_nv_interrupt_end_external);
-- 
cgit v1.2.2


From 5d7dcfa10ea0dd283773a301e3ce610a7797d582 Mon Sep 17 00:00:00 2001
From: Glenn Elliott <gelliott@cs.unc.edu>
Date: Wed, 11 Jan 2012 14:37:13 -0500
Subject: PAI implementation, C-RM, C-FIFO.

---
 include/linux/interrupt.h             |    2 +-
 include/litmus/fifo_common.h          |   25 +
 include/litmus/litmus.h               |    4 +-
 include/litmus/nvidia_info.h          |    3 +-
 include/litmus/rm_common.h            |   25 +
 include/litmus/rm_srt_common.h        |   25 +
 include/litmus/sched_plugin.h         |   11 +
 include/litmus/sched_trace.h          |    8 +-
 include/litmus/sched_trace_external.h |   22 +-
 include/litmus/trace.h                |   14 +
 kernel/sched.c                        |    4 +-
 kernel/softirq.c                      |    4 +
 kernel/workqueue.c                    |    2 +-
 litmus/Kconfig                        |   52 +-
 litmus/Makefile                       |    3 +-
 litmus/edf_common.c                   |   39 +
 litmus/fifo_common.c                  |  124 +++
 litmus/litmus_pai_softirq.c           |   64 ++
 litmus/litmus_softirq.c               |    2 +-
 litmus/nvidia_info.c                  |   24 +-
 litmus/rm_common.c                    |  160 ++++
 litmus/rm_srt_common.c                |  167 ++++
 litmus/sched_cedf.c                   |  229 ++++-
 litmus/sched_cfifo.c                  | 1611 +++++++++++++++++++++++++++++++++
 litmus/sched_crm.c                    | 1611 +++++++++++++++++++++++++++++++++
 litmus/sched_crm_srt.c                | 1611 +++++++++++++++++++++++++++++++++
 litmus/sched_gsn_edf.c                |   10 +-
 litmus/sched_plugin.c                 |   11 +
 litmus/sched_task_trace.c             |   26 +-
 litmus/sched_trace_external.c         |   23 +-
 30 files changed, 5874 insertions(+), 42 deletions(-)
 create mode 100644 include/litmus/fifo_common.h
 create mode 100644 include/litmus/rm_common.h
 create mode 100644 include/litmus/rm_srt_common.h
 create mode 100644 litmus/fifo_common.c
 create mode 100644 litmus/litmus_pai_softirq.c
 create mode 100644 litmus/rm_common.c
 create mode 100644 litmus/rm_srt_common.c
 create mode 100644 litmus/sched_cfifo.c
 create mode 100644 litmus/sched_crm.c
 create mode 100644 litmus/sched_crm_srt.c

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 5d22f5342376..a2f2880d5517 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -460,7 +460,7 @@ struct tasklet_struct
 	void (*func)(unsigned long);
 	unsigned long data;
 
-#ifdef CONFIG_LITMUS_SOFTIRQD
+#if defined(CONFIG_LITMUS_SOFTIRQD) || defined(CONFIG_LITMUS_PAI_SOFTIRQD)
 	struct task_struct *owner;
 #endif
 };
diff --git a/include/litmus/fifo_common.h b/include/litmus/fifo_common.h
new file mode 100644
index 000000000000..12cfbfea41ee
--- /dev/null
+++ b/include/litmus/fifo_common.h
@@ -0,0 +1,25 @@
+/*
+ * EDF common data structures and utility functions shared by all EDF
+ * based scheduler plugins
+ */
+
+/* CLEANUP: Add comments and make it less messy.
+ *
+ */
+
+#ifndef __UNC_FIFO_COMMON_H__
+#define __UNC_FIFO_COMMON_H__
+
+#include <litmus/rt_domain.h>
+
+void fifo_domain_init(rt_domain_t* rt, check_resched_needed_t resched,
+		     release_jobs_t release);
+
+int fifo_higher_prio(struct task_struct* first,
+		    struct task_struct* second);
+
+int fifo_ready_order(struct bheap_node* a, struct bheap_node* b);
+
+int fifo_preemption_needed(rt_domain_t* rt, struct task_struct *t);
+
+#endif
diff --git a/include/litmus/litmus.h b/include/litmus/litmus.h
index 3df242bf272f..829c1c5ab91f 100644
--- a/include/litmus/litmus.h
+++ b/include/litmus/litmus.h
@@ -118,7 +118,9 @@ static inline lt_t litmus_clock(void)
 #define earlier_release(a, b)  (lt_before(\
 	(a)->rt_param.job_params.release,\
 	(b)->rt_param.job_params.release))
-
+#define shorter_period(a, b)   (lt_before(\
+    (a)->rt_param.task_params.period,\
+    (b)->rt_param.task_params.period))
 void preempt_if_preemptable(struct task_struct* t, int on_cpu);
 
 #ifdef CONFIG_LITMUS_LOCKING
diff --git a/include/litmus/nvidia_info.h b/include/litmus/nvidia_info.h
index 579301d77cf5..9e07a27fdee3 100644
--- a/include/litmus/nvidia_info.h
+++ b/include/litmus/nvidia_info.h
@@ -7,7 +7,8 @@
 #include <litmus/litmus_softirq.h>
 
 
-#define NV_DEVICE_NUM NR_LITMUS_SOFTIRQD
+//#define NV_DEVICE_NUM NR_LITMUS_SOFTIRQD
+#define NV_DEVICE_NUM CONFIG_NV_DEVICE_NUM
 
 int init_nvidia_info(void);
 
diff --git a/include/litmus/rm_common.h b/include/litmus/rm_common.h
new file mode 100644
index 000000000000..5991b0b4e758
--- /dev/null
+++ b/include/litmus/rm_common.h
@@ -0,0 +1,25 @@
+/*
+ * EDF common data structures and utility functions shared by all EDF
+ * based scheduler plugins
+ */
+
+/* CLEANUP: Add comments and make it less messy.
+ *
+ */
+
+#ifndef __UNC_RM_COMMON_H__
+#define __UNC_RM_COMMON_H__
+
+#include <litmus/rt_domain.h>
+
+void rm_domain_init(rt_domain_t* rt, check_resched_needed_t resched,
+		     release_jobs_t release);
+
+int rm_higher_prio(struct task_struct* first,
+		    struct task_struct* second);
+
+int rm_ready_order(struct bheap_node* a, struct bheap_node* b);
+
+int rm_preemption_needed(rt_domain_t* rt, struct task_struct *t);
+
+#endif
diff --git a/include/litmus/rm_srt_common.h b/include/litmus/rm_srt_common.h
new file mode 100644
index 000000000000..78aa287327a2
--- /dev/null
+++ b/include/litmus/rm_srt_common.h
@@ -0,0 +1,25 @@
+/*
+ * EDF common data structures and utility functions shared by all EDF
+ * based scheduler plugins
+ */
+
+/* CLEANUP: Add comments and make it less messy.
+ *
+ */
+
+#ifndef __UNC_RM_SRT_COMMON_H__
+#define __UNC_RM_SRT_COMMON_H__
+
+#include <litmus/rt_domain.h>
+
+void rm_srt_domain_init(rt_domain_t* rt, check_resched_needed_t resched,
+		     release_jobs_t release);
+
+int rm_srt_higher_prio(struct task_struct* first,
+		    struct task_struct* second);
+
+int rm_srt_ready_order(struct bheap_node* a, struct bheap_node* b);
+
+int rm_srt_preemption_needed(rt_domain_t* rt, struct task_struct *t);
+
+#endif
diff --git a/include/litmus/sched_plugin.h b/include/litmus/sched_plugin.h
index df50930d14a0..12a9ab65a673 100644
--- a/include/litmus/sched_plugin.h
+++ b/include/litmus/sched_plugin.h
@@ -11,6 +11,10 @@
 #include <litmus/locking.h>
 #endif
 
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+#include <linux/interrupt.h>
+#endif
+
 /************************ setup/tear down ********************/
 
 typedef long (*activate_plugin_t) (void);
@@ -69,6 +73,9 @@ typedef void (*set_prio_inh_klitirq_t)(struct task_struct* klitirqd,
 typedef void (*clear_prio_inh_klitirqd_t)(struct task_struct* klitirqd,
                                           struct task_struct* old_owner);
 
+
+typedef int (*enqueue_pai_tasklet_t)(struct tasklet_struct* tasklet);
+
 /********************* sys call backends  ********************/
 /* This function causes the caller to sleep until the next release */
 typedef long (*complete_job_t) (void);
@@ -115,6 +122,10 @@ struct sched_plugin {
     set_prio_inh_klitirq_t		set_prio_inh_klitirqd;
     clear_prio_inh_klitirqd_t	clear_prio_inh_klitirqd;
 #endif
+	
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+	enqueue_pai_tasklet_t		enqueue_pai_tasklet;
+#endif
 } __attribute__ ((__aligned__(SMP_CACHE_BYTES)));
 
 
diff --git a/include/litmus/sched_trace.h b/include/litmus/sched_trace.h
index 1486c778aff8..232c7588d103 100644
--- a/include/litmus/sched_trace.h
+++ b/include/litmus/sched_trace.h
@@ -127,13 +127,13 @@ struct st_effective_priority_change_data {
 struct st_nv_interrupt_begin_data {
 	u64 when;
 	u32 device;
-	u8  __unused[4];
+	u32 serialNumber;
 } __attribute__((packed));
 
 struct st_nv_interrupt_end_data {
 	u64 when;
 	u32 device;
-	u8  __unused[4];
+	u32 serialNumber;
 } __attribute__((packed));
 
 #define DATA(x) struct st_ ## x ## _data x;
@@ -328,8 +328,8 @@ feather_callback void do_sched_trace_nv_interrupt_end(unsigned long id,
 
 #define sched_trace_nv_interrupt_begin(d) \
 	SCHED_TRACE(SCHED_TRACE_BASE_ID + 18, do_sched_trace_nv_interrupt_begin, d)
-#define sched_trace_nv_interrupt_end() \
-	SCHED_TRACE(SCHED_TRACE_BASE_ID + 19, do_sched_trace_nv_interrupt_end, 0ul)
+#define sched_trace_nv_interrupt_end(d) \
+	SCHED_TRACE(SCHED_TRACE_BASE_ID + 19, do_sched_trace_nv_interrupt_end, d)
 
 #define sched_trace_quantum_boundary() /* NOT IMPLEMENTED */
 
diff --git a/include/litmus/sched_trace_external.h b/include/litmus/sched_trace_external.h
index c2c872639880..90424d5c564c 100644
--- a/include/litmus/sched_trace_external.h
+++ b/include/litmus/sched_trace_external.h
@@ -34,9 +34,25 @@ static inline void sched_trace_nv_interrupt_begin_external(u32 device)
 	__sched_trace_nv_interrupt_begin_external(device);
 }
 
-extern void __sched_trace_nv_interrupt_end_external(void);
-static inline void sched_trace_nv_interrupt_end_external(void)
+extern void __sched_trace_nv_interrupt_end_external(u32 device);
+static inline void sched_trace_nv_interrupt_end_external(u32 device)
 {
-	__sched_trace_nv_interrupt_end_external();
+	__sched_trace_nv_interrupt_end_external(device);
 }
+
+#ifdef CONFIG_LITMUS_NVIDIA
+
+#define EX_TS(evt) \
+extern void __##evt(void); \
+static inline void EX_##evt(void) { __##evt(); }
+
+EX_TS(TS_NV_TOPISR_START)
+EX_TS(TS_NV_TOPISR_END)
+EX_TS(TS_NV_BOTISR_START)
+EX_TS(TS_NV_BOTISR_END)
+EX_TS(TS_NV_RELEASE_BOTISR_START)
+EX_TS(TS_NV_RELEASE_BOTISR_END)
+
+#endif
+
 #endif
diff --git a/include/litmus/trace.h b/include/litmus/trace.h
index 05f487263f28..aa3ee4a6757b 100644
--- a/include/litmus/trace.h
+++ b/include/litmus/trace.h
@@ -100,4 +100,18 @@ feather_callback void save_timestamp_cpu(unsigned long event, unsigned long cpu)
 #define TS_SEND_RESCHED_END		DTIMESTAMP(191, TSK_UNKNOWN)
 
 
+
+#ifdef CONFIG_LITMUS_NVIDIA
+
+#define TS_NV_TOPISR_START		TIMESTAMP(200)
+#define TS_NV_TOPISR_END		TIMESTAMP(201)
+
+#define TS_NV_BOTISR_START		TIMESTAMP(202)
+#define TS_NV_BOTISR_END		TIMESTAMP(203)
+
+#define TS_NV_RELEASE_BOTISR_START	TIMESTAMP(204)
+#define TS_NV_RELEASE_BOTISR_END	TIMESTAMP(205)
+
+#endif
+
 #endif /* !_SYS_TRACE_H_ */
diff --git a/kernel/sched.c b/kernel/sched.c
index 3162605ffc91..3aa2be09122b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3897,8 +3897,10 @@ need_resched_nonpreemptible:
 	if (need_resched())
 		goto need_resched;
 
+#ifdef LITMUS_SOFTIRQD
 	reacquire_klitirqd_lock(prev);
-	
+#endif
+
 	srp_ceiling_block();
 }
 EXPORT_SYMBOL(schedule);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index be4b8fab3637..ae77c5c1d17e 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -35,6 +35,7 @@
 
 #ifdef CONFIG_LITMUS_NVIDIA
 #include <litmus/nvidia_info.h>
+#include <litmus/trace.h>
 #endif
 
 /*
@@ -441,6 +442,9 @@ void __tasklet_schedule(struct tasklet_struct *t)
 				if(likely(_litmus_tasklet_schedule(t,nvidia_device)))
 				{
 					unlock_nv_registry(nvidia_device, &flags);
+
+					TS_NV_RELEASE_BOTISR_END;
+
 					return;
 				}
 				else
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 8139208eaee1..637cadac2627 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2531,7 +2531,7 @@ EXPORT_SYMBOL(cancel_delayed_work_sync);
  */
 int schedule_work(struct work_struct *work)
 {
-#ifdef CONFIG_LITMUS_NVIDIA
+#if defined(CONFIG_LITMUS_NVIDIA) && defined(CONFIG_LITMUS_SOFTIRQD)
 	if(is_nvidia_func(work->func))
 	{
 		u32 nvidiaDevice = get_work_nv_device_num(work);
diff --git a/litmus/Kconfig b/litmus/Kconfig
index 7e865d4dd703..5109cf7db7f6 100644
--- a/litmus/Kconfig
+++ b/litmus/Kconfig
@@ -218,18 +218,41 @@ config LITMUS_THREAD_ALL_SOFTIRQ
 
 		If unsure, say No.
 
+
+choice 
+	prompt "Scheduling of interrupt bottom-halves in Litmus."
+	default LITMUS_SOFTIRQD_NONE
+	depends on LITMUS_LOCKING && !LITMUS_THREAD_ALL_SOFTIRQ
+	help
+		Schedule tasklets with known priorities in Litmus.
+
+config LITMUS_SOFTIRQD_NONE
+	bool "No tasklet scheduling in Litmus."
+	help
+	  Don't schedule tasklets in Litmus.  Default.
+
 config LITMUS_SOFTIRQD
-       bool "Spawn klitirqd interrupt handling threads."
-	   depends on LITMUS_LOCKING
-	   default n
-	   help
-	     Create klitirqd interrupt handling threads.  Work must be
-		 specifically dispatched to these workers.  (Softirqs for
-		 Litmus tasks are not magically redirected to klitirqd.)
+	bool "Spawn klitirqd interrupt handling threads."
+	help
+	  Create klitirqd interrupt handling threads.  Work must be
+	  specifically dispatched to these workers.  (Softirqs for
+	  Litmus tasks are not magically redirected to klitirqd.)
 
-		 G-EDF ONLY for now!
+	  G-EDF/RM, C-EDF/RM ONLY for now!
 
-	     If unsure, say No.
+
+config LITMUS_PAI_SOFTIRQD
+	bool "Defer tasklets to context switch points."
+	help
+	  Only execute scheduled tasklet bottom halves at
+	  scheduling points.  Trades context switch overhead
+	  at the cost of non-preemptive durations of bottom half
+	  processing.
+		 
+	  G-EDF/RM, C-EDF/RM ONLY for now!	 
+		 
+endchoice	   
+	   
 
 config NR_LITMUS_SOFTIRQD
 	   int "Number of klitirqd."
@@ -241,13 +264,22 @@ config NR_LITMUS_SOFTIRQD
 
 config LITMUS_NVIDIA
 	  bool "Litmus handling of NVIDIA interrupts."
-	  depends on LITMUS_SOFTIRQD
+	  depends on LITMUS_SOFTIRQD || LITMUS_PAI_SOFTIRQD
 	  default n
 	  help
 	    Direct tasklets from NVIDIA devices to Litmus's klitirqd.
 
 		If unsure, say No.
 
+config NV_DEVICE_NUM
+	   int "Number of NVIDIA GPUs."
+	   depends on LITMUS_SOFTIRQD || LITMUS_PAI_SOFTIRQD
+	   range 1 4096
+	   default "1"
+	   help
+	     Should be (<= to the number of CPUs) and
+		 (<= to the number of GPUs) in your system.
+
 choice
 	  prompt "CUDA/Driver Version Support"
 	  default CUDA_4_0
diff --git a/litmus/Makefile b/litmus/Makefile
index 892e01c2e1b3..869939e2270c 100644
--- a/litmus/Makefile
+++ b/litmus/Makefile
@@ -19,7 +19,7 @@ obj-y     = sched_plugin.o litmus.o \
 	    sched_gsn_edf.o \
 	    sched_psn_edf.o
 
-obj-$(CONFIG_PLUGIN_CEDF) += sched_cedf.o
+obj-$(CONFIG_PLUGIN_CEDF) += sched_cedf.o sched_cfifo.o fifo_common.o sched_crm.o rm_common.o sched_crm_srt.o rm_srt_common.o
 obj-$(CONFIG_PLUGIN_PFAIR) += sched_pfair.o
 obj-$(CONFIG_SCHED_CPU_AFFINITY) += affinity.o
 
@@ -29,4 +29,5 @@ obj-$(CONFIG_SCHED_DEBUG_TRACE) += sched_trace.o
 obj-$(CONFIG_SCHED_OVERHEAD_TRACE) += trace.o
 
 obj-$(CONFIG_LITMUS_SOFTIRQD) += litmus_softirq.o
+obj-$(CONFIG_LITMUS_PAI_SOFTIRQD) += litmus_pai_softirq.o
 obj-$(CONFIG_LITMUS_NVIDIA) += nvidia_info.o sched_trace_external.o
diff --git a/litmus/edf_common.c b/litmus/edf_common.c
index fbd67ab5f467..0a06d7a26c00 100644
--- a/litmus/edf_common.c
+++ b/litmus/edf_common.c
@@ -63,7 +63,45 @@ int edf_higher_prio(struct task_struct* first,
 
 #endif
 
+	if (!is_realtime(second_task))
+		return true;
+
+	if (earlier_deadline(first_task, second_task))
+		return true;
+
+	if (get_deadline(first_task) == get_deadline(second_task))
+	{
+		if (shorter_period(first_task, second_task))
+		{
+			return true;
+		}
+		if (get_rt_period(first_task) == get_rt_period(second_task))
+		{
+#ifdef CONFIG_LITMUS_SOFTIRQD
+			if (first_task->rt_param.is_proxy_thread < second_task->rt_param.is_proxy_thread)
+		    {
+				return true;
+			}
+			if (first_task->rt_param.is_proxy_thread == second_task->rt_param.is_proxy_thread)
+			{
+#endif      
+			if (first_task->pid < second_task->pid)
+			{   
+				return true;
+			}
+			if (first_task->pid == second_task->pid)
+			{
+				return !second->rt_param.inh_task;
+			}
+#ifdef CONFIG_LITMUS_SOFTIRQD
+			}
+#endif
+		}
+	}
+	
+	return false;
 
+#if 0
 	return !is_realtime(second_task)  ||
     
 #ifdef CONFIG_LITMUS_SOFTIRQD
@@ -88,6 +126,7 @@ int edf_higher_prio(struct task_struct* first,
 		 */
 		(first_task->pid == second_task->pid &&
 		 !second->rt_param.inh_task)));
+#endif
 }
 
 int edf_ready_order(struct bheap_node* a, struct bheap_node* b)
diff --git a/litmus/fifo_common.c b/litmus/fifo_common.c
new file mode 100644
index 000000000000..c94510a171d9
--- /dev/null
+++ b/litmus/fifo_common.c
@@ -0,0 +1,124 @@
+/*
+ * kernel/fifo_common.c
+ *
+ * Common functions for EDF based scheduler.
+ */
+
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/sched_trace.h>
+
+#include <litmus/fifo_common.h>
+
+/* fifo_higher_prio -  returns true if first has a higher EDF priority
+ *                    than second. Deadline ties are broken by PID.
+ *
+ * both first and second may be NULL
+ */
+int fifo_higher_prio(struct task_struct* first,
+		    struct task_struct* second)
+{
+	struct task_struct *first_task = first;
+	struct task_struct *second_task = second;
+
+	/* There is no point in comparing a task to itself. */
+	if (first && first == second) {
+		TRACE_TASK(first,
+			   "WARNING: pointless edf priority comparison.\n");
+		return 0;
+	}
+
+
+	/* check for NULL tasks */
+	if (!first || !second)
+		return first && !second;
+
+#ifdef CONFIG_LITMUS_LOCKING
+
+	/* Check for inherited priorities. Change task
+	 * used for comparison in such a case.
+	 */
+	if (unlikely(first->rt_param.inh_task))
+		first_task = first->rt_param.inh_task;
+	if (unlikely(second->rt_param.inh_task))
+		second_task = second->rt_param.inh_task;
+
+	/* Check for priority boosting. Tie-break by start of boosting.
+	 */
+	if (unlikely(is_priority_boosted(first_task))) {
+		/* first_task is boosted, how about second_task? */
+		if (!is_priority_boosted(second_task) ||
+		    lt_before(get_boost_start(first_task),
+			      get_boost_start(second_task)))
+			return 1;
+		else
+			return 0;
+	} else if (unlikely(is_priority_boosted(second_task)))
+		/* second_task is boosted, first is not*/
+		return 0;
+
+#endif
+
+
+	return !is_realtime(second_task)  ||
+    
+#ifdef CONFIG_LITMUS_SOFTIRQD
+        /* proxy threads always lose w/o inheritance. */
+        (first_task->rt_param.is_proxy_thread <
+            second_task->rt_param.is_proxy_thread) ||
+#endif
+
+		/* is the deadline of the first task earlier?
+		 * Then it has higher priority.
+		 */
+		earlier_release(first_task, second_task) ||
+
+		/* Do we have a deadline tie?
+		 * Then break by PID.
+		 */
+		(get_release(first_task) == get_release(second_task) &&
+	        (first_task->pid < second_task->pid ||
+
+		/* If the PIDs are the same then the task with the inherited
+		 * priority wins.
+		 */
+		(first_task->pid == second_task->pid &&
+		 !second->rt_param.inh_task)));
+}
+
+int fifo_ready_order(struct bheap_node* a, struct bheap_node* b)
+{
+	return fifo_higher_prio(bheap2task(a), bheap2task(b));
+}
+
+void fifo_domain_init(rt_domain_t* rt, check_resched_needed_t resched,
+		      release_jobs_t release)
+{
+	rt_domain_init(rt,  fifo_ready_order, resched, release);
+}
+
+/* need_to_preempt - check whether the task t needs to be preempted
+ *                   call only with irqs disabled and with  ready_lock acquired
+ *                   THIS DOES NOT TAKE NON-PREEMPTIVE SECTIONS INTO ACCOUNT!
+ */
+int fifo_preemption_needed(rt_domain_t* rt, struct task_struct *t)
+{
+	/* we need the read lock for fifo_ready_queue */
+	/* no need to preempt if there is nothing pending */
+	if (!__jobs_pending(rt))
+		return 0;
+	/* we need to reschedule if t doesn't exist */
+	if (!t)
+		return 1;
+
+	/* NOTE: We cannot check for non-preemptibility since we
+	 *       don't know what address space we're currently in.
+	 */
+
+	/* make sure to get non-rt stuff out of the way */
+	return !is_realtime(t) || fifo_higher_prio(__next_ready(rt), t);
+}
diff --git a/litmus/litmus_pai_softirq.c b/litmus/litmus_pai_softirq.c
new file mode 100644
index 000000000000..b31eeb8a2538
--- /dev/null
+++ b/litmus/litmus_pai_softirq.c
@@ -0,0 +1,64 @@
+#include <linux/interrupt.h>
+#include <linux/percpu.h>
+#include <linux/cpu.h>
+#include <linux/kthread.h>
+#include <linux/ftrace.h>
+#include <linux/smp.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+
+#include <linux/sched.h>
+#include <linux/cpuset.h>
+
+#include <litmus/litmus.h>
+#include <litmus/sched_trace.h>
+#include <litmus/jobs.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/litmus_softirq.h>
+
+
+
+int __litmus_tasklet_schedule(struct tasklet_struct *t, unsigned int k_id)
+{
+	int ret = 0; /* assume failure */
+    if(unlikely((t->owner == NULL) || !is_realtime(t->owner)))
+    {
+        TRACE("%s: No owner associated with this tasklet!\n", __FUNCTION__);
+        BUG();
+    }
+
+    ret = litmus->enqueue_pai_tasklet(t);
+	
+	return(ret);
+}
+
+EXPORT_SYMBOL(__litmus_tasklet_schedule);
+
+
+
+// failure causes default Linux handling.
+int __litmus_tasklet_hi_schedule(struct tasklet_struct *t, unsigned int k_id)
+{
+	int ret = 0; /* assume failure */
+	return(ret);
+}
+EXPORT_SYMBOL(__litmus_tasklet_hi_schedule);
+
+
+// failure causes default Linux handling.
+int __litmus_tasklet_hi_schedule_first(struct tasklet_struct *t, unsigned int k_id)
+{
+	int ret = 0; /* assume failure */
+	return(ret);
+}
+EXPORT_SYMBOL(__litmus_tasklet_hi_schedule_first);
+
+
+// failure causes default Linux handling.
+int __litmus_schedule_work(struct work_struct *w, unsigned int k_id)
+{
+	int ret = 0; /* assume failure */
+	return(ret);
+}
+EXPORT_SYMBOL(__litmus_schedule_work);
+
diff --git a/litmus/litmus_softirq.c b/litmus/litmus_softirq.c
index 271e770dbaea..f5cca964b6c6 100644
--- a/litmus/litmus_softirq.c
+++ b/litmus/litmus_softirq.c
@@ -1166,7 +1166,7 @@ int __litmus_tasklet_schedule(struct tasklet_struct *t, unsigned int k_id)
         TRACE("%s: No owner associated with this tasklet!\n", __FUNCTION__);
         BUG();
     }
-    
+
     if(unlikely(k_id >= NR_LITMUS_SOFTIRQD))
     {
         TRACE("%s: No klitirqd_th%d!\n", __FUNCTION__, k_id);
diff --git a/litmus/nvidia_info.c b/litmus/nvidia_info.c
index 78f035244d21..d17152138c63 100644
--- a/litmus/nvidia_info.c
+++ b/litmus/nvidia_info.c
@@ -361,6 +361,7 @@ int get_nv_device_id(struct task_struct* owner)
 
 static int __reg_nv_device(int reg_device_id)
 {
+	int ret = 0;
     struct task_struct* old =
 		cmpxchg(&NV_DEVICE_REG[reg_device_id].device_owner,
 				NULL,
@@ -370,16 +371,21 @@ static int __reg_nv_device(int reg_device_id)
 
 	if(likely(old == NULL))
 	{
+#ifdef CONFIG_LITMUS_SOFTIRQD
 		down_and_set_stat(current, HELD, &tsk_rt(current)->klitirqd_sem);
+#endif
 		TRACE_CUR("%s: device %d registered.\n", __FUNCTION__, reg_device_id);
-		return(0);
 	}   
 	else
 	{   
 		TRACE_CUR("%s: device %d is already in use!\n", __FUNCTION__, reg_device_id);
-		return(-EBUSY);    
+		ret = -EBUSY;
 	}
-
+	
+	return(ret);
+	
+	
+	
 #if 0
 	//unsigned long flags;
 	//raw_spin_lock_irqsave(&NV_DEVICE_REG[reg_device_id].lock, flags);
@@ -411,19 +417,22 @@ static int __reg_nv_device(int reg_device_id)
 
 static int __clear_reg_nv_device(int de_reg_device_id)
 {
-	int ret;
-	unsigned long flags;
-    struct task_struct* klitirqd_th = get_klitirqd(de_reg_device_id);
+	int ret = 0;	
 	struct task_struct* old;
 	
+#ifdef CONFIG_LITMUS_SOFTIRQD
+	unsigned long flags;
+    struct task_struct* klitirqd_th = get_klitirqd(de_reg_device_id);
 	lock_nv_registry(de_reg_device_id, &flags);
+#endif
 	
 	old = cmpxchg(&NV_DEVICE_REG[de_reg_device_id].device_owner,
 				current,
 				NULL);
 	
 	mb();
-			    
+
+#ifdef CONFIG_LITMUS_SOFTIRQD
 	if(likely(old == current))
 	{   
 		flush_pending(klitirqd_th, current);
@@ -448,6 +457,7 @@ static int __clear_reg_nv_device(int de_reg_device_id)
 			TRACE_CUR("%s: device %d is not registered for this process's use! No one is!\n",
 					  __FUNCTION__, de_reg_device_id);
 	}
+#endif
 
 	return(ret);
 }
diff --git a/litmus/rm_common.c b/litmus/rm_common.c
new file mode 100644
index 000000000000..88f83bcbd9d8
--- /dev/null
+++ b/litmus/rm_common.c
@@ -0,0 +1,160 @@
+/*
+ * kernel/rm_common.c
+ *
+ * Common functions for EDF based scheduler.
+ */
+
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/sched_trace.h>
+
+#include <litmus/rm_common.h>
+
+/* rm_higher_prio -  returns true if first has a higher EDF priority
+ *                    than second. Deadline ties are broken by PID.
+ *
+ * both first and second may be NULL
+ */
+int rm_higher_prio(struct task_struct* first,
+		    struct task_struct* second)
+{
+	struct task_struct *first_task = first;
+	struct task_struct *second_task = second;
+
+	/* There is no point in comparing a task to itself. */
+	if (first && first == second) {
+		TRACE_TASK(first,
+			   "WARNING: pointless edf priority comparison.\n");
+		return 0;
+	}
+
+
+	/* check for NULL tasks */
+	if (!first || !second)
+		return first && !second;
+
+#ifdef CONFIG_LITMUS_LOCKING
+
+	/* Check for inherited priorities. Change task
+	 * used for comparison in such a case.
+	 */
+	if (unlikely(first->rt_param.inh_task))
+		first_task = first->rt_param.inh_task;
+	if (unlikely(second->rt_param.inh_task))
+		second_task = second->rt_param.inh_task;
+
+	/* Check for priority boosting. Tie-break by start of boosting.
+	 */
+	if (unlikely(is_priority_boosted(first_task))) {
+		/* first_task is boosted, how about second_task? */
+		if (!is_priority_boosted(second_task) ||
+		    lt_before(get_boost_start(first_task),
+			      get_boost_start(second_task)))
+			return 1;
+		else
+			return 0;
+	} else if (unlikely(is_priority_boosted(second_task)))
+		/* second_task is boosted, first is not*/
+		return 0;
+
+#endif
+
+	if (!is_realtime(second_task))
+		return true;
+
+	if (shorter_period(first_task, second_task))
+		return true;
+
+	if (get_rt_period(first_task) == get_rt_period(second_task))
+	{
+#ifdef CONFIG_LITMUS_SOFTIRQD
+			if (first_task->rt_param.is_proxy_thread < second_task->rt_param.is_proxy_thread)
+			{
+				return true;
+			}
+			if (first_task->rt_param.is_proxy_thread == second_task->rt_param.is_proxy_thread)
+			{
+#endif
+			if (first_task->pid < second_task->pid)
+			{
+				return true;
+			}
+			if (first_task->pid == second_task->pid)
+			{
+				return !second->rt_param.inh_task;
+			}
+#ifdef CONFIG_LITMUS_SOFTIRQD
+			}
+#endif
+	}
+
+	return false;
+
+#if 0
+	return !is_realtime(second_task) ||
+			shorter_period(first_task, second_task) ||
+			((get_rt_period(first_task) == get_rt_period(second_task)) && earlier_deadline(first_task, second_task))
+ 
+#ifdef CONFIG_LITMUS_SOFTIRQD
+        /* proxy threads always lose w/o inheritance. */
+        (first_task->rt_param.is_proxy_thread <
+            second_task->rt_param.is_proxy_thread) ||
+#endif
+
+		/* is the period of the first task shorter?
+		 * Then it has higher priority.
+		 */
+		shorter_period(first_task, second_task) ||
+
+		(earlier_deadline(first_task, second_task) ||
+
+		/* Do we have a deadline tie?
+		 * Then break by PID.
+		 */
+		(get_rt_period(first_task) == get_rt_period(second_task) &&
+	        (first_task->pid < second_task->pid ||
+
+		/* If the PIDs are the same then the task with the inherited
+		 * priority wins.
+		 */
+		(first_task->pid == second_task->pid &&
+		 !second->rt_param.inh_task)));
+#endif
+}
+
+int rm_ready_order(struct bheap_node* a, struct bheap_node* b)
+{
+	return rm_higher_prio(bheap2task(a), bheap2task(b));
+}
+
+void rm_domain_init(rt_domain_t* rt, check_resched_needed_t resched,
+		      release_jobs_t release)
+{
+	rt_domain_init(rt,  rm_ready_order, resched, release);
+}
+
+/* need_to_preempt - check whether the task t needs to be preempted
+ *                   call only with irqs disabled and with  ready_lock acquired
+ *                   THIS DOES NOT TAKE NON-PREEMPTIVE SECTIONS INTO ACCOUNT!
+ */
+int rm_preemption_needed(rt_domain_t* rt, struct task_struct *t)
+{
+	/* we need the read lock for rm_ready_queue */
+	/* no need to preempt if there is nothing pending */
+	if (!__jobs_pending(rt))
+		return 0;
+	/* we need to reschedule if t doesn't exist */
+	if (!t)
+		return 1;
+
+	/* NOTE: We cannot check for non-preemptibility since we
+	 *       don't know what address space we're currently in.
+	 */
+
+	/* make sure to get non-rt stuff out of the way */
+	return !is_realtime(t) || rm_higher_prio(__next_ready(rt), t);
+}
diff --git a/litmus/rm_srt_common.c b/litmus/rm_srt_common.c
new file mode 100644
index 000000000000..f58a8007678f
--- /dev/null
+++ b/litmus/rm_srt_common.c
@@ -0,0 +1,167 @@
+/*
+ * kernel/rm_common.c
+ *
+ * Common functions for EDF based scheduler.
+ */
+
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/sched_trace.h>
+
+#include <litmus/rm_common.h>
+
+/* rm_srt_higher_prio -  returns true if first has a higher EDF priority
+ *                    than second. Deadline ties are broken by PID.
+ *
+ * both first and second may be NULL
+ */
+int rm_srt_higher_prio(struct task_struct* first,
+		    struct task_struct* second)
+{
+	struct task_struct *first_task = first;
+	struct task_struct *second_task = second;
+
+	/* There is no point in comparing a task to itself. */
+	if (first && first == second) {
+		TRACE_TASK(first,
+			   "WARNING: pointless edf priority comparison.\n");
+		return 0;
+	}
+
+
+	/* check for NULL tasks */
+	if (!first || !second)
+		return first && !second;
+
+#ifdef CONFIG_LITMUS_LOCKING
+
+	/* Check for inherited priorities. Change task
+	 * used for comparison in such a case.
+	 */
+	if (unlikely(first->rt_param.inh_task))
+		first_task = first->rt_param.inh_task;
+	if (unlikely(second->rt_param.inh_task))
+		second_task = second->rt_param.inh_task;
+
+	/* Check for priority boosting. Tie-break by start of boosting.
+	 */
+	if (unlikely(is_priority_boosted(first_task))) {
+		/* first_task is boosted, how about second_task? */
+		if (!is_priority_boosted(second_task) ||
+		    lt_before(get_boost_start(first_task),
+			      get_boost_start(second_task)))
+			return 1;
+		else
+			return 0;
+	} else if (unlikely(is_priority_boosted(second_task)))
+		/* second_task is boosted, first is not*/
+		return 0;
+
+#endif
+
+	if (!is_realtime(second_task))
+		return true;
+
+	if (shorter_period(first_task, second_task))
+		return true;
+
+	if (get_rt_period(first_task) == get_rt_period(second_task))
+	{
+		if (earlier_deadline(first_task, second_task))
+		{
+			return true;
+		}
+		if(get_deadline(first_task) == get_deadline(second_task))
+		{
+#ifdef CONFIG_LITMUS_SOFTIRQD
+			if (first_task->rt_param.is_proxy_thread < second_task->rt_param.is_proxy_thread)
+			{
+				return true;
+			}
+			if (first_task->rt_param.is_proxy_thread == second_task->rt_param.is_proxy_thread)
+			{
+#endif
+			if (first_task->pid < second_task->pid)
+			{
+				return true;
+			}
+			if (first_task->pid == second_task->pid)
+			{
+				return !second->rt_param.inh_task;
+			}
+#ifdef CONFIG_LITMUS_SOFTIRQD
+			}
+#endif
+		}
+	}
+
+	return false;
+
+#if 0
+	return !is_realtime(second_task) ||
+			shorter_period(first_task, second_task) ||
+			((get_rt_period(first_task) == get_rt_period(second_task)) && earlier_deadline(first_task, second_task))
+ 
+#ifdef CONFIG_LITMUS_SOFTIRQD
+        /* proxy threads always lose w/o inheritance. */
+        (first_task->rt_param.is_proxy_thread <
+            second_task->rt_param.is_proxy_thread) ||
+#endif
+
+		/* is the period of the first task shorter?
+		 * Then it has higher priority.
+		 */
+		shorter_period(first_task, second_task) ||
+
+		(earlier_deadline(first_task, second_task) ||
+
+		/* Do we have a deadline tie?
+		 * Then break by PID.
+		 */
+		(get_rt_period(first_task) == get_rt_period(second_task) &&
+	        (first_task->pid < second_task->pid ||
+
+		/* If the PIDs are the same then the task with the inherited
+		 * priority wins.
+		 */
+		(first_task->pid == second_task->pid &&
+		 !second->rt_param.inh_task)));
+#endif
+}
+
+int rm_srt_ready_order(struct bheap_node* a, struct bheap_node* b)
+{
+	return rm_srt_higher_prio(bheap2task(a), bheap2task(b));
+}
+
+void rm_srt_domain_init(rt_domain_t* rt, check_resched_needed_t resched,
+		      release_jobs_t release)
+{
+	rt_domain_init(rt,  rm_srt_ready_order, resched, release);
+}
+
+/* need_to_preempt - check whether the task t needs to be preempted
+ *                   call only with irqs disabled and with  ready_lock acquired
+ *                   THIS DOES NOT TAKE NON-PREEMPTIVE SECTIONS INTO ACCOUNT!
+ */
+int rm_srt_preemption_needed(rt_domain_t* rt, struct task_struct *t)
+{
+	/* we need the read lock for rm_ready_queue */
+	/* no need to preempt if there is nothing pending */
+	if (!__jobs_pending(rt))
+		return 0;
+	/* we need to reschedule if t doesn't exist */
+	if (!t)
+		return 1;
+
+	/* NOTE: We cannot check for non-preemptibility since we
+	 *       don't know what address space we're currently in.
+	 */
+
+	/* make sure to get non-rt stuff out of the way */
+	return !is_realtime(t) || rm_srt_higher_prio(__next_ready(rt), t);
+}
diff --git a/litmus/sched_cedf.c b/litmus/sched_cedf.c
index 9b0a8d3b624d..f0356de60b2f 100644
--- a/litmus/sched_cedf.c
+++ b/litmus/sched_cedf.c
@@ -55,6 +55,10 @@
 #include <litmus/litmus_softirq.h>
 #endif
 
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+#include <linux/interrupt.h>
+#endif
+
 #ifdef CONFIG_LITMUS_NVIDIA
 #include <litmus/nvidia_info.h>
 #endif
@@ -91,6 +95,15 @@ DEFINE_PER_CPU(cpu_entry_t, cedf_cpu_entries);
 #define test_will_schedule(cpu) \
 	(atomic_read(&per_cpu(cedf_cpu_entries, cpu).will_schedule))
 
+
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+struct tasklet_head
+{
+	struct tasklet_struct *head;
+	struct tasklet_struct **tail;
+};
+#endif
+
 /*
  * In C-EDF there is a cedf domain _per_ cluster
  * The number of clusters is dynamically determined accordingly to the
@@ -108,6 +121,12 @@ typedef struct clusterdomain {
 	struct bheap      cpu_heap;
 	/* lock for this cluster */
 #define cedf_lock domain.ready_lock
+	
+	
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+	struct tasklet_head pending_tasklets;
+#endif	
+	
 } cedf_domain_t;
 
 /* a cedf_domain per cluster; allocation is done at init/activation time */
@@ -395,6 +414,198 @@ static void cedf_tick(struct task_struct* t)
 	}
 }
 
+
+
+
+
+
+
+
+
+
+
+
+
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+
+
+void __do_lit_tasklet(struct tasklet_struct* tasklet)
+{
+	if (!test_and_clear_bit(TASKLET_STATE_SCHED, &tasklet->state))
+	{
+		BUG();
+	}
+	TRACE("%s: Invoking tasklet with owner pid = %d.\n", __FUNCTION__, tasklet->owner->pid);
+	tasklet->func(tasklet->data);
+	tasklet_unlock(tasklet);
+
+}
+
+void do_lit_tasklets(cedf_domain_t* cluster, struct task_struct* next)
+{
+	int work_to_do = 1;
+	struct tasklet_struct *tasklet = NULL;
+	
+	TRACE("%s: entered.\n", __FUNCTION__);
+	
+	while(work_to_do) {
+		// remove tasklet at head of list if it has higher priority.
+		raw_spin_lock(&cluster->cedf_lock);		
+		// remove tasklet at head.
+		if(cluster->pending_tasklets.head != NULL) {
+			tasklet = cluster->pending_tasklets.head;
+			
+			if(edf_higher_prio(tasklet->owner, next)) {
+				// remove the tasklet from the queue
+				cluster->pending_tasklets.head = tasklet->next;
+				
+				TRACE("%s: Removed tasklet for %d from tasklet queue.\n", __FUNCTION__, tasklet->owner->pid);
+			}
+			else {
+				TRACE("%s: Pending tasklet (%d) does not have priority to run on this CPU (%d).\n", __FUNCTION__, tasklet->owner->pid, smp_processor_id());
+				tasklet = NULL;
+			}
+		}
+		else {
+			//TRACE("%s: Tasklet queue is empty.\n", __FUNCTION__);
+		}
+		raw_spin_unlock(&cluster->cedf_lock);
+		
+		if(tasklet) {
+			__do_lit_tasklet(tasklet);
+			tasklet = NULL;	
+		}
+		else {
+			work_to_do = 0;
+		}
+	}
+	
+	TRACE("%s: exited.\n", __FUNCTION__);
+}
+
+
+void __add_pai_tasklet(struct tasklet_struct* tasklet, cedf_domain_t* cluster)
+{
+	struct tasklet_struct* step;
+	
+	step = cluster->pending_tasklets.head;
+	TRACE("%s: (BEFORE) dumping tasklet queue...\n");
+	while(step != NULL){
+		TRACE("%s: %d\n", __FUNCTION__, step->owner);
+		step = step->next;
+	}
+	TRACE("%s: done.\n", __FUNCTION__);
+	
+	
+	step = cluster->pending_tasklets.head;
+	if(step == NULL) {
+		TRACE("%s: tasklet queue empty.  inserting tasklet for %d at head.\n", __FUNCTION__, tasklet->owner->pid);
+		// insert at tail.
+		tasklet->next = NULL;
+		*(cluster->pending_tasklets.tail) = tasklet;
+		cluster->pending_tasklets.tail = &tasklet->next;		
+	}
+	else if((*cluster->pending_tasklets.tail != NULL) &&
+			edf_higher_prio((*cluster->pending_tasklets.tail)->owner, tasklet->owner)) {
+		// insert at tail.
+		TRACE("%s: tasklet belongs at end.  inserting tasklet for %d at tail.\n", __FUNCTION__, tasklet->owner->pid);
+		
+		tasklet->next = NULL;
+		*(cluster->pending_tasklets.tail) = tasklet;
+		cluster->pending_tasklets.tail = &tasklet->next;		
+	}
+	else {
+		// insert the tasklet somewhere in the middle.
+		
+		while(step->next && edf_higher_prio(step->next->owner, tasklet->owner)) {
+			step = step->next;
+		}
+		
+		// insert tasklet right before step->next.
+		
+		TRACE("%s: tasklet belongs at end.  inserting tasklet for %d between %d and %d.\n", __FUNCTION__, tasklet->owner->pid, step->owner->pid, (step->next) ? step->next->owner->pid : -1);
+		
+		tasklet->next = step->next;
+		step->next = tasklet;
+
+		// patch up the head if needed.
+		if(cluster->pending_tasklets.head == step)
+		{
+			TRACE("%s: %d is the new tasklet queue head.\n", __FUNCTION__, tasklet->owner->pid);
+			cluster->pending_tasklets.head = tasklet;
+		}
+	}
+	
+	
+	step = cluster->pending_tasklets.head;
+	TRACE("%s: (AFTER) dumping tasklet queue...\n", __FUNCTION__);
+	while(step != NULL){
+		TRACE("%s: %d\n", __FUNCTION__, step->owner);
+		step = step->next;
+	}
+	TRACE("%s: done.\n", __FUNCTION__);	
+	
+// TODO: Maintain this list in priority order.
+//	tasklet->next = NULL;
+//	*(cluster->pending_tasklets.tail) = tasklet;
+//	cluster->pending_tasklets.tail = &tasklet->next;
+}
+
+int enqueue_pai_tasklet(struct tasklet_struct* tasklet)
+{
+	cedf_domain_t* cluster = task_cpu_cluster(tasklet->owner);
+	cpu_entry_t *lowest;
+	unsigned long flags;
+	
+    if(unlikely((tasklet->owner == NULL) || !is_realtime(tasklet->owner)))
+    {
+        TRACE("%s: No owner associated with this tasklet!\n", __FUNCTION__);
+        BUG();
+    }	
+	
+	raw_spin_lock_irqsave(&cluster->cedf_lock, flags);		
+
+	lowest = lowest_prio_cpu(cluster);
+	if (edf_higher_prio(tasklet->owner, lowest->linked)) {
+		if (smp_processor_id() == lowest->cpu) {
+			TRACE("%s: Running tasklet on CPU where it was received.\n", __FUNCTION__);
+			// execute the tasklet now.
+			__do_lit_tasklet(tasklet);
+		}
+		else {
+			// preempt the lowest CPU
+			__add_pai_tasklet(tasklet, cluster);
+			
+			TRACE("%s: Triggering CPU %d to run tasklet.\n", __FUNCTION__, lowest->cpu);
+			
+			preempt(lowest);
+		}
+	}
+	
+	raw_spin_unlock_irqrestore(&cluster->cedf_lock, flags);
+	
+	return(1); // success
+}
+
+
+#endif
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
 /* Getting schedule() right is a bit tricky. schedule() may not make any
  * assumptions on the state of the current task since it may be called for a
  * number of reasons. The reasons include a scheduler_tick() determined that it
@@ -507,8 +718,13 @@ static struct task_struct* cedf_schedule(struct task_struct * prev)
 			next = prev;
 
 	sched_state_task_picked();
+	
 	raw_spin_unlock(&cluster->cedf_lock);
 
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+	do_lit_tasklets(cluster, next);
+#endif	
+	
 #ifdef WANT_ALL_SCHED_EVENTS
 	TRACE("cedf_lock released, next=0x%p\n", next);
 
@@ -518,7 +734,6 @@ static struct task_struct* cedf_schedule(struct task_struct * prev)
 		TRACE("becomes idle at %llu.\n", litmus_clock());
 #endif
 
-
 	return next;
 }
 
@@ -1467,6 +1682,13 @@ static long cedf_activate_plugin(void)
 		bheap_init(&(cedf[i].cpu_heap));
 		edf_domain_init(&(cedf[i].domain), NULL, cedf_release_jobs);
 
+		
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+		cedf[i].pending_tasklets.head = NULL;
+		cedf[i].pending_tasklets.tail = &cedf[i].pending_tasklets.head;
+#endif
+		
+		
 		if(!zalloc_cpumask_var(&cedf[i].cpu_map, GFP_ATOMIC))
 			return -ENOMEM;
 	}
@@ -1578,7 +1800,10 @@ static struct sched_plugin cedf_plugin __cacheline_aligned_in_smp = {
 #ifdef CONFIG_LITMUS_SOFTIRQD
 	.set_prio_inh_klitirqd = set_priority_inheritance_klitirqd,
 	.clear_prio_inh_klitirqd = clear_priority_inheritance_klitirqd,
-#endif	
+#endif
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+	.enqueue_pai_tasklet = enqueue_pai_tasklet,
+#endif
 };
 
 static struct proc_dir_entry *cluster_file = NULL, *cedf_dir = NULL;
diff --git a/litmus/sched_cfifo.c b/litmus/sched_cfifo.c
new file mode 100644
index 000000000000..f515446f76ed
--- /dev/null
+++ b/litmus/sched_cfifo.c
@@ -0,0 +1,1611 @@
+/*
+ * litmus/sched_cfifo.c
+ *
+ * Implementation of the C-FIFO scheduling algorithm.
+ *
+ * This implementation is based on G-EDF:
+ * - CPUs are clustered around L2 or L3 caches.
+ * - Clusters topology is automatically detected (this is arch dependent
+ *   and is working only on x86 at the moment --- and only with modern
+ *   cpus that exports cpuid4 information)
+ * - The plugins _does not_ attempt to put tasks in the right cluster i.e.
+ *   the programmer needs to be aware of the topology to place tasks
+ *   in the desired cluster
+ * - default clustering is around L2 cache (cache index = 2)
+ *   supported clusters are: L1 (private cache: pedf), L2, L3, ALL (all
+ *   online_cpus are placed in a single cluster).
+ *
+ *   For details on functions, take a look at sched_gsn_edf.c
+ *
+ * Currently, we do not support changes in the number of online cpus.
+ * If the num_online_cpus() dynamically changes, the plugin is broken.
+ *
+ * This version uses the simple approach and serializes all scheduling
+ * decisions by the use of a queue lock. This is probably not the
+ * best way to do it, but it should suffice for now.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+#include <linux/module.h>
+
+#include <litmus/litmus.h>
+#include <litmus/jobs.h>
+#include <litmus/preempt.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/fifo_common.h>
+#include <litmus/sched_trace.h>
+
+#include <litmus/clustered.h>
+
+#include <litmus/bheap.h>
+
+/* to configure the cluster size */
+#include <litmus/litmus_proc.h>
+
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+#include <litmus/affinity.h>
+#endif
+
+#ifdef CONFIG_LITMUS_SOFTIRQD
+#include <litmus/litmus_softirq.h>
+#endif
+
+#ifdef CONFIG_LITMUS_NVIDIA
+#include <litmus/nvidia_info.h>
+#endif
+
+/* Reference configuration variable. Determines which cache level is used to
+ * group CPUs into clusters.  GLOBAL_CLUSTER, which is the default, means that
+ * all CPUs form a single cluster (just like GSN-EDF).
+ */
+static enum cache_level cluster_config = GLOBAL_CLUSTER;
+
+struct clusterdomain;
+
+/* cpu_entry_t - maintain the linked and scheduled state
+ *
+ * A cpu also contains a pointer to the cfifo_domain_t cluster
+ * that owns it (struct clusterdomain*)
+ */
+typedef struct  {
+	int 			cpu;
+	struct clusterdomain*	cluster;	/* owning cluster */
+	struct task_struct*	linked;		/* only RT tasks */
+	struct task_struct*	scheduled;	/* only RT tasks */
+	atomic_t		will_schedule;	/* prevent unneeded IPIs */
+	struct bheap_node*	hn;
+} cpu_entry_t;
+
+/* one cpu_entry_t per CPU */
+DEFINE_PER_CPU(cpu_entry_t, cfifo_cpu_entries);
+
+#define set_will_schedule() \
+	(atomic_set(&__get_cpu_var(cfifo_cpu_entries).will_schedule, 1))
+#define clear_will_schedule() \
+	(atomic_set(&__get_cpu_var(cfifo_cpu_entries).will_schedule, 0))
+#define test_will_schedule(cpu) \
+	(atomic_read(&per_cpu(cfifo_cpu_entries, cpu).will_schedule))
+
+/*
+ * In C-FIFO there is a cfifo domain _per_ cluster
+ * The number of clusters is dynamically determined accordingly to the
+ * total cpu number and the cluster size
+ */
+typedef struct clusterdomain {
+	/* rt_domain for this cluster */
+	rt_domain_t	domain;
+	/* cpus in this cluster */
+	cpu_entry_t*	*cpus;
+	/* map of this cluster cpus */
+	cpumask_var_t	cpu_map;
+	/* the cpus queue themselves according to priority in here */
+	struct bheap_node *heap_node;
+	struct bheap      cpu_heap;
+	/* lock for this cluster */
+#define cfifo_lock domain.ready_lock
+} cfifo_domain_t;
+
+/* a cfifo_domain per cluster; allocation is done at init/activation time */
+cfifo_domain_t *cfifo;
+
+#define remote_cluster(cpu)	((cfifo_domain_t *) per_cpu(cfifo_cpu_entries, cpu).cluster)
+#define task_cpu_cluster(task)	remote_cluster(get_partition(task))
+
+/* Uncomment WANT_ALL_SCHED_EVENTS if you want to see all scheduling
+ * decisions in the TRACE() log; uncomment VERBOSE_INIT for verbose
+ * information during the initialization of the plugin (e.g., topology)
+#define WANT_ALL_SCHED_EVENTS
+ */
+#define VERBOSE_INIT
+
+static int cpu_lower_prio(struct bheap_node *_a, struct bheap_node *_b)
+{
+	cpu_entry_t *a, *b;
+	a = _a->value;
+	b = _b->value;
+	/* Note that a and b are inverted: we want the lowest-priority CPU at
+	 * the top of the heap.
+	 */
+	return fifo_higher_prio(b->linked, a->linked);
+}
+
+/* update_cpu_position - Move the cpu entry to the correct place to maintain
+ *                       order in the cpu queue. Caller must hold cfifo lock.
+ */
+static void update_cpu_position(cpu_entry_t *entry)
+{
+	cfifo_domain_t *cluster = entry->cluster;
+
+	if (likely(bheap_node_in_heap(entry->hn)))
+		bheap_delete(cpu_lower_prio,
+				&cluster->cpu_heap,
+				entry->hn);
+
+	bheap_insert(cpu_lower_prio, &cluster->cpu_heap, entry->hn);
+}
+
+/* caller must hold cfifo lock */
+static cpu_entry_t* lowest_prio_cpu(cfifo_domain_t *cluster)
+{
+	struct bheap_node* hn;
+	hn = bheap_peek(cpu_lower_prio, &cluster->cpu_heap);
+	return hn->value;
+}
+
+
+/* link_task_to_cpu - Update the link of a CPU.
+ *                    Handles the case where the to-be-linked task is already
+ *                    scheduled on a different CPU.
+ */
+static noinline void link_task_to_cpu(struct task_struct* linked,
+				      cpu_entry_t *entry)
+{
+	cpu_entry_t *sched;
+	struct task_struct* tmp;
+	int on_cpu;
+
+	BUG_ON(linked && !is_realtime(linked));
+
+	/* Currently linked task is set to be unlinked. */
+	if (entry->linked) {
+		entry->linked->rt_param.linked_on = NO_CPU;
+	}
+
+	/* Link new task to CPU. */
+	if (linked) {
+		set_rt_flags(linked, RT_F_RUNNING);
+		/* handle task is already scheduled somewhere! */
+		on_cpu = linked->rt_param.scheduled_on;
+		if (on_cpu != NO_CPU) {
+			sched = &per_cpu(cfifo_cpu_entries, on_cpu);
+			/* this should only happen if not linked already */
+			BUG_ON(sched->linked == linked);
+
+			/* If we are already scheduled on the CPU to which we
+			 * wanted to link, we don't need to do the swap --
+			 * we just link ourselves to the CPU and depend on
+			 * the caller to get things right.
+			 */
+			if (entry != sched) {
+				TRACE_TASK(linked,
+					   "already scheduled on %d, updating link.\n",
+					   sched->cpu);
+				tmp = sched->linked;
+				linked->rt_param.linked_on = sched->cpu;
+				sched->linked = linked;
+				update_cpu_position(sched);
+				linked = tmp;
+			}
+		}
+		if (linked) /* might be NULL due to swap */
+			linked->rt_param.linked_on = entry->cpu;
+	}
+	entry->linked = linked;
+#ifdef WANT_ALL_SCHED_EVENTS
+	if (linked)
+		TRACE_TASK(linked, "linked to %d.\n", entry->cpu);
+	else
+		TRACE("NULL linked to %d.\n", entry->cpu);
+#endif
+	update_cpu_position(entry);
+}
+
+/* unlink - Make sure a task is not linked any longer to an entry
+ *          where it was linked before. Must hold cfifo_lock.
+ */
+static noinline void unlink(struct task_struct* t)
+{
+    	cpu_entry_t *entry;
+
+	if (t->rt_param.linked_on != NO_CPU) {
+		/* unlink */
+		entry = &per_cpu(cfifo_cpu_entries, t->rt_param.linked_on);
+		t->rt_param.linked_on = NO_CPU;
+		link_task_to_cpu(NULL, entry);
+	} else if (is_queued(t)) {
+		/* This is an interesting situation: t is scheduled,
+		 * but was just recently unlinked.  It cannot be
+		 * linked anywhere else (because then it would have
+		 * been relinked to this CPU), thus it must be in some
+		 * queue. We must remove it from the list in this
+		 * case.
+		 *
+		 * in C-FIFO case is should be somewhere in the queue for
+		 * its domain, therefore and we can get the domain using
+		 * task_cpu_cluster
+		 */
+		remove(&(task_cpu_cluster(t))->domain, t);
+	}
+}
+
+
+/* preempt - force a CPU to reschedule
+ */
+static void preempt(cpu_entry_t *entry)
+{
+	preempt_if_preemptable(entry->scheduled, entry->cpu);
+}
+
+/* requeue - Put an unlinked task into gsn-edf domain.
+ *           Caller must hold cfifo_lock.
+ */
+static noinline void requeue(struct task_struct* task)
+{
+	cfifo_domain_t *cluster = task_cpu_cluster(task);
+	BUG_ON(!task);
+	/* sanity check before insertion */
+	BUG_ON(is_queued(task));
+
+	if (is_released(task, litmus_clock()))
+		__add_ready(&cluster->domain, task);
+	else {
+		/* it has got to wait */
+		add_release(&cluster->domain, task);
+	}
+}
+
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+static cpu_entry_t* cfifo_get_nearest_available_cpu(
+				cfifo_domain_t *cluster, cpu_entry_t* start)
+{
+	cpu_entry_t* affinity;
+
+	get_nearest_available_cpu(affinity, start, cfifo_cpu_entries, -1);
+
+	/* make sure CPU is in our cluster */
+	if(affinity && cpu_isset(affinity->cpu, *cluster->cpu_map))
+		return(affinity);
+	else
+		return(NULL);
+}
+#endif
+
+
+/* check for any necessary preemptions */
+static void check_for_preemptions(cfifo_domain_t *cluster)
+{
+	struct task_struct *task;
+	cpu_entry_t *last;
+
+	for(last = lowest_prio_cpu(cluster);
+	    fifo_preemption_needed(&cluster->domain, last->linked);
+	    last = lowest_prio_cpu(cluster)) {
+		/* preemption necessary */
+		task = __take_ready(&cluster->domain);
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+		{
+			cpu_entry_t* affinity =
+					cfifo_get_nearest_available_cpu(cluster,
+							&per_cpu(cfifo_cpu_entries, task_cpu(task)));
+			if(affinity)
+				last = affinity;
+			else if(last->linked)
+				requeue(last->linked);
+		}
+#else
+		if (last->linked)
+			requeue(last->linked);
+#endif
+		TRACE("check_for_preemptions: attempting to link task %d to %d\n",
+				task->pid, last->cpu);
+		link_task_to_cpu(task, last);
+		preempt(last);
+	}
+}
+
+/* cfifo_job_arrival: task is either resumed or released */
+static noinline void cfifo_job_arrival(struct task_struct* task)
+{
+	cfifo_domain_t *cluster = task_cpu_cluster(task);
+	BUG_ON(!task);
+
+	requeue(task);
+	check_for_preemptions(cluster);
+}
+
+static void cfifo_release_jobs(rt_domain_t* rt, struct bheap* tasks)
+{
+	cfifo_domain_t* cluster = container_of(rt, cfifo_domain_t, domain);
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&cluster->cfifo_lock, flags);
+
+	__merge_ready(&cluster->domain, tasks);
+	check_for_preemptions(cluster);
+
+	raw_spin_unlock_irqrestore(&cluster->cfifo_lock, flags);
+}
+
+/* caller holds cfifo_lock */
+static noinline void job_completion(struct task_struct *t, int forced)
+{
+	BUG_ON(!t);
+
+	sched_trace_task_completion(t, forced);
+
+#ifdef CONFIG_LITMUS_NVIDIA
+	atomic_set(&tsk_rt(t)->nv_int_count, 0);
+#endif
+
+	TRACE_TASK(t, "job_completion().\n");
+
+	/* set flags */
+	set_rt_flags(t, RT_F_SLEEP);
+	/* prepare for next period */
+	prepare_for_next_period(t);
+	if (is_released(t, litmus_clock()))
+		sched_trace_task_release(t);
+	/* unlink */
+	unlink(t);
+	/* requeue
+	 * But don't requeue a blocking task. */
+	if (is_running(t))
+		cfifo_job_arrival(t);
+}
+
+/* cfifo_tick - this function is called for every local timer
+ *                         interrupt.
+ *
+ *                   checks whether the current task has expired and checks
+ *                   whether we need to preempt it if it has not expired
+ */
+static void cfifo_tick(struct task_struct* t)
+{
+	if (is_realtime(t) && budget_enforced(t) && budget_exhausted(t)) {
+		if (!is_np(t)) {
+			/* np tasks will be preempted when they become
+			 * preemptable again
+			 */
+			litmus_reschedule_local();
+			set_will_schedule();
+			TRACE("cfifo_scheduler_tick: "
+			      "%d is preemptable "
+			      " => FORCE_RESCHED\n", t->pid);
+		} else if (is_user_np(t)) {
+			TRACE("cfifo_scheduler_tick: "
+			      "%d is non-preemptable, "
+			      "preemption delayed.\n", t->pid);
+			request_exit_np(t);
+		}
+	}
+}
+
+/* Getting schedule() right is a bit tricky. schedule() may not make any
+ * assumptions on the state of the current task since it may be called for a
+ * number of reasons. The reasons include a scheduler_tick() determined that it
+ * was necessary, because sys_exit_np() was called, because some Linux
+ * subsystem determined so, or even (in the worst case) because there is a bug
+ * hidden somewhere. Thus, we must take extreme care to determine what the
+ * current state is.
+ *
+ * The CPU could currently be scheduling a task (or not), be linked (or not).
+ *
+ * The following assertions for the scheduled task could hold:
+ *
+ *      - !is_running(scheduled)        // the job blocks
+ *	- scheduled->timeslice == 0	// the job completed (forcefully)
+ *	- get_rt_flag() == RT_F_SLEEP	// the job completed (by syscall)
+ * 	- linked != scheduled		// we need to reschedule (for any reason)
+ * 	- is_np(scheduled)		// rescheduling must be delayed,
+ *					   sys_exit_np must be requested
+ *
+ * Any of these can occur together.
+ */
+static struct task_struct* cfifo_schedule(struct task_struct * prev)
+{
+	cpu_entry_t* entry = &__get_cpu_var(cfifo_cpu_entries);
+	cfifo_domain_t *cluster = entry->cluster;
+	int out_of_time, sleep, preempt, np, exists, blocks;
+	struct task_struct* next = NULL;
+
+	raw_spin_lock(&cluster->cfifo_lock);
+	clear_will_schedule();
+
+	/* sanity checking */
+	BUG_ON(entry->scheduled && entry->scheduled != prev);
+	BUG_ON(entry->scheduled && !is_realtime(prev));
+	BUG_ON(is_realtime(prev) && !entry->scheduled);
+
+	/* (0) Determine state */
+	exists      = entry->scheduled != NULL;
+	blocks      = exists && !is_running(entry->scheduled);
+	out_of_time = exists &&
+				  budget_enforced(entry->scheduled) &&
+				  budget_exhausted(entry->scheduled);
+	np 	    = exists && is_np(entry->scheduled);
+	sleep	    = exists && get_rt_flags(entry->scheduled) == RT_F_SLEEP;
+	preempt     = entry->scheduled != entry->linked;
+
+#ifdef WANT_ALL_SCHED_EVENTS
+	TRACE_TASK(prev, "invoked cfifo_schedule.\n");
+#endif
+
+	if (exists)
+		TRACE_TASK(prev,
+			   "blocks:%d out_of_time:%d np:%d sleep:%d preempt:%d "
+			   "state:%d sig:%d\n",
+			   blocks, out_of_time, np, sleep, preempt,
+			   prev->state, signal_pending(prev));
+	if (entry->linked && preempt)
+		TRACE_TASK(prev, "will be preempted by %s/%d\n",
+			   entry->linked->comm, entry->linked->pid);
+
+
+	/* If a task blocks we have no choice but to reschedule.
+	 */
+	if (blocks)
+		unlink(entry->scheduled);
+
+	/* Request a sys_exit_np() call if we would like to preempt but cannot.
+	 * We need to make sure to update the link structure anyway in case
+	 * that we are still linked. Multiple calls to request_exit_np() don't
+	 * hurt.
+	 */
+	if (np && (out_of_time || preempt || sleep)) {
+		unlink(entry->scheduled);
+		request_exit_np(entry->scheduled);
+	}
+
+	/* Any task that is preemptable and either exhausts its execution
+	 * budget or wants to sleep completes. We may have to reschedule after
+	 * this. Don't do a job completion if we block (can't have timers running
+	 * for blocked jobs). Preemption go first for the same reason.
+	 */
+	if (!np && (out_of_time || sleep) && !blocks && !preempt)
+		job_completion(entry->scheduled, !sleep);
+
+	/* Link pending task if we became unlinked.
+	 */
+	if (!entry->linked)
+		link_task_to_cpu(__take_ready(&cluster->domain), entry);
+
+	/* The final scheduling decision. Do we need to switch for some reason?
+	 * If linked is different from scheduled, then select linked as next.
+	 */
+	if ((!np || blocks) &&
+	    entry->linked != entry->scheduled) {
+		/* Schedule a linked job? */
+		if (entry->linked) {
+			entry->linked->rt_param.scheduled_on = entry->cpu;
+			next = entry->linked;
+		}
+		if (entry->scheduled) {
+			/* not gonna be scheduled soon */
+			entry->scheduled->rt_param.scheduled_on = NO_CPU;
+			TRACE_TASK(entry->scheduled, "scheduled_on = NO_CPU\n");
+		}
+	} else
+		/* Only override Linux scheduler if we have a real-time task
+		 * scheduled that needs to continue.
+		 */
+		if (exists)
+			next = prev;
+
+	sched_state_task_picked();
+	raw_spin_unlock(&cluster->cfifo_lock);
+
+#ifdef WANT_ALL_SCHED_EVENTS
+	TRACE("cfifo_lock released, next=0x%p\n", next);
+
+	if (next)
+		TRACE_TASK(next, "scheduled at %llu\n", litmus_clock());
+	else if (exists && !next)
+		TRACE("becomes idle at %llu.\n", litmus_clock());
+#endif
+
+
+	return next;
+}
+
+
+/* _finish_switch - we just finished the switch away from prev
+ */
+static void cfifo_finish_switch(struct task_struct *prev)
+{
+	cpu_entry_t* 	entry = &__get_cpu_var(cfifo_cpu_entries);
+
+	entry->scheduled = is_realtime(current) ? current : NULL;
+#ifdef WANT_ALL_SCHED_EVENTS
+	TRACE_TASK(prev, "switched away from\n");
+#endif
+}
+
+
+/*	Prepare a task for running in RT mode
+ */
+static void cfifo_task_new(struct task_struct * t, int on_rq, int running)
+{
+	unsigned long 		flags;
+	cpu_entry_t* 		entry;
+	cfifo_domain_t*		cluster;
+
+	TRACE("gsn edf: task new %d\n", t->pid);
+
+	/* the cluster doesn't change even if t is running */
+	cluster = task_cpu_cluster(t);
+
+	raw_spin_lock_irqsave(&cluster->cfifo_lock, flags);
+
+	/* setup job params */
+	release_at(t, litmus_clock());
+
+	if (running) {
+		entry = &per_cpu(cfifo_cpu_entries, task_cpu(t));
+		BUG_ON(entry->scheduled);
+
+		entry->scheduled = t;
+		tsk_rt(t)->scheduled_on = task_cpu(t);
+	} else {
+		t->rt_param.scheduled_on = NO_CPU;
+	}
+	t->rt_param.linked_on          = NO_CPU;
+
+	cfifo_job_arrival(t);
+	raw_spin_unlock_irqrestore(&cluster->cfifo_lock, flags);
+}
+
+static void cfifo_task_wake_up(struct task_struct *task)
+{
+	unsigned long flags;
+	//lt_t now;
+	cfifo_domain_t *cluster;
+
+	TRACE_TASK(task, "wake_up at %llu\n", litmus_clock());
+
+	cluster = task_cpu_cluster(task);
+
+	raw_spin_lock_irqsave(&cluster->cfifo_lock, flags);
+
+#if 0  // sporadic task model
+	/* We need to take suspensions because of semaphores into
+	 * account! If a job resumes after being suspended due to acquiring
+	 * a semaphore, it should never be treated as a new job release.
+	 */
+	if (get_rt_flags(task) == RT_F_EXIT_SEM) {
+		set_rt_flags(task, RT_F_RUNNING);
+	} else {
+		now = litmus_clock();
+		if (is_tardy(task, now)) {
+			/* new sporadic release */
+			release_at(task, now);
+			sched_trace_task_release(task);
+		}
+		else {
+			if (task->rt.time_slice) {
+				/* came back in time before deadline
+				*/
+				set_rt_flags(task, RT_F_RUNNING);
+			}
+		}
+	}
+#endif
+
+	//BUG_ON(tsk_rt(task)->linked_on != NO_CPU);
+	set_rt_flags(task, RT_F_RUNNING);  // periodic model
+
+	if(tsk_rt(task)->linked_on == NO_CPU)
+		cfifo_job_arrival(task);
+	else
+		TRACE("WTF, mate?!\n");
+
+	raw_spin_unlock_irqrestore(&cluster->cfifo_lock, flags);
+}
+
+static void cfifo_task_block(struct task_struct *t)
+{
+	unsigned long flags;
+	cfifo_domain_t *cluster;
+
+	TRACE_TASK(t, "block at %llu\n", litmus_clock());
+
+	cluster = task_cpu_cluster(t);
+
+	/* unlink if necessary */
+	raw_spin_lock_irqsave(&cluster->cfifo_lock, flags);
+	unlink(t);
+	raw_spin_unlock_irqrestore(&cluster->cfifo_lock, flags);
+
+	BUG_ON(!is_realtime(t));
+}
+
+
+static void cfifo_task_exit(struct task_struct * t)
+{
+	unsigned long flags;
+	cfifo_domain_t *cluster = task_cpu_cluster(t);
+
+	/* unlink if necessary */
+	raw_spin_lock_irqsave(&cluster->cfifo_lock, flags);
+	unlink(t);
+	if (tsk_rt(t)->scheduled_on != NO_CPU) {
+		cpu_entry_t *cpu;
+		cpu = &per_cpu(cfifo_cpu_entries, tsk_rt(t)->scheduled_on);
+		cpu->scheduled = NULL;
+		tsk_rt(t)->scheduled_on = NO_CPU;
+	}
+	raw_spin_unlock_irqrestore(&cluster->cfifo_lock, flags);
+
+	BUG_ON(!is_realtime(t));
+        TRACE_TASK(t, "RIP\n");
+}
+
+static long cfifo_admit_task(struct task_struct* tsk)
+{
+	return task_cpu(tsk) == tsk->rt_param.task_params.cpu ? 0 : -EINVAL;
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+#ifdef CONFIG_LITMUS_LOCKING
+
+#include <litmus/fdso.h>
+
+
+static void __set_priority_inheritance(struct task_struct* t, struct task_struct* prio_inh)
+{
+	int linked_on;
+	int check_preempt = 0;	
+	
+	cfifo_domain_t* cluster = task_cpu_cluster(t);
+	
+	if(prio_inh != NULL)
+		TRACE_TASK(t, "inherits priority from %s/%d\n", prio_inh->comm, prio_inh->pid);
+	else
+		TRACE_TASK(t, "inherits priority from %p\n", prio_inh);
+	
+	sched_trace_eff_prio_change(t, prio_inh);
+	
+	tsk_rt(t)->inh_task = prio_inh;
+	
+	linked_on  = tsk_rt(t)->linked_on;
+	
+	/* If it is scheduled, then we need to reorder the CPU heap. */
+	if (linked_on != NO_CPU) {
+		TRACE_TASK(t, "%s: linked  on %d\n",
+				   __FUNCTION__, linked_on);
+		/* Holder is scheduled; need to re-order CPUs.
+		 * We can't use heap_decrease() here since
+		 * the cpu_heap is ordered in reverse direction, so
+		 * it is actually an increase. */
+		bheap_delete(cpu_lower_prio, &cluster->cpu_heap,
+                     per_cpu(cfifo_cpu_entries, linked_on).hn);
+		bheap_insert(cpu_lower_prio, &cluster->cpu_heap,
+                     per_cpu(cfifo_cpu_entries, linked_on).hn);
+	} else {
+		/* holder may be queued: first stop queue changes */
+		raw_spin_lock(&cluster->domain.release_lock);
+		if (is_queued(t)) {
+			TRACE_TASK(t, "%s: is queued\n", __FUNCTION__);
+			
+			/* We need to update the position of holder in some
+			 * heap. Note that this could be a release heap if we
+			 * budget enforcement is used and this job overran. */
+			check_preempt = !bheap_decrease(fifo_ready_order, tsk_rt(t)->heap_node);
+			
+		} else {
+			/* Nothing to do: if it is not queued and not linked
+			 * then it is either sleeping or currently being moved
+			 * by other code (e.g., a timer interrupt handler) that
+			 * will use the correct priority when enqueuing the
+			 * task. */
+			TRACE_TASK(t, "%s: is NOT queued => Done.\n", __FUNCTION__);
+		}
+		raw_spin_unlock(&cluster->domain.release_lock);
+		
+		/* If holder was enqueued in a release heap, then the following
+		 * preemption check is pointless, but we can't easily detect
+		 * that case. If you want to fix this, then consider that
+		 * simply adding a state flag requires O(n) time to update when
+		 * releasing n tasks, which conflicts with the goal to have
+		 * O(log n) merges. */
+		if (check_preempt) {
+			/* heap_decrease() hit the top level of the heap: make
+			 * sure preemption checks get the right task, not the
+			 * potentially stale cache. */
+			bheap_uncache_min(fifo_ready_order, &cluster->domain.ready_queue);
+			check_for_preemptions(cluster);
+		}
+	}
+}
+
+/* called with IRQs off */
+static void set_priority_inheritance(struct task_struct* t, struct task_struct* prio_inh)
+{
+	cfifo_domain_t* cluster = task_cpu_cluster(t);
+	
+	raw_spin_lock(&cluster->cfifo_lock);
+	
+	__set_priority_inheritance(t, prio_inh);
+	
+#ifdef CONFIG_LITMUS_SOFTIRQD
+	if(tsk_rt(t)->cur_klitirqd != NULL)
+	{
+		TRACE_TASK(t, "%s/%d inherits a new priority!\n",
+				   tsk_rt(t)->cur_klitirqd->comm, tsk_rt(t)->cur_klitirqd->pid);
+		
+		__set_priority_inheritance(tsk_rt(t)->cur_klitirqd, prio_inh);
+	}
+#endif
+	
+	raw_spin_unlock(&cluster->cfifo_lock);
+}
+
+
+/* called with IRQs off */
+static void __clear_priority_inheritance(struct task_struct* t)
+{
+    TRACE_TASK(t, "priority restored\n");
+	
+    if(tsk_rt(t)->scheduled_on != NO_CPU)
+    {
+		sched_trace_eff_prio_change(t, NULL);
+		
+        tsk_rt(t)->inh_task = NULL;
+        
+        /* Check if rescheduling is necessary. We can't use heap_decrease()
+         * since the priority was effectively lowered. */
+        unlink(t);
+        cfifo_job_arrival(t);
+    }
+    else
+    {
+        __set_priority_inheritance(t, NULL);
+    }
+	
+#ifdef CONFIG_LITMUS_SOFTIRQD
+	if(tsk_rt(t)->cur_klitirqd != NULL)
+	{
+		TRACE_TASK(t, "%s/%d inheritance set back to owner.\n",
+				   tsk_rt(t)->cur_klitirqd->comm, tsk_rt(t)->cur_klitirqd->pid);
+		
+		if(tsk_rt(tsk_rt(t)->cur_klitirqd)->scheduled_on != NO_CPU)
+		{
+			sched_trace_eff_prio_change(tsk_rt(t)->cur_klitirqd, t);
+			
+			tsk_rt(tsk_rt(t)->cur_klitirqd)->inh_task = t;
+			
+			/* Check if rescheduling is necessary. We can't use heap_decrease()
+			 * since the priority was effectively lowered. */
+			unlink(tsk_rt(t)->cur_klitirqd);
+			cfifo_job_arrival(tsk_rt(t)->cur_klitirqd);
+		}
+		else
+		{
+			__set_priority_inheritance(tsk_rt(t)->cur_klitirqd, t);
+		}
+	}
+#endif
+}
+
+/* called with IRQs off */
+static void clear_priority_inheritance(struct task_struct* t)
+{
+	cfifo_domain_t* cluster = task_cpu_cluster(t);
+	
+	raw_spin_lock(&cluster->cfifo_lock);
+	__clear_priority_inheritance(t);
+	raw_spin_unlock(&cluster->cfifo_lock);
+}
+
+
+
+#ifdef CONFIG_LITMUS_SOFTIRQD
+/* called with IRQs off */
+static void set_priority_inheritance_klitirqd(struct task_struct* klitirqd,
+											  struct task_struct* old_owner,
+											  struct task_struct* new_owner)
+{
+	cfifo_domain_t* cluster = task_cpu_cluster(klitirqd);
+	
+	BUG_ON(!(tsk_rt(klitirqd)->is_proxy_thread));
+	
+	raw_spin_lock(&cluster->cfifo_lock);
+	
+	if(old_owner != new_owner)
+	{
+		if(old_owner)
+		{
+			// unreachable?
+			tsk_rt(old_owner)->cur_klitirqd = NULL;
+		}
+		
+		TRACE_TASK(klitirqd, "giving ownership to %s/%d.\n",
+				   new_owner->comm, new_owner->pid);
+		
+		tsk_rt(new_owner)->cur_klitirqd = klitirqd;
+	}
+	
+	__set_priority_inheritance(klitirqd,
+							   (tsk_rt(new_owner)->inh_task == NULL) ?
+							   new_owner :
+							   tsk_rt(new_owner)->inh_task);
+	
+	raw_spin_unlock(&cluster->cfifo_lock);
+}
+
+/* called with IRQs off */
+static void clear_priority_inheritance_klitirqd(struct task_struct* klitirqd,
+												struct task_struct* old_owner)
+{
+	cfifo_domain_t* cluster = task_cpu_cluster(klitirqd);
+	
+	BUG_ON(!(tsk_rt(klitirqd)->is_proxy_thread));
+	
+	raw_spin_lock(&cluster->cfifo_lock);
+    
+    TRACE_TASK(klitirqd, "priority restored\n");
+	
+    if(tsk_rt(klitirqd)->scheduled_on != NO_CPU)
+    {
+        tsk_rt(klitirqd)->inh_task = NULL;
+        
+        /* Check if rescheduling is necessary. We can't use heap_decrease()
+         * since the priority was effectively lowered. */
+        unlink(klitirqd);
+        cfifo_job_arrival(klitirqd);
+    }
+    else
+    {
+        __set_priority_inheritance(klitirqd, NULL);
+    }
+	
+	tsk_rt(old_owner)->cur_klitirqd = NULL;
+	
+	raw_spin_unlock(&cluster->cfifo_lock);
+}
+#endif  // CONFIG_LITMUS_SOFTIRQD
+
+
+/* ******************** KFMLP support ********************** */
+
+/* struct for semaphore with priority inheritance */
+struct kfmlp_queue
+{
+	wait_queue_head_t wait;
+	struct task_struct* owner;
+	struct task_struct* hp_waiter;
+	int count; /* number of waiters + holder */
+};
+
+struct kfmlp_semaphore
+{
+	struct litmus_lock litmus_lock;
+	
+	spinlock_t lock;
+	
+	int num_resources; /* aka k */
+	struct kfmlp_queue *queues; /* array */
+	struct kfmlp_queue *shortest_queue; /* pointer to shortest queue */
+};
+
+static inline struct kfmlp_semaphore* kfmlp_from_lock(struct litmus_lock* lock)
+{
+	return container_of(lock, struct kfmlp_semaphore, litmus_lock);
+}
+
+static inline int kfmlp_get_idx(struct kfmlp_semaphore* sem,
+								struct kfmlp_queue* queue)
+{
+	return (queue - &sem->queues[0]);
+}
+
+static inline struct kfmlp_queue* kfmlp_get_queue(struct kfmlp_semaphore* sem,
+												  struct task_struct* holder)
+{
+	int i;
+	for(i = 0; i < sem->num_resources; ++i)
+		if(sem->queues[i].owner == holder)
+			return(&sem->queues[i]);
+	return(NULL);
+}
+
+/* caller is responsible for locking */
+static struct task_struct* kfmlp_find_hp_waiter(struct kfmlp_queue *kqueue,
+										 struct task_struct *skip)
+{
+	struct list_head	*pos;
+	struct task_struct 	*queued, *found = NULL;
+	
+	list_for_each(pos, &kqueue->wait.task_list) {
+		queued  = (struct task_struct*) list_entry(pos, wait_queue_t,
+												   task_list)->private;
+		
+		/* Compare task prios, find high prio task. */
+		if (queued != skip && fifo_higher_prio(queued, found))
+			found = queued;
+	}
+	return found;
+}
+
+static inline struct kfmlp_queue* kfmlp_find_shortest(
+										  struct kfmlp_semaphore* sem,
+										  struct kfmlp_queue* search_start)
+{
+	// we start our search at search_start instead of at the beginning of the
+	// queue list to load-balance across all resources.
+	struct kfmlp_queue* step = search_start;
+	struct kfmlp_queue* shortest = sem->shortest_queue;
+	
+	do
+	{
+		step = (step+1 != &sem->queues[sem->num_resources]) ?
+		step+1 : &sem->queues[0];
+		if(step->count < shortest->count)
+		{
+			shortest = step;
+			if(step->count == 0)
+				break; /* can't get any shorter */
+		}
+	}while(step != search_start);
+	
+	return(shortest);
+}
+
+static struct task_struct* kfmlp_remove_hp_waiter(struct kfmlp_semaphore* sem)
+{
+	/* must hold sem->lock */
+	
+	struct kfmlp_queue *my_queue = NULL;
+	struct task_struct *max_hp = NULL;
+	
+	
+	struct list_head	*pos;
+	struct task_struct 	*queued;
+	int i;
+	
+	for(i = 0; i < sem->num_resources; ++i)
+	{
+		if( (sem->queues[i].count > 1) &&
+		   ((my_queue == NULL) ||
+			(fifo_higher_prio(sem->queues[i].hp_waiter, my_queue->hp_waiter))) )
+		{
+			my_queue = &sem->queues[i];
+		}
+	}
+	
+	if(my_queue)
+	{
+		cfifo_domain_t* cluster;
+		
+		max_hp = my_queue->hp_waiter;
+		BUG_ON(!max_hp);
+
+		TRACE_CUR("queue %d: stealing %s/%d from queue %d\n",
+				  kfmlp_get_idx(sem, my_queue),
+				  max_hp->comm, max_hp->pid,
+				  kfmlp_get_idx(sem, my_queue));
+		
+		my_queue->hp_waiter = kfmlp_find_hp_waiter(my_queue, max_hp);
+		
+		/*
+		 if(my_queue->hp_waiter)
+		 TRACE_CUR("queue %d: new hp_waiter is %s/%d\n",
+		 kfmlp_get_idx(sem, my_queue),
+		 my_queue->hp_waiter->comm,
+		 my_queue->hp_waiter->pid);
+		 else
+		 TRACE_CUR("queue %d: new hp_waiter is %p\n",
+		 kfmlp_get_idx(sem, my_queue), NULL);
+		 */
+	
+		cluster = task_cpu_cluster(max_hp);
+
+		raw_spin_lock(&cluster->cfifo_lock);
+		
+		/*
+		 if(my_queue->owner)
+		 TRACE_CUR("queue %d: owner is %s/%d\n",
+		 kfmlp_get_idx(sem, my_queue),
+		 my_queue->owner->comm,
+		 my_queue->owner->pid);
+		 else
+		 TRACE_CUR("queue %d: owner is %p\n",
+		 kfmlp_get_idx(sem, my_queue),
+		 NULL);
+		 */
+		
+		if(tsk_rt(my_queue->owner)->inh_task == max_hp)
+		{
+			__clear_priority_inheritance(my_queue->owner);
+			if(my_queue->hp_waiter != NULL)
+			{
+				__set_priority_inheritance(my_queue->owner, my_queue->hp_waiter);
+			}
+		}
+		raw_spin_unlock(&cluster->cfifo_lock);
+		
+		list_for_each(pos, &my_queue->wait.task_list)
+		{
+			queued  = (struct task_struct*) list_entry(pos, wait_queue_t,
+													   task_list)->private;
+			/* Compare task prios, find high prio task. */
+			if (queued == max_hp)
+			{
+				/*
+				 TRACE_CUR("queue %d: found entry in wait queue.  REMOVING!\n",
+				 kfmlp_get_idx(sem, my_queue));
+				 */
+				__remove_wait_queue(&my_queue->wait,
+									list_entry(pos, wait_queue_t, task_list));
+				break;
+			}
+		}
+		--(my_queue->count);
+	}
+	
+	return(max_hp);
+}
+
+int cfifo_kfmlp_lock(struct litmus_lock* l)
+{
+	struct task_struct* t = current;
+	struct kfmlp_semaphore *sem = kfmlp_from_lock(l);
+	struct kfmlp_queue* my_queue;
+	wait_queue_t wait;
+	unsigned long flags;
+	
+	if (!is_realtime(t))
+		return -EPERM;
+	
+	spin_lock_irqsave(&sem->lock, flags);
+	
+	my_queue = sem->shortest_queue;
+	
+	if (my_queue->owner) {
+		/* resource is not free => must suspend and wait */
+		TRACE_CUR("queue %d: Resource is not free => must suspend and wait.\n",
+				  kfmlp_get_idx(sem, my_queue));
+		
+		init_waitqueue_entry(&wait, t);
+		
+		/* FIXME: interruptible would be nice some day */
+		set_task_state(t, TASK_UNINTERRUPTIBLE);
+		
+		__add_wait_queue_tail_exclusive(&my_queue->wait, &wait);
+		
+		/* check if we need to activate priority inheritance */
+		if (fifo_higher_prio(t, my_queue->hp_waiter))
+		{
+			my_queue->hp_waiter = t;
+			if (fifo_higher_prio(t, my_queue->owner))
+			{
+				set_priority_inheritance(my_queue->owner, my_queue->hp_waiter);
+			}
+		}
+		
+		++(my_queue->count);
+		sem->shortest_queue = kfmlp_find_shortest(sem, my_queue);
+		
+		/* release lock before sleeping */
+		spin_unlock_irqrestore(&sem->lock, flags);
+		
+		/* We depend on the FIFO order.  Thus, we don't need to recheck
+		 * when we wake up; we are guaranteed to have the lock since
+		 * there is only one wake up per release (or steal).
+		 */
+		schedule();
+		
+		
+		if(my_queue->owner == t)
+		{
+			TRACE_CUR("queue %d: acquired through waiting\n",
+					  kfmlp_get_idx(sem, my_queue));
+		}
+		else
+		{
+			/* this case may happen if our wait entry was stolen
+			 between queues.  record where we went.*/
+			my_queue = kfmlp_get_queue(sem, t);
+			BUG_ON(!my_queue);
+			TRACE_CUR("queue %d: acquired through stealing\n",
+					  kfmlp_get_idx(sem, my_queue));
+		}
+	}
+	else
+	{
+		TRACE_CUR("queue %d: acquired immediately\n",
+				  kfmlp_get_idx(sem, my_queue));
+		
+		my_queue->owner = t;
+		
+		++(my_queue->count);
+		sem->shortest_queue = kfmlp_find_shortest(sem, my_queue);		
+		
+		spin_unlock_irqrestore(&sem->lock, flags);
+	}
+	
+	return kfmlp_get_idx(sem, my_queue);
+}
+
+int cfifo_kfmlp_unlock(struct litmus_lock* l)
+{
+	struct task_struct *t = current, *next;
+	struct kfmlp_semaphore *sem = kfmlp_from_lock(l);
+	struct kfmlp_queue *my_queue;
+	unsigned long flags;
+	int err = 0;
+	
+	spin_lock_irqsave(&sem->lock, flags);
+	
+	my_queue = kfmlp_get_queue(sem, t);
+	
+	if (!my_queue) {
+		err = -EINVAL;
+		goto out;
+	}
+	
+	/* check if there are jobs waiting for this resource */
+	next = __waitqueue_remove_first(&my_queue->wait);
+	if (next) {
+		/*
+		 TRACE_CUR("queue %d: ASSIGNING %s/%d as owner - next\n",
+		 kfmlp_get_idx(sem, my_queue),
+		 next->comm, next->pid);
+		 */
+		/* next becomes the resouce holder */
+		my_queue->owner = next;
+		
+		--(my_queue->count);
+		if(my_queue->count < sem->shortest_queue->count)
+		{
+			sem->shortest_queue = my_queue;
+		}	
+		
+		TRACE_CUR("queue %d: lock ownership passed to %s/%d\n",
+				  kfmlp_get_idx(sem, my_queue), next->comm, next->pid);
+		
+		/* determine new hp_waiter if necessary */
+		if (next == my_queue->hp_waiter) {
+			TRACE_TASK(next, "was highest-prio waiter\n");
+			/* next has the highest priority --- it doesn't need to
+			 * inherit.  However, we need to make sure that the
+			 * next-highest priority in the queue is reflected in
+			 * hp_waiter. */
+			my_queue->hp_waiter = kfmlp_find_hp_waiter(my_queue, next);
+			if (my_queue->hp_waiter)
+				TRACE_TASK(my_queue->hp_waiter, "queue %d: is new highest-prio waiter\n", kfmlp_get_idx(sem, my_queue));
+			else
+				TRACE("queue %d: no further waiters\n", kfmlp_get_idx(sem, my_queue));
+		} else {
+			/* Well, if next is not the highest-priority waiter,
+			 * then it ought to inherit the highest-priority
+			 * waiter's priority. */
+			set_priority_inheritance(next, my_queue->hp_waiter);
+		}
+		
+		/* wake up next */
+		wake_up_process(next);
+	}
+	else
+	{
+		TRACE_CUR("queue %d: looking to steal someone...\n", kfmlp_get_idx(sem, my_queue));
+		
+		next = kfmlp_remove_hp_waiter(sem); /* returns NULL if nothing to steal */
+		
+		/*
+		 if(next)
+		 TRACE_CUR("queue %d: ASSIGNING %s/%d as owner - steal\n",
+		 kfmlp_get_idx(sem, my_queue),
+		 next->comm, next->pid);
+		 */
+		
+		my_queue->owner = next;
+		
+		if(next)
+		{
+			TRACE_CUR("queue %d: lock ownership passed to %s/%d (which was stolen)\n",
+					  kfmlp_get_idx(sem, my_queue),
+					  next->comm, next->pid);
+			
+			/* wake up next */
+			wake_up_process(next);			
+		}
+		else
+		{
+			TRACE_CUR("queue %d: no one to steal.\n", kfmlp_get_idx(sem, my_queue));
+			
+			--(my_queue->count);
+			if(my_queue->count < sem->shortest_queue->count)
+			{
+				sem->shortest_queue = my_queue;
+			}
+		}
+	}
+	
+	/* we lose the benefit of priority inheritance (if any) */
+	if (tsk_rt(t)->inh_task)
+		clear_priority_inheritance(t);
+	
+out:
+	spin_unlock_irqrestore(&sem->lock, flags);
+	
+	return err;
+}
+
+int cfifo_kfmlp_close(struct litmus_lock* l)
+{
+	struct task_struct *t = current;
+	struct kfmlp_semaphore *sem = kfmlp_from_lock(l);
+	struct kfmlp_queue *my_queue;
+	unsigned long flags;
+	
+	int owner;
+	
+	spin_lock_irqsave(&sem->lock, flags);
+	
+	my_queue = kfmlp_get_queue(sem, t);	
+	owner = (my_queue) ? (my_queue->owner == t) : 0;
+	
+	spin_unlock_irqrestore(&sem->lock, flags);
+	
+	if (owner)
+		cfifo_kfmlp_unlock(l);
+	
+	return 0;
+}
+
+void cfifo_kfmlp_free(struct litmus_lock* l)
+{
+	struct kfmlp_semaphore *sem = kfmlp_from_lock(l);
+	kfree(sem->queues);
+	kfree(sem);
+}
+
+static struct litmus_lock_ops cfifo_kfmlp_lock_ops = {
+	.close  = cfifo_kfmlp_close,
+	.lock   = cfifo_kfmlp_lock,
+	.unlock = cfifo_kfmlp_unlock,
+	.deallocate = cfifo_kfmlp_free,
+};
+
+static struct litmus_lock* cfifo_new_kfmlp(void* __user arg, int* ret_code)
+{
+	struct kfmlp_semaphore* sem;
+	int num_resources = 0;
+	int i;
+	
+	if(!access_ok(VERIFY_READ, arg, sizeof(num_resources)))
+	{
+		*ret_code = -EINVAL;
+		return(NULL);
+	}
+	if(__copy_from_user(&num_resources, arg, sizeof(num_resources)))
+	{
+		*ret_code = -EINVAL;
+		return(NULL);
+	}
+	if(num_resources < 1)
+	{
+		*ret_code = -EINVAL;
+		return(NULL);		
+	}
+	
+	sem = kmalloc(sizeof(*sem), GFP_KERNEL);
+	if(!sem)
+	{
+		*ret_code = -ENOMEM;
+		return NULL;
+	}
+	
+	sem->queues = kmalloc(sizeof(struct kfmlp_queue)*num_resources, GFP_KERNEL);
+	if(!sem->queues)
+	{
+		kfree(sem);
+		*ret_code = -ENOMEM;
+		return NULL;		
+	}
+	
+	sem->litmus_lock.ops = &cfifo_kfmlp_lock_ops;
+	spin_lock_init(&sem->lock);
+	sem->num_resources = num_resources;
+	
+	for(i = 0; i < num_resources; ++i)
+	{
+		sem->queues[i].owner = NULL;
+		sem->queues[i].hp_waiter = NULL;
+		init_waitqueue_head(&sem->queues[i].wait);
+		sem->queues[i].count = 0;
+	}
+	
+	sem->shortest_queue = &sem->queues[0];
+	
+	*ret_code = 0;
+	return &sem->litmus_lock;
+}
+
+
+/* **** lock constructor **** */
+
+static long cfifo_allocate_lock(struct litmus_lock **lock, int type,
+								 void* __user arg)
+{
+	int err = -ENXIO;
+	
+	/* C-FIFO currently only supports the FMLP for global resources
+		WITHIN a given cluster.  DO NOT USE CROSS-CLUSTER! */
+	switch (type) {
+		case KFMLP_SEM:
+			*lock = cfifo_new_kfmlp(arg, &err);
+			break;
+	};
+	
+	return err;
+}
+
+#endif  // CONFIG_LITMUS_LOCKING
+
+
+
+
+
+
+/* total number of cluster */
+static int num_clusters;
+/* we do not support cluster of different sizes */
+static unsigned int cluster_size;
+
+#ifdef VERBOSE_INIT
+static void print_cluster_topology(cpumask_var_t mask, int cpu)
+{
+	int chk;
+	char buf[255];
+
+	chk = cpulist_scnprintf(buf, 254, mask);
+	buf[chk] = '\0';
+	printk(KERN_INFO "CPU = %d, shared cpu(s) = %s\n", cpu, buf);
+
+}
+#endif
+
+static int clusters_allocated = 0;
+
+static void cleanup_cfifo(void)
+{
+	int i;
+
+	if (clusters_allocated) {
+		for (i = 0; i < num_clusters; i++) {
+			kfree(cfifo[i].cpus);
+			kfree(cfifo[i].heap_node);
+			free_cpumask_var(cfifo[i].cpu_map);
+		}
+
+		kfree(cfifo);
+	}
+}
+
+static long cfifo_activate_plugin(void)
+{
+	int i, j, cpu, ccpu, cpu_count;
+	cpu_entry_t *entry;
+
+	cpumask_var_t mask;
+	int chk = 0;
+
+	/* de-allocate old clusters, if any */
+	cleanup_cfifo();
+
+	printk(KERN_INFO "C-FIFO: Activate Plugin, cluster configuration = %d\n",
+			cluster_config);
+
+	/* need to get cluster_size first */
+	if(!zalloc_cpumask_var(&mask, GFP_ATOMIC))
+		return -ENOMEM;
+
+	if (unlikely(cluster_config == GLOBAL_CLUSTER)) {
+		cluster_size = num_online_cpus();
+	} else {
+		chk = get_shared_cpu_map(mask, 0, cluster_config);
+		if (chk) {
+			/* if chk != 0 then it is the max allowed index */
+			printk(KERN_INFO "C-FIFO: Cluster configuration = %d "
+			       "is not supported on this hardware.\n",
+			       cluster_config);
+			/* User should notice that the configuration failed, so
+			 * let's bail out. */
+			return -EINVAL;
+		}
+
+		cluster_size = cpumask_weight(mask);
+	}
+
+	if ((num_online_cpus() % cluster_size) != 0) {
+		/* this can't be right, some cpus are left out */
+		printk(KERN_ERR "C-FIFO: Trying to group %d cpus in %d!\n",
+				num_online_cpus(), cluster_size);
+		return -1;
+	}
+
+	num_clusters = num_online_cpus() / cluster_size;
+	printk(KERN_INFO "C-FIFO: %d cluster(s) of size = %d\n",
+			num_clusters, cluster_size);
+
+	/* initialize clusters */
+	cfifo = kmalloc(num_clusters * sizeof(cfifo_domain_t), GFP_ATOMIC);
+	for (i = 0; i < num_clusters; i++) {
+
+		cfifo[i].cpus = kmalloc(cluster_size * sizeof(cpu_entry_t),
+				GFP_ATOMIC);
+		cfifo[i].heap_node = kmalloc(
+				cluster_size * sizeof(struct bheap_node),
+				GFP_ATOMIC);
+		bheap_init(&(cfifo[i].cpu_heap));
+		fifo_domain_init(&(cfifo[i].domain), NULL, cfifo_release_jobs);
+
+		if(!zalloc_cpumask_var(&cfifo[i].cpu_map, GFP_ATOMIC))
+			return -ENOMEM;
+	}
+
+	/* cycle through cluster and add cpus to them */
+	for (i = 0; i < num_clusters; i++) {
+
+		for_each_online_cpu(cpu) {
+			/* check if the cpu is already in a cluster */
+			for (j = 0; j < num_clusters; j++)
+				if (cpumask_test_cpu(cpu, cfifo[j].cpu_map))
+					break;
+			/* if it is in a cluster go to next cpu */
+			if (j < num_clusters &&
+					cpumask_test_cpu(cpu, cfifo[j].cpu_map))
+				continue;
+
+			/* this cpu isn't in any cluster */
+			/* get the shared cpus */
+			if (unlikely(cluster_config == GLOBAL_CLUSTER))
+				cpumask_copy(mask, cpu_online_mask);
+			else
+				get_shared_cpu_map(mask, cpu, cluster_config);
+
+			cpumask_copy(cfifo[i].cpu_map, mask);
+#ifdef VERBOSE_INIT
+			print_cluster_topology(mask, cpu);
+#endif
+			/* add cpus to current cluster and init cpu_entry_t */
+			cpu_count = 0;
+			for_each_cpu(ccpu, cfifo[i].cpu_map) {
+
+				entry = &per_cpu(cfifo_cpu_entries, ccpu);
+				cfifo[i].cpus[cpu_count] = entry;
+				atomic_set(&entry->will_schedule, 0);
+				entry->cpu = ccpu;
+				entry->cluster = &cfifo[i];
+				entry->hn = &(cfifo[i].heap_node[cpu_count]);
+				bheap_node_init(&entry->hn, entry);
+
+				cpu_count++;
+
+				entry->linked = NULL;
+				entry->scheduled = NULL;
+				update_cpu_position(entry);
+			}
+			/* done with this cluster */
+			break;
+		}
+	}
+	
+#ifdef CONFIG_LITMUS_SOFTIRQD
+	{
+		/* distribute the daemons evenly across the clusters. */
+		int* affinity = kmalloc(NR_LITMUS_SOFTIRQD * sizeof(int), GFP_ATOMIC);
+		int num_daemons_per_cluster = NR_LITMUS_SOFTIRQD / num_clusters;
+		int left_over = NR_LITMUS_SOFTIRQD % num_clusters;
+		
+		int daemon = 0;
+		for(i = 0; i < num_clusters; ++i)
+		{
+			int num_on_this_cluster = num_daemons_per_cluster;
+			if(left_over)
+			{
+				++num_on_this_cluster;
+				--left_over;
+			}
+			
+			for(j = 0; j < num_on_this_cluster; ++j)
+			{
+				// first CPU of this cluster
+				affinity[daemon++] = i*cluster_size;
+			}
+		}
+	
+		spawn_klitirqd(affinity);
+		
+		kfree(affinity);
+	}
+#endif
+	
+#ifdef CONFIG_LITMUS_NVIDIA
+	init_nvidia_info();
+#endif	
+
+	free_cpumask_var(mask);
+	clusters_allocated = 1;
+	return 0;
+}
+
+/*	Plugin object	*/
+static struct sched_plugin cfifo_plugin __cacheline_aligned_in_smp = {
+	.plugin_name		= "C-FIFO",
+	.finish_switch		= cfifo_finish_switch,
+	.tick			= cfifo_tick,
+	.task_new		= cfifo_task_new,
+	.complete_job		= complete_job,
+	.task_exit		= cfifo_task_exit,
+	.schedule		= cfifo_schedule,
+	.task_wake_up		= cfifo_task_wake_up,
+	.task_block		= cfifo_task_block,
+	.admit_task		= cfifo_admit_task,
+	.activate_plugin	= cfifo_activate_plugin,
+#ifdef CONFIG_LITMUS_LOCKING
+	.allocate_lock	= cfifo_allocate_lock,
+    .set_prio_inh   = set_priority_inheritance,
+    .clear_prio_inh = clear_priority_inheritance,	
+#endif
+#ifdef CONFIG_LITMUS_SOFTIRQD
+	.set_prio_inh_klitirqd = set_priority_inheritance_klitirqd,
+	.clear_prio_inh_klitirqd = clear_priority_inheritance_klitirqd,
+#endif	
+};
+
+static struct proc_dir_entry *cluster_file = NULL, *cfifo_dir = NULL;
+
+static int __init init_cfifo(void)
+{
+	int err, fs;
+
+	err = register_sched_plugin(&cfifo_plugin);
+	if (!err) {
+		fs = make_plugin_proc_dir(&cfifo_plugin, &cfifo_dir);
+		if (!fs)
+			cluster_file = create_cluster_file(cfifo_dir, &cluster_config);
+		else
+			printk(KERN_ERR "Could not allocate C-FIFO procfs dir.\n");
+	}
+	return err;
+}
+
+static void clean_cfifo(void)
+{
+	cleanup_cfifo();
+	if (cluster_file)
+		remove_proc_entry("cluster", cfifo_dir);
+	if (cfifo_dir)
+		remove_plugin_proc_dir(&cfifo_plugin);
+}
+
+module_init(init_cfifo);
+module_exit(clean_cfifo);
diff --git a/litmus/sched_crm.c b/litmus/sched_crm.c
new file mode 100644
index 000000000000..061b29eaff7e
--- /dev/null
+++ b/litmus/sched_crm.c
@@ -0,0 +1,1611 @@
+/*
+ * litmus/sched_crm.c
+ *
+ * Implementation of the C-RM scheduling algorithm.
+ *
+ * This implementation is based on G-EDF:
+ * - CPUs are clustered around L2 or L3 caches.
+ * - Clusters topology is automatically detected (this is arch dependent
+ *   and is working only on x86 at the moment --- and only with modern
+ *   cpus that exports cpuid4 information)
+ * - The plugins _does not_ attempt to put tasks in the right cluster i.e.
+ *   the programmer needs to be aware of the topology to place tasks
+ *   in the desired cluster
+ * - default clustering is around L2 cache (cache index = 2)
+ *   supported clusters are: L1 (private cache: pedf), L2, L3, ALL (all
+ *   online_cpus are placed in a single cluster).
+ *
+ *   For details on functions, take a look at sched_gsn_edf.c
+ *
+ * Currently, we do not support changes in the number of online cpus.
+ * If the num_online_cpus() dynamically changes, the plugin is broken.
+ *
+ * This version uses the simple approach and serializes all scheduling
+ * decisions by the use of a queue lock. This is probably not the
+ * best way to do it, but it should suffice for now.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+#include <linux/module.h>
+
+#include <litmus/litmus.h>
+#include <litmus/jobs.h>
+#include <litmus/preempt.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/rm_common.h>
+#include <litmus/sched_trace.h>
+
+#include <litmus/clustered.h>
+
+#include <litmus/bheap.h>
+
+/* to configure the cluster size */
+#include <litmus/litmus_proc.h>
+
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+#include <litmus/affinity.h>
+#endif
+
+#ifdef CONFIG_LITMUS_SOFTIRQD
+#include <litmus/litmus_softirq.h>
+#endif
+
+#ifdef CONFIG_LITMUS_NVIDIA
+#include <litmus/nvidia_info.h>
+#endif
+
+/* Reference configuration variable. Determines which cache level is used to
+ * group CPUs into clusters.  GLOBAL_CLUSTER, which is the default, means that
+ * all CPUs form a single cluster (just like GSN-EDF).
+ */
+static enum cache_level cluster_config = GLOBAL_CLUSTER;
+
+struct clusterdomain;
+
+/* cpu_entry_t - maintain the linked and scheduled state
+ *
+ * A cpu also contains a pointer to the crm_domain_t cluster
+ * that owns it (struct clusterdomain*)
+ */
+typedef struct  {
+	int 			cpu;
+	struct clusterdomain*	cluster;	/* owning cluster */
+	struct task_struct*	linked;		/* only RT tasks */
+	struct task_struct*	scheduled;	/* only RT tasks */
+	atomic_t		will_schedule;	/* prevent unneeded IPIs */
+	struct bheap_node*	hn;
+} cpu_entry_t;
+
+/* one cpu_entry_t per CPU */
+DEFINE_PER_CPU(cpu_entry_t, crm_cpu_entries);
+
+#define set_will_schedule() \
+	(atomic_set(&__get_cpu_var(crm_cpu_entries).will_schedule, 1))
+#define clear_will_schedule() \
+	(atomic_set(&__get_cpu_var(crm_cpu_entries).will_schedule, 0))
+#define test_will_schedule(cpu) \
+	(atomic_read(&per_cpu(crm_cpu_entries, cpu).will_schedule))
+
+/*
+ * In C-RM there is a crm domain _per_ cluster
+ * The number of clusters is dynamically determined accordingly to the
+ * total cpu number and the cluster size
+ */
+typedef struct clusterdomain {
+	/* rt_domain for this cluster */
+	rt_domain_t	domain;
+	/* cpus in this cluster */
+	cpu_entry_t*	*cpus;
+	/* map of this cluster cpus */
+	cpumask_var_t	cpu_map;
+	/* the cpus queue themselves according to priority in here */
+	struct bheap_node *heap_node;
+	struct bheap      cpu_heap;
+	/* lock for this cluster */
+#define crm_lock domain.ready_lock
+} crm_domain_t;
+
+/* a crm_domain per cluster; allocation is done at init/activation time */
+crm_domain_t *crm;
+
+#define remote_cluster(cpu)	((crm_domain_t *) per_cpu(crm_cpu_entries, cpu).cluster)
+#define task_cpu_cluster(task)	remote_cluster(get_partition(task))
+
+/* Uncomment WANT_ALL_SCHED_EVENTS if you want to see all scheduling
+ * decisions in the TRACE() log; uncomment VERBOSE_INIT for verbose
+ * information during the initialization of the plugin (e.g., topology)
+#define WANT_ALL_SCHED_EVENTS
+ */
+#define VERBOSE_INIT
+
+static int cpu_lower_prio(struct bheap_node *_a, struct bheap_node *_b)
+{
+	cpu_entry_t *a, *b;
+	a = _a->value;
+	b = _b->value;
+	/* Note that a and b are inverted: we want the lowest-priority CPU at
+	 * the top of the heap.
+	 */
+	return rm_higher_prio(b->linked, a->linked);
+}
+
+/* update_cpu_position - Move the cpu entry to the correct place to maintain
+ *                       order in the cpu queue. Caller must hold crm lock.
+ */
+static void update_cpu_position(cpu_entry_t *entry)
+{
+	crm_domain_t *cluster = entry->cluster;
+
+	if (likely(bheap_node_in_heap(entry->hn)))
+		bheap_delete(cpu_lower_prio,
+				&cluster->cpu_heap,
+				entry->hn);
+
+	bheap_insert(cpu_lower_prio, &cluster->cpu_heap, entry->hn);
+}
+
+/* caller must hold crm lock */
+static cpu_entry_t* lowest_prio_cpu(crm_domain_t *cluster)
+{
+	struct bheap_node* hn;
+	hn = bheap_peek(cpu_lower_prio, &cluster->cpu_heap);
+	return hn->value;
+}
+
+
+/* link_task_to_cpu - Update the link of a CPU.
+ *                    Handles the case where the to-be-linked task is already
+ *                    scheduled on a different CPU.
+ */
+static noinline void link_task_to_cpu(struct task_struct* linked,
+				      cpu_entry_t *entry)
+{
+	cpu_entry_t *sched;
+	struct task_struct* tmp;
+	int on_cpu;
+
+	BUG_ON(linked && !is_realtime(linked));
+
+	/* Currently linked task is set to be unlinked. */
+	if (entry->linked) {
+		entry->linked->rt_param.linked_on = NO_CPU;
+	}
+
+	/* Link new task to CPU. */
+	if (linked) {
+		set_rt_flags(linked, RT_F_RUNNING);
+		/* handle task is already scheduled somewhere! */
+		on_cpu = linked->rt_param.scheduled_on;
+		if (on_cpu != NO_CPU) {
+			sched = &per_cpu(crm_cpu_entries, on_cpu);
+			/* this should only happen if not linked already */
+			BUG_ON(sched->linked == linked);
+
+			/* If we are already scheduled on the CPU to which we
+			 * wanted to link, we don't need to do the swap --
+			 * we just link ourselves to the CPU and depend on
+			 * the caller to get things right.
+			 */
+			if (entry != sched) {
+				TRACE_TASK(linked,
+					   "already scheduled on %d, updating link.\n",
+					   sched->cpu);
+				tmp = sched->linked;
+				linked->rt_param.linked_on = sched->cpu;
+				sched->linked = linked;
+				update_cpu_position(sched);
+				linked = tmp;
+			}
+		}
+		if (linked) /* might be NULL due to swap */
+			linked->rt_param.linked_on = entry->cpu;
+	}
+	entry->linked = linked;
+#ifdef WANT_ALL_SCHED_EVENTS
+	if (linked)
+		TRACE_TASK(linked, "linked to %d.\n", entry->cpu);
+	else
+		TRACE("NULL linked to %d.\n", entry->cpu);
+#endif
+	update_cpu_position(entry);
+}
+
+/* unlink - Make sure a task is not linked any longer to an entry
+ *          where it was linked before. Must hold crm_lock.
+ */
+static noinline void unlink(struct task_struct* t)
+{
+    	cpu_entry_t *entry;
+
+	if (t->rt_param.linked_on != NO_CPU) {
+		/* unlink */
+		entry = &per_cpu(crm_cpu_entries, t->rt_param.linked_on);
+		t->rt_param.linked_on = NO_CPU;
+		link_task_to_cpu(NULL, entry);
+	} else if (is_queued(t)) {
+		/* This is an interesting situation: t is scheduled,
+		 * but was just recently unlinked.  It cannot be
+		 * linked anywhere else (because then it would have
+		 * been relinked to this CPU), thus it must be in some
+		 * queue. We must remove it from the list in this
+		 * case.
+		 *
+		 * in C-RM case is should be somewhere in the queue for
+		 * its domain, therefore and we can get the domain using
+		 * task_cpu_cluster
+		 */
+		remove(&(task_cpu_cluster(t))->domain, t);
+	}
+}
+
+
+/* preempt - force a CPU to reschedule
+ */
+static void preempt(cpu_entry_t *entry)
+{
+	preempt_if_preemptable(entry->scheduled, entry->cpu);
+}
+
+/* requeue - Put an unlinked task into gsn-edf domain.
+ *           Caller must hold crm_lock.
+ */
+static noinline void requeue(struct task_struct* task)
+{
+	crm_domain_t *cluster = task_cpu_cluster(task);
+	BUG_ON(!task);
+	/* sanity check before insertion */
+	BUG_ON(is_queued(task));
+
+	if (is_released(task, litmus_clock()))
+		__add_ready(&cluster->domain, task);
+	else {
+		/* it has got to wait */
+		add_release(&cluster->domain, task);
+	}
+}
+
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+static cpu_entry_t* crm_get_nearest_available_cpu(
+				crm_domain_t *cluster, cpu_entry_t* start)
+{
+	cpu_entry_t* affinity;
+
+	get_nearest_available_cpu(affinity, start, crm_cpu_entries, -1);
+
+	/* make sure CPU is in our cluster */
+	if(affinity && cpu_isset(affinity->cpu, *cluster->cpu_map))
+		return(affinity);
+	else
+		return(NULL);
+}
+#endif
+
+
+/* check for any necessary preemptions */
+static void check_for_preemptions(crm_domain_t *cluster)
+{
+	struct task_struct *task;
+	cpu_entry_t *last;
+
+	for(last = lowest_prio_cpu(cluster);
+	    rm_preemption_needed(&cluster->domain, last->linked);
+	    last = lowest_prio_cpu(cluster)) {
+		/* preemption necessary */
+		task = __take_ready(&cluster->domain);
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+		{
+			cpu_entry_t* affinity =
+					crm_get_nearest_available_cpu(cluster,
+							&per_cpu(crm_cpu_entries, task_cpu(task)));
+			if(affinity)
+				last = affinity;
+			else if(last->linked)
+				requeue(last->linked);
+		}
+#else
+		if (last->linked)
+			requeue(last->linked);
+#endif
+		TRACE("check_for_preemptions: attempting to link task %d to %d\n",
+				task->pid, last->cpu);
+		link_task_to_cpu(task, last);
+		preempt(last);
+	}
+}
+
+/* crm_job_arrival: task is either resumed or released */
+static noinline void crm_job_arrival(struct task_struct* task)
+{
+	crm_domain_t *cluster = task_cpu_cluster(task);
+	BUG_ON(!task);
+
+	requeue(task);
+	check_for_preemptions(cluster);
+}
+
+static void crm_release_jobs(rt_domain_t* rt, struct bheap* tasks)
+{
+	crm_domain_t* cluster = container_of(rt, crm_domain_t, domain);
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&cluster->crm_lock, flags);
+
+	__merge_ready(&cluster->domain, tasks);
+	check_for_preemptions(cluster);
+
+	raw_spin_unlock_irqrestore(&cluster->crm_lock, flags);
+}
+
+/* caller holds crm_lock */
+static noinline void job_completion(struct task_struct *t, int forced)
+{
+	BUG_ON(!t);
+
+	sched_trace_task_completion(t, forced);
+
+#ifdef CONFIG_LITMUS_NVIDIA
+	atomic_set(&tsk_rt(t)->nv_int_count, 0);
+#endif
+
+	TRACE_TASK(t, "job_completion().\n");
+
+	/* set flags */
+	set_rt_flags(t, RT_F_SLEEP);
+	/* prepare for next period */
+	prepare_for_next_period(t);
+	if (is_released(t, litmus_clock()))
+		sched_trace_task_release(t);
+	/* unlink */
+	unlink(t);
+	/* requeue
+	 * But don't requeue a blocking task. */
+	if (is_running(t))
+		crm_job_arrival(t);
+}
+
+/* crm_tick - this function is called for every local timer
+ *                         interrupt.
+ *
+ *                   checks whether the current task has expired and checks
+ *                   whether we need to preempt it if it has not expired
+ */
+static void crm_tick(struct task_struct* t)
+{
+	if (is_realtime(t) && budget_enforced(t) && budget_exhausted(t)) {
+		if (!is_np(t)) {
+			/* np tasks will be preempted when they become
+			 * preemptable again
+			 */
+			litmus_reschedule_local();
+			set_will_schedule();
+			TRACE("crm_scheduler_tick: "
+			      "%d is preemptable "
+			      " => FORCE_RESCHED\n", t->pid);
+		} else if (is_user_np(t)) {
+			TRACE("crm_scheduler_tick: "
+			      "%d is non-preemptable, "
+			      "preemption delayed.\n", t->pid);
+			request_exit_np(t);
+		}
+	}
+}
+
+/* Getting schedule() right is a bit tricky. schedule() may not make any
+ * assumptions on the state of the current task since it may be called for a
+ * number of reasons. The reasons include a scheduler_tick() determined that it
+ * was necessary, because sys_exit_np() was called, because some Linux
+ * subsystem determined so, or even (in the worst case) because there is a bug
+ * hidden somewhere. Thus, we must take extreme care to determine what the
+ * current state is.
+ *
+ * The CPU could currently be scheduling a task (or not), be linked (or not).
+ *
+ * The following assertions for the scheduled task could hold:
+ *
+ *      - !is_running(scheduled)        // the job blocks
+ *	- scheduled->timeslice == 0	// the job completed (forcefully)
+ *	- get_rt_flag() == RT_F_SLEEP	// the job completed (by syscall)
+ * 	- linked != scheduled		// we need to reschedule (for any reason)
+ * 	- is_np(scheduled)		// rescheduling must be delayed,
+ *					   sys_exit_np must be requested
+ *
+ * Any of these can occur together.
+ */
+static struct task_struct* crm_schedule(struct task_struct * prev)
+{
+	cpu_entry_t* entry = &__get_cpu_var(crm_cpu_entries);
+	crm_domain_t *cluster = entry->cluster;
+	int out_of_time, sleep, preempt, np, exists, blocks;
+	struct task_struct* next = NULL;
+
+	raw_spin_lock(&cluster->crm_lock);
+	clear_will_schedule();
+
+	/* sanity checking */
+	BUG_ON(entry->scheduled && entry->scheduled != prev);
+	BUG_ON(entry->scheduled && !is_realtime(prev));
+	BUG_ON(is_realtime(prev) && !entry->scheduled);
+
+	/* (0) Determine state */
+	exists      = entry->scheduled != NULL;
+	blocks      = exists && !is_running(entry->scheduled);
+	out_of_time = exists &&
+				  budget_enforced(entry->scheduled) &&
+				  budget_exhausted(entry->scheduled);
+	np 	    = exists && is_np(entry->scheduled);
+	sleep	    = exists && get_rt_flags(entry->scheduled) == RT_F_SLEEP;
+	preempt     = entry->scheduled != entry->linked;
+
+#ifdef WANT_ALL_SCHED_EVENTS
+	TRACE_TASK(prev, "invoked crm_schedule.\n");
+#endif
+
+	if (exists)
+		TRACE_TASK(prev,
+			   "blocks:%d out_of_time:%d np:%d sleep:%d preempt:%d "
+			   "state:%d sig:%d\n",
+			   blocks, out_of_time, np, sleep, preempt,
+			   prev->state, signal_pending(prev));
+	if (entry->linked && preempt)
+		TRACE_TASK(prev, "will be preempted by %s/%d\n",
+			   entry->linked->comm, entry->linked->pid);
+
+
+	/* If a task blocks we have no choice but to reschedule.
+	 */
+	if (blocks)
+		unlink(entry->scheduled);
+
+	/* Request a sys_exit_np() call if we would like to preempt but cannot.
+	 * We need to make sure to update the link structure anyway in case
+	 * that we are still linked. Multiple calls to request_exit_np() don't
+	 * hurt.
+	 */
+	if (np && (out_of_time || preempt || sleep)) {
+		unlink(entry->scheduled);
+		request_exit_np(entry->scheduled);
+	}
+
+	/* Any task that is preemptable and either exhausts its execution
+	 * budget or wants to sleep completes. We may have to reschedule after
+	 * this. Don't do a job completion if we block (can't have timers running
+	 * for blocked jobs). Preemption go first for the same reason.
+	 */
+	if (!np && (out_of_time || sleep) && !blocks && !preempt)
+		job_completion(entry->scheduled, !sleep);
+
+	/* Link pending task if we became unlinked.
+	 */
+	if (!entry->linked)
+		link_task_to_cpu(__take_ready(&cluster->domain), entry);
+
+	/* The final scheduling decision. Do we need to switch for some reason?
+	 * If linked is different from scheduled, then select linked as next.
+	 */
+	if ((!np || blocks) &&
+	    entry->linked != entry->scheduled) {
+		/* Schedule a linked job? */
+		if (entry->linked) {
+			entry->linked->rt_param.scheduled_on = entry->cpu;
+			next = entry->linked;
+		}
+		if (entry->scheduled) {
+			/* not gonna be scheduled soon */
+			entry->scheduled->rt_param.scheduled_on = NO_CPU;
+			TRACE_TASK(entry->scheduled, "scheduled_on = NO_CPU\n");
+		}
+	} else
+		/* Only override Linux scheduler if we have a real-time task
+		 * scheduled that needs to continue.
+		 */
+		if (exists)
+			next = prev;
+
+	sched_state_task_picked();
+	raw_spin_unlock(&cluster->crm_lock);
+
+#ifdef WANT_ALL_SCHED_EVENTS
+	TRACE("crm_lock released, next=0x%p\n", next);
+
+	if (next)
+		TRACE_TASK(next, "scheduled at %llu\n", litmus_clock());
+	else if (exists && !next)
+		TRACE("becomes idle at %llu.\n", litmus_clock());
+#endif
+
+
+	return next;
+}
+
+
+/* _finish_switch - we just finished the switch away from prev
+ */
+static void crm_finish_switch(struct task_struct *prev)
+{
+	cpu_entry_t* 	entry = &__get_cpu_var(crm_cpu_entries);
+
+	entry->scheduled = is_realtime(current) ? current : NULL;
+#ifdef WANT_ALL_SCHED_EVENTS
+	TRACE_TASK(prev, "switched away from\n");
+#endif
+}
+
+
+/*	Prepare a task for running in RT mode
+ */
+static void crm_task_new(struct task_struct * t, int on_rq, int running)
+{
+	unsigned long 		flags;
+	cpu_entry_t* 		entry;
+	crm_domain_t*		cluster;
+
+	TRACE("gsn edf: task new %d\n", t->pid);
+
+	/* the cluster doesn't change even if t is running */
+	cluster = task_cpu_cluster(t);
+
+	raw_spin_lock_irqsave(&cluster->crm_lock, flags);
+
+	/* setup job params */
+	release_at(t, litmus_clock());
+
+	if (running) {
+		entry = &per_cpu(crm_cpu_entries, task_cpu(t));
+		BUG_ON(entry->scheduled);
+
+		entry->scheduled = t;
+		tsk_rt(t)->scheduled_on = task_cpu(t);
+	} else {
+		t->rt_param.scheduled_on = NO_CPU;
+	}
+	t->rt_param.linked_on          = NO_CPU;
+
+	crm_job_arrival(t);
+	raw_spin_unlock_irqrestore(&cluster->crm_lock, flags);
+}
+
+static void crm_task_wake_up(struct task_struct *task)
+{
+	unsigned long flags;
+	//lt_t now;
+	crm_domain_t *cluster;
+
+	TRACE_TASK(task, "wake_up at %llu\n", litmus_clock());
+
+	cluster = task_cpu_cluster(task);
+
+	raw_spin_lock_irqsave(&cluster->crm_lock, flags);
+
+#if 0  // sporadic task model
+	/* We need to take suspensions because of semaphores into
+	 * account! If a job resumes after being suspended due to acquiring
+	 * a semaphore, it should never be treated as a new job release.
+	 */
+	if (get_rt_flags(task) == RT_F_EXIT_SEM) {
+		set_rt_flags(task, RT_F_RUNNING);
+	} else {
+		now = litmus_clock();
+		if (is_tardy(task, now)) {
+			/* new sporadic release */
+			release_at(task, now);
+			sched_trace_task_release(task);
+		}
+		else {
+			if (task->rt.time_slice) {
+				/* came back in time before deadline
+				*/
+				set_rt_flags(task, RT_F_RUNNING);
+			}
+		}
+	}
+#endif
+
+	//BUG_ON(tsk_rt(task)->linked_on != NO_CPU);
+	set_rt_flags(task, RT_F_RUNNING);  // periodic model
+
+	if(tsk_rt(task)->linked_on == NO_CPU)
+		crm_job_arrival(task);
+	else
+		TRACE("WTF, mate?!\n");
+
+	raw_spin_unlock_irqrestore(&cluster->crm_lock, flags);
+}
+
+static void crm_task_block(struct task_struct *t)
+{
+	unsigned long flags;
+	crm_domain_t *cluster;
+
+	TRACE_TASK(t, "block at %llu\n", litmus_clock());
+
+	cluster = task_cpu_cluster(t);
+
+	/* unlink if necessary */
+	raw_spin_lock_irqsave(&cluster->crm_lock, flags);
+	unlink(t);
+	raw_spin_unlock_irqrestore(&cluster->crm_lock, flags);
+
+	BUG_ON(!is_realtime(t));
+}
+
+
+static void crm_task_exit(struct task_struct * t)
+{
+	unsigned long flags;
+	crm_domain_t *cluster = task_cpu_cluster(t);
+
+	/* unlink if necessary */
+	raw_spin_lock_irqsave(&cluster->crm_lock, flags);
+	unlink(t);
+	if (tsk_rt(t)->scheduled_on != NO_CPU) {
+		cpu_entry_t *cpu;
+		cpu = &per_cpu(crm_cpu_entries, tsk_rt(t)->scheduled_on);
+		cpu->scheduled = NULL;
+		tsk_rt(t)->scheduled_on = NO_CPU;
+	}
+	raw_spin_unlock_irqrestore(&cluster->crm_lock, flags);
+
+	BUG_ON(!is_realtime(t));
+        TRACE_TASK(t, "RIP\n");
+}
+
+static long crm_admit_task(struct task_struct* tsk)
+{
+	return task_cpu(tsk) == tsk->rt_param.task_params.cpu ? 0 : -EINVAL;
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+#ifdef CONFIG_LITMUS_LOCKING
+
+#include <litmus/fdso.h>
+
+
+static void __set_priority_inheritance(struct task_struct* t, struct task_struct* prio_inh)
+{
+	int linked_on;
+	int check_preempt = 0;	
+	
+	crm_domain_t* cluster = task_cpu_cluster(t);
+	
+	if(prio_inh != NULL)
+		TRACE_TASK(t, "inherits priority from %s/%d\n", prio_inh->comm, prio_inh->pid);
+	else
+		TRACE_TASK(t, "inherits priority from %p\n", prio_inh);
+	
+	sched_trace_eff_prio_change(t, prio_inh);
+	
+	tsk_rt(t)->inh_task = prio_inh;
+	
+	linked_on  = tsk_rt(t)->linked_on;
+	
+	/* If it is scheduled, then we need to reorder the CPU heap. */
+	if (linked_on != NO_CPU) {
+		TRACE_TASK(t, "%s: linked  on %d\n",
+				   __FUNCTION__, linked_on);
+		/* Holder is scheduled; need to re-order CPUs.
+		 * We can't use heap_decrease() here since
+		 * the cpu_heap is ordered in reverse direction, so
+		 * it is actually an increase. */
+		bheap_delete(cpu_lower_prio, &cluster->cpu_heap,
+                     per_cpu(crm_cpu_entries, linked_on).hn);
+		bheap_insert(cpu_lower_prio, &cluster->cpu_heap,
+                     per_cpu(crm_cpu_entries, linked_on).hn);
+	} else {
+		/* holder may be queued: first stop queue changes */
+		raw_spin_lock(&cluster->domain.release_lock);
+		if (is_queued(t)) {
+			TRACE_TASK(t, "%s: is queued\n", __FUNCTION__);
+			
+			/* We need to update the position of holder in some
+			 * heap. Note that this could be a release heap if we
+			 * budget enforcement is used and this job overran. */
+			check_preempt = !bheap_decrease(rm_ready_order, tsk_rt(t)->heap_node);
+			
+		} else {
+			/* Nothing to do: if it is not queued and not linked
+			 * then it is either sleeping or currently being moved
+			 * by other code (e.g., a timer interrupt handler) that
+			 * will use the correct priority when enqueuing the
+			 * task. */
+			TRACE_TASK(t, "%s: is NOT queued => Done.\n", __FUNCTION__);
+		}
+		raw_spin_unlock(&cluster->domain.release_lock);
+		
+		/* If holder was enqueued in a release heap, then the following
+		 * preemption check is pointless, but we can't easily detect
+		 * that case. If you want to fix this, then consider that
+		 * simply adding a state flag requires O(n) time to update when
+		 * releasing n tasks, which conflicts with the goal to have
+		 * O(log n) merges. */
+		if (check_preempt) {
+			/* heap_decrease() hit the top level of the heap: make
+			 * sure preemption checks get the right task, not the
+			 * potentially stale cache. */
+			bheap_uncache_min(rm_ready_order, &cluster->domain.ready_queue);
+			check_for_preemptions(cluster);
+		}
+	}
+}
+
+/* called with IRQs off */
+static void set_priority_inheritance(struct task_struct* t, struct task_struct* prio_inh)
+{
+	crm_domain_t* cluster = task_cpu_cluster(t);
+	
+	raw_spin_lock(&cluster->crm_lock);
+	
+	__set_priority_inheritance(t, prio_inh);
+	
+#ifdef CONFIG_LITMUS_SOFTIRQD
+	if(tsk_rt(t)->cur_klitirqd != NULL)
+	{
+		TRACE_TASK(t, "%s/%d inherits a new priority!\n",
+				   tsk_rt(t)->cur_klitirqd->comm, tsk_rt(t)->cur_klitirqd->pid);
+		
+		__set_priority_inheritance(tsk_rt(t)->cur_klitirqd, prio_inh);
+	}
+#endif
+	
+	raw_spin_unlock(&cluster->crm_lock);
+}
+
+
+/* called with IRQs off */
+static void __clear_priority_inheritance(struct task_struct* t)
+{
+    TRACE_TASK(t, "priority restored\n");
+	
+    if(tsk_rt(t)->scheduled_on != NO_CPU)
+    {
+		sched_trace_eff_prio_change(t, NULL);
+		
+        tsk_rt(t)->inh_task = NULL;
+        
+        /* Check if rescheduling is necessary. We can't use heap_decrease()
+         * since the priority was effectively lowered. */
+        unlink(t);
+        crm_job_arrival(t);
+    }
+    else
+    {
+        __set_priority_inheritance(t, NULL);
+    }
+	
+#ifdef CONFIG_LITMUS_SOFTIRQD
+	if(tsk_rt(t)->cur_klitirqd != NULL)
+	{
+		TRACE_TASK(t, "%s/%d inheritance set back to owner.\n",
+				   tsk_rt(t)->cur_klitirqd->comm, tsk_rt(t)->cur_klitirqd->pid);
+		
+		if(tsk_rt(tsk_rt(t)->cur_klitirqd)->scheduled_on != NO_CPU)
+		{
+			sched_trace_eff_prio_change(tsk_rt(t)->cur_klitirqd, t);
+			
+			tsk_rt(tsk_rt(t)->cur_klitirqd)->inh_task = t;
+			
+			/* Check if rescheduling is necessary. We can't use heap_decrease()
+			 * since the priority was effectively lowered. */
+			unlink(tsk_rt(t)->cur_klitirqd);
+			crm_job_arrival(tsk_rt(t)->cur_klitirqd);
+		}
+		else
+		{
+			__set_priority_inheritance(tsk_rt(t)->cur_klitirqd, t);
+		}
+	}
+#endif
+}
+
+/* called with IRQs off */
+static void clear_priority_inheritance(struct task_struct* t)
+{
+	crm_domain_t* cluster = task_cpu_cluster(t);
+	
+	raw_spin_lock(&cluster->crm_lock);
+	__clear_priority_inheritance(t);
+	raw_spin_unlock(&cluster->crm_lock);
+}
+
+
+
+#ifdef CONFIG_LITMUS_SOFTIRQD
+/* called with IRQs off */
+static void set_priority_inheritance_klitirqd(struct task_struct* klitirqd,
+											  struct task_struct* old_owner,
+											  struct task_struct* new_owner)
+{
+	crm_domain_t* cluster = task_cpu_cluster(klitirqd);
+	
+	BUG_ON(!(tsk_rt(klitirqd)->is_proxy_thread));
+	
+	raw_spin_lock(&cluster->crm_lock);
+	
+	if(old_owner != new_owner)
+	{
+		if(old_owner)
+		{
+			// unreachable?
+			tsk_rt(old_owner)->cur_klitirqd = NULL;
+		}
+		
+		TRACE_TASK(klitirqd, "giving ownership to %s/%d.\n",
+				   new_owner->comm, new_owner->pid);
+		
+		tsk_rt(new_owner)->cur_klitirqd = klitirqd;
+	}
+	
+	__set_priority_inheritance(klitirqd,
+							   (tsk_rt(new_owner)->inh_task == NULL) ?
+							   new_owner :
+							   tsk_rt(new_owner)->inh_task);
+	
+	raw_spin_unlock(&cluster->crm_lock);
+}
+
+/* called with IRQs off */
+static void clear_priority_inheritance_klitirqd(struct task_struct* klitirqd,
+												struct task_struct* old_owner)
+{
+	crm_domain_t* cluster = task_cpu_cluster(klitirqd);
+	
+	BUG_ON(!(tsk_rt(klitirqd)->is_proxy_thread));
+	
+	raw_spin_lock(&cluster->crm_lock);
+    
+    TRACE_TASK(klitirqd, "priority restored\n");
+	
+    if(tsk_rt(klitirqd)->scheduled_on != NO_CPU)
+    {
+        tsk_rt(klitirqd)->inh_task = NULL;
+        
+        /* Check if rescheduling is necessary. We can't use heap_decrease()
+         * since the priority was effectively lowered. */
+        unlink(klitirqd);
+        crm_job_arrival(klitirqd);
+    }
+    else
+    {
+        __set_priority_inheritance(klitirqd, NULL);
+    }
+	
+	tsk_rt(old_owner)->cur_klitirqd = NULL;
+	
+	raw_spin_unlock(&cluster->crm_lock);
+}
+#endif  // CONFIG_LITMUS_SOFTIRQD
+
+
+/* ******************** KFMLP support ********************** */
+
+/* struct for semaphore with priority inheritance */
+struct kfmlp_queue
+{
+	wait_queue_head_t wait;
+	struct task_struct* owner;
+	struct task_struct* hp_waiter;
+	int count; /* number of waiters + holder */
+};
+
+struct kfmlp_semaphore
+{
+	struct litmus_lock litmus_lock;
+	
+	spinlock_t lock;
+	
+	int num_resources; /* aka k */
+	struct kfmlp_queue *queues; /* array */
+	struct kfmlp_queue *shortest_queue; /* pointer to shortest queue */
+};
+
+static inline struct kfmlp_semaphore* kfmlp_from_lock(struct litmus_lock* lock)
+{
+	return container_of(lock, struct kfmlp_semaphore, litmus_lock);
+}
+
+static inline int kfmlp_get_idx(struct kfmlp_semaphore* sem,
+								struct kfmlp_queue* queue)
+{
+	return (queue - &sem->queues[0]);
+}
+
+static inline struct kfmlp_queue* kfmlp_get_queue(struct kfmlp_semaphore* sem,
+												  struct task_struct* holder)
+{
+	int i;
+	for(i = 0; i < sem->num_resources; ++i)
+		if(sem->queues[i].owner == holder)
+			return(&sem->queues[i]);
+	return(NULL);
+}
+
+/* caller is responsible for locking */
+static struct task_struct* kfmlp_find_hp_waiter(struct kfmlp_queue *kqueue,
+										 struct task_struct *skip)
+{
+	struct list_head	*pos;
+	struct task_struct 	*queued, *found = NULL;
+	
+	list_for_each(pos, &kqueue->wait.task_list) {
+		queued  = (struct task_struct*) list_entry(pos, wait_queue_t,
+												   task_list)->private;
+		
+		/* Compare task prios, find high prio task. */
+		if (queued != skip && rm_higher_prio(queued, found))
+			found = queued;
+	}
+	return found;
+}
+
+static inline struct kfmlp_queue* kfmlp_find_shortest(
+										  struct kfmlp_semaphore* sem,
+										  struct kfmlp_queue* search_start)
+{
+	// we start our search at search_start instead of at the beginning of the
+	// queue list to load-balance across all resources.
+	struct kfmlp_queue* step = search_start;
+	struct kfmlp_queue* shortest = sem->shortest_queue;
+	
+	do
+	{
+		step = (step+1 != &sem->queues[sem->num_resources]) ?
+		step+1 : &sem->queues[0];
+		if(step->count < shortest->count)
+		{
+			shortest = step;
+			if(step->count == 0)
+				break; /* can't get any shorter */
+		}
+	}while(step != search_start);
+	
+	return(shortest);
+}
+
+static struct task_struct* kfmlp_remove_hp_waiter(struct kfmlp_semaphore* sem)
+{
+	/* must hold sem->lock */
+	
+	struct kfmlp_queue *my_queue = NULL;
+	struct task_struct *max_hp = NULL;
+	
+	
+	struct list_head	*pos;
+	struct task_struct 	*queued;
+	int i;
+	
+	for(i = 0; i < sem->num_resources; ++i)
+	{
+		if( (sem->queues[i].count > 1) &&
+		   ((my_queue == NULL) ||
+			(rm_higher_prio(sem->queues[i].hp_waiter, my_queue->hp_waiter))) )
+		{
+			my_queue = &sem->queues[i];
+		}
+	}
+	
+	if(my_queue)
+	{
+		crm_domain_t* cluster;
+		
+		max_hp = my_queue->hp_waiter;
+		BUG_ON(!max_hp);
+
+		TRACE_CUR("queue %d: stealing %s/%d from queue %d\n",
+				  kfmlp_get_idx(sem, my_queue),
+				  max_hp->comm, max_hp->pid,
+				  kfmlp_get_idx(sem, my_queue));
+		
+		my_queue->hp_waiter = kfmlp_find_hp_waiter(my_queue, max_hp);
+		
+		/*
+		 if(my_queue->hp_waiter)
+		 TRACE_CUR("queue %d: new hp_waiter is %s/%d\n",
+		 kfmlp_get_idx(sem, my_queue),
+		 my_queue->hp_waiter->comm,
+		 my_queue->hp_waiter->pid);
+		 else
+		 TRACE_CUR("queue %d: new hp_waiter is %p\n",
+		 kfmlp_get_idx(sem, my_queue), NULL);
+		 */
+	
+		cluster = task_cpu_cluster(max_hp);
+
+		raw_spin_lock(&cluster->crm_lock);
+		
+		/*
+		 if(my_queue->owner)
+		 TRACE_CUR("queue %d: owner is %s/%d\n",
+		 kfmlp_get_idx(sem, my_queue),
+		 my_queue->owner->comm,
+		 my_queue->owner->pid);
+		 else
+		 TRACE_CUR("queue %d: owner is %p\n",
+		 kfmlp_get_idx(sem, my_queue),
+		 NULL);
+		 */
+		
+		if(tsk_rt(my_queue->owner)->inh_task == max_hp)
+		{
+			__clear_priority_inheritance(my_queue->owner);
+			if(my_queue->hp_waiter != NULL)
+			{
+				__set_priority_inheritance(my_queue->owner, my_queue->hp_waiter);
+			}
+		}
+		raw_spin_unlock(&cluster->crm_lock);
+		
+		list_for_each(pos, &my_queue->wait.task_list)
+		{
+			queued  = (struct task_struct*) list_entry(pos, wait_queue_t,
+													   task_list)->private;
+			/* Compare task prios, find high prio task. */
+			if (queued == max_hp)
+			{
+				/*
+				 TRACE_CUR("queue %d: found entry in wait queue.  REMOVING!\n",
+				 kfmlp_get_idx(sem, my_queue));
+				 */
+				__remove_wait_queue(&my_queue->wait,
+									list_entry(pos, wait_queue_t, task_list));
+				break;
+			}
+		}
+		--(my_queue->count);
+	}
+	
+	return(max_hp);
+}
+
+int crm_kfmlp_lock(struct litmus_lock* l)
+{
+	struct task_struct* t = current;
+	struct kfmlp_semaphore *sem = kfmlp_from_lock(l);
+	struct kfmlp_queue* my_queue;
+	wait_queue_t wait;
+	unsigned long flags;
+	
+	if (!is_realtime(t))
+		return -EPERM;
+	
+	spin_lock_irqsave(&sem->lock, flags);
+	
+	my_queue = sem->shortest_queue;
+	
+	if (my_queue->owner) {
+		/* resource is not free => must suspend and wait */
+		TRACE_CUR("queue %d: Resource is not free => must suspend and wait.\n",
+				  kfmlp_get_idx(sem, my_queue));
+		
+		init_waitqueue_entry(&wait, t);
+		
+		/* FIXME: interruptible would be nice some day */
+		set_task_state(t, TASK_UNINTERRUPTIBLE);
+		
+		__add_wait_queue_tail_exclusive(&my_queue->wait, &wait);
+		
+		/* check if we need to activate priority inheritance */
+		if (rm_higher_prio(t, my_queue->hp_waiter))
+		{
+			my_queue->hp_waiter = t;
+			if (rm_higher_prio(t, my_queue->owner))
+			{
+				set_priority_inheritance(my_queue->owner, my_queue->hp_waiter);
+			}
+		}
+		
+		++(my_queue->count);
+		sem->shortest_queue = kfmlp_find_shortest(sem, my_queue);
+		
+		/* release lock before sleeping */
+		spin_unlock_irqrestore(&sem->lock, flags);
+		
+		/* We depend on the FIFO order.  Thus, we don't need to recheck
+		 * when we wake up; we are guaranteed to have the lock since
+		 * there is only one wake up per release (or steal).
+		 */
+		schedule();
+		
+		
+		if(my_queue->owner == t)
+		{
+			TRACE_CUR("queue %d: acquired through waiting\n",
+					  kfmlp_get_idx(sem, my_queue));
+		}
+		else
+		{
+			/* this case may happen if our wait entry was stolen
+			 between queues.  record where we went.*/
+			my_queue = kfmlp_get_queue(sem, t);
+			BUG_ON(!my_queue);
+			TRACE_CUR("queue %d: acquired through stealing\n",
+					  kfmlp_get_idx(sem, my_queue));
+		}
+	}
+	else
+	{
+		TRACE_CUR("queue %d: acquired immediately\n",
+				  kfmlp_get_idx(sem, my_queue));
+		
+		my_queue->owner = t;
+		
+		++(my_queue->count);
+		sem->shortest_queue = kfmlp_find_shortest(sem, my_queue);		
+		
+		spin_unlock_irqrestore(&sem->lock, flags);
+	}
+	
+	return kfmlp_get_idx(sem, my_queue);
+}
+
+int crm_kfmlp_unlock(struct litmus_lock* l)
+{
+	struct task_struct *t = current, *next;
+	struct kfmlp_semaphore *sem = kfmlp_from_lock(l);
+	struct kfmlp_queue *my_queue;
+	unsigned long flags;
+	int err = 0;
+	
+	spin_lock_irqsave(&sem->lock, flags);
+	
+	my_queue = kfmlp_get_queue(sem, t);
+	
+	if (!my_queue) {
+		err = -EINVAL;
+		goto out;
+	}
+	
+	/* check if there are jobs waiting for this resource */
+	next = __waitqueue_remove_first(&my_queue->wait);
+	if (next) {
+		/*
+		 TRACE_CUR("queue %d: ASSIGNING %s/%d as owner - next\n",
+		 kfmlp_get_idx(sem, my_queue),
+		 next->comm, next->pid);
+		 */
+		/* next becomes the resouce holder */
+		my_queue->owner = next;
+		
+		--(my_queue->count);
+		if(my_queue->count < sem->shortest_queue->count)
+		{
+			sem->shortest_queue = my_queue;
+		}	
+		
+		TRACE_CUR("queue %d: lock ownership passed to %s/%d\n",
+				  kfmlp_get_idx(sem, my_queue), next->comm, next->pid);
+		
+		/* determine new hp_waiter if necessary */
+		if (next == my_queue->hp_waiter) {
+			TRACE_TASK(next, "was highest-prio waiter\n");
+			/* next has the highest priority --- it doesn't need to
+			 * inherit.  However, we need to make sure that the
+			 * next-highest priority in the queue is reflected in
+			 * hp_waiter. */
+			my_queue->hp_waiter = kfmlp_find_hp_waiter(my_queue, next);
+			if (my_queue->hp_waiter)
+				TRACE_TASK(my_queue->hp_waiter, "queue %d: is new highest-prio waiter\n", kfmlp_get_idx(sem, my_queue));
+			else
+				TRACE("queue %d: no further waiters\n", kfmlp_get_idx(sem, my_queue));
+		} else {
+			/* Well, if next is not the highest-priority waiter,
+			 * then it ought to inherit the highest-priority
+			 * waiter's priority. */
+			set_priority_inheritance(next, my_queue->hp_waiter);
+		}
+		
+		/* wake up next */
+		wake_up_process(next);
+	}
+	else
+	{
+		TRACE_CUR("queue %d: looking to steal someone...\n", kfmlp_get_idx(sem, my_queue));
+		
+		next = kfmlp_remove_hp_waiter(sem); /* returns NULL if nothing to steal */
+		
+		/*
+		 if(next)
+		 TRACE_CUR("queue %d: ASSIGNING %s/%d as owner - steal\n",
+		 kfmlp_get_idx(sem, my_queue),
+		 next->comm, next->pid);
+		 */
+		
+		my_queue->owner = next;
+		
+		if(next)
+		{
+			TRACE_CUR("queue %d: lock ownership passed to %s/%d (which was stolen)\n",
+					  kfmlp_get_idx(sem, my_queue),
+					  next->comm, next->pid);
+			
+			/* wake up next */
+			wake_up_process(next);			
+		}
+		else
+		{
+			TRACE_CUR("queue %d: no one to steal.\n", kfmlp_get_idx(sem, my_queue));
+			
+			--(my_queue->count);
+			if(my_queue->count < sem->shortest_queue->count)
+			{
+				sem->shortest_queue = my_queue;
+			}
+		}
+	}
+	
+	/* we lose the benefit of priority inheritance (if any) */
+	if (tsk_rt(t)->inh_task)
+		clear_priority_inheritance(t);
+	
+out:
+	spin_unlock_irqrestore(&sem->lock, flags);
+	
+	return err;
+}
+
+int crm_kfmlp_close(struct litmus_lock* l)
+{
+	struct task_struct *t = current;
+	struct kfmlp_semaphore *sem = kfmlp_from_lock(l);
+	struct kfmlp_queue *my_queue;
+	unsigned long flags;
+	
+	int owner;
+	
+	spin_lock_irqsave(&sem->lock, flags);
+	
+	my_queue = kfmlp_get_queue(sem, t);	
+	owner = (my_queue) ? (my_queue->owner == t) : 0;
+	
+	spin_unlock_irqrestore(&sem->lock, flags);
+	
+	if (owner)
+		crm_kfmlp_unlock(l);
+	
+	return 0;
+}
+
+void crm_kfmlp_free(struct litmus_lock* l)
+{
+	struct kfmlp_semaphore *sem = kfmlp_from_lock(l);
+	kfree(sem->queues);
+	kfree(sem);
+}
+
+static struct litmus_lock_ops crm_kfmlp_lock_ops = {
+	.close  = crm_kfmlp_close,
+	.lock   = crm_kfmlp_lock,
+	.unlock = crm_kfmlp_unlock,
+	.deallocate = crm_kfmlp_free,
+};
+
+static struct litmus_lock* crm_new_kfmlp(void* __user arg, int* ret_code)
+{
+	struct kfmlp_semaphore* sem;
+	int num_resources = 0;
+	int i;
+	
+	if(!access_ok(VERIFY_READ, arg, sizeof(num_resources)))
+	{
+		*ret_code = -EINVAL;
+		return(NULL);
+	}
+	if(__copy_from_user(&num_resources, arg, sizeof(num_resources)))
+	{
+		*ret_code = -EINVAL;
+		return(NULL);
+	}
+	if(num_resources < 1)
+	{
+		*ret_code = -EINVAL;
+		return(NULL);		
+	}
+	
+	sem = kmalloc(sizeof(*sem), GFP_KERNEL);
+	if(!sem)
+	{
+		*ret_code = -ENOMEM;
+		return NULL;
+	}
+	
+	sem->queues = kmalloc(sizeof(struct kfmlp_queue)*num_resources, GFP_KERNEL);
+	if(!sem->queues)
+	{
+		kfree(sem);
+		*ret_code = -ENOMEM;
+		return NULL;		
+	}
+	
+	sem->litmus_lock.ops = &crm_kfmlp_lock_ops;
+	spin_lock_init(&sem->lock);
+	sem->num_resources = num_resources;
+	
+	for(i = 0; i < num_resources; ++i)
+	{
+		sem->queues[i].owner = NULL;
+		sem->queues[i].hp_waiter = NULL;
+		init_waitqueue_head(&sem->queues[i].wait);
+		sem->queues[i].count = 0;
+	}
+	
+	sem->shortest_queue = &sem->queues[0];
+	
+	*ret_code = 0;
+	return &sem->litmus_lock;
+}
+
+
+/* **** lock constructor **** */
+
+static long crm_allocate_lock(struct litmus_lock **lock, int type,
+								 void* __user arg)
+{
+	int err = -ENXIO;
+	
+	/* C-RM currently only supports the FMLP for global resources
+		WITHIN a given cluster.  DO NOT USE CROSS-CLUSTER! */
+	switch (type) {
+		case KFMLP_SEM:
+			*lock = crm_new_kfmlp(arg, &err);
+			break;
+	};
+	
+	return err;
+}
+
+#endif  // CONFIG_LITMUS_LOCKING
+
+
+
+
+
+
+/* total number of cluster */
+static int num_clusters;
+/* we do not support cluster of different sizes */
+static unsigned int cluster_size;
+
+#ifdef VERBOSE_INIT
+static void print_cluster_topology(cpumask_var_t mask, int cpu)
+{
+	int chk;
+	char buf[255];
+
+	chk = cpulist_scnprintf(buf, 254, mask);
+	buf[chk] = '\0';
+	printk(KERN_INFO "CPU = %d, shared cpu(s) = %s\n", cpu, buf);
+
+}
+#endif
+
+static int clusters_allocated = 0;
+
+static void cleanup_crm(void)
+{
+	int i;
+
+	if (clusters_allocated) {
+		for (i = 0; i < num_clusters; i++) {
+			kfree(crm[i].cpus);
+			kfree(crm[i].heap_node);
+			free_cpumask_var(crm[i].cpu_map);
+		}
+
+		kfree(crm);
+	}
+}
+
+static long crm_activate_plugin(void)
+{
+	int i, j, cpu, ccpu, cpu_count;
+	cpu_entry_t *entry;
+
+	cpumask_var_t mask;
+	int chk = 0;
+
+	/* de-allocate old clusters, if any */
+	cleanup_crm();
+
+	printk(KERN_INFO "C-RM: Activate Plugin, cluster configuration = %d\n",
+			cluster_config);
+
+	/* need to get cluster_size first */
+	if(!zalloc_cpumask_var(&mask, GFP_ATOMIC))
+		return -ENOMEM;
+
+	if (unlikely(cluster_config == GLOBAL_CLUSTER)) {
+		cluster_size = num_online_cpus();
+	} else {
+		chk = get_shared_cpu_map(mask, 0, cluster_config);
+		if (chk) {
+			/* if chk != 0 then it is the max allowed index */
+			printk(KERN_INFO "C-RM: Cluster configuration = %d "
+			       "is not supported on this hardware.\n",
+			       cluster_config);
+			/* User should notice that the configuration failed, so
+			 * let's bail out. */
+			return -EINVAL;
+		}
+
+		cluster_size = cpumask_weight(mask);
+	}
+
+	if ((num_online_cpus() % cluster_size) != 0) {
+		/* this can't be right, some cpus are left out */
+		printk(KERN_ERR "C-RM: Trying to group %d cpus in %d!\n",
+				num_online_cpus(), cluster_size);
+		return -1;
+	}
+
+	num_clusters = num_online_cpus() / cluster_size;
+	printk(KERN_INFO "C-RM: %d cluster(s) of size = %d\n",
+			num_clusters, cluster_size);
+
+	/* initialize clusters */
+	crm = kmalloc(num_clusters * sizeof(crm_domain_t), GFP_ATOMIC);
+	for (i = 0; i < num_clusters; i++) {
+
+		crm[i].cpus = kmalloc(cluster_size * sizeof(cpu_entry_t),
+				GFP_ATOMIC);
+		crm[i].heap_node = kmalloc(
+				cluster_size * sizeof(struct bheap_node),
+				GFP_ATOMIC);
+		bheap_init(&(crm[i].cpu_heap));
+		rm_domain_init(&(crm[i].domain), NULL, crm_release_jobs);
+
+		if(!zalloc_cpumask_var(&crm[i].cpu_map, GFP_ATOMIC))
+			return -ENOMEM;
+	}
+
+	/* cycle through cluster and add cpus to them */
+	for (i = 0; i < num_clusters; i++) {
+
+		for_each_online_cpu(cpu) {
+			/* check if the cpu is already in a cluster */
+			for (j = 0; j < num_clusters; j++)
+				if (cpumask_test_cpu(cpu, crm[j].cpu_map))
+					break;
+			/* if it is in a cluster go to next cpu */
+			if (j < num_clusters &&
+					cpumask_test_cpu(cpu, crm[j].cpu_map))
+				continue;
+
+			/* this cpu isn't in any cluster */
+			/* get the shared cpus */
+			if (unlikely(cluster_config == GLOBAL_CLUSTER))
+				cpumask_copy(mask, cpu_online_mask);
+			else
+				get_shared_cpu_map(mask, cpu, cluster_config);
+
+			cpumask_copy(crm[i].cpu_map, mask);
+#ifdef VERBOSE_INIT
+			print_cluster_topology(mask, cpu);
+#endif
+			/* add cpus to current cluster and init cpu_entry_t */
+			cpu_count = 0;
+			for_each_cpu(ccpu, crm[i].cpu_map) {
+
+				entry = &per_cpu(crm_cpu_entries, ccpu);
+				crm[i].cpus[cpu_count] = entry;
+				atomic_set(&entry->will_schedule, 0);
+				entry->cpu = ccpu;
+				entry->cluster = &crm[i];
+				entry->hn = &(crm[i].heap_node[cpu_count]);
+				bheap_node_init(&entry->hn, entry);
+
+				cpu_count++;
+
+				entry->linked = NULL;
+				entry->scheduled = NULL;
+				update_cpu_position(entry);
+			}
+			/* done with this cluster */
+			break;
+		}
+	}
+	
+#ifdef CONFIG_LITMUS_SOFTIRQD
+	{
+		/* distribute the daemons evenly across the clusters. */
+		int* affinity = kmalloc(NR_LITMUS_SOFTIRQD * sizeof(int), GFP_ATOMIC);
+		int num_daemons_per_cluster = NR_LITMUS_SOFTIRQD / num_clusters;
+		int left_over = NR_LITMUS_SOFTIRQD % num_clusters;
+		
+		int daemon = 0;
+		for(i = 0; i < num_clusters; ++i)
+		{
+			int num_on_this_cluster = num_daemons_per_cluster;
+			if(left_over)
+			{
+				++num_on_this_cluster;
+				--left_over;
+			}
+			
+			for(j = 0; j < num_on_this_cluster; ++j)
+			{
+				// first CPU of this cluster
+				affinity[daemon++] = i*cluster_size;
+			}
+		}
+	
+		spawn_klitirqd(affinity);
+		
+		kfree(affinity);
+	}
+#endif
+	
+#ifdef CONFIG_LITMUS_NVIDIA
+	init_nvidia_info();
+#endif	
+
+	free_cpumask_var(mask);
+	clusters_allocated = 1;
+	return 0;
+}
+
+/*	Plugin object	*/
+static struct sched_plugin crm_plugin __cacheline_aligned_in_smp = {
+	.plugin_name		= "C-RM",
+	.finish_switch		= crm_finish_switch,
+	.tick			= crm_tick,
+	.task_new		= crm_task_new,
+	.complete_job		= complete_job,
+	.task_exit		= crm_task_exit,
+	.schedule		= crm_schedule,
+	.task_wake_up		= crm_task_wake_up,
+	.task_block		= crm_task_block,
+	.admit_task		= crm_admit_task,
+	.activate_plugin	= crm_activate_plugin,
+#ifdef CONFIG_LITMUS_LOCKING
+	.allocate_lock	= crm_allocate_lock,
+    .set_prio_inh   = set_priority_inheritance,
+    .clear_prio_inh = clear_priority_inheritance,	
+#endif
+#ifdef CONFIG_LITMUS_SOFTIRQD
+	.set_prio_inh_klitirqd = set_priority_inheritance_klitirqd,
+	.clear_prio_inh_klitirqd = clear_priority_inheritance_klitirqd,
+#endif	
+};
+
+static struct proc_dir_entry *cluster_file = NULL, *crm_dir = NULL;
+
+static int __init init_crm(void)
+{
+	int err, fs;
+
+	err = register_sched_plugin(&crm_plugin);
+	if (!err) {
+		fs = make_plugin_proc_dir(&crm_plugin, &crm_dir);
+		if (!fs)
+			cluster_file = create_cluster_file(crm_dir, &cluster_config);
+		else
+			printk(KERN_ERR "Could not allocate C-RM procfs dir.\n");
+	}
+	return err;
+}
+
+static void clean_crm(void)
+{
+	cleanup_crm();
+	if (cluster_file)
+		remove_proc_entry("cluster", crm_dir);
+	if (crm_dir)
+		remove_plugin_proc_dir(&crm_plugin);
+}
+
+module_init(init_crm);
+module_exit(clean_crm);
diff --git a/litmus/sched_crm_srt.c b/litmus/sched_crm_srt.c
new file mode 100644
index 000000000000..4473f35e64cd
--- /dev/null
+++ b/litmus/sched_crm_srt.c
@@ -0,0 +1,1611 @@
+/*
+ * litmus/sched_crm_srt.c
+ *
+ * Implementation of the C-RM-SRT scheduling algorithm.
+ *
+ * This implementation is based on G-EDF:
+ * - CPUs are clustered around L2 or L3 caches.
+ * - Clusters topology is automatically detected (this is arch dependent
+ *   and is working only on x86 at the moment --- and only with modern
+ *   cpus that exports cpuid4 information)
+ * - The plugins _does not_ attempt to put tasks in the right cluster i.e.
+ *   the programmer needs to be aware of the topology to place tasks
+ *   in the desired cluster
+ * - default clustering is around L2 cache (cache index = 2)
+ *   supported clusters are: L1 (private cache: pedf), L2, L3, ALL (all
+ *   online_cpus are placed in a single cluster).
+ *
+ *   For details on functions, take a look at sched_gsn_edf.c
+ *
+ * Currently, we do not support changes in the number of online cpus.
+ * If the num_online_cpus() dynamically changes, the plugin is broken.
+ *
+ * This version uses the simple approach and serializes all scheduling
+ * decisions by the use of a queue lock. This is probably not the
+ * best way to do it, but it should suffice for now.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+#include <linux/module.h>
+
+#include <litmus/litmus.h>
+#include <litmus/jobs.h>
+#include <litmus/preempt.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/rm_srt_common.h>
+#include <litmus/sched_trace.h>
+
+#include <litmus/clustered.h>
+
+#include <litmus/bheap.h>
+
+/* to configure the cluster size */
+#include <litmus/litmus_proc.h>
+
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+#include <litmus/affinity.h>
+#endif
+
+#ifdef CONFIG_LITMUS_SOFTIRQD
+#include <litmus/litmus_softirq.h>
+#endif
+
+#ifdef CONFIG_LITMUS_NVIDIA
+#include <litmus/nvidia_info.h>
+#endif
+
+/* Reference configuration variable. Determines which cache level is used to
+ * group CPUs into clusters.  GLOBAL_CLUSTER, which is the default, means that
+ * all CPUs form a single cluster (just like GSN-EDF).
+ */
+static enum cache_level cluster_config = GLOBAL_CLUSTER;
+
+struct clusterdomain;
+
+/* cpu_entry_t - maintain the linked and scheduled state
+ *
+ * A cpu also contains a pointer to the crm_srt_domain_t cluster
+ * that owns it (struct clusterdomain*)
+ */
+typedef struct  {
+	int 			cpu;
+	struct clusterdomain*	cluster;	/* owning cluster */
+	struct task_struct*	linked;		/* only RT tasks */
+	struct task_struct*	scheduled;	/* only RT tasks */
+	atomic_t		will_schedule;	/* prevent unneeded IPIs */
+	struct bheap_node*	hn;
+} cpu_entry_t;
+
+/* one cpu_entry_t per CPU */
+DEFINE_PER_CPU(cpu_entry_t, crm_srt_cpu_entries);
+
+#define set_will_schedule() \
+	(atomic_set(&__get_cpu_var(crm_srt_cpu_entries).will_schedule, 1))
+#define clear_will_schedule() \
+	(atomic_set(&__get_cpu_var(crm_srt_cpu_entries).will_schedule, 0))
+#define test_will_schedule(cpu) \
+	(atomic_read(&per_cpu(crm_srt_cpu_entries, cpu).will_schedule))
+
+/*
+ * In C-RM-SRT there is a crm_srt domain _per_ cluster
+ * The number of clusters is dynamically determined accordingly to the
+ * total cpu number and the cluster size
+ */
+typedef struct clusterdomain {
+	/* rt_domain for this cluster */
+	rt_domain_t	domain;
+	/* cpus in this cluster */
+	cpu_entry_t*	*cpus;
+	/* map of this cluster cpus */
+	cpumask_var_t	cpu_map;
+	/* the cpus queue themselves according to priority in here */
+	struct bheap_node *heap_node;
+	struct bheap      cpu_heap;
+	/* lock for this cluster */
+#define crm_srt_lock domain.ready_lock
+} crm_srt_domain_t;
+
+/* a crm_srt_domain per cluster; allocation is done at init/activation time */
+crm_srt_domain_t *crm_srt;
+
+#define remote_cluster(cpu)	((crm_srt_domain_t *) per_cpu(crm_srt_cpu_entries, cpu).cluster)
+#define task_cpu_cluster(task)	remote_cluster(get_partition(task))
+
+/* Uncomment WANT_ALL_SCHED_EVENTS if you want to see all scheduling
+ * decisions in the TRACE() log; uncomment VERBOSE_INIT for verbose
+ * information during the initialization of the plugin (e.g., topology)
+#define WANT_ALL_SCHED_EVENTS
+ */
+#define VERBOSE_INIT
+
+static int cpu_lower_prio(struct bheap_node *_a, struct bheap_node *_b)
+{
+	cpu_entry_t *a, *b;
+	a = _a->value;
+	b = _b->value;
+	/* Note that a and b are inverted: we want the lowest-priority CPU at
+	 * the top of the heap.
+	 */
+	return rm_srt_higher_prio(b->linked, a->linked);
+}
+
+/* update_cpu_position - Move the cpu entry to the correct place to maintain
+ *                       order in the cpu queue. Caller must hold crm_srt lock.
+ */
+static void update_cpu_position(cpu_entry_t *entry)
+{
+	crm_srt_domain_t *cluster = entry->cluster;
+
+	if (likely(bheap_node_in_heap(entry->hn)))
+		bheap_delete(cpu_lower_prio,
+				&cluster->cpu_heap,
+				entry->hn);
+
+	bheap_insert(cpu_lower_prio, &cluster->cpu_heap, entry->hn);
+}
+
+/* caller must hold crm_srt lock */
+static cpu_entry_t* lowest_prio_cpu(crm_srt_domain_t *cluster)
+{
+	struct bheap_node* hn;
+	hn = bheap_peek(cpu_lower_prio, &cluster->cpu_heap);
+	return hn->value;
+}
+
+
+/* link_task_to_cpu - Update the link of a CPU.
+ *                    Handles the case where the to-be-linked task is already
+ *                    scheduled on a different CPU.
+ */
+static noinline void link_task_to_cpu(struct task_struct* linked,
+				      cpu_entry_t *entry)
+{
+	cpu_entry_t *sched;
+	struct task_struct* tmp;
+	int on_cpu;
+
+	BUG_ON(linked && !is_realtime(linked));
+
+	/* Currently linked task is set to be unlinked. */
+	if (entry->linked) {
+		entry->linked->rt_param.linked_on = NO_CPU;
+	}
+
+	/* Link new task to CPU. */
+	if (linked) {
+		set_rt_flags(linked, RT_F_RUNNING);
+		/* handle task is already scheduled somewhere! */
+		on_cpu = linked->rt_param.scheduled_on;
+		if (on_cpu != NO_CPU) {
+			sched = &per_cpu(crm_srt_cpu_entries, on_cpu);
+			/* this should only happen if not linked already */
+			BUG_ON(sched->linked == linked);
+
+			/* If we are already scheduled on the CPU to which we
+			 * wanted to link, we don't need to do the swap --
+			 * we just link ourselves to the CPU and depend on
+			 * the caller to get things right.
+			 */
+			if (entry != sched) {
+				TRACE_TASK(linked,
+					   "already scheduled on %d, updating link.\n",
+					   sched->cpu);
+				tmp = sched->linked;
+				linked->rt_param.linked_on = sched->cpu;
+				sched->linked = linked;
+				update_cpu_position(sched);
+				linked = tmp;
+			}
+		}
+		if (linked) /* might be NULL due to swap */
+			linked->rt_param.linked_on = entry->cpu;
+	}
+	entry->linked = linked;
+#ifdef WANT_ALL_SCHED_EVENTS
+	if (linked)
+		TRACE_TASK(linked, "linked to %d.\n", entry->cpu);
+	else
+		TRACE("NULL linked to %d.\n", entry->cpu);
+#endif
+	update_cpu_position(entry);
+}
+
+/* unlink - Make sure a task is not linked any longer to an entry
+ *          where it was linked before. Must hold crm_srt_lock.
+ */
+static noinline void unlink(struct task_struct* t)
+{
+    	cpu_entry_t *entry;
+
+	if (t->rt_param.linked_on != NO_CPU) {
+		/* unlink */
+		entry = &per_cpu(crm_srt_cpu_entries, t->rt_param.linked_on);
+		t->rt_param.linked_on = NO_CPU;
+		link_task_to_cpu(NULL, entry);
+	} else if (is_queued(t)) {
+		/* This is an interesting situation: t is scheduled,
+		 * but was just recently unlinked.  It cannot be
+		 * linked anywhere else (because then it would have
+		 * been relinked to this CPU), thus it must be in some
+		 * queue. We must remove it from the list in this
+		 * case.
+		 *
+		 * in C-RM-SRT case is should be somewhere in the queue for
+		 * its domain, therefore and we can get the domain using
+		 * task_cpu_cluster
+		 */
+		remove(&(task_cpu_cluster(t))->domain, t);
+	}
+}
+
+
+/* preempt - force a CPU to reschedule
+ */
+static void preempt(cpu_entry_t *entry)
+{
+	preempt_if_preemptable(entry->scheduled, entry->cpu);
+}
+
+/* requeue - Put an unlinked task into gsn-edf domain.
+ *           Caller must hold crm_srt_lock.
+ */
+static noinline void requeue(struct task_struct* task)
+{
+	crm_srt_domain_t *cluster = task_cpu_cluster(task);
+	BUG_ON(!task);
+	/* sanity check before insertion */
+	BUG_ON(is_queued(task));
+
+	if (is_released(task, litmus_clock()))
+		__add_ready(&cluster->domain, task);
+	else {
+		/* it has got to wait */
+		add_release(&cluster->domain, task);
+	}
+}
+
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+static cpu_entry_t* crm_srt_get_nearest_available_cpu(
+				crm_srt_domain_t *cluster, cpu_entry_t* start)
+{
+	cpu_entry_t* affinity;
+
+	get_nearest_available_cpu(affinity, start, crm_srt_cpu_entries, -1);
+
+	/* make sure CPU is in our cluster */
+	if(affinity && cpu_isset(affinity->cpu, *cluster->cpu_map))
+		return(affinity);
+	else
+		return(NULL);
+}
+#endif
+
+
+/* check for any necessary preemptions */
+static void check_for_preemptions(crm_srt_domain_t *cluster)
+{
+	struct task_struct *task;
+	cpu_entry_t *last;
+
+	for(last = lowest_prio_cpu(cluster);
+	    rm_srt_preemption_needed(&cluster->domain, last->linked);
+	    last = lowest_prio_cpu(cluster)) {
+		/* preemption necessary */
+		task = __take_ready(&cluster->domain);
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+		{
+			cpu_entry_t* affinity =
+					crm_srt_get_nearest_available_cpu(cluster,
+							&per_cpu(crm_srt_cpu_entries, task_cpu(task)));
+			if(affinity)
+				last = affinity;
+			else if(last->linked)
+				requeue(last->linked);
+		}
+#else
+		if (last->linked)
+			requeue(last->linked);
+#endif
+		TRACE("check_for_preemptions: attempting to link task %d to %d\n",
+				task->pid, last->cpu);
+		link_task_to_cpu(task, last);
+		preempt(last);
+	}
+}
+
+/* crm_srt_job_arrival: task is either resumed or released */
+static noinline void crm_srt_job_arrival(struct task_struct* task)
+{
+	crm_srt_domain_t *cluster = task_cpu_cluster(task);
+	BUG_ON(!task);
+
+	requeue(task);
+	check_for_preemptions(cluster);
+}
+
+static void crm_srt_release_jobs(rt_domain_t* rt, struct bheap* tasks)
+{
+	crm_srt_domain_t* cluster = container_of(rt, crm_srt_domain_t, domain);
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&cluster->crm_srt_lock, flags);
+
+	__merge_ready(&cluster->domain, tasks);
+	check_for_preemptions(cluster);
+
+	raw_spin_unlock_irqrestore(&cluster->crm_srt_lock, flags);
+}
+
+/* caller holds crm_srt_lock */
+static noinline void job_completion(struct task_struct *t, int forced)
+{
+	BUG_ON(!t);
+
+	sched_trace_task_completion(t, forced);
+
+#ifdef CONFIG_LITMUS_NVIDIA
+	atomic_set(&tsk_rt(t)->nv_int_count, 0);
+#endif
+
+	TRACE_TASK(t, "job_completion().\n");
+
+	/* set flags */
+	set_rt_flags(t, RT_F_SLEEP);
+	/* prepare for next period */
+	prepare_for_next_period(t);
+	if (is_released(t, litmus_clock()))
+		sched_trace_task_release(t);
+	/* unlink */
+	unlink(t);
+	/* requeue
+	 * But don't requeue a blocking task. */
+	if (is_running(t))
+		crm_srt_job_arrival(t);
+}
+
+/* crm_srt_tick - this function is called for every local timer
+ *                         interrupt.
+ *
+ *                   checks whether the current task has expired and checks
+ *                   whether we need to preempt it if it has not expired
+ */
+static void crm_srt_tick(struct task_struct* t)
+{
+	if (is_realtime(t) && budget_enforced(t) && budget_exhausted(t)) {
+		if (!is_np(t)) {
+			/* np tasks will be preempted when they become
+			 * preemptable again
+			 */
+			litmus_reschedule_local();
+			set_will_schedule();
+			TRACE("crm_srt_scheduler_tick: "
+			      "%d is preemptable "
+			      " => FORCE_RESCHED\n", t->pid);
+		} else if (is_user_np(t)) {
+			TRACE("crm_srt_scheduler_tick: "
+			      "%d is non-preemptable, "
+			      "preemption delayed.\n", t->pid);
+			request_exit_np(t);
+		}
+	}
+}
+
+/* Getting schedule() right is a bit tricky. schedule() may not make any
+ * assumptions on the state of the current task since it may be called for a
+ * number of reasons. The reasons include a scheduler_tick() determined that it
+ * was necessary, because sys_exit_np() was called, because some Linux
+ * subsystem determined so, or even (in the worst case) because there is a bug
+ * hidden somewhere. Thus, we must take extreme care to determine what the
+ * current state is.
+ *
+ * The CPU could currently be scheduling a task (or not), be linked (or not).
+ *
+ * The following assertions for the scheduled task could hold:
+ *
+ *      - !is_running(scheduled)        // the job blocks
+ *	- scheduled->timeslice == 0	// the job completed (forcefully)
+ *	- get_rt_flag() == RT_F_SLEEP	// the job completed (by syscall)
+ * 	- linked != scheduled		// we need to reschedule (for any reason)
+ * 	- is_np(scheduled)		// rescheduling must be delayed,
+ *					   sys_exit_np must be requested
+ *
+ * Any of these can occur together.
+ */
+static struct task_struct* crm_srt_schedule(struct task_struct * prev)
+{
+	cpu_entry_t* entry = &__get_cpu_var(crm_srt_cpu_entries);
+	crm_srt_domain_t *cluster = entry->cluster;
+	int out_of_time, sleep, preempt, np, exists, blocks;
+	struct task_struct* next = NULL;
+
+	raw_spin_lock(&cluster->crm_srt_lock);
+	clear_will_schedule();
+
+	/* sanity checking */
+	BUG_ON(entry->scheduled && entry->scheduled != prev);
+	BUG_ON(entry->scheduled && !is_realtime(prev));
+	BUG_ON(is_realtime(prev) && !entry->scheduled);
+
+	/* (0) Determine state */
+	exists      = entry->scheduled != NULL;
+	blocks      = exists && !is_running(entry->scheduled);
+	out_of_time = exists &&
+				  budget_enforced(entry->scheduled) &&
+				  budget_exhausted(entry->scheduled);
+	np 	    = exists && is_np(entry->scheduled);
+	sleep	    = exists && get_rt_flags(entry->scheduled) == RT_F_SLEEP;
+	preempt     = entry->scheduled != entry->linked;
+
+#ifdef WANT_ALL_SCHED_EVENTS
+	TRACE_TASK(prev, "invoked crm_srt_schedule.\n");
+#endif
+
+	if (exists)
+		TRACE_TASK(prev,
+			   "blocks:%d out_of_time:%d np:%d sleep:%d preempt:%d "
+			   "state:%d sig:%d\n",
+			   blocks, out_of_time, np, sleep, preempt,
+			   prev->state, signal_pending(prev));
+	if (entry->linked && preempt)
+		TRACE_TASK(prev, "will be preempted by %s/%d\n",
+			   entry->linked->comm, entry->linked->pid);
+
+
+	/* If a task blocks we have no choice but to reschedule.
+	 */
+	if (blocks)
+		unlink(entry->scheduled);
+
+	/* Request a sys_exit_np() call if we would like to preempt but cannot.
+	 * We need to make sure to update the link structure anyway in case
+	 * that we are still linked. Multiple calls to request_exit_np() don't
+	 * hurt.
+	 */
+	if (np && (out_of_time || preempt || sleep)) {
+		unlink(entry->scheduled);
+		request_exit_np(entry->scheduled);
+	}
+
+	/* Any task that is preemptable and either exhausts its execution
+	 * budget or wants to sleep completes. We may have to reschedule after
+	 * this. Don't do a job completion if we block (can't have timers running
+	 * for blocked jobs). Preemption go first for the same reason.
+	 */
+	if (!np && (out_of_time || sleep) && !blocks && !preempt)
+		job_completion(entry->scheduled, !sleep);
+
+	/* Link pending task if we became unlinked.
+	 */
+	if (!entry->linked)
+		link_task_to_cpu(__take_ready(&cluster->domain), entry);
+
+	/* The final scheduling decision. Do we need to switch for some reason?
+	 * If linked is different from scheduled, then select linked as next.
+	 */
+	if ((!np || blocks) &&
+	    entry->linked != entry->scheduled) {
+		/* Schedule a linked job? */
+		if (entry->linked) {
+			entry->linked->rt_param.scheduled_on = entry->cpu;
+			next = entry->linked;
+		}
+		if (entry->scheduled) {
+			/* not gonna be scheduled soon */
+			entry->scheduled->rt_param.scheduled_on = NO_CPU;
+			TRACE_TASK(entry->scheduled, "scheduled_on = NO_CPU\n");
+		}
+	} else
+		/* Only override Linux scheduler if we have a real-time task
+		 * scheduled that needs to continue.
+		 */
+		if (exists)
+			next = prev;
+
+	sched_state_task_picked();
+	raw_spin_unlock(&cluster->crm_srt_lock);
+
+#ifdef WANT_ALL_SCHED_EVENTS
+	TRACE("crm_srt_lock released, next=0x%p\n", next);
+
+	if (next)
+		TRACE_TASK(next, "scheduled at %llu\n", litmus_clock());
+	else if (exists && !next)
+		TRACE("becomes idle at %llu.\n", litmus_clock());
+#endif
+
+
+	return next;
+}
+
+
+/* _finish_switch - we just finished the switch away from prev
+ */
+static void crm_srt_finish_switch(struct task_struct *prev)
+{
+	cpu_entry_t* 	entry = &__get_cpu_var(crm_srt_cpu_entries);
+
+	entry->scheduled = is_realtime(current) ? current : NULL;
+#ifdef WANT_ALL_SCHED_EVENTS
+	TRACE_TASK(prev, "switched away from\n");
+#endif
+}
+
+
+/*	Prepare a task for running in RT mode
+ */
+static void crm_srt_task_new(struct task_struct * t, int on_rq, int running)
+{
+	unsigned long 		flags;
+	cpu_entry_t* 		entry;
+	crm_srt_domain_t*		cluster;
+
+	TRACE("gsn edf: task new %d\n", t->pid);
+
+	/* the cluster doesn't change even if t is running */
+	cluster = task_cpu_cluster(t);
+
+	raw_spin_lock_irqsave(&cluster->crm_srt_lock, flags);
+
+	/* setup job params */
+	release_at(t, litmus_clock());
+
+	if (running) {
+		entry = &per_cpu(crm_srt_cpu_entries, task_cpu(t));
+		BUG_ON(entry->scheduled);
+
+		entry->scheduled = t;
+		tsk_rt(t)->scheduled_on = task_cpu(t);
+	} else {
+		t->rt_param.scheduled_on = NO_CPU;
+	}
+	t->rt_param.linked_on          = NO_CPU;
+
+	crm_srt_job_arrival(t);
+	raw_spin_unlock_irqrestore(&cluster->crm_srt_lock, flags);
+}
+
+static void crm_srt_task_wake_up(struct task_struct *task)
+{
+	unsigned long flags;
+	//lt_t now;
+	crm_srt_domain_t *cluster;
+
+	TRACE_TASK(task, "wake_up at %llu\n", litmus_clock());
+
+	cluster = task_cpu_cluster(task);
+
+	raw_spin_lock_irqsave(&cluster->crm_srt_lock, flags);
+
+#if 0  // sporadic task model
+	/* We need to take suspensions because of semaphores into
+	 * account! If a job resumes after being suspended due to acquiring
+	 * a semaphore, it should never be treated as a new job release.
+	 */
+	if (get_rt_flags(task) == RT_F_EXIT_SEM) {
+		set_rt_flags(task, RT_F_RUNNING);
+	} else {
+		now = litmus_clock();
+		if (is_tardy(task, now)) {
+			/* new sporadic release */
+			release_at(task, now);
+			sched_trace_task_release(task);
+		}
+		else {
+			if (task->rt.time_slice) {
+				/* came back in time before deadline
+				*/
+				set_rt_flags(task, RT_F_RUNNING);
+			}
+		}
+	}
+#endif
+
+	//BUG_ON(tsk_rt(task)->linked_on != NO_CPU);
+	set_rt_flags(task, RT_F_RUNNING);  // periodic model
+
+	if(tsk_rt(task)->linked_on == NO_CPU)
+		crm_srt_job_arrival(task);
+	else
+		TRACE("WTF, mate?!\n");
+
+	raw_spin_unlock_irqrestore(&cluster->crm_srt_lock, flags);
+}
+
+static void crm_srt_task_block(struct task_struct *t)
+{
+	unsigned long flags;
+	crm_srt_domain_t *cluster;
+
+	TRACE_TASK(t, "block at %llu\n", litmus_clock());
+
+	cluster = task_cpu_cluster(t);
+
+	/* unlink if necessary */
+	raw_spin_lock_irqsave(&cluster->crm_srt_lock, flags);
+	unlink(t);
+	raw_spin_unlock_irqrestore(&cluster->crm_srt_lock, flags);
+
+	BUG_ON(!is_realtime(t));
+}
+
+
+static void crm_srt_task_exit(struct task_struct * t)
+{
+	unsigned long flags;
+	crm_srt_domain_t *cluster = task_cpu_cluster(t);
+
+	/* unlink if necessary */
+	raw_spin_lock_irqsave(&cluster->crm_srt_lock, flags);
+	unlink(t);
+	if (tsk_rt(t)->scheduled_on != NO_CPU) {
+		cpu_entry_t *cpu;
+		cpu = &per_cpu(crm_srt_cpu_entries, tsk_rt(t)->scheduled_on);
+		cpu->scheduled = NULL;
+		tsk_rt(t)->scheduled_on = NO_CPU;
+	}
+	raw_spin_unlock_irqrestore(&cluster->crm_srt_lock, flags);
+
+	BUG_ON(!is_realtime(t));
+        TRACE_TASK(t, "RIP\n");
+}
+
+static long crm_srt_admit_task(struct task_struct* tsk)
+{
+	return task_cpu(tsk) == tsk->rt_param.task_params.cpu ? 0 : -EINVAL;
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+#ifdef CONFIG_LITMUS_LOCKING
+
+#include <litmus/fdso.h>
+
+
+static void __set_priority_inheritance(struct task_struct* t, struct task_struct* prio_inh)
+{
+	int linked_on;
+	int check_preempt = 0;	
+	
+	crm_srt_domain_t* cluster = task_cpu_cluster(t);
+	
+	if(prio_inh != NULL)
+		TRACE_TASK(t, "inherits priority from %s/%d\n", prio_inh->comm, prio_inh->pid);
+	else
+		TRACE_TASK(t, "inherits priority from %p\n", prio_inh);
+	
+	sched_trace_eff_prio_change(t, prio_inh);
+	
+	tsk_rt(t)->inh_task = prio_inh;
+	
+	linked_on  = tsk_rt(t)->linked_on;
+	
+	/* If it is scheduled, then we need to reorder the CPU heap. */
+	if (linked_on != NO_CPU) {
+		TRACE_TASK(t, "%s: linked  on %d\n",
+				   __FUNCTION__, linked_on);
+		/* Holder is scheduled; need to re-order CPUs.
+		 * We can't use heap_decrease() here since
+		 * the cpu_heap is ordered in reverse direction, so
+		 * it is actually an increase. */
+		bheap_delete(cpu_lower_prio, &cluster->cpu_heap,
+                     per_cpu(crm_srt_cpu_entries, linked_on).hn);
+		bheap_insert(cpu_lower_prio, &cluster->cpu_heap,
+                     per_cpu(crm_srt_cpu_entries, linked_on).hn);
+	} else {
+		/* holder may be queued: first stop queue changes */
+		raw_spin_lock(&cluster->domain.release_lock);
+		if (is_queued(t)) {
+			TRACE_TASK(t, "%s: is queued\n", __FUNCTION__);
+			
+			/* We need to update the position of holder in some
+			 * heap. Note that this could be a release heap if we
+			 * budget enforcement is used and this job overran. */
+			check_preempt = !bheap_decrease(rm_srt_ready_order, tsk_rt(t)->heap_node);
+			
+		} else {
+			/* Nothing to do: if it is not queued and not linked
+			 * then it is either sleeping or currently being moved
+			 * by other code (e.g., a timer interrupt handler) that
+			 * will use the correct priority when enqueuing the
+			 * task. */
+			TRACE_TASK(t, "%s: is NOT queued => Done.\n", __FUNCTION__);
+		}
+		raw_spin_unlock(&cluster->domain.release_lock);
+		
+		/* If holder was enqueued in a release heap, then the following
+		 * preemption check is pointless, but we can't easily detect
+		 * that case. If you want to fix this, then consider that
+		 * simply adding a state flag requires O(n) time to update when
+		 * releasing n tasks, which conflicts with the goal to have
+		 * O(log n) merges. */
+		if (check_preempt) {
+			/* heap_decrease() hit the top level of the heap: make
+			 * sure preemption checks get the right task, not the
+			 * potentially stale cache. */
+			bheap_uncache_min(rm_srt_ready_order, &cluster->domain.ready_queue);
+			check_for_preemptions(cluster);
+		}
+	}
+}
+
+/* called with IRQs off */
+static void set_priority_inheritance(struct task_struct* t, struct task_struct* prio_inh)
+{
+	crm_srt_domain_t* cluster = task_cpu_cluster(t);
+	
+	raw_spin_lock(&cluster->crm_srt_lock);
+	
+	__set_priority_inheritance(t, prio_inh);
+	
+#ifdef CONFIG_LITMUS_SOFTIRQD
+	if(tsk_rt(t)->cur_klitirqd != NULL)
+	{
+		TRACE_TASK(t, "%s/%d inherits a new priority!\n",
+				   tsk_rt(t)->cur_klitirqd->comm, tsk_rt(t)->cur_klitirqd->pid);
+		
+		__set_priority_inheritance(tsk_rt(t)->cur_klitirqd, prio_inh);
+	}
+#endif
+	
+	raw_spin_unlock(&cluster->crm_srt_lock);
+}
+
+
+/* called with IRQs off */
+static void __clear_priority_inheritance(struct task_struct* t)
+{
+    TRACE_TASK(t, "priority restored\n");
+	
+    if(tsk_rt(t)->scheduled_on != NO_CPU)
+    {
+		sched_trace_eff_prio_change(t, NULL);
+		
+        tsk_rt(t)->inh_task = NULL;
+        
+        /* Check if rescheduling is necessary. We can't use heap_decrease()
+         * since the priority was effectively lowered. */
+        unlink(t);
+        crm_srt_job_arrival(t);
+    }
+    else
+    {
+        __set_priority_inheritance(t, NULL);
+    }
+	
+#ifdef CONFIG_LITMUS_SOFTIRQD
+	if(tsk_rt(t)->cur_klitirqd != NULL)
+	{
+		TRACE_TASK(t, "%s/%d inheritance set back to owner.\n",
+				   tsk_rt(t)->cur_klitirqd->comm, tsk_rt(t)->cur_klitirqd->pid);
+		
+		if(tsk_rt(tsk_rt(t)->cur_klitirqd)->scheduled_on != NO_CPU)
+		{
+			sched_trace_eff_prio_change(tsk_rt(t)->cur_klitirqd, t);
+			
+			tsk_rt(tsk_rt(t)->cur_klitirqd)->inh_task = t;
+			
+			/* Check if rescheduling is necessary. We can't use heap_decrease()
+			 * since the priority was effectively lowered. */
+			unlink(tsk_rt(t)->cur_klitirqd);
+			crm_srt_job_arrival(tsk_rt(t)->cur_klitirqd);
+		}
+		else
+		{
+			__set_priority_inheritance(tsk_rt(t)->cur_klitirqd, t);
+		}
+	}
+#endif
+}
+
+/* called with IRQs off */
+static void clear_priority_inheritance(struct task_struct* t)
+{
+	crm_srt_domain_t* cluster = task_cpu_cluster(t);
+	
+	raw_spin_lock(&cluster->crm_srt_lock);
+	__clear_priority_inheritance(t);
+	raw_spin_unlock(&cluster->crm_srt_lock);
+}
+
+
+
+#ifdef CONFIG_LITMUS_SOFTIRQD
+/* called with IRQs off */
+static void set_priority_inheritance_klitirqd(struct task_struct* klitirqd,
+											  struct task_struct* old_owner,
+											  struct task_struct* new_owner)
+{
+	crm_srt_domain_t* cluster = task_cpu_cluster(klitirqd);
+	
+	BUG_ON(!(tsk_rt(klitirqd)->is_proxy_thread));
+	
+	raw_spin_lock(&cluster->crm_srt_lock);
+	
+	if(old_owner != new_owner)
+	{
+		if(old_owner)
+		{
+			// unreachable?
+			tsk_rt(old_owner)->cur_klitirqd = NULL;
+		}
+		
+		TRACE_TASK(klitirqd, "giving ownership to %s/%d.\n",
+				   new_owner->comm, new_owner->pid);
+		
+		tsk_rt(new_owner)->cur_klitirqd = klitirqd;
+	}
+	
+	__set_priority_inheritance(klitirqd,
+							   (tsk_rt(new_owner)->inh_task == NULL) ?
+							   new_owner :
+							   tsk_rt(new_owner)->inh_task);
+	
+	raw_spin_unlock(&cluster->crm_srt_lock);
+}
+
+/* called with IRQs off */
+static void clear_priority_inheritance_klitirqd(struct task_struct* klitirqd,
+												struct task_struct* old_owner)
+{
+	crm_srt_domain_t* cluster = task_cpu_cluster(klitirqd);
+	
+	BUG_ON(!(tsk_rt(klitirqd)->is_proxy_thread));
+	
+	raw_spin_lock(&cluster->crm_srt_lock);
+    
+    TRACE_TASK(klitirqd, "priority restored\n");
+	
+    if(tsk_rt(klitirqd)->scheduled_on != NO_CPU)
+    {
+        tsk_rt(klitirqd)->inh_task = NULL;
+        
+        /* Check if rescheduling is necessary. We can't use heap_decrease()
+         * since the priority was effectively lowered. */
+        unlink(klitirqd);
+        crm_srt_job_arrival(klitirqd);
+    }
+    else
+    {
+        __set_priority_inheritance(klitirqd, NULL);
+    }
+	
+	tsk_rt(old_owner)->cur_klitirqd = NULL;
+	
+	raw_spin_unlock(&cluster->crm_srt_lock);
+}
+#endif  // CONFIG_LITMUS_SOFTIRQD
+
+
+/* ******************** KFMLP support ********************** */
+
+/* struct for semaphore with priority inheritance */
+struct kfmlp_queue
+{
+	wait_queue_head_t wait;
+	struct task_struct* owner;
+	struct task_struct* hp_waiter;
+	int count; /* number of waiters + holder */
+};
+
+struct kfmlp_semaphore
+{
+	struct litmus_lock litmus_lock;
+	
+	spinlock_t lock;
+	
+	int num_resources; /* aka k */
+	struct kfmlp_queue *queues; /* array */
+	struct kfmlp_queue *shortest_queue; /* pointer to shortest queue */
+};
+
+static inline struct kfmlp_semaphore* kfmlp_from_lock(struct litmus_lock* lock)
+{
+	return container_of(lock, struct kfmlp_semaphore, litmus_lock);
+}
+
+static inline int kfmlp_get_idx(struct kfmlp_semaphore* sem,
+								struct kfmlp_queue* queue)
+{
+	return (queue - &sem->queues[0]);
+}
+
+static inline struct kfmlp_queue* kfmlp_get_queue(struct kfmlp_semaphore* sem,
+												  struct task_struct* holder)
+{
+	int i;
+	for(i = 0; i < sem->num_resources; ++i)
+		if(sem->queues[i].owner == holder)
+			return(&sem->queues[i]);
+	return(NULL);
+}
+
+/* caller is responsible for locking */
+static struct task_struct* kfmlp_find_hp_waiter(struct kfmlp_queue *kqueue,
+										 struct task_struct *skip)
+{
+	struct list_head	*pos;
+	struct task_struct 	*queued, *found = NULL;
+	
+	list_for_each(pos, &kqueue->wait.task_list) {
+		queued  = (struct task_struct*) list_entry(pos, wait_queue_t,
+												   task_list)->private;
+		
+		/* Compare task prios, find high prio task. */
+		if (queued != skip && rm_srt_higher_prio(queued, found))
+			found = queued;
+	}
+	return found;
+}
+
+static inline struct kfmlp_queue* kfmlp_find_shortest(
+										  struct kfmlp_semaphore* sem,
+										  struct kfmlp_queue* search_start)
+{
+	// we start our search at search_start instead of at the beginning of the
+	// queue list to load-balance across all resources.
+	struct kfmlp_queue* step = search_start;
+	struct kfmlp_queue* shortest = sem->shortest_queue;
+	
+	do
+	{
+		step = (step+1 != &sem->queues[sem->num_resources]) ?
+		step+1 : &sem->queues[0];
+		if(step->count < shortest->count)
+		{
+			shortest = step;
+			if(step->count == 0)
+				break; /* can't get any shorter */
+		}
+	}while(step != search_start);
+	
+	return(shortest);
+}
+
+static struct task_struct* kfmlp_remove_hp_waiter(struct kfmlp_semaphore* sem)
+{
+	/* must hold sem->lock */
+	
+	struct kfmlp_queue *my_queue = NULL;
+	struct task_struct *max_hp = NULL;
+	
+	
+	struct list_head	*pos;
+	struct task_struct 	*queued;
+	int i;
+	
+	for(i = 0; i < sem->num_resources; ++i)
+	{
+		if( (sem->queues[i].count > 1) &&
+		   ((my_queue == NULL) ||
+			(rm_srt_higher_prio(sem->queues[i].hp_waiter, my_queue->hp_waiter))) )
+		{
+			my_queue = &sem->queues[i];
+		}
+	}
+	
+	if(my_queue)
+	{
+		crm_srt_domain_t* cluster;
+		
+		max_hp = my_queue->hp_waiter;
+		BUG_ON(!max_hp);
+
+		TRACE_CUR("queue %d: stealing %s/%d from queue %d\n",
+				  kfmlp_get_idx(sem, my_queue),
+				  max_hp->comm, max_hp->pid,
+				  kfmlp_get_idx(sem, my_queue));
+		
+		my_queue->hp_waiter = kfmlp_find_hp_waiter(my_queue, max_hp);
+		
+		/*
+		 if(my_queue->hp_waiter)
+		 TRACE_CUR("queue %d: new hp_waiter is %s/%d\n",
+		 kfmlp_get_idx(sem, my_queue),
+		 my_queue->hp_waiter->comm,
+		 my_queue->hp_waiter->pid);
+		 else
+		 TRACE_CUR("queue %d: new hp_waiter is %p\n",
+		 kfmlp_get_idx(sem, my_queue), NULL);
+		 */
+	
+		cluster = task_cpu_cluster(max_hp);
+
+		raw_spin_lock(&cluster->crm_srt_lock);
+		
+		/*
+		 if(my_queue->owner)
+		 TRACE_CUR("queue %d: owner is %s/%d\n",
+		 kfmlp_get_idx(sem, my_queue),
+		 my_queue->owner->comm,
+		 my_queue->owner->pid);
+		 else
+		 TRACE_CUR("queue %d: owner is %p\n",
+		 kfmlp_get_idx(sem, my_queue),
+		 NULL);
+		 */
+		
+		if(tsk_rt(my_queue->owner)->inh_task == max_hp)
+		{
+			__clear_priority_inheritance(my_queue->owner);
+			if(my_queue->hp_waiter != NULL)
+			{
+				__set_priority_inheritance(my_queue->owner, my_queue->hp_waiter);
+			}
+		}
+		raw_spin_unlock(&cluster->crm_srt_lock);
+		
+		list_for_each(pos, &my_queue->wait.task_list)
+		{
+			queued  = (struct task_struct*) list_entry(pos, wait_queue_t,
+													   task_list)->private;
+			/* Compare task prios, find high prio task. */
+			if (queued == max_hp)
+			{
+				/*
+				 TRACE_CUR("queue %d: found entry in wait queue.  REMOVING!\n",
+				 kfmlp_get_idx(sem, my_queue));
+				 */
+				__remove_wait_queue(&my_queue->wait,
+									list_entry(pos, wait_queue_t, task_list));
+				break;
+			}
+		}
+		--(my_queue->count);
+	}
+	
+	return(max_hp);
+}
+
+int crm_srt_kfmlp_lock(struct litmus_lock* l)
+{
+	struct task_struct* t = current;
+	struct kfmlp_semaphore *sem = kfmlp_from_lock(l);
+	struct kfmlp_queue* my_queue;
+	wait_queue_t wait;
+	unsigned long flags;
+	
+	if (!is_realtime(t))
+		return -EPERM;
+	
+	spin_lock_irqsave(&sem->lock, flags);
+	
+	my_queue = sem->shortest_queue;
+	
+	if (my_queue->owner) {
+		/* resource is not free => must suspend and wait */
+		TRACE_CUR("queue %d: Resource is not free => must suspend and wait.\n",
+				  kfmlp_get_idx(sem, my_queue));
+		
+		init_waitqueue_entry(&wait, t);
+		
+		/* FIXME: interruptible would be nice some day */
+		set_task_state(t, TASK_UNINTERRUPTIBLE);
+		
+		__add_wait_queue_tail_exclusive(&my_queue->wait, &wait);
+		
+		/* check if we need to activate priority inheritance */
+		if (rm_srt_higher_prio(t, my_queue->hp_waiter))
+		{
+			my_queue->hp_waiter = t;
+			if (rm_srt_higher_prio(t, my_queue->owner))
+			{
+				set_priority_inheritance(my_queue->owner, my_queue->hp_waiter);
+			}
+		}
+		
+		++(my_queue->count);
+		sem->shortest_queue = kfmlp_find_shortest(sem, my_queue);
+		
+		/* release lock before sleeping */
+		spin_unlock_irqrestore(&sem->lock, flags);
+		
+		/* We depend on the FIFO order.  Thus, we don't need to recheck
+		 * when we wake up; we are guaranteed to have the lock since
+		 * there is only one wake up per release (or steal).
+		 */
+		schedule();
+		
+		
+		if(my_queue->owner == t)
+		{
+			TRACE_CUR("queue %d: acquired through waiting\n",
+					  kfmlp_get_idx(sem, my_queue));
+		}
+		else
+		{
+			/* this case may happen if our wait entry was stolen
+			 between queues.  record where we went.*/
+			my_queue = kfmlp_get_queue(sem, t);
+			BUG_ON(!my_queue);
+			TRACE_CUR("queue %d: acquired through stealing\n",
+					  kfmlp_get_idx(sem, my_queue));
+		}
+	}
+	else
+	{
+		TRACE_CUR("queue %d: acquired immediately\n",
+				  kfmlp_get_idx(sem, my_queue));
+		
+		my_queue->owner = t;
+		
+		++(my_queue->count);
+		sem->shortest_queue = kfmlp_find_shortest(sem, my_queue);		
+		
+		spin_unlock_irqrestore(&sem->lock, flags);
+	}
+	
+	return kfmlp_get_idx(sem, my_queue);
+}
+
+int crm_srt_kfmlp_unlock(struct litmus_lock* l)
+{
+	struct task_struct *t = current, *next;
+	struct kfmlp_semaphore *sem = kfmlp_from_lock(l);
+	struct kfmlp_queue *my_queue;
+	unsigned long flags;
+	int err = 0;
+	
+	spin_lock_irqsave(&sem->lock, flags);
+	
+	my_queue = kfmlp_get_queue(sem, t);
+	
+	if (!my_queue) {
+		err = -EINVAL;
+		goto out;
+	}
+	
+	/* check if there are jobs waiting for this resource */
+	next = __waitqueue_remove_first(&my_queue->wait);
+	if (next) {
+		/*
+		 TRACE_CUR("queue %d: ASSIGNING %s/%d as owner - next\n",
+		 kfmlp_get_idx(sem, my_queue),
+		 next->comm, next->pid);
+		 */
+		/* next becomes the resouce holder */
+		my_queue->owner = next;
+		
+		--(my_queue->count);
+		if(my_queue->count < sem->shortest_queue->count)
+		{
+			sem->shortest_queue = my_queue;
+		}	
+		
+		TRACE_CUR("queue %d: lock ownership passed to %s/%d\n",
+				  kfmlp_get_idx(sem, my_queue), next->comm, next->pid);
+		
+		/* determine new hp_waiter if necessary */
+		if (next == my_queue->hp_waiter) {
+			TRACE_TASK(next, "was highest-prio waiter\n");
+			/* next has the highest priority --- it doesn't need to
+			 * inherit.  However, we need to make sure that the
+			 * next-highest priority in the queue is reflected in
+			 * hp_waiter. */
+			my_queue->hp_waiter = kfmlp_find_hp_waiter(my_queue, next);
+			if (my_queue->hp_waiter)
+				TRACE_TASK(my_queue->hp_waiter, "queue %d: is new highest-prio waiter\n", kfmlp_get_idx(sem, my_queue));
+			else
+				TRACE("queue %d: no further waiters\n", kfmlp_get_idx(sem, my_queue));
+		} else {
+			/* Well, if next is not the highest-priority waiter,
+			 * then it ought to inherit the highest-priority
+			 * waiter's priority. */
+			set_priority_inheritance(next, my_queue->hp_waiter);
+		}
+		
+		/* wake up next */
+		wake_up_process(next);
+	}
+	else
+	{
+		TRACE_CUR("queue %d: looking to steal someone...\n", kfmlp_get_idx(sem, my_queue));
+		
+		next = kfmlp_remove_hp_waiter(sem); /* returns NULL if nothing to steal */
+		
+		/*
+		 if(next)
+		 TRACE_CUR("queue %d: ASSIGNING %s/%d as owner - steal\n",
+		 kfmlp_get_idx(sem, my_queue),
+		 next->comm, next->pid);
+		 */
+		
+		my_queue->owner = next;
+		
+		if(next)
+		{
+			TRACE_CUR("queue %d: lock ownership passed to %s/%d (which was stolen)\n",
+					  kfmlp_get_idx(sem, my_queue),
+					  next->comm, next->pid);
+			
+			/* wake up next */
+			wake_up_process(next);			
+		}
+		else
+		{
+			TRACE_CUR("queue %d: no one to steal.\n", kfmlp_get_idx(sem, my_queue));
+			
+			--(my_queue->count);
+			if(my_queue->count < sem->shortest_queue->count)
+			{
+				sem->shortest_queue = my_queue;
+			}
+		}
+	}
+	
+	/* we lose the benefit of priority inheritance (if any) */
+	if (tsk_rt(t)->inh_task)
+		clear_priority_inheritance(t);
+	
+out:
+	spin_unlock_irqrestore(&sem->lock, flags);
+	
+	return err;
+}
+
+int crm_srt_kfmlp_close(struct litmus_lock* l)
+{
+	struct task_struct *t = current;
+	struct kfmlp_semaphore *sem = kfmlp_from_lock(l);
+	struct kfmlp_queue *my_queue;
+	unsigned long flags;
+	
+	int owner;
+	
+	spin_lock_irqsave(&sem->lock, flags);
+	
+	my_queue = kfmlp_get_queue(sem, t);	
+	owner = (my_queue) ? (my_queue->owner == t) : 0;
+	
+	spin_unlock_irqrestore(&sem->lock, flags);
+	
+	if (owner)
+		crm_srt_kfmlp_unlock(l);
+	
+	return 0;
+}
+
+void crm_srt_kfmlp_free(struct litmus_lock* l)
+{
+	struct kfmlp_semaphore *sem = kfmlp_from_lock(l);
+	kfree(sem->queues);
+	kfree(sem);
+}
+
+static struct litmus_lock_ops crm_srt_kfmlp_lock_ops = {
+	.close  = crm_srt_kfmlp_close,
+	.lock   = crm_srt_kfmlp_lock,
+	.unlock = crm_srt_kfmlp_unlock,
+	.deallocate = crm_srt_kfmlp_free,
+};
+
+static struct litmus_lock* crm_srt_new_kfmlp(void* __user arg, int* ret_code)
+{
+	struct kfmlp_semaphore* sem;
+	int num_resources = 0;
+	int i;
+	
+	if(!access_ok(VERIFY_READ, arg, sizeof(num_resources)))
+	{
+		*ret_code = -EINVAL;
+		return(NULL);
+	}
+	if(__copy_from_user(&num_resources, arg, sizeof(num_resources)))
+	{
+		*ret_code = -EINVAL;
+		return(NULL);
+	}
+	if(num_resources < 1)
+	{
+		*ret_code = -EINVAL;
+		return(NULL);		
+	}
+	
+	sem = kmalloc(sizeof(*sem), GFP_KERNEL);
+	if(!sem)
+	{
+		*ret_code = -ENOMEM;
+		return NULL;
+	}
+	
+	sem->queues = kmalloc(sizeof(struct kfmlp_queue)*num_resources, GFP_KERNEL);
+	if(!sem->queues)
+	{
+		kfree(sem);
+		*ret_code = -ENOMEM;
+		return NULL;		
+	}
+	
+	sem->litmus_lock.ops = &crm_srt_kfmlp_lock_ops;
+	spin_lock_init(&sem->lock);
+	sem->num_resources = num_resources;
+	
+	for(i = 0; i < num_resources; ++i)
+	{
+		sem->queues[i].owner = NULL;
+		sem->queues[i].hp_waiter = NULL;
+		init_waitqueue_head(&sem->queues[i].wait);
+		sem->queues[i].count = 0;
+	}
+	
+	sem->shortest_queue = &sem->queues[0];
+	
+	*ret_code = 0;
+	return &sem->litmus_lock;
+}
+
+
+/* **** lock constructor **** */
+
+static long crm_srt_allocate_lock(struct litmus_lock **lock, int type,
+								 void* __user arg)
+{
+	int err = -ENXIO;
+	
+	/* C-RM-SRT currently only supports the FMLP for global resources
+		WITHIN a given cluster.  DO NOT USE CROSS-CLUSTER! */
+	switch (type) {
+		case KFMLP_SEM:
+			*lock = crm_srt_new_kfmlp(arg, &err);
+			break;
+	};
+	
+	return err;
+}
+
+#endif  // CONFIG_LITMUS_LOCKING
+
+
+
+
+
+
+/* total number of cluster */
+static int num_clusters;
+/* we do not support cluster of different sizes */
+static unsigned int cluster_size;
+
+#ifdef VERBOSE_INIT
+static void print_cluster_topology(cpumask_var_t mask, int cpu)
+{
+	int chk;
+	char buf[255];
+
+	chk = cpulist_scnprintf(buf, 254, mask);
+	buf[chk] = '\0';
+	printk(KERN_INFO "CPU = %d, shared cpu(s) = %s\n", cpu, buf);
+
+}
+#endif
+
+static int clusters_allocated = 0;
+
+static void cleanup_crm_srt(void)
+{
+	int i;
+
+	if (clusters_allocated) {
+		for (i = 0; i < num_clusters; i++) {
+			kfree(crm_srt[i].cpus);
+			kfree(crm_srt[i].heap_node);
+			free_cpumask_var(crm_srt[i].cpu_map);
+		}
+
+		kfree(crm_srt);
+	}
+}
+
+static long crm_srt_activate_plugin(void)
+{
+	int i, j, cpu, ccpu, cpu_count;
+	cpu_entry_t *entry;
+
+	cpumask_var_t mask;
+	int chk = 0;
+
+	/* de-allocate old clusters, if any */
+	cleanup_crm_srt();
+
+	printk(KERN_INFO "C-RM-SRT: Activate Plugin, cluster configuration = %d\n",
+			cluster_config);
+
+	/* need to get cluster_size first */
+	if(!zalloc_cpumask_var(&mask, GFP_ATOMIC))
+		return -ENOMEM;
+
+	if (unlikely(cluster_config == GLOBAL_CLUSTER)) {
+		cluster_size = num_online_cpus();
+	} else {
+		chk = get_shared_cpu_map(mask, 0, cluster_config);
+		if (chk) {
+			/* if chk != 0 then it is the max allowed index */
+			printk(KERN_INFO "C-RM-SRT: Cluster configuration = %d "
+			       "is not supported on this hardware.\n",
+			       cluster_config);
+			/* User should notice that the configuration failed, so
+			 * let's bail out. */
+			return -EINVAL;
+		}
+
+		cluster_size = cpumask_weight(mask);
+	}
+
+	if ((num_online_cpus() % cluster_size) != 0) {
+		/* this can't be right, some cpus are left out */
+		printk(KERN_ERR "C-RM-SRT: Trying to group %d cpus in %d!\n",
+				num_online_cpus(), cluster_size);
+		return -1;
+	}
+
+	num_clusters = num_online_cpus() / cluster_size;
+	printk(KERN_INFO "C-RM-SRT: %d cluster(s) of size = %d\n",
+			num_clusters, cluster_size);
+
+	/* initialize clusters */
+	crm_srt = kmalloc(num_clusters * sizeof(crm_srt_domain_t), GFP_ATOMIC);
+	for (i = 0; i < num_clusters; i++) {
+
+		crm_srt[i].cpus = kmalloc(cluster_size * sizeof(cpu_entry_t),
+				GFP_ATOMIC);
+		crm_srt[i].heap_node = kmalloc(
+				cluster_size * sizeof(struct bheap_node),
+				GFP_ATOMIC);
+		bheap_init(&(crm_srt[i].cpu_heap));
+		rm_srt_domain_init(&(crm_srt[i].domain), NULL, crm_srt_release_jobs);
+
+		if(!zalloc_cpumask_var(&crm_srt[i].cpu_map, GFP_ATOMIC))
+			return -ENOMEM;
+	}
+
+	/* cycle through cluster and add cpus to them */
+	for (i = 0; i < num_clusters; i++) {
+
+		for_each_online_cpu(cpu) {
+			/* check if the cpu is already in a cluster */
+			for (j = 0; j < num_clusters; j++)
+				if (cpumask_test_cpu(cpu, crm_srt[j].cpu_map))
+					break;
+			/* if it is in a cluster go to next cpu */
+			if (j < num_clusters &&
+					cpumask_test_cpu(cpu, crm_srt[j].cpu_map))
+				continue;
+
+			/* this cpu isn't in any cluster */
+			/* get the shared cpus */
+			if (unlikely(cluster_config == GLOBAL_CLUSTER))
+				cpumask_copy(mask, cpu_online_mask);
+			else
+				get_shared_cpu_map(mask, cpu, cluster_config);
+
+			cpumask_copy(crm_srt[i].cpu_map, mask);
+#ifdef VERBOSE_INIT
+			print_cluster_topology(mask, cpu);
+#endif
+			/* add cpus to current cluster and init cpu_entry_t */
+			cpu_count = 0;
+			for_each_cpu(ccpu, crm_srt[i].cpu_map) {
+
+				entry = &per_cpu(crm_srt_cpu_entries, ccpu);
+				crm_srt[i].cpus[cpu_count] = entry;
+				atomic_set(&entry->will_schedule, 0);
+				entry->cpu = ccpu;
+				entry->cluster = &crm_srt[i];
+				entry->hn = &(crm_srt[i].heap_node[cpu_count]);
+				bheap_node_init(&entry->hn, entry);
+
+				cpu_count++;
+
+				entry->linked = NULL;
+				entry->scheduled = NULL;
+				update_cpu_position(entry);
+			}
+			/* done with this cluster */
+			break;
+		}
+	}
+	
+#ifdef CONFIG_LITMUS_SOFTIRQD
+	{
+		/* distribute the daemons evenly across the clusters. */
+		int* affinity = kmalloc(NR_LITMUS_SOFTIRQD * sizeof(int), GFP_ATOMIC);
+		int num_daemons_per_cluster = NR_LITMUS_SOFTIRQD / num_clusters;
+		int left_over = NR_LITMUS_SOFTIRQD % num_clusters;
+		
+		int daemon = 0;
+		for(i = 0; i < num_clusters; ++i)
+		{
+			int num_on_this_cluster = num_daemons_per_cluster;
+			if(left_over)
+			{
+				++num_on_this_cluster;
+				--left_over;
+			}
+			
+			for(j = 0; j < num_on_this_cluster; ++j)
+			{
+				// first CPU of this cluster
+				affinity[daemon++] = i*cluster_size;
+			}
+		}
+	
+		spawn_klitirqd(affinity);
+		
+		kfree(affinity);
+	}
+#endif
+	
+#ifdef CONFIG_LITMUS_NVIDIA
+	init_nvidia_info();
+#endif	
+
+	free_cpumask_var(mask);
+	clusters_allocated = 1;
+	return 0;
+}
+
+/*	Plugin object	*/
+static struct sched_plugin crm_srt_plugin __cacheline_aligned_in_smp = {
+	.plugin_name		= "C-RM-SRT",
+	.finish_switch		= crm_srt_finish_switch,
+	.tick			= crm_srt_tick,
+	.task_new		= crm_srt_task_new,
+	.complete_job		= complete_job,
+	.task_exit		= crm_srt_task_exit,
+	.schedule		= crm_srt_schedule,
+	.task_wake_up		= crm_srt_task_wake_up,
+	.task_block		= crm_srt_task_block,
+	.admit_task		= crm_srt_admit_task,
+	.activate_plugin	= crm_srt_activate_plugin,
+#ifdef CONFIG_LITMUS_LOCKING
+	.allocate_lock	= crm_srt_allocate_lock,
+    .set_prio_inh   = set_priority_inheritance,
+    .clear_prio_inh = clear_priority_inheritance,	
+#endif
+#ifdef CONFIG_LITMUS_SOFTIRQD
+	.set_prio_inh_klitirqd = set_priority_inheritance_klitirqd,
+	.clear_prio_inh_klitirqd = clear_priority_inheritance_klitirqd,
+#endif	
+};
+
+static struct proc_dir_entry *cluster_file = NULL, *crm_srt_dir = NULL;
+
+static int __init init_crm_srt(void)
+{
+	int err, fs;
+
+	err = register_sched_plugin(&crm_srt_plugin);
+	if (!err) {
+		fs = make_plugin_proc_dir(&crm_srt_plugin, &crm_srt_dir);
+		if (!fs)
+			cluster_file = create_cluster_file(crm_srt_dir, &cluster_config);
+		else
+			printk(KERN_ERR "Could not allocate C-RM-SRT procfs dir.\n");
+	}
+	return err;
+}
+
+static void clean_crm_srt(void)
+{
+	cleanup_crm_srt();
+	if (cluster_file)
+		remove_proc_entry("cluster", crm_srt_dir);
+	if (crm_srt_dir)
+		remove_plugin_proc_dir(&crm_srt_plugin);
+}
+
+module_init(init_crm_srt);
+module_exit(clean_crm_srt);
diff --git a/litmus/sched_gsn_edf.c b/litmus/sched_gsn_edf.c
index d04e0703c154..ac7685fe69f0 100644
--- a/litmus/sched_gsn_edf.c
+++ b/litmus/sched_gsn_edf.c
@@ -1155,12 +1155,14 @@ static inline struct kfmlp_queue* kfmlp_find_shortest(
 	{
 		step = (step+1 != &sem->queues[sem->num_resources]) ?
 			step+1 : &sem->queues[0];
+
 		if(step->count < shortest->count)
 		{
 			shortest = step;
 			if(step->count == 0)
 				break; /* can't get any shorter */
 		}
+
 	}while(step != search_start);
 	
 	return(shortest);
@@ -1369,7 +1371,9 @@ int gsnedf_kfmlp_unlock(struct litmus_lock* l)
 		my_queue->owner = next;
 		
 		--(my_queue->count);
-		if(my_queue->count < sem->shortest_queue->count)
+		// the '=' of '<=' is a dumb method to attempt to build
+		// affinity until tasks can tell us where they ran last...
+		if(my_queue->count <= sem->shortest_queue->count)
 		{
 			sem->shortest_queue = my_queue;
 		}	
@@ -1428,7 +1432,9 @@ int gsnedf_kfmlp_unlock(struct litmus_lock* l)
 			TRACE_CUR("queue %d: no one to steal.\n", kfmlp_get_idx(sem, my_queue));
 			
 			--(my_queue->count);
-			if(my_queue->count < sem->shortest_queue->count)
+			// the '=' of '<=' is a dumb method to attempt to build
+			// affinity until tasks can tell us where they ran last...
+			if(my_queue->count <= sem->shortest_queue->count)
 			{
 				sem->shortest_queue = my_queue;
 			}
diff --git a/litmus/sched_plugin.c b/litmus/sched_plugin.c
index 8802670a4b0b..e393d749baf5 100644
--- a/litmus/sched_plugin.c
+++ b/litmus/sched_plugin.c
@@ -152,6 +152,14 @@ static void litmus_dummy_clear_prio_inh_klitirqd(struct task_struct* klitirqd,
 }
 #endif
 
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+static int litmus_dummy_enqueue_pai_tasklet(struct tasklet_struct* t)
+{
+	TRACE("PAI Tasklet unsupported in this plugin!!!!!!\n");
+	return(0); // failure.
+}
+#endif
+
 
 /* The default scheduler plugin. It doesn't do anything and lets Linux do its
  * job.
@@ -176,6 +184,9 @@ struct sched_plugin linux_sched_plugin = {
 #ifdef CONFIG_LITMUS_SOFTIRQD
 	.set_prio_inh_klitirqd = litmus_dummy_set_prio_inh_klitirq,
 	.clear_prio_inh_klitirqd = litmus_dummy_clear_prio_inh_klitirqd,
+#endif
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+	.enqueue_pai_tasklet = litmus_dummy_enqueue_pai_tasklet,
 #endif
 	.admit_task = litmus_dummy_admit_task
 };
diff --git a/litmus/sched_task_trace.c b/litmus/sched_task_trace.c
index 7aeb99b668d3..d079df2b292a 100644
--- a/litmus/sched_task_trace.c
+++ b/litmus/sched_task_trace.c
@@ -191,7 +191,9 @@ feather_callback void do_sched_trace_task_completion(unsigned long id,
 	if (rec) {
 		rec->data.completion.when   = now();
 		rec->data.completion.forced = forced;
+#ifdef LITMUS_NVIDIA
 		rec->data.completion.nv_int_count = (u16)atomic_read(&tsk_rt(t)->nv_int_count);
+#endif
 		put_record(rec);
 	}
 }
@@ -367,24 +369,29 @@ feather_callback void do_sched_trace_eff_prio_change(unsigned long id,
 	}
 }
 
-
 /* pray for no nesting of nv interrupts on same CPU... */
 struct tracing_interrupt_map
 {
 	int active;
 	int count;
 	unsigned long data[128]; // assume nesting less than 128...
+	unsigned long serial[128];
 };
 DEFINE_PER_CPU(struct tracing_interrupt_map, active_interrupt_tracing);
 
+
+DEFINE_PER_CPU(u32, intCounter);
+
 feather_callback void do_sched_trace_nv_interrupt_begin(unsigned long id,
 												unsigned long _device)
 {
 	struct st_event_record *rec;
+	u32 serialNum;
 
 	{
+		u32* serial;
 		struct tracing_interrupt_map* int_map = &per_cpu(active_interrupt_tracing, smp_processor_id());
-		if(int_map->active == 0xcafebabe)
+		if(!int_map->active == 0xcafebabe)
 		{
 			int_map->count++;
 		}
@@ -393,7 +400,12 @@ feather_callback void do_sched_trace_nv_interrupt_begin(unsigned long id,
 			int_map->active = 0xcafebabe;
 			int_map->count = 1;
 		}
-		int_map->data[int_map->count-1] = _device;
+		//int_map->data[int_map->count-1] = _device;
+		
+		serial = &per_cpu(intCounter, smp_processor_id());
+		*serial += num_online_cpus();
+		serialNum = *serial;
+		int_map->serial[int_map->count-1] = serialNum;
 	}
 
 	rec = get_record(ST_NV_INTERRUPT_BEGIN, NULL);
@@ -401,6 +413,7 @@ feather_callback void do_sched_trace_nv_interrupt_begin(unsigned long id,
 		u32 device = _device;
 		rec->data.nv_interrupt_begin.when = now();
 		rec->data.nv_interrupt_begin.device = device;
+		rec->data.nv_interrupt_begin.serialNumber = serialNum;
 		put_record(rec);
 	}
 }
@@ -416,7 +429,7 @@ int is_interrupt_tracing_active(void)
 }
 */
 
-feather_callback void do_sched_trace_nv_interrupt_end(unsigned long id, unsigned long unused)
+feather_callback void do_sched_trace_nv_interrupt_end(unsigned long id, unsigned long _device)
 {
 	struct tracing_interrupt_map* int_map = &per_cpu(active_interrupt_tracing, smp_processor_id());
 	if(int_map->active == 0xcafebabe)
@@ -428,8 +441,11 @@ feather_callback void do_sched_trace_nv_interrupt_end(unsigned long id, unsigned
 			int_map->active = 0;
 
 		if(rec) {
+			u32 device = _device;
 			rec->data.nv_interrupt_end.when = now();
-			rec->data.nv_interrupt_end.device = int_map->data[int_map->count];
+			//rec->data.nv_interrupt_end.device = int_map->data[int_map->count];
+			rec->data.nv_interrupt_end.device = device;
+			rec->data.nv_interrupt_end.serialNumber = int_map->serial[int_map->count];
 			put_record(rec);
 		}
 	}
diff --git a/litmus/sched_trace_external.c b/litmus/sched_trace_external.c
index d7d7d8bae298..5b7e6152416a 100644
--- a/litmus/sched_trace_external.c
+++ b/litmus/sched_trace_external.c
@@ -1,5 +1,6 @@
 #include <linux/module.h>
 
+#include <litmus/trace.h>
 #include <litmus/sched_trace.h>
 #include <litmus/litmus.h>
 
@@ -38,8 +39,26 @@ void __sched_trace_nv_interrupt_begin_external(u32 device)
 }
 EXPORT_SYMBOL(__sched_trace_nv_interrupt_begin_external);
 
-void __sched_trace_nv_interrupt_end_external(void)
+void __sched_trace_nv_interrupt_end_external(u32 device)
 {
-	sched_trace_nv_interrupt_end();
+	unsigned long _device = device;
+	sched_trace_nv_interrupt_end(_device);
 }
 EXPORT_SYMBOL(__sched_trace_nv_interrupt_end_external);
+
+
+#ifdef CONFIG_LITMUS_NVIDIA
+
+#define EXX_TS(evt) \
+void __##evt(void) { evt; } \
+EXPORT_SYMBOL(__##evt);
+
+EXX_TS(TS_NV_TOPISR_START)
+EXX_TS(TS_NV_TOPISR_END)
+EXX_TS(TS_NV_BOTISR_START)
+EXX_TS(TS_NV_BOTISR_END)
+EXX_TS(TS_NV_RELEASE_BOTISR_START)
+EXX_TS(TS_NV_RELEASE_BOTISR_END)
+
+#endif
+
-- 
cgit v1.2.2


From 53a6dbb9f5337e77fce9c2672488c1c5e0621beb Mon Sep 17 00:00:00 2001
From: Glenn Elliott <gelliott@cs.unc.edu>
Date: Sat, 14 Jan 2012 14:20:07 -0500
Subject: Completed PAI for C-EDF.

---
 include/litmus/sched_plugin.h         |   2 +
 include/litmus/sched_trace_external.h |  20 +++
 kernel/sched.c                        |   5 +
 kernel/softirq.c                      |   3 -
 litmus/litmus_softirq.c               |   5 +
 litmus/sched_cedf.c                   | 319 +++++++++++++++++++++++++++++-----
 litmus/sched_plugin.c                 |   8 +-
 litmus/sched_trace_external.c         |   8 +-
 8 files changed, 316 insertions(+), 54 deletions(-)

diff --git a/include/litmus/sched_plugin.h b/include/litmus/sched_plugin.h
index 12a9ab65a673..3fc64f832fef 100644
--- a/include/litmus/sched_plugin.h
+++ b/include/litmus/sched_plugin.h
@@ -75,6 +75,7 @@ typedef void (*clear_prio_inh_klitirqd_t)(struct task_struct* klitirqd,
 
 
 typedef int (*enqueue_pai_tasklet_t)(struct tasklet_struct* tasklet);
+typedef void (*run_tasklets_t)(struct task_struct* next);
 
 /********************* sys call backends  ********************/
 /* This function causes the caller to sleep until the next release */
@@ -125,6 +126,7 @@ struct sched_plugin {
 	
 #ifdef CONFIG_LITMUS_PAI_SOFTIRQD
 	enqueue_pai_tasklet_t		enqueue_pai_tasklet;
+	run_tasklets_t				run_tasklets;
 #endif
 } __attribute__ ((__aligned__(SMP_CACHE_BYTES)));
 
diff --git a/include/litmus/sched_trace_external.h b/include/litmus/sched_trace_external.h
index 90424d5c564c..e70e45e4cf51 100644
--- a/include/litmus/sched_trace_external.h
+++ b/include/litmus/sched_trace_external.h
@@ -4,6 +4,8 @@
 #ifndef _LINUX_SCHED_TRACE_EXTERNAL_H_
 #define _LINUX_SCHED_TRACE_EXTERNAL_H_
 
+
+#ifdef CONFIG_SCHED_TASK_TRACE
 extern void __sched_trace_tasklet_begin_external(struct task_struct* t);
 static inline void sched_trace_tasklet_begin_external(struct task_struct* t)
 {
@@ -28,6 +30,7 @@ static inline void sched_trace_work_end_external(struct task_struct* t, struct t
 	__sched_trace_work_end_external(t, e, f);
 }
 
+#ifdef CONFIG_LITMUS_NVIDIA
 extern void __sched_trace_nv_interrupt_begin_external(u32 device);
 static inline void sched_trace_nv_interrupt_begin_external(u32 device)
 {
@@ -39,6 +42,23 @@ static inline void sched_trace_nv_interrupt_end_external(u32 device)
 {
 	__sched_trace_nv_interrupt_end_external(device);
 }
+#endif
+
+#else
+
+// no tracing.
+static inline void sched_trace_tasklet_begin_external(struct task_struct* t){}
+static inline void sched_trace_tasklet_end_external(struct task_struct* t, unsigned long flushed){}
+static inline void sched_trace_work_begin_external(struct task_struct* t, struct task_struct* e){}
+static inline void sched_trace_work_end_external(struct task_struct* t, struct task_struct* e, unsigned long f){}
+
+#ifdef CONFIG_LITMUS_NVIDIA
+static inline void sched_trace_nv_interrupt_begin_external(u32 device){}
+static inline void sched_trace_nv_interrupt_end_external(u32 device){}
+#endif
+
+#endif
+
 
 #ifdef CONFIG_LITMUS_NVIDIA
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 3aa2be09122b..08b725cd9182 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2883,6 +2883,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 	struct mm_struct *mm, *oldmm;
 
 	prepare_task_switch(rq, prev, next);
+
 	trace_sched_switch(prev, next);
 	mm = next->mm;
 	oldmm = prev->active_mm;
@@ -3901,6 +3902,10 @@ need_resched_nonpreemptible:
 	reacquire_klitirqd_lock(prev);
 #endif
 
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+	litmus->run_tasklets(prev);
+#endif	
+	
 	srp_ceiling_block();
 }
 EXPORT_SYMBOL(schedule);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index ae77c5c1d17e..d3217c54d2bf 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -442,9 +442,6 @@ void __tasklet_schedule(struct tasklet_struct *t)
 				if(likely(_litmus_tasklet_schedule(t,nvidia_device)))
 				{
 					unlock_nv_registry(nvidia_device, &flags);
-
-					TS_NV_RELEASE_BOTISR_END;
-
 					return;
 				}
 				else
diff --git a/litmus/litmus_softirq.c b/litmus/litmus_softirq.c
index f5cca964b6c6..c49676c6d3a7 100644
--- a/litmus/litmus_softirq.c
+++ b/litmus/litmus_softirq.c
@@ -470,6 +470,9 @@ static void do_lit_tasklet(struct klitirqd_info* which,
         /* execute tasklet if it has my priority and is free */
 		if ((t->owner == which->current_owner) && tasklet_trylock(t)) {
 			if (!atomic_read(&t->count)) {
+				
+				sched_trace_tasklet_begin(t->owner);
+				
 				if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
                 {
 					BUG();
@@ -480,6 +483,8 @@ static void do_lit_tasklet(struct klitirqd_info* which,
 				
 				atomic_dec(count);
 				
+				sched_trace_tasklet_end(t->owner, 0ul);
+				
 				continue;  /* process more tasklets */
 			}
 			tasklet_unlock(t);
diff --git a/litmus/sched_cedf.c b/litmus/sched_cedf.c
index f0356de60b2f..4924da21865e 100644
--- a/litmus/sched_cedf.c
+++ b/litmus/sched_cedf.c
@@ -124,6 +124,7 @@ typedef struct clusterdomain {
 	
 	
 #ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+	raw_spinlock_t tasklet_lock;
 	struct tasklet_head pending_tasklets;
 #endif	
 	
@@ -429,36 +430,137 @@ static void cedf_tick(struct task_struct* t)
 #ifdef CONFIG_LITMUS_PAI_SOFTIRQD
 
 
-void __do_lit_tasklet(struct tasklet_struct* tasklet)
+void __do_lit_tasklet(struct tasklet_struct* tasklet, unsigned long flushed)
 {
-	if (!test_and_clear_bit(TASKLET_STATE_SCHED, &tasklet->state))
-	{
+	if (!atomic_read(&tasklet->count)) {
+		sched_trace_tasklet_begin(tasklet->owner);
+		
+		if (!test_and_clear_bit(TASKLET_STATE_SCHED, &tasklet->state))
+		{
+			BUG();
+		}
+		TRACE("%s: Invoking tasklet with owner pid = %d (flushed = %d).\n", __FUNCTION__, tasklet->owner->pid, flushed);
+		tasklet->func(tasklet->data);
+		tasklet_unlock(tasklet);
+
+		sched_trace_tasklet_end(tasklet->owner, flushed);
+	}
+	else {
 		BUG();
 	}
-	TRACE("%s: Invoking tasklet with owner pid = %d.\n", __FUNCTION__, tasklet->owner->pid);
-	tasklet->func(tasklet->data);
-	tasklet_unlock(tasklet);
+}
+
+
+void __extract_tasklets(cedf_domain_t* cluster, struct task_struct* task, struct tasklet_head* task_tasklets)
+{
+	struct tasklet_struct* step;
+	struct tasklet_struct* tasklet;
+	struct tasklet_struct* prev;
+	
+	task_tasklets->head = NULL;
+	task_tasklets->tail = &(task_tasklets->head);
+
+	prev = NULL;
+	for(step = cluster->pending_tasklets.head; step != NULL; step = step->next)
+	{
+		if(step->owner == task)
+		{
+			TRACE("%s: Found tasklet to flush: %d\n", __FUNCTION__, step->owner->pid);
+
+			tasklet = step;
+
+			if(prev) {
+				prev->next = tasklet->next;
+			}
+			else if(cluster->pending_tasklets.head == tasklet) {
+				// we're at the head.
+				cluster->pending_tasklets.head = tasklet->next;
+			}
+
+			if(cluster->pending_tasklets.tail == &tasklet) {
+				// we're at the tail
+				if(prev) {
+					cluster->pending_tasklets.tail = &prev;
+				}
+				else {
+					cluster->pending_tasklets.tail = &(cluster->pending_tasklets.head);
+				}
+			}
+
+			tasklet->next = NULL;
+			*(task_tasklets->tail) = tasklet;
+			task_tasklets->tail = &(tasklet->next);
+		}
+		else {
+			prev = step;
+		}
+	}
+}
+
+void flush_tasklets(cedf_domain_t* cluster, struct task_struct* task)
+{
+	unsigned long flags;
+	struct tasklet_head task_tasklets;
+	struct tasklet_struct* step;
+
+	raw_spin_lock_irqsave(&cluster->cedf_lock, flags);
+	__extract_tasklets(cluster, task, &task_tasklets);
+	raw_spin_unlock_irqrestore(&cluster->cedf_lock, flags);
+
+	if(cluster->pending_tasklets.head != NULL) {
+		TRACE("%s: Flushing tasklets for %d...\n", __FUNCTION__, task->pid);
+	}
+	
+	// now execute any flushed tasklets.
+	for(step = cluster->pending_tasklets.head; step != NULL; /**/)
+	{
+		struct tasklet_struct* temp = step->next;
+
+		step->next = NULL;
+		__do_lit_tasklet(step, 1ul);
 
+		step = temp;
+	}
 }
 
-void do_lit_tasklets(cedf_domain_t* cluster, struct task_struct* next)
+
+void do_lit_tasklets(cedf_domain_t* cluster, struct task_struct* sched_task)
 {
 	int work_to_do = 1;
 	struct tasklet_struct *tasklet = NULL;
-	
-	TRACE("%s: entered.\n", __FUNCTION__);
+	struct tasklet_struct *step;
+	unsigned long flags;
 	
 	while(work_to_do) {
 		// remove tasklet at head of list if it has higher priority.
-		raw_spin_lock(&cluster->cedf_lock);		
-		// remove tasklet at head.
+		raw_spin_lock_irqsave(&cluster->cedf_lock, flags);	
+
+
+		step = cluster->pending_tasklets.head;
+		TRACE("%s: (BEFORE) dumping tasklet queue...\n", __FUNCTION__);
+		while(step != NULL){
+			TRACE("%s: %p (%d)\n", __FUNCTION__, step, step->owner->pid);
+			step = step->next;
+		}
+		TRACE("%s: tail = %p (%d)\n", __FUNCTION__, *(cluster->pending_tasklets.tail), (*(cluster->pending_tasklets.tail) != NULL) ? (*(cluster->pending_tasklets.tail))->owner->pid : -1);
+		TRACE("%s: done.\n", __FUNCTION__);
+	
+
 		if(cluster->pending_tasklets.head != NULL) {
+			// remove tasklet at head.
 			tasklet = cluster->pending_tasklets.head;
 			
-			if(edf_higher_prio(tasklet->owner, next)) {
+			if(edf_higher_prio(tasklet->owner, sched_task)) {
+
+				if(NULL == tasklet->next) {
+					// tasklet is at the head, list only has one element
+					TRACE("%s: Tasklet for %d is the last element in tasklet queue.\n", __FUNCTION__, tasklet->owner->pid);
+					cluster->pending_tasklets.tail = &(cluster->pending_tasklets.head);
+				}
+
 				// remove the tasklet from the queue
 				cluster->pending_tasklets.head = tasklet->next;
-				
+
 				TRACE("%s: Removed tasklet for %d from tasklet queue.\n", __FUNCTION__, tasklet->owner->pid);
 			}
 			else {
@@ -467,12 +569,24 @@ void do_lit_tasklets(cedf_domain_t* cluster, struct task_struct* next)
 			}
 		}
 		else {
-			//TRACE("%s: Tasklet queue is empty.\n", __FUNCTION__);
+			TRACE("%s: Tasklet queue is empty.\n", __FUNCTION__);
 		}
-		raw_spin_unlock(&cluster->cedf_lock);
+
+
+		step = cluster->pending_tasklets.head;
+		TRACE("%s: (AFTER) dumping tasklet queue...\n", __FUNCTION__);
+		while(step != NULL){
+			TRACE("%s: %p (%d)\n", __FUNCTION__, step, step->owner->pid);
+			step = step->next;
+		}
+		TRACE("%s: tail = %p (%d)\n", __FUNCTION__, *(cluster->pending_tasklets.tail), (*(cluster->pending_tasklets.tail) != NULL) ? (*(cluster->pending_tasklets.tail))->owner->pid : -1);
+		TRACE("%s: done.\n", __FUNCTION__);
+	
+
+		raw_spin_unlock_irqrestore(&cluster->cedf_lock, flags);
 		
 		if(tasklet) {
-			__do_lit_tasklet(tasklet);
+			__do_lit_tasklet(tasklet, 0ul);
 			tasklet = NULL;	
 		}
 		else {
@@ -480,7 +594,50 @@ void do_lit_tasklets(cedf_domain_t* cluster, struct task_struct* next)
 		}
 	}
 	
-	TRACE("%s: exited.\n", __FUNCTION__);
+	//TRACE("%s: exited.\n", __FUNCTION__);
+}
+
+
+void run_tasklets(struct task_struct* sched_task)
+{
+	cedf_domain_t* cluster;
+
+#if 0
+	int task_is_rt = is_realtime(sched_task);
+	cedf_domain_t* cluster;
+
+	if(is_realtime(sched_task)) {
+		cluster = task_cpu_cluster(sched_task);
+	}
+	else {
+		cluster = remote_cluster(get_cpu());
+	}
+
+	if(cluster && cluster->pending_tasklets.head != NULL) {
+		TRACE("%s: There are tasklets to process.\n", __FUNCTION__);
+		
+		do_lit_tasklets(cluster, sched_task);
+	}
+
+	if(!task_is_rt) {
+		put_cpu_no_resched();
+	}
+#else
+
+	preempt_disable();
+
+	cluster = (is_realtime(sched_task)) ?
+		task_cpu_cluster(sched_task) :
+		remote_cluster(smp_processor_id());
+
+	if(cluster && cluster->pending_tasklets.head != NULL) {
+		TRACE("%s: There are tasklets to process.\n", __FUNCTION__);
+		do_lit_tasklets(cluster, sched_task);
+	}
+
+	preempt_enable_no_resched();
+
+#endif
 }
 
 
@@ -489,41 +646,47 @@ void __add_pai_tasklet(struct tasklet_struct* tasklet, cedf_domain_t* cluster)
 	struct tasklet_struct* step;
 	
 	step = cluster->pending_tasklets.head;
-	TRACE("%s: (BEFORE) dumping tasklet queue...\n");
+	TRACE("%s: (BEFORE) dumping tasklet queue...\n", __FUNCTION__);
 	while(step != NULL){
-		TRACE("%s: %d\n", __FUNCTION__, step->owner);
+		TRACE("%s: %p (%d)\n", __FUNCTION__, step, step->owner->pid);
 		step = step->next;
 	}
+	TRACE("%s: tail = %p (%d)\n", __FUNCTION__, *(cluster->pending_tasklets.tail), (*(cluster->pending_tasklets.tail) != NULL) ? (*(cluster->pending_tasklets.tail))->owner->pid : -1);
 	TRACE("%s: done.\n", __FUNCTION__);
 	
 	
+	tasklet->next = NULL;  // make sure there are no old values floating around
+
 	step = cluster->pending_tasklets.head;
 	if(step == NULL) {
 		TRACE("%s: tasklet queue empty.  inserting tasklet for %d at head.\n", __FUNCTION__, tasklet->owner->pid);
 		// insert at tail.
-		tasklet->next = NULL;
 		*(cluster->pending_tasklets.tail) = tasklet;
-		cluster->pending_tasklets.tail = &tasklet->next;		
+		cluster->pending_tasklets.tail = &(tasklet->next);		
 	}
-	else if((*cluster->pending_tasklets.tail != NULL) &&
-			edf_higher_prio((*cluster->pending_tasklets.tail)->owner, tasklet->owner)) {
+	else if((*(cluster->pending_tasklets.tail) != NULL) &&
+			edf_higher_prio((*(cluster->pending_tasklets.tail))->owner, tasklet->owner)) {
 		// insert at tail.
 		TRACE("%s: tasklet belongs at end.  inserting tasklet for %d at tail.\n", __FUNCTION__, tasklet->owner->pid);
 		
-		tasklet->next = NULL;
 		*(cluster->pending_tasklets.tail) = tasklet;
-		cluster->pending_tasklets.tail = &tasklet->next;		
+		cluster->pending_tasklets.tail = &(tasklet->next);
 	}
 	else {
+
+        WARN_ON(1 == 1);
+
 		// insert the tasklet somewhere in the middle.
-		
+
+        TRACE("%s: tasklet belongs somewhere in the middle.\n", __FUNCTION__);
+
 		while(step->next && edf_higher_prio(step->next->owner, tasklet->owner)) {
 			step = step->next;
 		}
 		
 		// insert tasklet right before step->next.
 		
-		TRACE("%s: tasklet belongs at end.  inserting tasklet for %d between %d and %d.\n", __FUNCTION__, tasklet->owner->pid, step->owner->pid, (step->next) ? step->next->owner->pid : -1);
+		TRACE("%s: inserting tasklet for %d between %d and %d.\n", __FUNCTION__, tasklet->owner->pid, step->owner->pid, (step->next) ? step->next->owner->pid : -1);
 		
 		tasklet->next = step->next;
 		step->next = tasklet;
@@ -540,9 +703,10 @@ void __add_pai_tasklet(struct tasklet_struct* tasklet, cedf_domain_t* cluster)
 	step = cluster->pending_tasklets.head;
 	TRACE("%s: (AFTER) dumping tasklet queue...\n", __FUNCTION__);
 	while(step != NULL){
-		TRACE("%s: %d\n", __FUNCTION__, step->owner);
+		TRACE("%s: %p (%d)\n", __FUNCTION__, step, step->owner->pid);
 		step = step->next;
 	}
+	TRACE("%s: tail = %p (%d)\n", __FUNCTION__, *(cluster->pending_tasklets.tail), (*(cluster->pending_tasklets.tail) != NULL) ? (*(cluster->pending_tasklets.tail))->owner->pid : -1);
 	TRACE("%s: done.\n", __FUNCTION__);	
 	
 // TODO: Maintain this list in priority order.
@@ -553,37 +717,89 @@ void __add_pai_tasklet(struct tasklet_struct* tasklet, cedf_domain_t* cluster)
 
 int enqueue_pai_tasklet(struct tasklet_struct* tasklet)
 {
-	cedf_domain_t* cluster = task_cpu_cluster(tasklet->owner);
-	cpu_entry_t *lowest;
+	cedf_domain_t *cluster = NULL;
+	cpu_entry_t *targetCPU = NULL;
+	int thisCPU;
+	int runLocal = 0;
+	int runNow = 0;
 	unsigned long flags;
 	
     if(unlikely((tasklet->owner == NULL) || !is_realtime(tasklet->owner)))
     {
         TRACE("%s: No owner associated with this tasklet!\n", __FUNCTION__);
-        BUG();
+		return 0;
     }	
 	
+	cluster = task_cpu_cluster(tasklet->owner);
+
 	raw_spin_lock_irqsave(&cluster->cedf_lock, flags);		
 
-	lowest = lowest_prio_cpu(cluster);
-	if (edf_higher_prio(tasklet->owner, lowest->linked)) {
-		if (smp_processor_id() == lowest->cpu) {
-			TRACE("%s: Running tasklet on CPU where it was received.\n", __FUNCTION__);
-			// execute the tasklet now.
-			__do_lit_tasklet(tasklet);
+	thisCPU = smp_processor_id();
+
+#if 1
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+	{
+		cpu_entry_t* affinity = NULL;
+		
+		// use this CPU if it is in our cluster and isn't running any RT work.
+		if(cpu_isset(thisCPU, *cluster->cpu_map) && (__get_cpu_var(cedf_cpu_entries).linked == NULL)) {
+			affinity = &(__get_cpu_var(cedf_cpu_entries));
 		}
 		else {
-			// preempt the lowest CPU
-			__add_pai_tasklet(tasklet, cluster);
-			
-			TRACE("%s: Triggering CPU %d to run tasklet.\n", __FUNCTION__, lowest->cpu);
-			
-			preempt(lowest);
+			// this CPU is busy or shouldn't run tasklet in this cluster.
+			// look for available near by CPUs.
+			// NOTE: Affinity towards owner and not this CPU.  Is this right?
+			affinity = 
+				cedf_get_nearest_available_cpu(cluster,
+								&per_cpu(cedf_cpu_entries, task_cpu(tasklet->owner)));
 		}
+
+		targetCPU = affinity;
+	}
+#endif
+#endif
+
+	if (targetCPU == NULL) {
+		targetCPU = lowest_prio_cpu(cluster);
+	}
+
+	if (edf_higher_prio(tasklet->owner, targetCPU->linked)) {
+		if (thisCPU == targetCPU->cpu) {
+			TRACE("%s: Run tasklet locally (and now).\n", __FUNCTION__);
+			runLocal = 1;
+			runNow = 1;
+		}
+		else {
+			TRACE("%s: Run tasklet remotely (and now).\n", __FUNCTION__);
+			runLocal = 0;
+			runNow = 1;
+		}
+	}
+	else {
+		runLocal = 0;
+		runNow = 0;
+	}
+	
+	if(!runLocal) {
+		// enqueue the tasklet
+		__add_pai_tasklet(tasklet, cluster);
 	}
 	
 	raw_spin_unlock_irqrestore(&cluster->cedf_lock, flags);
 	
+	
+	if (runLocal /*&& runNow */) {  // runNow == 1 is implied
+		TRACE("%s: Running tasklet on CPU where it was received.\n", __FUNCTION__);
+		__do_lit_tasklet(tasklet, 0ul);
+	}
+	else if (runNow /*&& !runLocal */) {  // runLocal == 0 is implied
+		TRACE("%s: Triggering CPU %d to run tasklet.\n", __FUNCTION__, targetCPU->cpu);
+		preempt(targetCPU);  // need to be protected by cedf_lock?
+	}
+	else {
+		TRACE("%s: Scheduling of tasklet was deferred.\n", __FUNCTION__);
+	}
+	
 	return(1); // success
 }
 
@@ -721,9 +937,14 @@ static struct task_struct* cedf_schedule(struct task_struct * prev)
 	
 	raw_spin_unlock(&cluster->cedf_lock);
 
+	/*
 #ifdef CONFIG_LITMUS_PAI_SOFTIRQD
-	do_lit_tasklets(cluster, next);
-#endif	
+	if(cluster->pending_tasklets.head != NULL)  // peak at data.  normally locked with cluster->cedf_lock
+	{
+		do_lit_tasklets(cluster, next);
+	}
+#endif
+*/
 	
 #ifdef WANT_ALL_SCHED_EVENTS
 	TRACE("cedf_lock released, next=0x%p\n", next);
@@ -865,6 +1086,10 @@ static void cedf_task_exit(struct task_struct * t)
 	}
 	raw_spin_unlock_irqrestore(&cluster->cedf_lock, flags);
 
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+	flush_tasklets(cluster, t);
+#endif
+
 	BUG_ON(!is_realtime(t));
         TRACE_TASK(t, "RIP\n");
 }
@@ -1684,8 +1909,9 @@ static long cedf_activate_plugin(void)
 
 		
 #ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+		raw_spin_lock_init(&(cedf[i].tasklet_lock));
 		cedf[i].pending_tasklets.head = NULL;
-		cedf[i].pending_tasklets.tail = &cedf[i].pending_tasklets.head;
+		cedf[i].pending_tasklets.tail = &(cedf[i].pending_tasklets.head);
 #endif
 		
 		
@@ -1803,6 +2029,7 @@ static struct sched_plugin cedf_plugin __cacheline_aligned_in_smp = {
 #endif
 #ifdef CONFIG_LITMUS_PAI_SOFTIRQD
 	.enqueue_pai_tasklet = enqueue_pai_tasklet,
+	.run_tasklets = run_tasklets,
 #endif
 };
 
diff --git a/litmus/sched_plugin.c b/litmus/sched_plugin.c
index e393d749baf5..d977e80aa32f 100644
--- a/litmus/sched_plugin.c
+++ b/litmus/sched_plugin.c
@@ -155,9 +155,14 @@ static void litmus_dummy_clear_prio_inh_klitirqd(struct task_struct* klitirqd,
 #ifdef CONFIG_LITMUS_PAI_SOFTIRQD
 static int litmus_dummy_enqueue_pai_tasklet(struct tasklet_struct* t)
 {
-	TRACE("PAI Tasklet unsupported in this plugin!!!!!!\n");
+	TRACE("%s: PAI Tasklet unsupported in this plugin!!!!!!\n", __FUNCTION__);
 	return(0); // failure.
 }
+
+static void litmus_dummy_run_tasklets(struct task_struct* t)
+{
+	//TRACE("%s: PAI Tasklet unsupported in this plugin!!!!!!\n", __FUNCTION__);
+}
 #endif
 
 
@@ -187,6 +192,7 @@ struct sched_plugin linux_sched_plugin = {
 #endif
 #ifdef CONFIG_LITMUS_PAI_SOFTIRQD
 	.enqueue_pai_tasklet = litmus_dummy_enqueue_pai_tasklet,
+	.run_tasklets = litmus_dummy_run_tasklets,
 #endif
 	.admit_task = litmus_dummy_admit_task
 };
diff --git a/litmus/sched_trace_external.c b/litmus/sched_trace_external.c
index 5b7e6152416a..cf8e1d78aa77 100644
--- a/litmus/sched_trace_external.c
+++ b/litmus/sched_trace_external.c
@@ -34,15 +34,15 @@ EXPORT_SYMBOL(__sched_trace_work_end_external);
 
 void __sched_trace_nv_interrupt_begin_external(u32 device)
 {
-	unsigned long _device = device;
-	sched_trace_nv_interrupt_begin(_device);
+	//unsigned long _device = device;
+	sched_trace_nv_interrupt_begin((unsigned long)device);
 }
 EXPORT_SYMBOL(__sched_trace_nv_interrupt_begin_external);
 
 void __sched_trace_nv_interrupt_end_external(u32 device)
 {
-	unsigned long _device = device;
-	sched_trace_nv_interrupt_end(_device);
+	//unsigned long _device = device;
+	sched_trace_nv_interrupt_end((unsigned long)device);
 }
 EXPORT_SYMBOL(__sched_trace_nv_interrupt_end_external);
 
-- 
cgit v1.2.2


From 1a582a2c5e361e01a4c64f185bb1a23c3f70701a Mon Sep 17 00:00:00 2001
From: Glenn Elliott <gelliott@cs.unc.edu>
Date: Sat, 14 Jan 2012 16:56:47 -0500
Subject: Port PAI interrupts to GSN-EDF, C-RM/RM-SRT/FIFO.

---
 litmus/sched_cedf.c    |  32 ++--
 litmus/sched_cfifo.c   | 450 ++++++++++++++++++++++++++++++++++++++++++++++++-
 litmus/sched_crm.c     | 448 +++++++++++++++++++++++++++++++++++++++++++++++-
 litmus/sched_crm_srt.c | 445 +++++++++++++++++++++++++++++++++++++++++++++++-
 litmus/sched_gsn_edf.c | 434 ++++++++++++++++++++++++++++++++++++++++++++++-
 5 files changed, 1787 insertions(+), 22 deletions(-)

diff --git a/litmus/sched_cedf.c b/litmus/sched_cedf.c
index 4924da21865e..02106f455c0f 100644
--- a/litmus/sched_cedf.c
+++ b/litmus/sched_cedf.c
@@ -124,7 +124,6 @@ typedef struct clusterdomain {
 	
 	
 #ifdef CONFIG_LITMUS_PAI_SOFTIRQD
-	raw_spinlock_t tasklet_lock;
 	struct tasklet_head pending_tasklets;
 #endif	
 	
@@ -430,7 +429,7 @@ static void cedf_tick(struct task_struct* t)
 #ifdef CONFIG_LITMUS_PAI_SOFTIRQD
 
 
-void __do_lit_tasklet(struct tasklet_struct* tasklet, unsigned long flushed)
+static void __do_lit_tasklet(struct tasklet_struct* tasklet, unsigned long flushed)
 {
 	if (!atomic_read(&tasklet->count)) {
 		sched_trace_tasklet_begin(tasklet->owner);
@@ -451,7 +450,7 @@ void __do_lit_tasklet(struct tasklet_struct* tasklet, unsigned long flushed)
 }
 
 
-void __extract_tasklets(cedf_domain_t* cluster, struct task_struct* task, struct tasklet_head* task_tasklets)
+static void __extract_tasklets(cedf_domain_t* cluster, struct task_struct* task, struct tasklet_head* task_tasklets)
 {
 	struct tasklet_struct* step;
 	struct tasklet_struct* tasklet;
@@ -497,7 +496,7 @@ void __extract_tasklets(cedf_domain_t* cluster, struct task_struct* task, struct
 	}
 }
 
-void flush_tasklets(cedf_domain_t* cluster, struct task_struct* task)
+static void flush_tasklets(cedf_domain_t* cluster, struct task_struct* task)
 {
 	unsigned long flags;
 	struct tasklet_head task_tasklets;
@@ -524,18 +523,18 @@ void flush_tasklets(cedf_domain_t* cluster, struct task_struct* task)
 }
 
 
-void do_lit_tasklets(cedf_domain_t* cluster, struct task_struct* sched_task)
+static void do_lit_tasklets(cedf_domain_t* cluster, struct task_struct* sched_task)
 {
 	int work_to_do = 1;
 	struct tasklet_struct *tasklet = NULL;
-	struct tasklet_struct *step;
+	//struct tasklet_struct *step;
 	unsigned long flags;
 	
 	while(work_to_do) {
 		// remove tasklet at head of list if it has higher priority.
 		raw_spin_lock_irqsave(&cluster->cedf_lock, flags);	
 
-
+/*
 		step = cluster->pending_tasklets.head;
 		TRACE("%s: (BEFORE) dumping tasklet queue...\n", __FUNCTION__);
 		while(step != NULL){
@@ -544,6 +543,7 @@ void do_lit_tasklets(cedf_domain_t* cluster, struct task_struct* sched_task)
 		}
 		TRACE("%s: tail = %p (%d)\n", __FUNCTION__, *(cluster->pending_tasklets.tail), (*(cluster->pending_tasklets.tail) != NULL) ? (*(cluster->pending_tasklets.tail))->owner->pid : -1);
 		TRACE("%s: done.\n", __FUNCTION__);
+ */
 	
 
 		if(cluster->pending_tasklets.head != NULL) {
@@ -573,6 +573,7 @@ void do_lit_tasklets(cedf_domain_t* cluster, struct task_struct* sched_task)
 		}
 
 
+		/*
 		step = cluster->pending_tasklets.head;
 		TRACE("%s: (AFTER) dumping tasklet queue...\n", __FUNCTION__);
 		while(step != NULL){
@@ -581,6 +582,7 @@ void do_lit_tasklets(cedf_domain_t* cluster, struct task_struct* sched_task)
 		}
 		TRACE("%s: tail = %p (%d)\n", __FUNCTION__, *(cluster->pending_tasklets.tail), (*(cluster->pending_tasklets.tail) != NULL) ? (*(cluster->pending_tasklets.tail))->owner->pid : -1);
 		TRACE("%s: done.\n", __FUNCTION__);
+		 */
 	
 
 		raw_spin_unlock_irqrestore(&cluster->cedf_lock, flags);
@@ -598,7 +600,7 @@ void do_lit_tasklets(cedf_domain_t* cluster, struct task_struct* sched_task)
 }
 
 
-void run_tasklets(struct task_struct* sched_task)
+static void run_tasklets(struct task_struct* sched_task)
 {
 	cedf_domain_t* cluster;
 
@@ -641,10 +643,11 @@ void run_tasklets(struct task_struct* sched_task)
 }
 
 
-void __add_pai_tasklet(struct tasklet_struct* tasklet, cedf_domain_t* cluster)
+static void __add_pai_tasklet(struct tasklet_struct* tasklet, cedf_domain_t* cluster)
 {
 	struct tasklet_struct* step;
 	
+	/*
 	step = cluster->pending_tasklets.head;
 	TRACE("%s: (BEFORE) dumping tasklet queue...\n", __FUNCTION__);
 	while(step != NULL){
@@ -653,6 +656,7 @@ void __add_pai_tasklet(struct tasklet_struct* tasklet, cedf_domain_t* cluster)
 	}
 	TRACE("%s: tail = %p (%d)\n", __FUNCTION__, *(cluster->pending_tasklets.tail), (*(cluster->pending_tasklets.tail) != NULL) ? (*(cluster->pending_tasklets.tail))->owner->pid : -1);
 	TRACE("%s: done.\n", __FUNCTION__);
+	 */
 	
 	
 	tasklet->next = NULL;  // make sure there are no old values floating around
@@ -674,7 +678,7 @@ void __add_pai_tasklet(struct tasklet_struct* tasklet, cedf_domain_t* cluster)
 	}
 	else {
 
-        WARN_ON(1 == 1);
+        //WARN_ON(1 == 1);
 
 		// insert the tasklet somewhere in the middle.
 
@@ -699,7 +703,7 @@ void __add_pai_tasklet(struct tasklet_struct* tasklet, cedf_domain_t* cluster)
 		}
 	}
 	
-	
+	/*
 	step = cluster->pending_tasklets.head;
 	TRACE("%s: (AFTER) dumping tasklet queue...\n", __FUNCTION__);
 	while(step != NULL){
@@ -707,7 +711,8 @@ void __add_pai_tasklet(struct tasklet_struct* tasklet, cedf_domain_t* cluster)
 		step = step->next;
 	}
 	TRACE("%s: tail = %p (%d)\n", __FUNCTION__, *(cluster->pending_tasklets.tail), (*(cluster->pending_tasklets.tail) != NULL) ? (*(cluster->pending_tasklets.tail))->owner->pid : -1);
-	TRACE("%s: done.\n", __FUNCTION__);	
+	TRACE("%s: done.\n", __FUNCTION__);
+	 */
 	
 // TODO: Maintain this list in priority order.
 //	tasklet->next = NULL;
@@ -715,7 +720,7 @@ void __add_pai_tasklet(struct tasklet_struct* tasklet, cedf_domain_t* cluster)
 //	cluster->pending_tasklets.tail = &tasklet->next;
 }
 
-int enqueue_pai_tasklet(struct tasklet_struct* tasklet)
+static int enqueue_pai_tasklet(struct tasklet_struct* tasklet)
 {
 	cedf_domain_t *cluster = NULL;
 	cpu_entry_t *targetCPU = NULL;
@@ -1909,7 +1914,6 @@ static long cedf_activate_plugin(void)
 
 		
 #ifdef CONFIG_LITMUS_PAI_SOFTIRQD
-		raw_spin_lock_init(&(cedf[i].tasklet_lock));
 		cedf[i].pending_tasklets.head = NULL;
 		cedf[i].pending_tasklets.tail = &(cedf[i].pending_tasklets.head);
 #endif
diff --git a/litmus/sched_cfifo.c b/litmus/sched_cfifo.c
index f515446f76ed..689b2dbe5fae 100644
--- a/litmus/sched_cfifo.c
+++ b/litmus/sched_cfifo.c
@@ -55,6 +55,10 @@
 #include <litmus/litmus_softirq.h>
 #endif
 
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+#include <linux/interrupt.h>
+#endif
+
 #ifdef CONFIG_LITMUS_NVIDIA
 #include <litmus/nvidia_info.h>
 #endif
@@ -91,6 +95,15 @@ DEFINE_PER_CPU(cpu_entry_t, cfifo_cpu_entries);
 #define test_will_schedule(cpu) \
 	(atomic_read(&per_cpu(cfifo_cpu_entries, cpu).will_schedule))
 
+
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+struct tasklet_head
+{
+	struct tasklet_struct *head;
+	struct tasklet_struct **tail;
+};
+#endif
+
 /*
  * In C-FIFO there is a cfifo domain _per_ cluster
  * The number of clusters is dynamically determined accordingly to the
@@ -108,6 +121,12 @@ typedef struct clusterdomain {
 	struct bheap      cpu_heap;
 	/* lock for this cluster */
 #define cfifo_lock domain.ready_lock
+	
+	
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+	struct tasklet_head pending_tasklets;
+#endif	
+
 } cfifo_domain_t;
 
 /* a cfifo_domain per cluster; allocation is done at init/activation time */
@@ -251,7 +270,7 @@ static void preempt(cpu_entry_t *entry)
 	preempt_if_preemptable(entry->scheduled, entry->cpu);
 }
 
-/* requeue - Put an unlinked task into gsn-edf domain.
+/* requeue - Put an unlinked task into c-fifo domain.
  *           Caller must hold cfifo_lock.
  */
 static noinline void requeue(struct task_struct* task)
@@ -395,6 +414,419 @@ static void cfifo_tick(struct task_struct* t)
 	}
 }
 
+
+
+
+
+
+
+
+
+
+
+
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+
+
+static void __do_lit_tasklet(struct tasklet_struct* tasklet, unsigned long flushed)
+{
+	if (!atomic_read(&tasklet->count)) {
+		sched_trace_tasklet_begin(tasklet->owner);
+		
+		if (!test_and_clear_bit(TASKLET_STATE_SCHED, &tasklet->state))
+		{
+			BUG();
+		}
+		TRACE("%s: Invoking tasklet with owner pid = %d (flushed = %d).\n", __FUNCTION__, tasklet->owner->pid, flushed);
+		tasklet->func(tasklet->data);
+		tasklet_unlock(tasklet);
+		
+		sched_trace_tasklet_end(tasklet->owner, flushed);
+	}
+	else {
+		BUG();
+	}
+}
+
+
+static void __extract_tasklets(cfifo_domain_t* cluster, struct task_struct* task, struct tasklet_head* task_tasklets)
+{
+	struct tasklet_struct* step;
+	struct tasklet_struct* tasklet;
+	struct tasklet_struct* prev;
+	
+	task_tasklets->head = NULL;
+	task_tasklets->tail = &(task_tasklets->head);
+	
+	prev = NULL;
+	for(step = cluster->pending_tasklets.head; step != NULL; step = step->next)
+	{
+		if(step->owner == task)
+		{
+			TRACE("%s: Found tasklet to flush: %d\n", __FUNCTION__, step->owner->pid);
+			
+			tasklet = step;
+			
+			if(prev) {
+				prev->next = tasklet->next;
+			}
+			else if(cluster->pending_tasklets.head == tasklet) {
+				// we're at the head.
+				cluster->pending_tasklets.head = tasklet->next;
+			}
+			
+			if(cluster->pending_tasklets.tail == &tasklet) {
+				// we're at the tail
+				if(prev) {
+					cluster->pending_tasklets.tail = &prev;
+				}
+				else {
+					cluster->pending_tasklets.tail = &(cluster->pending_tasklets.head);
+				}
+			}
+			
+			tasklet->next = NULL;
+			*(task_tasklets->tail) = tasklet;
+			task_tasklets->tail = &(tasklet->next);
+		}
+		else {
+			prev = step;
+		}
+	}
+}
+
+static void flush_tasklets(cfifo_domain_t* cluster, struct task_struct* task)
+{
+	unsigned long flags;
+	struct tasklet_head task_tasklets;
+	struct tasklet_struct* step;
+	
+	raw_spin_lock_irqsave(&cluster->cfifo_lock, flags);
+	__extract_tasklets(cluster, task, &task_tasklets);
+	raw_spin_unlock_irqrestore(&cluster->cfifo_lock, flags);
+	
+	if(cluster->pending_tasklets.head != NULL) {
+		TRACE("%s: Flushing tasklets for %d...\n", __FUNCTION__, task->pid);
+	}
+	
+	// now execute any flushed tasklets.
+	for(step = cluster->pending_tasklets.head; step != NULL; /**/)
+	{
+		struct tasklet_struct* temp = step->next;
+		
+		step->next = NULL;
+		__do_lit_tasklet(step, 1ul);
+		
+		step = temp;
+	}
+}
+
+
+static void do_lit_tasklets(cfifo_domain_t* cluster, struct task_struct* sched_task)
+{
+	int work_to_do = 1;
+	struct tasklet_struct *tasklet = NULL;
+	//struct tasklet_struct *step;
+	unsigned long flags;
+	
+	while(work_to_do) {
+		// remove tasklet at head of list if it has higher priority.
+		raw_spin_lock_irqsave(&cluster->cfifo_lock, flags);	
+		
+		/*
+		step = cluster->pending_tasklets.head;
+		TRACE("%s: (BEFORE) dumping tasklet queue...\n", __FUNCTION__);
+		while(step != NULL){
+			TRACE("%s: %p (%d)\n", __FUNCTION__, step, step->owner->pid);
+			step = step->next;
+		}
+		TRACE("%s: tail = %p (%d)\n", __FUNCTION__, *(cluster->pending_tasklets.tail), (*(cluster->pending_tasklets.tail) != NULL) ? (*(cluster->pending_tasklets.tail))->owner->pid : -1);
+		TRACE("%s: done.\n", __FUNCTION__);
+		 */
+		
+		
+		if(cluster->pending_tasklets.head != NULL) {
+			// remove tasklet at head.
+			tasklet = cluster->pending_tasklets.head;
+			
+			if(fifo_higher_prio(tasklet->owner, sched_task)) {
+				
+				if(NULL == tasklet->next) {
+					// tasklet is at the head, list only has one element
+					TRACE("%s: Tasklet for %d is the last element in tasklet queue.\n", __FUNCTION__, tasklet->owner->pid);
+					cluster->pending_tasklets.tail = &(cluster->pending_tasklets.head);
+				}
+				
+				// remove the tasklet from the queue
+				cluster->pending_tasklets.head = tasklet->next;
+				
+				TRACE("%s: Removed tasklet for %d from tasklet queue.\n", __FUNCTION__, tasklet->owner->pid);
+			}
+			else {
+				TRACE("%s: Pending tasklet (%d) does not have priority to run on this CPU (%d).\n", __FUNCTION__, tasklet->owner->pid, smp_processor_id());
+				tasklet = NULL;
+			}
+		}
+		else {
+			TRACE("%s: Tasklet queue is empty.\n", __FUNCTION__);
+		}
+		
+		/*
+		step = cluster->pending_tasklets.head;
+		TRACE("%s: (AFTER) dumping tasklet queue...\n", __FUNCTION__);
+		while(step != NULL){
+			TRACE("%s: %p (%d)\n", __FUNCTION__, step, step->owner->pid);
+			step = step->next;
+		}
+		TRACE("%s: tail = %p (%d)\n", __FUNCTION__, *(cluster->pending_tasklets.tail), (*(cluster->pending_tasklets.tail) != NULL) ? (*(cluster->pending_tasklets.tail))->owner->pid : -1);
+		TRACE("%s: done.\n", __FUNCTION__);
+		 */
+		
+		raw_spin_unlock_irqrestore(&cluster->cfifo_lock, flags);
+		
+		if(tasklet) {
+			__do_lit_tasklet(tasklet, 0ul);
+			tasklet = NULL;	
+		}
+		else {
+			work_to_do = 0;
+		}
+	}
+	
+	//TRACE("%s: exited.\n", __FUNCTION__);
+}
+
+
+static void run_tasklets(struct task_struct* sched_task)
+{
+	cfifo_domain_t* cluster;
+	
+#if 0
+	int task_is_rt = is_realtime(sched_task);
+	cfifo_domain_t* cluster;
+	
+	if(is_realtime(sched_task)) {
+		cluster = task_cpu_cluster(sched_task);
+	}
+	else {
+		cluster = remote_cluster(get_cpu());
+	}
+	
+	if(cluster && cluster->pending_tasklets.head != NULL) {
+		TRACE("%s: There are tasklets to process.\n", __FUNCTION__);
+		
+		do_lit_tasklets(cluster, sched_task);
+	}
+	
+	if(!task_is_rt) {
+		put_cpu_no_resched();
+	}
+#else
+	
+	preempt_disable();
+	
+	cluster = (is_realtime(sched_task)) ?
+		task_cpu_cluster(sched_task) :
+		remote_cluster(smp_processor_id());
+	
+	if(cluster && cluster->pending_tasklets.head != NULL) {
+		TRACE("%s: There are tasklets to process.\n", __FUNCTION__);
+		do_lit_tasklets(cluster, sched_task);
+	}
+	
+	preempt_enable_no_resched();
+	
+#endif
+}
+
+
+static void __add_pai_tasklet(struct tasklet_struct* tasklet, cfifo_domain_t* cluster)
+{
+	struct tasklet_struct* step;
+	
+	/*
+	step = cluster->pending_tasklets.head;
+	TRACE("%s: (BEFORE) dumping tasklet queue...\n", __FUNCTION__);
+	while(step != NULL){
+		TRACE("%s: %p (%d)\n", __FUNCTION__, step, step->owner->pid);
+		step = step->next;
+	}
+	TRACE("%s: tail = %p (%d)\n", __FUNCTION__, *(cluster->pending_tasklets.tail), (*(cluster->pending_tasklets.tail) != NULL) ? (*(cluster->pending_tasklets.tail))->owner->pid : -1);
+	TRACE("%s: done.\n", __FUNCTION__);
+	 */
+	
+	
+	tasklet->next = NULL;  // make sure there are no old values floating around
+	
+	step = cluster->pending_tasklets.head;
+	if(step == NULL) {
+		TRACE("%s: tasklet queue empty.  inserting tasklet for %d at head.\n", __FUNCTION__, tasklet->owner->pid);
+		// insert at tail.
+		*(cluster->pending_tasklets.tail) = tasklet;
+		cluster->pending_tasklets.tail = &(tasklet->next);		
+	}
+	else if((*(cluster->pending_tasklets.tail) != NULL) &&
+			fifo_higher_prio((*(cluster->pending_tasklets.tail))->owner, tasklet->owner)) {
+		// insert at tail.
+		TRACE("%s: tasklet belongs at end.  inserting tasklet for %d at tail.\n", __FUNCTION__, tasklet->owner->pid);
+		
+		*(cluster->pending_tasklets.tail) = tasklet;
+		cluster->pending_tasklets.tail = &(tasklet->next);
+	}
+	else {
+		
+        //WARN_ON(1 == 1);
+		
+		// insert the tasklet somewhere in the middle.
+		
+        TRACE("%s: tasklet belongs somewhere in the middle.\n", __FUNCTION__);
+		
+		while(step->next && fifo_higher_prio(step->next->owner, tasklet->owner)) {
+			step = step->next;
+		}
+		
+		// insert tasklet right before step->next.
+		
+		TRACE("%s: inserting tasklet for %d between %d and %d.\n", __FUNCTION__, tasklet->owner->pid, step->owner->pid, (step->next) ? step->next->owner->pid : -1);
+		
+		tasklet->next = step->next;
+		step->next = tasklet;
+		
+		// patch up the head if needed.
+		if(cluster->pending_tasklets.head == step)
+		{
+			TRACE("%s: %d is the new tasklet queue head.\n", __FUNCTION__, tasklet->owner->pid);
+			cluster->pending_tasklets.head = tasklet;
+		}
+	}
+	
+	/*
+	step = cluster->pending_tasklets.head;
+	TRACE("%s: (AFTER) dumping tasklet queue...\n", __FUNCTION__);
+	while(step != NULL){
+		TRACE("%s: %p (%d)\n", __FUNCTION__, step, step->owner->pid);
+		step = step->next;
+	}
+	TRACE("%s: tail = %p (%d)\n", __FUNCTION__, *(cluster->pending_tasklets.tail), (*(cluster->pending_tasklets.tail) != NULL) ? (*(cluster->pending_tasklets.tail))->owner->pid : -1);
+	TRACE("%s: done.\n", __FUNCTION__);
+	 */
+	
+	// TODO: Maintain this list in priority order.
+	//	tasklet->next = NULL;
+	//	*(cluster->pending_tasklets.tail) = tasklet;
+	//	cluster->pending_tasklets.tail = &tasklet->next;
+}
+
+static int enqueue_pai_tasklet(struct tasklet_struct* tasklet)
+{
+	cfifo_domain_t *cluster = NULL;
+	cpu_entry_t *targetCPU = NULL;
+	int thisCPU;
+	int runLocal = 0;
+	int runNow = 0;
+	unsigned long flags;
+	
+    if(unlikely((tasklet->owner == NULL) || !is_realtime(tasklet->owner)))
+    {
+        TRACE("%s: No owner associated with this tasklet!\n", __FUNCTION__);
+		return 0;
+    }	
+	
+	cluster = task_cpu_cluster(tasklet->owner);
+	
+	raw_spin_lock_irqsave(&cluster->cfifo_lock, flags);		
+	
+	thisCPU = smp_processor_id();
+	
+#if 1
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+	{
+		cpu_entry_t* affinity = NULL;
+		
+		// use this CPU if it is in our cluster and isn't running any RT work.
+		if(cpu_isset(thisCPU, *cluster->cpu_map) && (__get_cpu_var(cfifo_cpu_entries).linked == NULL)) {
+			affinity = &(__get_cpu_var(cfifo_cpu_entries));
+		}
+		else {
+			// this CPU is busy or shouldn't run tasklet in this cluster.
+			// look for available near by CPUs.
+			// NOTE: Affinity towards owner and not this CPU.  Is this right?
+			affinity = 
+			cfifo_get_nearest_available_cpu(cluster,
+										   &per_cpu(cfifo_cpu_entries, task_cpu(tasklet->owner)));
+		}
+		
+		targetCPU = affinity;
+	}
+#endif
+#endif
+	
+	if (targetCPU == NULL) {
+		targetCPU = lowest_prio_cpu(cluster);
+	}
+	
+	if (fifo_higher_prio(tasklet->owner, targetCPU->linked)) {
+		if (thisCPU == targetCPU->cpu) {
+			TRACE("%s: Run tasklet locally (and now).\n", __FUNCTION__);
+			runLocal = 1;
+			runNow = 1;
+		}
+		else {
+			TRACE("%s: Run tasklet remotely (and now).\n", __FUNCTION__);
+			runLocal = 0;
+			runNow = 1;
+		}
+	}
+	else {
+		runLocal = 0;
+		runNow = 0;
+	}
+	
+	if(!runLocal) {
+		// enqueue the tasklet
+		__add_pai_tasklet(tasklet, cluster);
+	}
+	
+	raw_spin_unlock_irqrestore(&cluster->cfifo_lock, flags);
+	
+	
+	if (runLocal /*&& runNow */) {  // runNow == 1 is implied
+		TRACE("%s: Running tasklet on CPU where it was received.\n", __FUNCTION__);
+		__do_lit_tasklet(tasklet, 0ul);
+	}
+	else if (runNow /*&& !runLocal */) {  // runLocal == 0 is implied
+		TRACE("%s: Triggering CPU %d to run tasklet.\n", __FUNCTION__, targetCPU->cpu);
+		preempt(targetCPU);  // need to be protected by cfifo_lock?
+	}
+	else {
+		TRACE("%s: Scheduling of tasklet was deferred.\n", __FUNCTION__);
+	}
+	
+	return(1); // success
+}
+
+
+#endif
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
 /* Getting schedule() right is a bit tricky. schedule() may not make any
  * assumptions on the state of the current task since it may be called for a
  * number of reasons. The reasons include a scheduler_tick() determined that it
@@ -544,7 +976,7 @@ static void cfifo_task_new(struct task_struct * t, int on_rq, int running)
 	cpu_entry_t* 		entry;
 	cfifo_domain_t*		cluster;
 
-	TRACE("gsn edf: task new %d\n", t->pid);
+	TRACE("cfifo: task new %d\n", t->pid);
 
 	/* the cluster doesn't change even if t is running */
 	cluster = task_cpu_cluster(t);
@@ -650,6 +1082,10 @@ static void cfifo_task_exit(struct task_struct * t)
 	}
 	raw_spin_unlock_irqrestore(&cluster->cfifo_lock, flags);
 
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+	flush_tasklets(cluster, t);
+#endif	
+	
 	BUG_ON(!is_realtime(t));
         TRACE_TASK(t, "RIP\n");
 }
@@ -1467,6 +1903,12 @@ static long cfifo_activate_plugin(void)
 		bheap_init(&(cfifo[i].cpu_heap));
 		fifo_domain_init(&(cfifo[i].domain), NULL, cfifo_release_jobs);
 
+		
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+		cfifo[i].pending_tasklets.head = NULL;
+		cfifo[i].pending_tasklets.tail = &(cfifo[i].pending_tasklets.head);
+#endif		
+		
 		if(!zalloc_cpumask_var(&cfifo[i].cpu_map, GFP_ATOMIC))
 			return -ENOMEM;
 	}
@@ -1578,6 +2020,10 @@ static struct sched_plugin cfifo_plugin __cacheline_aligned_in_smp = {
 #ifdef CONFIG_LITMUS_SOFTIRQD
 	.set_prio_inh_klitirqd = set_priority_inheritance_klitirqd,
 	.clear_prio_inh_klitirqd = clear_priority_inheritance_klitirqd,
+#endif
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+	.enqueue_pai_tasklet = enqueue_pai_tasklet,
+	.run_tasklets = run_tasklets,
 #endif	
 };
 
diff --git a/litmus/sched_crm.c b/litmus/sched_crm.c
index 061b29eaff7e..fd7fab982998 100644
--- a/litmus/sched_crm.c
+++ b/litmus/sched_crm.c
@@ -55,6 +55,10 @@
 #include <litmus/litmus_softirq.h>
 #endif
 
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+#include <linux/interrupt.h>
+#endif
+
 #ifdef CONFIG_LITMUS_NVIDIA
 #include <litmus/nvidia_info.h>
 #endif
@@ -91,6 +95,14 @@ DEFINE_PER_CPU(cpu_entry_t, crm_cpu_entries);
 #define test_will_schedule(cpu) \
 	(atomic_read(&per_cpu(crm_cpu_entries, cpu).will_schedule))
 
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+struct tasklet_head
+{
+	struct tasklet_struct *head;
+	struct tasklet_struct **tail;
+};
+#endif
+
 /*
  * In C-RM there is a crm domain _per_ cluster
  * The number of clusters is dynamically determined accordingly to the
@@ -108,6 +120,10 @@ typedef struct clusterdomain {
 	struct bheap      cpu_heap;
 	/* lock for this cluster */
 #define crm_lock domain.ready_lock
+	
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+	struct tasklet_head pending_tasklets;
+#endif	
 } crm_domain_t;
 
 /* a crm_domain per cluster; allocation is done at init/activation time */
@@ -251,7 +267,7 @@ static void preempt(cpu_entry_t *entry)
 	preempt_if_preemptable(entry->scheduled, entry->cpu);
 }
 
-/* requeue - Put an unlinked task into gsn-edf domain.
+/* requeue - Put an unlinked task into c-rm domain.
  *           Caller must hold crm_lock.
  */
 static noinline void requeue(struct task_struct* task)
@@ -394,6 +410,421 @@ static void crm_tick(struct task_struct* t)
 		}
 	}
 }
+
+
+
+
+
+
+
+
+
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+
+
+static void __do_lit_tasklet(struct tasklet_struct* tasklet, unsigned long flushed)
+{
+	if (!atomic_read(&tasklet->count)) {
+		sched_trace_tasklet_begin(tasklet->owner);
+		
+		if (!test_and_clear_bit(TASKLET_STATE_SCHED, &tasklet->state))
+		{
+			BUG();
+		}
+		TRACE("%s: Invoking tasklet with owner pid = %d (flushed = %d).\n", __FUNCTION__, tasklet->owner->pid, flushed);
+		tasklet->func(tasklet->data);
+		tasklet_unlock(tasklet);
+		
+		sched_trace_tasklet_end(tasklet->owner, flushed);
+	}
+	else {
+		BUG();
+	}
+}
+
+
+static void __extract_tasklets(crm_domain_t* cluster, struct task_struct* task, struct tasklet_head* task_tasklets)
+{
+	struct tasklet_struct* step;
+	struct tasklet_struct* tasklet;
+	struct tasklet_struct* prev;
+	
+	task_tasklets->head = NULL;
+	task_tasklets->tail = &(task_tasklets->head);
+	
+	prev = NULL;
+	for(step = cluster->pending_tasklets.head; step != NULL; step = step->next)
+	{
+		if(step->owner == task)
+		{
+			TRACE("%s: Found tasklet to flush: %d\n", __FUNCTION__, step->owner->pid);
+			
+			tasklet = step;
+			
+			if(prev) {
+				prev->next = tasklet->next;
+			}
+			else if(cluster->pending_tasklets.head == tasklet) {
+				// we're at the head.
+				cluster->pending_tasklets.head = tasklet->next;
+			}
+			
+			if(cluster->pending_tasklets.tail == &tasklet) {
+				// we're at the tail
+				if(prev) {
+					cluster->pending_tasklets.tail = &prev;
+				}
+				else {
+					cluster->pending_tasklets.tail = &(cluster->pending_tasklets.head);
+				}
+			}
+			
+			tasklet->next = NULL;
+			*(task_tasklets->tail) = tasklet;
+			task_tasklets->tail = &(tasklet->next);
+		}
+		else {
+			prev = step;
+		}
+	}
+}
+
+static void flush_tasklets(crm_domain_t* cluster, struct task_struct* task)
+{
+	unsigned long flags;
+	struct tasklet_head task_tasklets;
+	struct tasklet_struct* step;
+	
+	raw_spin_lock_irqsave(&cluster->crm_lock, flags);
+	__extract_tasklets(cluster, task, &task_tasklets);
+	raw_spin_unlock_irqrestore(&cluster->crm_lock, flags);
+	
+	if(cluster->pending_tasklets.head != NULL) {
+		TRACE("%s: Flushing tasklets for %d...\n", __FUNCTION__, task->pid);
+	}
+	
+	// now execute any flushed tasklets.
+	for(step = cluster->pending_tasklets.head; step != NULL; /**/)
+	{
+		struct tasklet_struct* temp = step->next;
+		
+		step->next = NULL;
+		__do_lit_tasklet(step, 1ul);
+		
+		step = temp;
+	}
+}
+
+
+static void do_lit_tasklets(crm_domain_t* cluster, struct task_struct* sched_task)
+{
+	int work_to_do = 1;
+	struct tasklet_struct *tasklet = NULL;
+	//struct tasklet_struct *step;
+	unsigned long flags;
+	
+	while(work_to_do) {
+		// remove tasklet at head of list if it has higher priority.
+		raw_spin_lock_irqsave(&cluster->crm_lock, flags);	
+		
+		/*
+		step = cluster->pending_tasklets.head;
+		TRACE("%s: (BEFORE) dumping tasklet queue...\n", __FUNCTION__);
+		while(step != NULL){
+			TRACE("%s: %p (%d)\n", __FUNCTION__, step, step->owner->pid);
+			step = step->next;
+		}
+		TRACE("%s: tail = %p (%d)\n", __FUNCTION__, *(cluster->pending_tasklets.tail), (*(cluster->pending_tasklets.tail) != NULL) ? (*(cluster->pending_tasklets.tail))->owner->pid : -1);
+		TRACE("%s: done.\n", __FUNCTION__);
+		*/
+		
+		if(cluster->pending_tasklets.head != NULL) {
+			// remove tasklet at head.
+			tasklet = cluster->pending_tasklets.head;
+			
+			if(rm_higher_prio(tasklet->owner, sched_task)) {
+				
+				if(NULL == tasklet->next) {
+					// tasklet is at the head, list only has one element
+					TRACE("%s: Tasklet for %d is the last element in tasklet queue.\n", __FUNCTION__, tasklet->owner->pid);
+					cluster->pending_tasklets.tail = &(cluster->pending_tasklets.head);
+				}
+				
+				// remove the tasklet from the queue
+				cluster->pending_tasklets.head = tasklet->next;
+				
+				TRACE("%s: Removed tasklet for %d from tasklet queue.\n", __FUNCTION__, tasklet->owner->pid);
+			}
+			else {
+				TRACE("%s: Pending tasklet (%d) does not have priority to run on this CPU (%d).\n", __FUNCTION__, tasklet->owner->pid, smp_processor_id());
+				tasklet = NULL;
+			}
+		}
+		else {
+			TRACE("%s: Tasklet queue is empty.\n", __FUNCTION__);
+		}
+		
+		/*
+		step = cluster->pending_tasklets.head;
+		TRACE("%s: (AFTER) dumping tasklet queue...\n", __FUNCTION__);
+		while(step != NULL){
+			TRACE("%s: %p (%d)\n", __FUNCTION__, step, step->owner->pid);
+			step = step->next;
+		}
+		TRACE("%s: tail = %p (%d)\n", __FUNCTION__, *(cluster->pending_tasklets.tail), (*(cluster->pending_tasklets.tail) != NULL) ? (*(cluster->pending_tasklets.tail))->owner->pid : -1);
+		TRACE("%s: done.\n", __FUNCTION__);
+		*/
+		
+		raw_spin_unlock_irqrestore(&cluster->crm_lock, flags);
+		
+		if(tasklet) {
+			__do_lit_tasklet(tasklet, 0ul);
+			tasklet = NULL;	
+		}
+		else {
+			work_to_do = 0;
+		}
+	}
+	
+	//TRACE("%s: exited.\n", __FUNCTION__);
+}
+
+
+static void run_tasklets(struct task_struct* sched_task)
+{
+	crm_domain_t* cluster;
+	
+#if 0
+	int task_is_rt = is_realtime(sched_task);
+	crm_domain_t* cluster;
+	
+	if(is_realtime(sched_task)) {
+		cluster = task_cpu_cluster(sched_task);
+	}
+	else {
+		cluster = remote_cluster(get_cpu());
+	}
+	
+	if(cluster && cluster->pending_tasklets.head != NULL) {
+		TRACE("%s: There are tasklets to process.\n", __FUNCTION__);
+		
+		do_lit_tasklets(cluster, sched_task);
+	}
+	
+	if(!task_is_rt) {
+		put_cpu_no_resched();
+	}
+#else
+	
+	preempt_disable();
+	
+	cluster = (is_realtime(sched_task)) ?
+	task_cpu_cluster(sched_task) :
+	remote_cluster(smp_processor_id());
+	
+	if(cluster && cluster->pending_tasklets.head != NULL) {
+		TRACE("%s: There are tasklets to process.\n", __FUNCTION__);
+		do_lit_tasklets(cluster, sched_task);
+	}
+	
+	preempt_enable_no_resched();
+	
+#endif
+}
+
+
+static void __add_pai_tasklet(struct tasklet_struct* tasklet, crm_domain_t* cluster)
+{
+	struct tasklet_struct* step;
+	
+	/*
+	step = cluster->pending_tasklets.head;
+	TRACE("%s: (BEFORE) dumping tasklet queue...\n", __FUNCTION__);
+	while(step != NULL){
+		TRACE("%s: %p (%d)\n", __FUNCTION__, step, step->owner->pid);
+		step = step->next;
+	}
+	TRACE("%s: tail = %p (%d)\n", __FUNCTION__, *(cluster->pending_tasklets.tail), (*(cluster->pending_tasklets.tail) != NULL) ? (*(cluster->pending_tasklets.tail))->owner->pid : -1);
+	TRACE("%s: done.\n", __FUNCTION__);
+	*/
+	
+	tasklet->next = NULL;  // make sure there are no old values floating around
+	
+	step = cluster->pending_tasklets.head;
+	if(step == NULL) {
+		TRACE("%s: tasklet queue empty.  inserting tasklet for %d at head.\n", __FUNCTION__, tasklet->owner->pid);
+		// insert at tail.
+		*(cluster->pending_tasklets.tail) = tasklet;
+		cluster->pending_tasklets.tail = &(tasklet->next);		
+	}
+	else if((*(cluster->pending_tasklets.tail) != NULL) &&
+			rm_higher_prio((*(cluster->pending_tasklets.tail))->owner, tasklet->owner)) {
+		// insert at tail.
+		TRACE("%s: tasklet belongs at end.  inserting tasklet for %d at tail.\n", __FUNCTION__, tasklet->owner->pid);
+		
+		*(cluster->pending_tasklets.tail) = tasklet;
+		cluster->pending_tasklets.tail = &(tasklet->next);
+	}
+	else {
+		
+        //WARN_ON(1 == 1);
+		
+		// insert the tasklet somewhere in the middle.
+		
+        TRACE("%s: tasklet belongs somewhere in the middle.\n", __FUNCTION__);
+		
+		while(step->next && rm_higher_prio(step->next->owner, tasklet->owner)) {
+			step = step->next;
+		}
+		
+		// insert tasklet right before step->next.
+		
+		TRACE("%s: inserting tasklet for %d between %d and %d.\n", __FUNCTION__, tasklet->owner->pid, step->owner->pid, (step->next) ? step->next->owner->pid : -1);
+		
+		tasklet->next = step->next;
+		step->next = tasklet;
+		
+		// patch up the head if needed.
+		if(cluster->pending_tasklets.head == step)
+		{
+			TRACE("%s: %d is the new tasklet queue head.\n", __FUNCTION__, tasklet->owner->pid);
+			cluster->pending_tasklets.head = tasklet;
+		}
+	}
+	
+	/*
+	step = cluster->pending_tasklets.head;
+	TRACE("%s: (AFTER) dumping tasklet queue...\n", __FUNCTION__);
+	while(step != NULL){
+		TRACE("%s: %p (%d)\n", __FUNCTION__, step, step->owner->pid);
+		step = step->next;
+	}
+	TRACE("%s: tail = %p (%d)\n", __FUNCTION__, *(cluster->pending_tasklets.tail), (*(cluster->pending_tasklets.tail) != NULL) ? (*(cluster->pending_tasklets.tail))->owner->pid : -1);
+	TRACE("%s: done.\n", __FUNCTION__);	
+	*/
+	 
+	// TODO: Maintain this list in priority order.
+	//	tasklet->next = NULL;
+	//	*(cluster->pending_tasklets.tail) = tasklet;
+	//	cluster->pending_tasklets.tail = &tasklet->next;
+}
+
+static int enqueue_pai_tasklet(struct tasklet_struct* tasklet)
+{
+	crm_domain_t *cluster = NULL;
+	cpu_entry_t *targetCPU = NULL;
+	int thisCPU;
+	int runLocal = 0;
+	int runNow = 0;
+	unsigned long flags;
+	
+    if(unlikely((tasklet->owner == NULL) || !is_realtime(tasklet->owner)))
+    {
+        TRACE("%s: No owner associated with this tasklet!\n", __FUNCTION__);
+		return 0;
+    }	
+	
+	cluster = task_cpu_cluster(tasklet->owner);
+	
+	raw_spin_lock_irqsave(&cluster->crm_lock, flags);		
+	
+	thisCPU = smp_processor_id();
+	
+#if 1
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+	{
+		cpu_entry_t* affinity = NULL;
+		
+		// use this CPU if it is in our cluster and isn't running any RT work.
+		if(cpu_isset(thisCPU, *cluster->cpu_map) && (__get_cpu_var(crm_cpu_entries).linked == NULL)) {
+			affinity = &(__get_cpu_var(crm_cpu_entries));
+		}
+		else {
+			// this CPU is busy or shouldn't run tasklet in this cluster.
+			// look for available near by CPUs.
+			// NOTE: Affinity towards owner and not this CPU.  Is this right?
+			affinity = 
+			crm_get_nearest_available_cpu(cluster,
+										   &per_cpu(crm_cpu_entries, task_cpu(tasklet->owner)));
+		}
+		
+		targetCPU = affinity;
+	}
+#endif
+#endif
+	
+	if (targetCPU == NULL) {
+		targetCPU = lowest_prio_cpu(cluster);
+	}
+	
+	if (rm_higher_prio(tasklet->owner, targetCPU->linked)) {
+		if (thisCPU == targetCPU->cpu) {
+			TRACE("%s: Run tasklet locally (and now).\n", __FUNCTION__);
+			runLocal = 1;
+			runNow = 1;
+		}
+		else {
+			TRACE("%s: Run tasklet remotely (and now).\n", __FUNCTION__);
+			runLocal = 0;
+			runNow = 1;
+		}
+	}
+	else {
+		runLocal = 0;
+		runNow = 0;
+	}
+	
+	if(!runLocal) {
+		// enqueue the tasklet
+		__add_pai_tasklet(tasklet, cluster);
+	}
+	
+	raw_spin_unlock_irqrestore(&cluster->crm_lock, flags);
+	
+	
+	if (runLocal /*&& runNow */) {  // runNow == 1 is implied
+		TRACE("%s: Running tasklet on CPU where it was received.\n", __FUNCTION__);
+		__do_lit_tasklet(tasklet, 0ul);
+	}
+	else if (runNow /*&& !runLocal */) {  // runLocal == 0 is implied
+		TRACE("%s: Triggering CPU %d to run tasklet.\n", __FUNCTION__, targetCPU->cpu);
+		preempt(targetCPU);  // need to be protected by crm_lock?
+	}
+	else {
+		TRACE("%s: Scheduling of tasklet was deferred.\n", __FUNCTION__);
+	}
+	
+	return(1); // success
+}
+
+
+#endif
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
 
 /* Getting schedule() right is a bit tricky. schedule() may not make any
  * assumptions on the state of the current task since it may be called for a
@@ -544,7 +975,7 @@ static void crm_task_new(struct task_struct * t, int on_rq, int running)
 	cpu_entry_t* 		entry;
 	crm_domain_t*		cluster;
 
-	TRACE("gsn edf: task new %d\n", t->pid);
+	TRACE("crm: task new %d\n", t->pid);
 
 	/* the cluster doesn't change even if t is running */
 	cluster = task_cpu_cluster(t);
@@ -650,6 +1081,10 @@ static void crm_task_exit(struct task_struct * t)
 	}
 	raw_spin_unlock_irqrestore(&cluster->crm_lock, flags);
 
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+	flush_tasklets(cluster, t);
+#endif	
+	
 	BUG_ON(!is_realtime(t));
         TRACE_TASK(t, "RIP\n");
 }
@@ -1467,6 +1902,11 @@ static long crm_activate_plugin(void)
 		bheap_init(&(crm[i].cpu_heap));
 		rm_domain_init(&(crm[i].domain), NULL, crm_release_jobs);
 
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+		crm[i].pending_tasklets.head = NULL;
+		crm[i].pending_tasklets.tail = &(crm[i].pending_tasklets.head);
+#endif		
+		
 		if(!zalloc_cpumask_var(&crm[i].cpu_map, GFP_ATOMIC))
 			return -ENOMEM;
 	}
@@ -1578,6 +2018,10 @@ static struct sched_plugin crm_plugin __cacheline_aligned_in_smp = {
 #ifdef CONFIG_LITMUS_SOFTIRQD
 	.set_prio_inh_klitirqd = set_priority_inheritance_klitirqd,
 	.clear_prio_inh_klitirqd = clear_priority_inheritance_klitirqd,
+#endif
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+	.enqueue_pai_tasklet = enqueue_pai_tasklet,
+	.run_tasklets = run_tasklets,
 #endif	
 };
 
diff --git a/litmus/sched_crm_srt.c b/litmus/sched_crm_srt.c
index 4473f35e64cd..c0004354573d 100644
--- a/litmus/sched_crm_srt.c
+++ b/litmus/sched_crm_srt.c
@@ -55,6 +55,10 @@
 #include <litmus/litmus_softirq.h>
 #endif
 
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+#include <linux/interrupt.h>
+#endif
+
 #ifdef CONFIG_LITMUS_NVIDIA
 #include <litmus/nvidia_info.h>
 #endif
@@ -91,6 +95,15 @@ DEFINE_PER_CPU(cpu_entry_t, crm_srt_cpu_entries);
 #define test_will_schedule(cpu) \
 	(atomic_read(&per_cpu(crm_srt_cpu_entries, cpu).will_schedule))
 
+
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+struct tasklet_head
+{
+	struct tasklet_struct *head;
+	struct tasklet_struct **tail;
+};
+#endif
+
 /*
  * In C-RM-SRT there is a crm_srt domain _per_ cluster
  * The number of clusters is dynamically determined accordingly to the
@@ -108,6 +121,12 @@ typedef struct clusterdomain {
 	struct bheap      cpu_heap;
 	/* lock for this cluster */
 #define crm_srt_lock domain.ready_lock
+	
+	
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+	struct tasklet_head pending_tasklets;
+#endif	
+	
 } crm_srt_domain_t;
 
 /* a crm_srt_domain per cluster; allocation is done at init/activation time */
@@ -251,7 +270,7 @@ static void preempt(cpu_entry_t *entry)
 	preempt_if_preemptable(entry->scheduled, entry->cpu);
 }
 
-/* requeue - Put an unlinked task into gsn-edf domain.
+/* requeue - Put an unlinked task into c-rm-srt domain.
  *           Caller must hold crm_srt_lock.
  */
 static noinline void requeue(struct task_struct* task)
@@ -395,6 +414,415 @@ static void crm_srt_tick(struct task_struct* t)
 	}
 }
 
+
+
+
+
+
+
+
+
+
+
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+
+
+static void __do_lit_tasklet(struct tasklet_struct* tasklet, unsigned long flushed)
+{
+	if (!atomic_read(&tasklet->count)) {
+		sched_trace_tasklet_begin(tasklet->owner);
+		
+		if (!test_and_clear_bit(TASKLET_STATE_SCHED, &tasklet->state))
+		{
+			BUG();
+		}
+		TRACE("%s: Invoking tasklet with owner pid = %d (flushed = %d).\n", __FUNCTION__, tasklet->owner->pid, flushed);
+		tasklet->func(tasklet->data);
+		tasklet_unlock(tasklet);
+		
+		sched_trace_tasklet_end(tasklet->owner, flushed);
+	}
+	else {
+		BUG();
+	}
+}
+
+
+static void __extract_tasklets(crm_srt_domain_t* cluster, struct task_struct* task, struct tasklet_head* task_tasklets)
+{
+	struct tasklet_struct* step;
+	struct tasklet_struct* tasklet;
+	struct tasklet_struct* prev;
+	
+	task_tasklets->head = NULL;
+	task_tasklets->tail = &(task_tasklets->head);
+	
+	prev = NULL;
+	for(step = cluster->pending_tasklets.head; step != NULL; step = step->next)
+	{
+		if(step->owner == task)
+		{
+			TRACE("%s: Found tasklet to flush: %d\n", __FUNCTION__, step->owner->pid);
+			
+			tasklet = step;
+			
+			if(prev) {
+				prev->next = tasklet->next;
+			}
+			else if(cluster->pending_tasklets.head == tasklet) {
+				// we're at the head.
+				cluster->pending_tasklets.head = tasklet->next;
+			}
+			
+			if(cluster->pending_tasklets.tail == &tasklet) {
+				// we're at the tail
+				if(prev) {
+					cluster->pending_tasklets.tail = &prev;
+				}
+				else {
+					cluster->pending_tasklets.tail = &(cluster->pending_tasklets.head);
+				}
+			}
+			
+			tasklet->next = NULL;
+			*(task_tasklets->tail) = tasklet;
+			task_tasklets->tail = &(tasklet->next);
+		}
+		else {
+			prev = step;
+		}
+	}
+}
+
+static void flush_tasklets(crm_srt_domain_t* cluster, struct task_struct* task)
+{
+	unsigned long flags;
+	struct tasklet_head task_tasklets;
+	struct tasklet_struct* step;
+	
+	raw_spin_lock_irqsave(&cluster->crm_srt_lock, flags);
+	__extract_tasklets(cluster, task, &task_tasklets);
+	raw_spin_unlock_irqrestore(&cluster->crm_srt_lock, flags);
+	
+	if(cluster->pending_tasklets.head != NULL) {
+		TRACE("%s: Flushing tasklets for %d...\n", __FUNCTION__, task->pid);
+	}
+	
+	// now execute any flushed tasklets.
+	for(step = cluster->pending_tasklets.head; step != NULL; /**/)
+	{
+		struct tasklet_struct* temp = step->next;
+		
+		step->next = NULL;
+		__do_lit_tasklet(step, 1ul);
+		
+		step = temp;
+	}
+}
+
+
+static void do_lit_tasklets(crm_srt_domain_t* cluster, struct task_struct* sched_task)
+{
+	int work_to_do = 1;
+	struct tasklet_struct *tasklet = NULL;
+	//struct tasklet_struct *step;
+	unsigned long flags;
+	
+	while(work_to_do) {
+		// remove tasklet at head of list if it has higher priority.
+		raw_spin_lock_irqsave(&cluster->crm_srt_lock, flags);	
+		
+		/*
+		step = cluster->pending_tasklets.head;
+		TRACE("%s: (BEFORE) dumping tasklet queue...\n", __FUNCTION__);
+		while(step != NULL){
+			TRACE("%s: %p (%d)\n", __FUNCTION__, step, step->owner->pid);
+			step = step->next;
+		}
+		TRACE("%s: tail = %p (%d)\n", __FUNCTION__, *(cluster->pending_tasklets.tail), (*(cluster->pending_tasklets.tail) != NULL) ? (*(cluster->pending_tasklets.tail))->owner->pid : -1);
+		TRACE("%s: done.\n", __FUNCTION__);
+		*/
+		
+		if(cluster->pending_tasklets.head != NULL) {
+			// remove tasklet at head.
+			tasklet = cluster->pending_tasklets.head;
+			
+			if(rm_srt_higher_prio(tasklet->owner, sched_task)) {
+				
+				if(NULL == tasklet->next) {
+					// tasklet is at the head, list only has one element
+					TRACE("%s: Tasklet for %d is the last element in tasklet queue.\n", __FUNCTION__, tasklet->owner->pid);
+					cluster->pending_tasklets.tail = &(cluster->pending_tasklets.head);
+				}
+				
+				// remove the tasklet from the queue
+				cluster->pending_tasklets.head = tasklet->next;
+				
+				TRACE("%s: Removed tasklet for %d from tasklet queue.\n", __FUNCTION__, tasklet->owner->pid);
+			}
+			else {
+				TRACE("%s: Pending tasklet (%d) does not have priority to run on this CPU (%d).\n", __FUNCTION__, tasklet->owner->pid, smp_processor_id());
+				tasklet = NULL;
+			}
+		}
+		else {
+			TRACE("%s: Tasklet queue is empty.\n", __FUNCTION__);
+		}
+		
+		/*
+		step = cluster->pending_tasklets.head;
+		TRACE("%s: (AFTER) dumping tasklet queue...\n", __FUNCTION__);
+		while(step != NULL){
+			TRACE("%s: %p (%d)\n", __FUNCTION__, step, step->owner->pid);
+			step = step->next;
+		}
+		TRACE("%s: tail = %p (%d)\n", __FUNCTION__, *(cluster->pending_tasklets.tail), (*(cluster->pending_tasklets.tail) != NULL) ? (*(cluster->pending_tasklets.tail))->owner->pid : -1);
+		TRACE("%s: done.\n", __FUNCTION__);
+		*/
+		
+		raw_spin_unlock_irqrestore(&cluster->crm_srt_lock, flags);
+		
+		if(tasklet) {
+			__do_lit_tasklet(tasklet, 0ul);
+			tasklet = NULL;	
+		}
+		else {
+			work_to_do = 0;
+		}
+	}
+	
+	//TRACE("%s: exited.\n", __FUNCTION__);
+}
+
+
+static void run_tasklets(struct task_struct* sched_task)
+{
+	crm_srt_domain_t* cluster;
+	
+#if 0
+	int task_is_rt = is_realtime(sched_task);
+	crm_srt_domain_t* cluster;
+	
+	if(is_realtime(sched_task)) {
+		cluster = task_cpu_cluster(sched_task);
+	}
+	else {
+		cluster = remote_cluster(get_cpu());
+	}
+	
+	if(cluster && cluster->pending_tasklets.head != NULL) {
+		TRACE("%s: There are tasklets to process.\n", __FUNCTION__);
+		
+		do_lit_tasklets(cluster, sched_task);
+	}
+	
+	if(!task_is_rt) {
+		put_cpu_no_resched();
+	}
+#else
+	
+	preempt_disable();
+	
+	cluster = (is_realtime(sched_task)) ?
+	task_cpu_cluster(sched_task) :
+	remote_cluster(smp_processor_id());
+	
+	if(cluster && cluster->pending_tasklets.head != NULL) {
+		TRACE("%s: There are tasklets to process.\n", __FUNCTION__);
+		do_lit_tasklets(cluster, sched_task);
+	}
+	
+	preempt_enable_no_resched();
+	
+#endif
+}
+
+
+static void __add_pai_tasklet(struct tasklet_struct* tasklet, crm_srt_domain_t* cluster)
+{
+	struct tasklet_struct* step;
+	
+	/*
+	step = cluster->pending_tasklets.head;
+	TRACE("%s: (BEFORE) dumping tasklet queue...\n", __FUNCTION__);
+	while(step != NULL){
+		TRACE("%s: %p (%d)\n", __FUNCTION__, step, step->owner->pid);
+		step = step->next;
+	}
+	TRACE("%s: tail = %p (%d)\n", __FUNCTION__, *(cluster->pending_tasklets.tail), (*(cluster->pending_tasklets.tail) != NULL) ? (*(cluster->pending_tasklets.tail))->owner->pid : -1);
+	TRACE("%s: done.\n", __FUNCTION__);
+	*/
+	
+	tasklet->next = NULL;  // make sure there are no old values floating around
+	
+	step = cluster->pending_tasklets.head;
+	if(step == NULL) {
+		TRACE("%s: tasklet queue empty.  inserting tasklet for %d at head.\n", __FUNCTION__, tasklet->owner->pid);
+		// insert at tail.
+		*(cluster->pending_tasklets.tail) = tasklet;
+		cluster->pending_tasklets.tail = &(tasklet->next);		
+	}
+	else if((*(cluster->pending_tasklets.tail) != NULL) &&
+			rm_srt_higher_prio((*(cluster->pending_tasklets.tail))->owner, tasklet->owner)) {
+		// insert at tail.
+		TRACE("%s: tasklet belongs at end.  inserting tasklet for %d at tail.\n", __FUNCTION__, tasklet->owner->pid);
+		
+		*(cluster->pending_tasklets.tail) = tasklet;
+		cluster->pending_tasklets.tail = &(tasklet->next);
+	}
+	else {
+		
+        //WARN_ON(1 == 1);
+		
+		// insert the tasklet somewhere in the middle.
+		
+        TRACE("%s: tasklet belongs somewhere in the middle.\n", __FUNCTION__);
+		
+		while(step->next && rm_srt_higher_prio(step->next->owner, tasklet->owner)) {
+			step = step->next;
+		}
+		
+		// insert tasklet right before step->next.
+		
+		TRACE("%s: inserting tasklet for %d between %d and %d.\n", __FUNCTION__, tasklet->owner->pid, step->owner->pid, (step->next) ? step->next->owner->pid : -1);
+		
+		tasklet->next = step->next;
+		step->next = tasklet;
+		
+		// patch up the head if needed.
+		if(cluster->pending_tasklets.head == step)
+		{
+			TRACE("%s: %d is the new tasklet queue head.\n", __FUNCTION__, tasklet->owner->pid);
+			cluster->pending_tasklets.head = tasklet;
+		}
+	}
+	
+	/*
+	step = cluster->pending_tasklets.head;
+	TRACE("%s: (AFTER) dumping tasklet queue...\n", __FUNCTION__);
+	while(step != NULL){
+		TRACE("%s: %p (%d)\n", __FUNCTION__, step, step->owner->pid);
+		step = step->next;
+	}
+	TRACE("%s: tail = %p (%d)\n", __FUNCTION__, *(cluster->pending_tasklets.tail), (*(cluster->pending_tasklets.tail) != NULL) ? (*(cluster->pending_tasklets.tail))->owner->pid : -1);
+	TRACE("%s: done.\n", __FUNCTION__);	
+	 */
+	
+	// TODO: Maintain this list in priority order.
+	//	tasklet->next = NULL;
+	//	*(cluster->pending_tasklets.tail) = tasklet;
+	//	cluster->pending_tasklets.tail = &tasklet->next;
+}
+
+static int enqueue_pai_tasklet(struct tasklet_struct* tasklet)
+{
+	crm_srt_domain_t *cluster = NULL;
+	cpu_entry_t *targetCPU = NULL;
+	int thisCPU;
+	int runLocal = 0;
+	int runNow = 0;
+	unsigned long flags;
+	
+    if(unlikely((tasklet->owner == NULL) || !is_realtime(tasklet->owner)))
+    {
+        TRACE("%s: No owner associated with this tasklet!\n", __FUNCTION__);
+		return 0;
+    }	
+	
+	cluster = task_cpu_cluster(tasklet->owner);
+	
+	raw_spin_lock_irqsave(&cluster->crm_srt_lock, flags);		
+	
+	thisCPU = smp_processor_id();
+	
+#if 1
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+	{
+		cpu_entry_t* affinity = NULL;
+		
+		// use this CPU if it is in our cluster and isn't running any RT work.
+		if(cpu_isset(thisCPU, *cluster->cpu_map) && (__get_cpu_var(crm_srt_cpu_entries).linked == NULL)) {
+			affinity = &(__get_cpu_var(crm_srt_cpu_entries));
+		}
+		else {
+			// this CPU is busy or shouldn't run tasklet in this cluster.
+			// look for available near by CPUs.
+			// NOTE: Affinity towards owner and not this CPU.  Is this right?
+			affinity = 
+				crm_srt_get_nearest_available_cpu(cluster,
+						&per_cpu(crm_srt_cpu_entries, task_cpu(tasklet->owner)));
+		}
+		
+		targetCPU = affinity;
+	}
+#endif
+#endif
+	
+	if (targetCPU == NULL) {
+		targetCPU = lowest_prio_cpu(cluster);
+	}
+	
+	if (rm_srt_higher_prio(tasklet->owner, targetCPU->linked)) {
+		if (thisCPU == targetCPU->cpu) {
+			TRACE("%s: Run tasklet locally (and now).\n", __FUNCTION__);
+			runLocal = 1;
+			runNow = 1;
+		}
+		else {
+			TRACE("%s: Run tasklet remotely (and now).\n", __FUNCTION__);
+			runLocal = 0;
+			runNow = 1;
+		}
+	}
+	else {
+		runLocal = 0;
+		runNow = 0;
+	}
+	
+	if(!runLocal) {
+		// enqueue the tasklet
+		__add_pai_tasklet(tasklet, cluster);
+	}
+	
+	raw_spin_unlock_irqrestore(&cluster->crm_srt_lock, flags);
+	
+	
+	if (runLocal /*&& runNow */) {  // runNow == 1 is implied
+		TRACE("%s: Running tasklet on CPU where it was received.\n", __FUNCTION__);
+		__do_lit_tasklet(tasklet, 0ul);
+	}
+	else if (runNow /*&& !runLocal */) {  // runLocal == 0 is implied
+		TRACE("%s: Triggering CPU %d to run tasklet.\n", __FUNCTION__, targetCPU->cpu);
+		preempt(targetCPU);  // need to be protected by crm_srt_lock?
+	}
+	else {
+		TRACE("%s: Scheduling of tasklet was deferred.\n", __FUNCTION__);
+	}
+	
+	return(1); // success
+}
+
+
+#endif
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
 /* Getting schedule() right is a bit tricky. schedule() may not make any
  * assumptions on the state of the current task since it may be called for a
  * number of reasons. The reasons include a scheduler_tick() determined that it
@@ -544,7 +972,7 @@ static void crm_srt_task_new(struct task_struct * t, int on_rq, int running)
 	cpu_entry_t* 		entry;
 	crm_srt_domain_t*		cluster;
 
-	TRACE("gsn edf: task new %d\n", t->pid);
+	TRACE("crm srt: task new %d\n", t->pid);
 
 	/* the cluster doesn't change even if t is running */
 	cluster = task_cpu_cluster(t);
@@ -650,6 +1078,10 @@ static void crm_srt_task_exit(struct task_struct * t)
 	}
 	raw_spin_unlock_irqrestore(&cluster->crm_srt_lock, flags);
 
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+	flush_tasklets(cluster, t);
+#endif
+		
 	BUG_ON(!is_realtime(t));
         TRACE_TASK(t, "RIP\n");
 }
@@ -1467,6 +1899,11 @@ static long crm_srt_activate_plugin(void)
 		bheap_init(&(crm_srt[i].cpu_heap));
 		rm_srt_domain_init(&(crm_srt[i].domain), NULL, crm_srt_release_jobs);
 
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+		crm_srt[i].pending_tasklets.head = NULL;
+		crm_srt[i].pending_tasklets.tail = &(crm_srt[i].pending_tasklets.head);
+#endif		
+		
 		if(!zalloc_cpumask_var(&crm_srt[i].cpu_map, GFP_ATOMIC))
 			return -ENOMEM;
 	}
@@ -1578,6 +2015,10 @@ static struct sched_plugin crm_srt_plugin __cacheline_aligned_in_smp = {
 #ifdef CONFIG_LITMUS_SOFTIRQD
 	.set_prio_inh_klitirqd = set_priority_inheritance_klitirqd,
 	.clear_prio_inh_klitirqd = clear_priority_inheritance_klitirqd,
+#endif
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+	.enqueue_pai_tasklet = enqueue_pai_tasklet,
+	.run_tasklets = run_tasklets,
 #endif	
 };
 
diff --git a/litmus/sched_gsn_edf.c b/litmus/sched_gsn_edf.c
index ac7685fe69f0..b40ff7ba4f0e 100644
--- a/litmus/sched_gsn_edf.c
+++ b/litmus/sched_gsn_edf.c
@@ -35,6 +35,10 @@
 #include <litmus/litmus_softirq.h>
 #endif
 
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+#include <linux/interrupt.h>
+#endif
+
 #ifdef CONFIG_LITMUS_NVIDIA
 #include <litmus/nvidia_info.h>
 #endif
@@ -126,6 +130,16 @@ static struct bheap      gsnedf_cpu_heap;
 static rt_domain_t gsnedf;
 #define gsnedf_lock (gsnedf.ready_lock)
 
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+struct tasklet_head
+{
+	struct tasklet_struct *head;
+	struct tasklet_struct **tail;
+};
+
+struct tasklet_head gsnedf_pending_tasklets;
+#endif
+
 
 /* Uncomment this if you want to see all scheduling decisions in the
  * TRACE() log.
@@ -393,6 +407,410 @@ static void gsnedf_tick(struct task_struct* t)
 	}
 }
 
+
+
+
+
+
+
+
+
+
+
+
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+
+
+static void __do_lit_tasklet(struct tasklet_struct* tasklet, unsigned long flushed)
+{
+	if (!atomic_read(&tasklet->count)) {
+		sched_trace_tasklet_begin(tasklet->owner);
+		
+		if (!test_and_clear_bit(TASKLET_STATE_SCHED, &tasklet->state))
+		{
+			BUG();
+		}
+		TRACE("%s: Invoking tasklet with owner pid = %d (flushed = %d).\n", __FUNCTION__, tasklet->owner->pid, flushed);
+		tasklet->func(tasklet->data);
+		tasklet_unlock(tasklet);
+		
+		sched_trace_tasklet_end(tasklet->owner, flushed);
+	}
+	else {
+		BUG();
+	}
+}
+
+
+static void __extract_tasklets(struct task_struct* task, struct tasklet_head* task_tasklets)
+{
+	struct tasklet_struct* step;
+	struct tasklet_struct* tasklet;
+	struct tasklet_struct* prev;
+	
+	task_tasklets->head = NULL;
+	task_tasklets->tail = &(task_tasklets->head);
+	
+	prev = NULL;
+	for(step = gsnedf_pending_tasklets.head; step != NULL; step = step->next)
+	{
+		if(step->owner == task)
+		{
+			TRACE("%s: Found tasklet to flush: %d\n", __FUNCTION__, step->owner->pid);
+			
+			tasklet = step;
+			
+			if(prev) {
+				prev->next = tasklet->next;
+			}
+			else if(gsnedf_pending_tasklets.head == tasklet) {
+				// we're at the head.
+				gsnedf_pending_tasklets.head = tasklet->next;
+			}
+			
+			if(gsnedf_pending_tasklets.tail == &tasklet) {
+				// we're at the tail
+				if(prev) {
+					gsnedf_pending_tasklets.tail = &prev;
+				}
+				else {
+					gsnedf_pending_tasklets.tail = &(gsnedf_pending_tasklets.head);
+				}
+			}
+			
+			tasklet->next = NULL;
+			*(task_tasklets->tail) = tasklet;
+			task_tasklets->tail = &(tasklet->next);
+		}
+		else {
+			prev = step;
+		}
+	}
+}
+
+static void flush_tasklets(struct task_struct* task)
+{
+	unsigned long flags;
+	struct tasklet_head task_tasklets;
+	struct tasklet_struct* step;
+	
+	raw_spin_lock_irqsave(&gsnedf_lock, flags);
+	__extract_tasklets(task, &task_tasklets);
+	raw_spin_unlock_irqrestore(&gsnedf_lock, flags);
+	
+	if(gsnedf_pending_tasklets.head != NULL) {
+		TRACE("%s: Flushing tasklets for %d...\n", __FUNCTION__, task->pid);
+	}
+	
+	// now execute any flushed tasklets.
+	for(step = gsnedf_pending_tasklets.head; step != NULL; /**/)
+	{
+		struct tasklet_struct* temp = step->next;
+		
+		step->next = NULL;
+		__do_lit_tasklet(step, 1ul);
+		
+		step = temp;
+	}
+}
+
+
+static void do_lit_tasklets(struct task_struct* sched_task)
+{
+	int work_to_do = 1;
+	struct tasklet_struct *tasklet = NULL;
+	//struct tasklet_struct *step;
+	unsigned long flags;
+	
+	while(work_to_do) {
+		// remove tasklet at head of list if it has higher priority.
+		raw_spin_lock_irqsave(&gsnedf_lock, flags);	
+		
+		/*
+		 step = gsnedf_pending_tasklets.head;
+		 TRACE("%s: (BEFORE) dumping tasklet queue...\n", __FUNCTION__);
+		 while(step != NULL){
+		 TRACE("%s: %p (%d)\n", __FUNCTION__, step, step->owner->pid);
+		 step = step->next;
+		 }
+		 TRACE("%s: tail = %p (%d)\n", __FUNCTION__, *(gsnedf_pending_tasklets.tail), (*(gsnedf_pending_tasklets.tail) != NULL) ? (*(gsnedf_pending_tasklets.tail))->owner->pid : -1);
+		 TRACE("%s: done.\n", __FUNCTION__);
+		 */
+		
+		
+		if(gsnedf_pending_tasklets.head != NULL) {
+			// remove tasklet at head.
+			tasklet = gsnedf_pending_tasklets.head;
+			
+			if(edf_higher_prio(tasklet->owner, sched_task)) {
+				
+				if(NULL == tasklet->next) {
+					// tasklet is at the head, list only has one element
+					TRACE("%s: Tasklet for %d is the last element in tasklet queue.\n", __FUNCTION__, tasklet->owner->pid);
+					gsnedf_pending_tasklets.tail = &(gsnedf_pending_tasklets.head);
+				}
+				
+				// remove the tasklet from the queue
+				gsnedf_pending_tasklets.head = tasklet->next;
+				
+				TRACE("%s: Removed tasklet for %d from tasklet queue.\n", __FUNCTION__, tasklet->owner->pid);
+			}
+			else {
+				TRACE("%s: Pending tasklet (%d) does not have priority to run on this CPU (%d).\n", __FUNCTION__, tasklet->owner->pid, smp_processor_id());
+				tasklet = NULL;
+			}
+		}
+		else {
+			TRACE("%s: Tasklet queue is empty.\n", __FUNCTION__);
+		}
+		
+		
+		/*
+		 step = gsnedf_pending_tasklets.head;
+		 TRACE("%s: (AFTER) dumping tasklet queue...\n", __FUNCTION__);
+		 while(step != NULL){
+		 TRACE("%s: %p (%d)\n", __FUNCTION__, step, step->owner->pid);
+		 step = step->next;
+		 }
+		 TRACE("%s: tail = %p (%d)\n", __FUNCTION__, *(gsnedf_pending_tasklets.tail), (*(gsnedf_pending_tasklets.tail) != NULL) ? (*(gsnedf_pending_tasklets.tail))->owner->pid : -1);
+		 TRACE("%s: done.\n", __FUNCTION__);
+		 */
+		
+		
+		raw_spin_unlock_irqrestore(&gsnedf_lock, flags);
+		
+		if(tasklet) {
+			__do_lit_tasklet(tasklet, 0ul);
+			tasklet = NULL;	
+		}
+		else {
+			work_to_do = 0;
+		}
+	}
+	
+	//TRACE("%s: exited.\n", __FUNCTION__);
+}
+
+
+static void run_tasklets(struct task_struct* sched_task)
+{
+#if 0
+	int task_is_rt = is_realtime(sched_task);
+	cedf_domain_t* cluster;
+	
+	if(is_realtime(sched_task)) {
+		cluster = task_cpu_cluster(sched_task);
+	}
+	else {
+		cluster = remote_cluster(get_cpu());
+	}
+	
+	if(cluster && gsnedf_pending_tasklets.head != NULL) {
+		TRACE("%s: There are tasklets to process.\n", __FUNCTION__);
+		
+		do_lit_tasklets(cluster, sched_task);
+	}
+	
+	if(!task_is_rt) {
+		put_cpu_no_resched();
+	}
+#else
+	
+	preempt_disable();
+	
+	if(gsnedf_pending_tasklets.head != NULL) {
+		TRACE("%s: There are tasklets to process.\n", __FUNCTION__);
+		do_lit_tasklets(sched_task);
+	}
+	
+	preempt_enable_no_resched();
+	
+#endif
+}
+
+
+static void __add_pai_tasklet(struct tasklet_struct* tasklet)
+{
+	struct tasklet_struct* step;
+	
+	/*
+	 step = gsnedf_pending_tasklets.head;
+	 TRACE("%s: (BEFORE) dumping tasklet queue...\n", __FUNCTION__);
+	 while(step != NULL){
+	 TRACE("%s: %p (%d)\n", __FUNCTION__, step, step->owner->pid);
+	 step = step->next;
+	 }
+	 TRACE("%s: tail = %p (%d)\n", __FUNCTION__, *(gsnedf_pending_tasklets.tail), (*(gsnedf_pending_tasklets.tail) != NULL) ? (*(gsnedf_pending_tasklets.tail))->owner->pid : -1);
+	 TRACE("%s: done.\n", __FUNCTION__);
+	 */
+	
+	
+	tasklet->next = NULL;  // make sure there are no old values floating around
+	
+	step = gsnedf_pending_tasklets.head;
+	if(step == NULL) {
+		TRACE("%s: tasklet queue empty.  inserting tasklet for %d at head.\n", __FUNCTION__, tasklet->owner->pid);
+		// insert at tail.
+		*(gsnedf_pending_tasklets.tail) = tasklet;
+		gsnedf_pending_tasklets.tail = &(tasklet->next);		
+	}
+	else if((*(gsnedf_pending_tasklets.tail) != NULL) &&
+			edf_higher_prio((*(gsnedf_pending_tasklets.tail))->owner, tasklet->owner)) {
+		// insert at tail.
+		TRACE("%s: tasklet belongs at end.  inserting tasklet for %d at tail.\n", __FUNCTION__, tasklet->owner->pid);
+		
+		*(gsnedf_pending_tasklets.tail) = tasklet;
+		gsnedf_pending_tasklets.tail = &(tasklet->next);
+	}
+	else {
+		
+        //WARN_ON(1 == 1);
+		
+		// insert the tasklet somewhere in the middle.
+		
+        TRACE("%s: tasklet belongs somewhere in the middle.\n", __FUNCTION__);
+		
+		while(step->next && edf_higher_prio(step->next->owner, tasklet->owner)) {
+			step = step->next;
+		}
+		
+		// insert tasklet right before step->next.
+		
+		TRACE("%s: inserting tasklet for %d between %d and %d.\n", __FUNCTION__, tasklet->owner->pid, step->owner->pid, (step->next) ? step->next->owner->pid : -1);
+		
+		tasklet->next = step->next;
+		step->next = tasklet;
+		
+		// patch up the head if needed.
+		if(gsnedf_pending_tasklets.head == step)
+		{
+			TRACE("%s: %d is the new tasklet queue head.\n", __FUNCTION__, tasklet->owner->pid);
+			gsnedf_pending_tasklets.head = tasklet;
+		}
+	}
+	
+	/*
+	 step = gsnedf_pending_tasklets.head;
+	 TRACE("%s: (AFTER) dumping tasklet queue...\n", __FUNCTION__);
+	 while(step != NULL){
+	 TRACE("%s: %p (%d)\n", __FUNCTION__, step, step->owner->pid);
+	 step = step->next;
+	 }
+	 TRACE("%s: tail = %p (%d)\n", __FUNCTION__, *(gsnedf_pending_tasklets.tail), (*(gsnedf_pending_tasklets.tail) != NULL) ? (*(gsnedf_pending_tasklets.tail))->owner->pid : -1);
+	 TRACE("%s: done.\n", __FUNCTION__);
+	 */
+	
+	// TODO: Maintain this list in priority order.
+	//	tasklet->next = NULL;
+	//	*(gsnedf_pending_tasklets.tail) = tasklet;
+	//	gsnedf_pending_tasklets.tail = &tasklet->next;
+}
+
+static int enqueue_pai_tasklet(struct tasklet_struct* tasklet)
+{
+	cpu_entry_t *targetCPU = NULL;
+	int thisCPU;
+	int runLocal = 0;
+	int runNow = 0;
+	unsigned long flags;
+	
+    if(unlikely((tasklet->owner == NULL) || !is_realtime(tasklet->owner)))
+    {
+        TRACE("%s: No owner associated with this tasklet!\n", __FUNCTION__);
+		return 0;
+    }	
+
+	
+	raw_spin_lock_irqsave(&gsnedf_lock, flags);		
+	
+	thisCPU = smp_processor_id();
+	
+#if 1
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+	{
+		cpu_entry_t* affinity = NULL;
+		
+		// use this CPU if it is in our cluster and isn't running any RT work.
+		if(
+#ifdef CONFIG_RELEASE_MASTER
+		   (thisCPU != gsnedf.release_master) &&
+#endif
+		   (__get_cpu_var(gsnedf_cpu_entries).linked == NULL)) {
+			affinity = &(__get_cpu_var(gsnedf_cpu_entries));
+		}
+		else {
+			// this CPU is busy or shouldn't run tasklet in this cluster.
+			// look for available near by CPUs.
+			// NOTE: Affinity towards owner and not this CPU.  Is this right?		
+			affinity = 
+				gsnedf_get_nearest_available_cpu(
+					&per_cpu(gsnedf_cpu_entries, task_cpu(tasklet->owner)));
+		}
+		
+		targetCPU = affinity;
+	}
+#endif
+#endif
+	
+	if (targetCPU == NULL) {
+		targetCPU = lowest_prio_cpu();
+	}
+	
+	if (edf_higher_prio(tasklet->owner, targetCPU->linked)) {
+		if (thisCPU == targetCPU->cpu) {
+			TRACE("%s: Run tasklet locally (and now).\n", __FUNCTION__);
+			runLocal = 1;
+			runNow = 1;
+		}
+		else {
+			TRACE("%s: Run tasklet remotely (and now).\n", __FUNCTION__);
+			runLocal = 0;
+			runNow = 1;
+		}
+	}
+	else {
+		runLocal = 0;
+		runNow = 0;
+	}
+	
+	if(!runLocal) {
+		// enqueue the tasklet
+		__add_pai_tasklet(tasklet);
+	}
+	
+	raw_spin_unlock_irqrestore(&gsnedf_lock, flags);
+	
+	
+	if (runLocal /*&& runNow */) {  // runNow == 1 is implied
+		TRACE("%s: Running tasklet on CPU where it was received.\n", __FUNCTION__);
+		__do_lit_tasklet(tasklet, 0ul);
+	}
+	else if (runNow /*&& !runLocal */) {  // runLocal == 0 is implied
+		TRACE("%s: Triggering CPU %d to run tasklet.\n", __FUNCTION__, targetCPU->cpu);
+		preempt(targetCPU);  // need to be protected by cedf_lock?
+	}
+	else {
+		TRACE("%s: Scheduling of tasklet was deferred.\n", __FUNCTION__);
+	}
+	
+	return(1); // success
+}
+
+
+#endif
+
+
+
+
+
+
+
+
+
+
+
+
+
 /* Getting schedule() right is a bit tricky. schedule() may not make any
  * assumptions on the state of the current task since it may be called for a
  * number of reasons. The reasons include a scheduler_tick() determined that it
@@ -592,7 +1010,7 @@ static void gsnedf_task_new(struct task_struct * t, int on_rq, int running)
 static void gsnedf_task_wake_up(struct task_struct *task)
 {
 	unsigned long flags;
-	lt_t now;	
+	//lt_t now;	
 	
 	TRACE_TASK(task, "wake_up at %llu\n", litmus_clock());
 
@@ -660,6 +1078,10 @@ static void gsnedf_task_exit(struct task_struct * t)
 	}
 	raw_spin_unlock_irqrestore(&gsnedf_lock, flags);
 
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+	flush_tasklets(t);
+#endif	
+	
 	BUG_ON(!is_realtime(t));
         TRACE_TASK(t, "RIP\n");
 }
@@ -1602,6 +2024,11 @@ static long gsnedf_activate_plugin(void)
 		}
 #endif
 	}
+	
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+	gsnedf_pending_tasklets.head = NULL;
+	gsnedf_pending_tasklets.tail = &(gsnedf_pending_tasklets.head);
+#endif	
     
 #ifdef CONFIG_LITMUS_SOFTIRQD
     spawn_klitirqd(NULL);
@@ -1636,7 +2063,10 @@ static struct sched_plugin gsn_edf_plugin __cacheline_aligned_in_smp = {
 	.set_prio_inh_klitirqd = set_priority_inheritance_klitirqd,
 	.clear_prio_inh_klitirqd = clear_priority_inheritance_klitirqd,
 #endif
-
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+	.enqueue_pai_tasklet = enqueue_pai_tasklet,
+	.run_tasklets = run_tasklets,
+#endif
 };
 
 
-- 
cgit v1.2.2


From f5264e2cb8213dad425cb2d2db564edbc443a51a Mon Sep 17 00:00:00 2001
From: Glenn Elliott <gelliott@cs.unc.edu>
Date: Fri, 20 Jan 2012 11:09:15 -0500
Subject: Fix bugs in tracing and PAI handling

---
 include/litmus/sched_plugin.h |  2 --
 include/litmus/trace.h        |  6 ++++
 litmus/sched_cedf.c           | 71 +++++++++++++++++++++++++++++++++---------
 litmus/sched_cfifo.c          | 14 ++++++---
 litmus/sched_crm.c            | 72 ++++++++++++++++++++++++++++++++++---------
 litmus/sched_crm_srt.c        | 14 ++++++---
 litmus/sched_gsn_edf.c        | 15 ++++++---
 7 files changed, 151 insertions(+), 43 deletions(-)

diff --git a/include/litmus/sched_plugin.h b/include/litmus/sched_plugin.h
index 3fc64f832fef..8fdf05dd7cd3 100644
--- a/include/litmus/sched_plugin.h
+++ b/include/litmus/sched_plugin.h
@@ -11,9 +11,7 @@
 #include <litmus/locking.h>
 #endif
 
-#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
 #include <linux/interrupt.h>
-#endif
 
 /************************ setup/tear down ********************/
 
diff --git a/include/litmus/trace.h b/include/litmus/trace.h
index aa3ee4a6757b..09d409b60268 100644
--- a/include/litmus/trace.h
+++ b/include/litmus/trace.h
@@ -114,4 +114,10 @@ feather_callback void save_timestamp_cpu(unsigned long event, unsigned long cpu)
 
 #endif
 
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+#define TS_NV_SCHED_BOTISR_START	TIMESTAMP(206)
+#define TS_NV_SCHED_BOTISR_END		TIMESTAMP(207)
+#endif
+
+
 #endif /* !_SYS_TRACE_H_ */
diff --git a/litmus/sched_cedf.c b/litmus/sched_cedf.c
index 02106f455c0f..9aa5822c3834 100644
--- a/litmus/sched_cedf.c
+++ b/litmus/sched_cedf.c
@@ -57,6 +57,7 @@
 
 #ifdef CONFIG_LITMUS_PAI_SOFTIRQD
 #include <linux/interrupt.h>
+#include <litmus/trace.h>
 #endif
 
 #ifdef CONFIG_LITMUS_NVIDIA
@@ -432,17 +433,24 @@ static void cedf_tick(struct task_struct* t)
 static void __do_lit_tasklet(struct tasklet_struct* tasklet, unsigned long flushed)
 {
 	if (!atomic_read(&tasklet->count)) {
-		sched_trace_tasklet_begin(tasklet->owner);
+		if(tasklet->owner) {
+			sched_trace_tasklet_begin(tasklet->owner);
+		}
 		
 		if (!test_and_clear_bit(TASKLET_STATE_SCHED, &tasklet->state))
 		{
 			BUG();
 		}
-		TRACE("%s: Invoking tasklet with owner pid = %d (flushed = %d).\n", __FUNCTION__, tasklet->owner->pid, flushed);
+		TRACE("%s: Invoking tasklet with owner pid = %d (flushed = %d).\n",
+			  __FUNCTION__,
+			  (tasklet->owner) ? tasklet->owner->pid : -1,
+			  (tasklet->owner) ? 0 : 1);
 		tasklet->func(tasklet->data);
 		tasklet_unlock(tasklet);
-
-		sched_trace_tasklet_end(tasklet->owner, flushed);
+		
+		if(tasklet->owner) {
+			sched_trace_tasklet_end(tasklet->owner, flushed);
+		}
 	}
 	else {
 		BUG();
@@ -498,6 +506,7 @@ static void __extract_tasklets(cedf_domain_t* cluster, struct task_struct* task,
 
 static void flush_tasklets(cedf_domain_t* cluster, struct task_struct* task)
 {
+#if 0
 	unsigned long flags;
 	struct tasklet_head task_tasklets;
 	struct tasklet_struct* step;
@@ -520,6 +529,27 @@ static void flush_tasklets(cedf_domain_t* cluster, struct task_struct* task)
 
 		step = temp;
 	}
+#endif
+	
+	// lazy flushing.
+	// just change ownership to NULL and let an idle processor
+	// take care of it. :P
+	
+	struct tasklet_struct* step;
+	unsigned long flags;
+	
+	raw_spin_lock_irqsave(&cluster->cedf_lock, flags);
+	
+	for(step = cluster->pending_tasklets.head; step != NULL; step = step->next)
+	{
+		if(step->owner == task)
+		{
+			TRACE("%s: Found tasklet to flush: %d\n", __FUNCTION__, step->owner->pid);
+			step->owner = NULL;
+		}
+	}	
+	
+	raw_spin_unlock_irqrestore(&cluster->cedf_lock, flags);	
 }
 
 
@@ -531,6 +561,9 @@ static void do_lit_tasklets(cedf_domain_t* cluster, struct task_struct* sched_ta
 	unsigned long flags;
 	
 	while(work_to_do) {
+		
+		TS_NV_SCHED_BOTISR_START;
+		
 		// remove tasklet at head of list if it has higher priority.
 		raw_spin_lock_irqsave(&cluster->cedf_lock, flags);	
 
@@ -544,7 +577,6 @@ static void do_lit_tasklets(cedf_domain_t* cluster, struct task_struct* sched_ta
 		TRACE("%s: tail = %p (%d)\n", __FUNCTION__, *(cluster->pending_tasklets.tail), (*(cluster->pending_tasklets.tail) != NULL) ? (*(cluster->pending_tasklets.tail))->owner->pid : -1);
 		TRACE("%s: done.\n", __FUNCTION__);
  */
-	
 
 		if(cluster->pending_tasklets.head != NULL) {
 			// remove tasklet at head.
@@ -554,17 +586,17 @@ static void do_lit_tasklets(cedf_domain_t* cluster, struct task_struct* sched_ta
 
 				if(NULL == tasklet->next) {
 					// tasklet is at the head, list only has one element
-					TRACE("%s: Tasklet for %d is the last element in tasklet queue.\n", __FUNCTION__, tasklet->owner->pid);
+					TRACE("%s: Tasklet for %d is the last element in tasklet queue.\n", __FUNCTION__, (tasklet->owner) ? tasklet->owner->pid : -1);
 					cluster->pending_tasklets.tail = &(cluster->pending_tasklets.head);
 				}
 
 				// remove the tasklet from the queue
 				cluster->pending_tasklets.head = tasklet->next;
 
-				TRACE("%s: Removed tasklet for %d from tasklet queue.\n", __FUNCTION__, tasklet->owner->pid);
+				TRACE("%s: Removed tasklet for %d from tasklet queue.\n", __FUNCTION__, (tasklet->owner) ? tasklet->owner->pid : -1);
 			}
 			else {
-				TRACE("%s: Pending tasklet (%d) does not have priority to run on this CPU (%d).\n", __FUNCTION__, tasklet->owner->pid, smp_processor_id());
+				TRACE("%s: Pending tasklet (%d) does not have priority to run on this CPU (%d).\n", __FUNCTION__, (tasklet->owner) ? tasklet->owner->pid : -1, smp_processor_id());
 				tasklet = NULL;
 			}
 		}
@@ -584,9 +616,11 @@ static void do_lit_tasklets(cedf_domain_t* cluster, struct task_struct* sched_ta
 		TRACE("%s: done.\n", __FUNCTION__);
 		 */
 	
-
 		raw_spin_unlock_irqrestore(&cluster->cedf_lock, flags);
 		
+		
+		TS_NV_SCHED_BOTISR_END;
+		
 		if(tasklet) {
 			__do_lit_tasklet(tasklet, 0ul);
 			tasklet = NULL;	
@@ -690,7 +724,16 @@ static void __add_pai_tasklet(struct tasklet_struct* tasklet, cedf_domain_t* clu
 		
 		// insert tasklet right before step->next.
 		
-		TRACE("%s: inserting tasklet for %d between %d and %d.\n", __FUNCTION__, tasklet->owner->pid, step->owner->pid, (step->next) ? step->next->owner->pid : -1);
+		TRACE("%s: inserting tasklet for %d between %d and %d.\n", __FUNCTION__,
+			  tasklet->owner->pid,
+			  (step->owner) ?
+				step->owner->pid :
+				-1,
+			  (step->next) ?
+				((step->next->owner) ?
+					step->next->owner->pid :
+					-1) :
+				-1);
 		
 		tasklet->next = step->next;
 		step->next = tasklet;
@@ -1080,6 +1123,10 @@ static void cedf_task_exit(struct task_struct * t)
 	unsigned long flags;
 	cedf_domain_t *cluster = task_cpu_cluster(t);
 
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+	flush_tasklets(cluster, t);
+#endif	
+	
 	/* unlink if necessary */
 	raw_spin_lock_irqsave(&cluster->cedf_lock, flags);
 	unlink(t);
@@ -1091,10 +1138,6 @@ static void cedf_task_exit(struct task_struct * t)
 	}
 	raw_spin_unlock_irqrestore(&cluster->cedf_lock, flags);
 
-#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
-	flush_tasklets(cluster, t);
-#endif
-
 	BUG_ON(!is_realtime(t));
         TRACE_TASK(t, "RIP\n");
 }
diff --git a/litmus/sched_cfifo.c b/litmus/sched_cfifo.c
index 689b2dbe5fae..7fbdec3f1d15 100644
--- a/litmus/sched_cfifo.c
+++ b/litmus/sched_cfifo.c
@@ -57,6 +57,7 @@
 
 #ifdef CONFIG_LITMUS_PAI_SOFTIRQD
 #include <linux/interrupt.h>
+#include <litmus/trace.h>
 #endif
 
 #ifdef CONFIG_LITMUS_NVIDIA
@@ -530,6 +531,9 @@ static void do_lit_tasklets(cfifo_domain_t* cluster, struct task_struct* sched_t
 	unsigned long flags;
 	
 	while(work_to_do) {
+		
+		TS_NV_SCHED_BOTISR_START;
+		
 		// remove tasklet at head of list if it has higher priority.
 		raw_spin_lock_irqsave(&cluster->cfifo_lock, flags);	
 		
@@ -584,6 +588,8 @@ static void do_lit_tasklets(cfifo_domain_t* cluster, struct task_struct* sched_t
 		
 		raw_spin_unlock_irqrestore(&cluster->cfifo_lock, flags);
 		
+		TS_NV_SCHED_BOTISR_END;
+		
 		if(tasklet) {
 			__do_lit_tasklet(tasklet, 0ul);
 			tasklet = NULL;	
@@ -1071,6 +1077,10 @@ static void cfifo_task_exit(struct task_struct * t)
 	unsigned long flags;
 	cfifo_domain_t *cluster = task_cpu_cluster(t);
 
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+	flush_tasklets(cluster, t);
+#endif		
+	
 	/* unlink if necessary */
 	raw_spin_lock_irqsave(&cluster->cfifo_lock, flags);
 	unlink(t);
@@ -1081,10 +1091,6 @@ static void cfifo_task_exit(struct task_struct * t)
 		tsk_rt(t)->scheduled_on = NO_CPU;
 	}
 	raw_spin_unlock_irqrestore(&cluster->cfifo_lock, flags);
-
-#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
-	flush_tasklets(cluster, t);
-#endif	
 	
 	BUG_ON(!is_realtime(t));
         TRACE_TASK(t, "RIP\n");
diff --git a/litmus/sched_crm.c b/litmus/sched_crm.c
index fd7fab982998..e51de10557f9 100644
--- a/litmus/sched_crm.c
+++ b/litmus/sched_crm.c
@@ -57,6 +57,7 @@
 
 #ifdef CONFIG_LITMUS_PAI_SOFTIRQD
 #include <linux/interrupt.h>
+#include <litmus/trace.h>
 #endif
 
 #ifdef CONFIG_LITMUS_NVIDIA
@@ -425,17 +426,24 @@ static void crm_tick(struct task_struct* t)
 static void __do_lit_tasklet(struct tasklet_struct* tasklet, unsigned long flushed)
 {
 	if (!atomic_read(&tasklet->count)) {
-		sched_trace_tasklet_begin(tasklet->owner);
+		if(tasklet->owner) {
+			sched_trace_tasklet_begin(tasklet->owner);
+		}
 		
 		if (!test_and_clear_bit(TASKLET_STATE_SCHED, &tasklet->state))
 		{
 			BUG();
 		}
-		TRACE("%s: Invoking tasklet with owner pid = %d (flushed = %d).\n", __FUNCTION__, tasklet->owner->pid, flushed);
+		TRACE("%s: Invoking tasklet with owner pid = %d (flushed = %d).\n",
+			  __FUNCTION__,
+			  (tasklet->owner) ? tasklet->owner->pid : -1,
+			  (tasklet->owner) ? 0 : 1);
 		tasklet->func(tasklet->data);
 		tasklet_unlock(tasklet);
 		
-		sched_trace_tasklet_end(tasklet->owner, flushed);
+		if(tasklet->owner) {
+			sched_trace_tasklet_end(tasklet->owner, flushed);
+		}
 	}
 	else {
 		BUG();
@@ -491,6 +499,7 @@ static void __extract_tasklets(crm_domain_t* cluster, struct task_struct* task,
 
 static void flush_tasklets(crm_domain_t* cluster, struct task_struct* task)
 {
+#if 0
 	unsigned long flags;
 	struct tasklet_head task_tasklets;
 	struct tasklet_struct* step;
@@ -513,6 +522,27 @@ static void flush_tasklets(crm_domain_t* cluster, struct task_struct* task)
 		
 		step = temp;
 	}
+#endif
+	
+	// lazy flushing.
+	// just change ownership to NULL and let an idle processor
+	// take care of it. :P
+	
+	struct tasklet_struct* step;
+	unsigned long flags;
+	
+	raw_spin_lock_irqsave(&cluster->crm_lock, flags);
+
+	for(step = cluster->pending_tasklets.head; step != NULL; step = step->next)
+	{
+		if(step->owner == task)
+		{
+			TRACE("%s: Found tasklet to flush: %d\n", __FUNCTION__, step->owner->pid);
+			step->owner = NULL;
+		}
+	}	
+	
+	raw_spin_unlock_irqrestore(&cluster->crm_lock, flags);
 }
 
 
@@ -524,6 +554,9 @@ static void do_lit_tasklets(crm_domain_t* cluster, struct task_struct* sched_tas
 	unsigned long flags;
 	
 	while(work_to_do) {
+		
+		TS_NV_SCHED_BOTISR_START;
+		
 		// remove tasklet at head of list if it has higher priority.
 		raw_spin_lock_irqsave(&cluster->crm_lock, flags);	
 		
@@ -546,17 +579,17 @@ static void do_lit_tasklets(crm_domain_t* cluster, struct task_struct* sched_tas
 				
 				if(NULL == tasklet->next) {
 					// tasklet is at the head, list only has one element
-					TRACE("%s: Tasklet for %d is the last element in tasklet queue.\n", __FUNCTION__, tasklet->owner->pid);
+					TRACE("%s: Tasklet for %d is the last element in tasklet queue.\n", __FUNCTION__, (tasklet->owner) ? tasklet->owner->pid : -1);
 					cluster->pending_tasklets.tail = &(cluster->pending_tasklets.head);
 				}
 				
 				// remove the tasklet from the queue
 				cluster->pending_tasklets.head = tasklet->next;
 				
-				TRACE("%s: Removed tasklet for %d from tasklet queue.\n", __FUNCTION__, tasklet->owner->pid);
+				TRACE("%s: Removed tasklet for %d from tasklet queue.\n", __FUNCTION__, (tasklet->owner) ? tasklet->owner->pid : -1);
 			}
 			else {
-				TRACE("%s: Pending tasklet (%d) does not have priority to run on this CPU (%d).\n", __FUNCTION__, tasklet->owner->pid, smp_processor_id());
+				TRACE("%s: Pending tasklet (%d) does not have priority to run on this CPU (%d).\n", __FUNCTION__, (tasklet->owner) ? tasklet->owner->pid : -1, smp_processor_id());
 				tasklet = NULL;
 			}
 		}
@@ -577,6 +610,8 @@ static void do_lit_tasklets(crm_domain_t* cluster, struct task_struct* sched_tas
 		
 		raw_spin_unlock_irqrestore(&cluster->crm_lock, flags);
 		
+		TS_NV_SCHED_BOTISR_END;
+		
 		if(tasklet) {
 			__do_lit_tasklet(tasklet, 0ul);
 			tasklet = NULL;	
@@ -619,8 +654,8 @@ static void run_tasklets(struct task_struct* sched_task)
 	preempt_disable();
 	
 	cluster = (is_realtime(sched_task)) ?
-	task_cpu_cluster(sched_task) :
-	remote_cluster(smp_processor_id());
+		task_cpu_cluster(sched_task) :
+		remote_cluster(smp_processor_id());
 	
 	if(cluster && cluster->pending_tasklets.head != NULL) {
 		TRACE("%s: There are tasklets to process.\n", __FUNCTION__);
@@ -679,8 +714,17 @@ static void __add_pai_tasklet(struct tasklet_struct* tasklet, crm_domain_t* clus
 		
 		// insert tasklet right before step->next.
 		
-		TRACE("%s: inserting tasklet for %d between %d and %d.\n", __FUNCTION__, tasklet->owner->pid, step->owner->pid, (step->next) ? step->next->owner->pid : -1);
-		
+		TRACE("%s: inserting tasklet for %d between %d and %d.\n", __FUNCTION__,
+			  tasklet->owner->pid,
+			  (step->owner) ?
+			      step->owner->pid :
+			      -1,
+			  (step->next) ?
+				((step->next->owner) ?
+					step->next->owner->pid :
+					-1) :
+			    -1);
+			  
 		tasklet->next = step->next;
 		step->next = tasklet;
 		
@@ -1070,6 +1114,10 @@ static void crm_task_exit(struct task_struct * t)
 	unsigned long flags;
 	crm_domain_t *cluster = task_cpu_cluster(t);
 
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+	flush_tasklets(cluster, t);
+#endif		
+
 	/* unlink if necessary */
 	raw_spin_lock_irqsave(&cluster->crm_lock, flags);
 	unlink(t);
@@ -1080,10 +1128,6 @@ static void crm_task_exit(struct task_struct * t)
 		tsk_rt(t)->scheduled_on = NO_CPU;
 	}
 	raw_spin_unlock_irqrestore(&cluster->crm_lock, flags);
-
-#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
-	flush_tasklets(cluster, t);
-#endif	
 	
 	BUG_ON(!is_realtime(t));
         TRACE_TASK(t, "RIP\n");
diff --git a/litmus/sched_crm_srt.c b/litmus/sched_crm_srt.c
index c0004354573d..f0064d486953 100644
--- a/litmus/sched_crm_srt.c
+++ b/litmus/sched_crm_srt.c
@@ -57,6 +57,7 @@
 
 #ifdef CONFIG_LITMUS_PAI_SOFTIRQD
 #include <linux/interrupt.h>
+#include <litmus/trace.h>
 #endif
 
 #ifdef CONFIG_LITMUS_NVIDIA
@@ -529,6 +530,9 @@ static void do_lit_tasklets(crm_srt_domain_t* cluster, struct task_struct* sched
 	unsigned long flags;
 	
 	while(work_to_do) {
+		
+		TS_NV_SCHED_BOTISR_START;
+		
 		// remove tasklet at head of list if it has higher priority.
 		raw_spin_lock_irqsave(&cluster->crm_srt_lock, flags);	
 		
@@ -582,6 +586,8 @@ static void do_lit_tasklets(crm_srt_domain_t* cluster, struct task_struct* sched
 		
 		raw_spin_unlock_irqrestore(&cluster->crm_srt_lock, flags);
 		
+		TS_NV_SCHED_BOTISR_END;
+		
 		if(tasklet) {
 			__do_lit_tasklet(tasklet, 0ul);
 			tasklet = NULL;	
@@ -1067,6 +1073,10 @@ static void crm_srt_task_exit(struct task_struct * t)
 	unsigned long flags;
 	crm_srt_domain_t *cluster = task_cpu_cluster(t);
 
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+	flush_tasklets(cluster, t);
+#endif	
+	
 	/* unlink if necessary */
 	raw_spin_lock_irqsave(&cluster->crm_srt_lock, flags);
 	unlink(t);
@@ -1077,10 +1087,6 @@ static void crm_srt_task_exit(struct task_struct * t)
 		tsk_rt(t)->scheduled_on = NO_CPU;
 	}
 	raw_spin_unlock_irqrestore(&cluster->crm_srt_lock, flags);
-
-#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
-	flush_tasklets(cluster, t);
-#endif
 		
 	BUG_ON(!is_realtime(t));
         TRACE_TASK(t, "RIP\n");
diff --git a/litmus/sched_gsn_edf.c b/litmus/sched_gsn_edf.c
index b40ff7ba4f0e..30c745fe33a7 100644
--- a/litmus/sched_gsn_edf.c
+++ b/litmus/sched_gsn_edf.c
@@ -37,6 +37,7 @@
 
 #ifdef CONFIG_LITMUS_PAI_SOFTIRQD
 #include <linux/interrupt.h>
+#include <litmus/trace.h>
 #endif
 
 #ifdef CONFIG_LITMUS_NVIDIA
@@ -523,6 +524,9 @@ static void do_lit_tasklets(struct task_struct* sched_task)
 	unsigned long flags;
 	
 	while(work_to_do) {
+		
+		TS_NV_SCHED_BOTISR_START;
+		
 		// remove tasklet at head of list if it has higher priority.
 		raw_spin_lock_irqsave(&gsnedf_lock, flags);	
 		
@@ -576,9 +580,10 @@ static void do_lit_tasklets(struct task_struct* sched_task)
 		 TRACE("%s: done.\n", __FUNCTION__);
 		 */
 		
-		
 		raw_spin_unlock_irqrestore(&gsnedf_lock, flags);
 		
+		TS_NV_SCHED_BOTISR_END;
+		
 		if(tasklet) {
 			__do_lit_tasklet(tasklet, 0ul);
 			tasklet = NULL;	
@@ -1069,6 +1074,10 @@ static void gsnedf_task_exit(struct task_struct * t)
 {
 	unsigned long flags;
 
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+	flush_tasklets(t);
+#endif		
+	
 	/* unlink if necessary */
 	raw_spin_lock_irqsave(&gsnedf_lock, flags);
 	unlink(t);
@@ -1077,10 +1086,6 @@ static void gsnedf_task_exit(struct task_struct * t)
 		tsk_rt(t)->scheduled_on = NO_CPU;
 	}
 	raw_spin_unlock_irqrestore(&gsnedf_lock, flags);
-
-#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
-	flush_tasklets(t);
-#endif	
 	
 	BUG_ON(!is_realtime(t));
         TRACE_TASK(t, "RIP\n");
-- 
cgit v1.2.2


From 3d1c6d44d3f133909d1c594351c2b7c779b1d7d4 Mon Sep 17 00:00:00 2001
From: Glenn Elliott <gelliott@cs.unc.edu>
Date: Sun, 4 Mar 2012 16:09:04 -0500
Subject: Some cleanup of PAI

---
 arch/x86/kernel/irq.c | 10 ----------
 kernel/softirq.c      |  1 +
 kernel/workqueue.c    |  3 ++-
 3 files changed, 3 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 50abbc6b7429..433cd154333c 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -248,17 +248,7 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
 				__func__, smp_processor_id(), vector, irq);
 	}
 
-//#ifndef CONFIG_LITMUS_NVIDIA
 	irq_exit();
-//#else
-	/* skip softirqs if we're tracing an interrupt top-half */
-	/* comment out if-statement if we want to trace with bh on. */
-	//if(!is_interrupt_tracing_active())
-//	irq_exit();
-
-
-//	sched_trace_nv_interrupt_end();
-//#endif
 
 	set_irq_regs(old_regs);
 	return 1;
diff --git a/kernel/softirq.c b/kernel/softirq.c
index d3217c54d2bf..7a6f500570f1 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -439,6 +439,7 @@ void __tasklet_schedule(struct tasklet_struct *t)
 
 				t->owner = device_owner;
 				sched_trace_tasklet_release(t->owner);
+
 				if(likely(_litmus_tasklet_schedule(t,nvidia_device)))
 				{
 					unlock_nv_registry(nvidia_device, &flags);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 637cadac2627..2293aadbb1ab 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2531,6 +2531,7 @@ EXPORT_SYMBOL(cancel_delayed_work_sync);
  */
 int schedule_work(struct work_struct *work)
 {
+#if 0
 #if defined(CONFIG_LITMUS_NVIDIA) && defined(CONFIG_LITMUS_SOFTIRQD)
 	if(is_nvidia_func(work->func))
 	{
@@ -2583,7 +2584,7 @@ int schedule_work(struct work_struct *work)
 		unlock_nv_registry(nvidiaDevice, &flags);
 	}
 #endif
-
+#endif
 	return(__schedule_work(work));
 }
 EXPORT_SYMBOL(schedule_work);
-- 
cgit v1.2.2