29 files changed, 11980 insertions, 293 deletions
diff --git a/litmus/Kconfig b/litmus/Kconfig
index bd6635c8de08..594c54342bdc 100644
--- a/litmus/Kconfig
+++ b/litmus/Kconfig
@@ -34,8 +34,70 @@ config RELEASE_MASTER
           (http://www.cs.unc.edu/~anderson/papers.html).
           Currently only supported by GSN-EDF.
+config REALTIME_AUX_TASKS
+        bool "Real-Time Auxillary Tasks"
+        depends on LITMUS_LOCKING
+        default n
+        help
+                Adds a system call that forces all non-real-time threads in a process
+                to become auxillary real-time tasks. These tasks inherit the priority of
+                the highest-prio *BLOCKED* (but NOT blocked on a Litmus lock) real-time
+                task (non-auxillary) in the process. This allows the integration of COTS
+                code that has background helper threads used primarily for message passing
+                and synchronization. If these background threads are NOT real-time scheduled,
+                then unbounded priority inversions may occur if a real-time task blocks on
+                a non-real-time thread.
+                Beware of the following pitfalls:
+                  1) Auxillary threads should not be CPU intensive. They should mostly
+                     block on mutexes and condition variables. Violating this will
+                         likely prevent meaningful analysis.
+                  2) Since there may be more than one auxillary thread per process,
+                     priority inversions may occur with respect to single-threaded
+                         task models if/when one of threads are scheduled simultanously
+                         with another of the same identity.
+choice
+        prompt "Scheduling prioritization of AUX tasks."
+        default REALTIME_AUX_TASK_PRIORITY_BOOSTED
+        help
+                Select the prioritization method for auxillary tasks.
+config REALTIME_AUX_TASK_PRIORITY_BOOSTED
+        bool "Boosted"
+        help
+                Run all auxillary task threads at a maximum priority. Useful for
+                temporarily working around bugs during development.
+config REALTIME_AUX_TASK_PRIORITY_INHERITANCE
+        bool "Inheritance"
+        help
+                Auxillary tasks inherit the maximum priority from blocked real-time
+                threads within the same process.
+                Additional pitfall:
+                  3) Busy-wait deadlock is likely between normal real-time tasks and
+                     auxillary tasks synchronize using _preemptive_ spinlocks that do
+                         not use priority inheritance.
+                These pitfalls are mitgated by the fact that auxillary tasks only
+                inherit priorities from blocked tasks (Blocking signifies that the
+                blocked task _may_ be waiting on an auxillary task to perform some
+                work.). Futher, auxillary tasks without an inherited priority are
+                _always_ scheduled with a priority less than any normal real-time task!!
+                NOTE: Aux tasks do not _directly_ inherit a priority from rt tasks that
+                are blocked on Litmus locks. Aux task should be COTS code that know nothing
+                of Litmus, so they won't hold Litmus locks. Nothing the aux task can do can
+                _directly_ unblock the rt task blocked on a Litmus lock. However, the lock
+                holder that blocks the rt task CAN block on I/O and contribute its priority
+                to the aux tasks. Aux tasks may still _indirectly_ inherit the priority of
+                the blocked rt task via the lock holder.
+endchoice
 endmenu
 menu "Real-Time Synchronization"
 config NP_SECTION
@@ -60,6 +122,42 @@ config LITMUS_LOCKING
          Say Yes if you want to include locking protocols such as the FMLP and
          Baker's SRP.
+config LITMUS_AFFINITY_LOCKING
+        bool "Enable affinity infrastructure in k-exclusion locking protocols."
+        depends on LITMUS_LOCKING
+        default n
+        help
+          Enable affinity tracking infrastructure in k-exclusion locking protocols.
+          This only enabled the *infrastructure* not actual affinity algorithms.
+          If unsure, say No.
+config LITMUS_NESTED_LOCKING
+                bool "Support for nested inheritance in locking protocols"
+        depends on LITMUS_LOCKING
+        default n
+        help
+          Enable nested priority inheritance.
+config LITMUS_DGL_SUPPORT
+        bool "Support for dynamic group locks"
+        depends on LITMUS_NESTED_LOCKING
+        default n
+        help
+          Enable dynamic group lock support.
+config LITMUS_MAX_DGL_SIZE
+        int "Maximum size of a dynamic group lock."
+        depends on LITMUS_DGL_SUPPORT
+        range 1 128
+        default "10"
+        help
+                Dynamic group lock data structures are allocated on the process
+                stack when a group is requested. We set a maximum size of
+                locks in a dynamic group lock to avoid dynamic allocation.
+                TODO: Batch DGL requests exceeding LITMUS_MAX_DGL_SIZE.
 endmenu
 menu "Performance Enhancements"
@@ -112,6 +210,14 @@ choice
          Break ties between two jobs, A and B, with equal deadlines by using a
          uniform hash; i.e.: hash(A.pid, A.job_num) < hash(B.pid, B.job_num). Job
          A has ~50% of winning a given tie-break.
+          NOTES:
+            * This method doesn't work very well if a tied job has a low-valued
+                  hash while the jobs it ties with do not make progress (that is,
+                  they don't increment to new job numbers). The job with a low-valued
+                  hash job will lose most tie-breaks. This is usually not a problem
+                  unless you are doing something funky in Litmus (ex. worker threads
+                  that do not increment job numbers).
        
        config EDF_PID_TIE_BREAK
        bool "PID-based Tie Breaks"
@@ -167,7 +273,7 @@ config SCHED_TASK_TRACE
 config SCHED_TASK_TRACE_SHIFT
       int "Buffer size for sched_trace_xxx() events"
       depends on SCHED_TASK_TRACE
-       range 8 13
+       range 8 15
       default 9
       help
@@ -279,4 +385,108 @@ config PREEMPT_STATE_TRACE
 endmenu
+menu "Interrupt Handling"
+choice 
+        prompt "Scheduling of interrupt bottom-halves in Litmus."
+        default LITMUS_SOFTIRQD_NONE
+        depends on LITMUS_LOCKING
+        help
+                Schedule tasklets with known priorities in Litmus.
+config LITMUS_SOFTIRQD_NONE
+        bool "No tasklet scheduling in Litmus."
+        help
+          Don't schedule tasklets in Litmus.  Default.
+config LITMUS_SOFTIRQD
+        bool "Spawn klmirqd interrupt handling threads."
+        help
+          Create klmirqd interrupt handling threads.  Work must be
+          specifically dispatched to these workers.  (Softirqs for
+          Litmus tasks are not magically redirected to klmirqd.)
+          G-EDF, C-EDF ONLY for now!
+config LITMUS_PAI_SOFTIRQD
+        bool "Defer tasklets to context switch points."
+        help
+          Only execute scheduled tasklet bottom halves at
+          scheduling points.  Trades context switch overhead
+          at the cost of non-preemptive durations of bottom half
+          processing.
+                 
+          G-EDF, C-EDF ONLY for now!     
+                 
+endchoice          
+           
+config LITMUS_NVIDIA
+          bool "Litmus handling of NVIDIA interrupts."
+          default n
+          help
+            Direct tasklets from NVIDIA devices to Litmus's klmirqd
+                or PAI interrupt handling routines.
+                If unsure, say No.
+config LITMUS_AFFINITY_AWARE_GPU_ASSINGMENT
+          bool "Enable affinity-aware heuristics to improve GPU assignment."
+          depends on LITMUS_NVIDIA && LITMUS_AFFINITY_LOCKING
+          default n
+          help
+            Enable several heuristics to improve the assignment
+                of GPUs to real-time tasks to reduce the overheads
+                of memory migrations.
+                If unsure, say No.
+config NV_DEVICE_NUM
+           int "Number of NVIDIA GPUs."
+           depends on LITMUS_SOFTIRQD || LITMUS_PAI_SOFTIRQD
+           range 1 16
+           default "1"
+           help
+             Should be (<= to the number of CPUs) and
+                 (<= to the number of GPUs) in your system.
+choice
+          prompt "CUDA/Driver Version Support"
+          default CUDA_5_0
+          depends on LITMUS_NVIDIA
+          help
+                Select the version of CUDA/driver to support.
+config CUDA_5_0
+          bool "CUDA 5.0"
+          depends on LITMUS_NVIDIA && REALTIME_AUX_TASKS
+          help
+            Support CUDA 5.0 RCx (dev. driver version: x86_64-304.33)
+config CUDA_4_0
+          bool "CUDA 4.0"
+          depends on LITMUS_NVIDIA
+          help
+                Support CUDA 4.0 RC2 (dev. driver version: x86_64-270.40)
+config CUDA_3_2
+          bool "CUDA 3.2"
+          depends on LITMUS_NVIDIA
+          help
+                Support CUDA 3.2 (dev. driver version: x86_64-260.24)
+endchoice
+config LITMUS_NV_KLMIRQD_DEBUG
+          bool "Raise fake sporadic tasklets to test nv klimirqd threads."
+          depends on LITMUS_NVIDIA && LITMUS_SOFTIRQD
+          default n
+          help
+                Causes tasklets to be sporadically dispatched to waiting klmirqd
+                threads.  WARNING! Kernel panic may occur if you switch between
+                LITMUS plugins!
+endmenu
 endmenu
diff --git a/litmus/Makefile b/litmus/Makefile
index d26ca7076b62..67d8b8ee72bc 100644
--- a/litmus/Makefile
+++ b/litmus/Makefile
@@ -18,6 +18,7 @@ obj-y     = sched_plugin.o litmus.o \
            bheap.o \
            binheap.o \
            ctrldev.o \
+                aux_tasks.o \
            sched_gsn_edf.o \
            sched_psn_edf.o \
            sched_pfp.o
@@ -30,3 +31,11 @@ obj-$(CONFIG_FEATHER_TRACE) += ft_event.o ftdev.o
 obj-$(CONFIG_SCHED_TASK_TRACE) += sched_task_trace.o
 obj-$(CONFIG_SCHED_DEBUG_TRACE) += sched_trace.o
 obj-$(CONFIG_SCHED_OVERHEAD_TRACE) += trace.o
+obj-$(CONFIG_LITMUS_LOCKING) += kfmlp_lock.o
+obj-$(CONFIG_LITMUS_NESTED_LOCKING) += rsm_lock.o ikglp_lock.o
+obj-$(CONFIG_LITMUS_SOFTIRQD) += litmus_softirq.o
+obj-$(CONFIG_LITMUS_PAI_SOFTIRQD) += litmus_pai_softirq.o
+obj-$(CONFIG_LITMUS_NVIDIA) += nvidia_info.o sched_trace_external.o
+obj-$(CONFIG_LITMUS_AFFINITY_LOCKING) += kexclu_affinity.o gpu_affinity.o
diff --git a/litmus/affinity.c b/litmus/affinity.c
index 3fa6dd789400..cd93249b5506 100644
--- a/litmus/affinity.c
+++ b/litmus/affinity.c
@@ -26,7 +26,7 @@ void init_topology(void) {
                                        cpumask_weight((struct cpumask *)&neigh_info[cpu].neighbors[i]);
                        }
                        printk("CPU %d has %d neighbors at level %d. (mask = %lx)\n",
-                                                        cpu, neigh_info[cpu].size[i], i, 
+                                                        cpu, neigh_info[cpu].size[i], i,
                                                        *cpumask_bits(neigh_info[cpu].neighbors[i]));
                }
diff --git a/litmus/aux_tasks.c b/litmus/aux_tasks.c
new file mode 100644
index 000000000000..ef26bba3be77
--- /dev/null
+++ b/litmus/aux_tasks.c
@@ -0,0 +1,529 @@
+#include <litmus/sched_plugin.h>
+#include <litmus/trace.h>
+#include <litmus/litmus.h>
+#ifdef CONFIG_REALTIME_AUX_TASKS
+#include <litmus/rt_param.h>
+#include <litmus/aux_tasks.h>
+#include <linux/time.h>
+#define AUX_SLICE_NR_JIFFIES 1
+#define AUX_SLICE_NS ((NSEC_PER_SEC / HZ) * AUX_SLICE_NR_JIFFIES)
+static int admit_aux_task(struct task_struct *t)
+{
+        int retval = 0;
+        struct task_struct *leader = t->group_leader;
+        /* budget enforcement increments job numbers.  job numbers are used in
+         * tie-breaking of aux_tasks.  method helps ensure:
+         * 1) aux threads with no inherited priority can starve another (they share
+         *    the CPUs equally.
+         * 2) aux threads that inherit the same priority cannot starve each other.
+         *
+         * Assuming aux threads are well-behavied (they do very little work and
+         * suspend), risk of starvation should not be an issue, but this is a
+         * fail-safe.
+         */
+        struct rt_task tp = {
+                .period = AUX_SLICE_NS,
+                .relative_deadline = AUX_SLICE_NS,
+                .exec_cost = AUX_SLICE_NS, /* allow full utilization with buget tracking */
+                .phase = 0,
+                .cpu = task_cpu(leader),  /* take CPU of group leader */
+                .budget_policy = QUANTUM_ENFORCEMENT,
+                .budget_signal_policy = NO_SIGNALS,
+                .cls = RT_CLASS_BEST_EFFORT
+        };
+        struct sched_param param = { .sched_priority = 0};
+        tsk_rt(t)->task_params = tp;
+        retval = sched_setscheduler_nocheck(t, SCHED_LITMUS, &param);
+        return retval;
+}
+int exit_aux_task(struct task_struct *t)
+{
+        int retval = 0;
+        BUG_ON(!tsk_rt(t)->is_aux_task);
+        TRACE_CUR("Aux task %s/%d is exiting from %s/%d.\n", t->comm, t->pid, t->group_leader->comm, t->group_leader->pid);
+        tsk_rt(t)->is_aux_task = 0;
+#ifdef CONFIG_REALTIME_AUX_TASK_PRIORITY_INHERITANCE
+        list_del(&tsk_rt(t)->aux_task_node);
+        if (tsk_rt(t)->inh_task) {
+                litmus->__decrease_prio(t, NULL);
+        }
+#endif
+        return retval;
+}
+static int aux_tasks_increase_priority(struct task_struct *leader, struct task_struct *hp)
+{
+        int retval = 0;
+#ifdef CONFIG_REALTIME_AUX_TASK_PRIORITY_INHERITANCE
+        struct list_head *pos;
+        TRACE_CUR("Increasing priority of aux tasks in group %s/%d.\n", leader->comm, leader->pid);
+        list_for_each(pos, &tsk_aux(leader)->aux_tasks) {
+                struct task_struct *aux =
+                        container_of(list_entry(pos, struct rt_param, aux_task_node),
+                                                struct task_struct, rt_param);
+                if (!is_realtime(aux)) {
+                        TRACE_CUR("skipping non-real-time aux task %s/%d\n", aux->comm, aux->pid);
+                }
+                else if(tsk_rt(aux)->inh_task == hp) {
+                        TRACE_CUR("skipping real-time aux task %s/%d that already inherits from %s/%d\n", aux->comm, aux->pid, hp->comm, hp->pid);
+                }
+                else {
+                        // aux tasks don't touch rt locks, so no nested call needed.
+                        TRACE_CUR("increasing %s/%d.\n", aux->comm, aux->pid);
+                        retval = litmus->__increase_prio(aux, hp);
+                }
+        }
+#endif
+        return retval;
+}
+static int aux_tasks_decrease_priority(struct task_struct *leader, struct task_struct *hp)
+{
+        int retval = 0;
+#ifdef CONFIG_REALTIME_AUX_TASK_PRIORITY_INHERITANCE
+        struct list_head *pos;
+        TRACE_CUR("Decreasing priority of aux tasks in group %s/%d.\n", leader->comm, leader->pid);
+        list_for_each(pos, &tsk_aux(leader)->aux_tasks) {
+                struct task_struct *aux =
+                        container_of(list_entry(pos, struct rt_param, aux_task_node),
+                                                 struct task_struct, rt_param);
+                if (!is_realtime(aux)) {
+                        TRACE_CUR("skipping non-real-time aux task %s/%d\n", aux->comm, aux->pid);
+                }
+                else {
+                        TRACE_CUR("decreasing %s/%d.\n", aux->comm, aux->pid);
+                        retval = litmus->__decrease_prio(aux, hp);
+                }
+        }
+#endif
+        return retval;
+}
+int aux_task_owner_increase_priority(struct task_struct *t)
+{
+        int retval = 0;
+#ifdef CONFIG_REALTIME_AUX_TASK_PRIORITY_INHERITANCE
+        struct task_struct *leader;
+        struct task_struct *hp = NULL;
+        struct task_struct *hp_eff = NULL;
+        BUG_ON(!is_realtime(t));
+        BUG_ON(!tsk_rt(t)->has_aux_tasks);
+        leader = t->group_leader;
+        if (!binheap_is_in_heap(&tsk_rt(t)->aux_task_owner_node)) {
+                WARN_ON(!is_running(t));
+                TRACE_CUR("aux tasks may not inherit from %s/%d in group %s/%d\n",
+                                                t->comm, t->pid, leader->comm, leader->pid);
+                goto out;
+        }
+        TRACE_CUR("task %s/%d in group %s/%d increasing priority.\n", t->comm, t->pid, leader->comm, leader->pid);
+        hp = container_of(binheap_top_entry(&tsk_aux(leader)->aux_task_owners, struct rt_param, aux_task_owner_node),
+                                          struct task_struct, rt_param);
+        hp_eff = effective_priority(hp);
+        if (hp != t) { /* our position in the heap may have changed. hp is already at the root. */
+                binheap_decrease(&tsk_rt(t)->aux_task_owner_node, &tsk_aux(leader)->aux_task_owners);
+        }
+        hp = container_of(binheap_top_entry(&tsk_aux(leader)->aux_task_owners, struct rt_param, aux_task_owner_node),
+                                          struct task_struct, rt_param);
+        if (effective_priority(hp) != hp_eff) { /* the eff. prio. of hp has changed */
+                hp_eff = effective_priority(hp);
+                TRACE_CUR("%s/%d is new hp in group %s/%d.\n", t->comm, t->pid, leader->comm, leader->pid);
+                retval = aux_tasks_increase_priority(leader, hp_eff);
+        }
+#endif
+out:
+        return retval;
+}
+int aux_task_owner_decrease_priority(struct task_struct *t)
+{
+        int retval = 0;
+#ifdef CONFIG_REALTIME_AUX_TASK_PRIORITY_INHERITANCE
+        struct task_struct *leader;
+        struct task_struct *hp = NULL;
+        struct task_struct *hp_eff = NULL;
+        BUG_ON(!is_realtime(t));
+        BUG_ON(!tsk_rt(t)->has_aux_tasks);
+        leader = t->group_leader;
+        if (!binheap_is_in_heap(&tsk_rt(t)->aux_task_owner_node)) {
+                WARN_ON(!is_running(t));
+                TRACE_CUR("aux tasks may not inherit from %s/%d in group %s/%d\n",
+                                                t->comm, t->pid, leader->comm, leader->pid);
+                goto out;
+        }
+        TRACE_CUR("task %s/%d in group %s/%d decresing priority.\n", t->comm, t->pid, leader->comm, leader->pid);
+        hp = container_of(binheap_top_entry(&tsk_aux(leader)->aux_task_owners, struct rt_param, aux_task_owner_node),
+                                          struct task_struct, rt_param);
+        hp_eff = effective_priority(hp);
+        binheap_delete(&tsk_rt(t)->aux_task_owner_node, &tsk_aux(leader)->aux_task_owners);
+        binheap_add(&tsk_rt(t)->aux_task_owner_node, &tsk_aux(leader)->aux_task_owners,
+                                struct rt_param, aux_task_owner_node);
+        if (hp == t) { /* t was originally the hp */
+                struct task_struct *new_hp =
+                        container_of(binheap_top_entry(&tsk_aux(leader)->aux_task_owners, struct rt_param, aux_task_owner_node),
+                                                struct task_struct, rt_param);
+                if (effective_priority(new_hp) != hp_eff) { /* eff prio. of hp has changed */
+                        hp_eff = effective_priority(new_hp);
+                        TRACE_CUR("%s/%d is no longer hp in group %s/%d.\n", t->comm, t->pid, leader->comm, leader->pid);
+                        retval = aux_tasks_decrease_priority(leader, hp_eff);
+                }
+        }
+#endif
+out:
+        return retval;
+}
+int make_aux_task_if_required(struct task_struct *t)
+{
+        struct task_struct *leader;
+        int retval = 0;
+        read_lock_irq(&tasklist_lock);
+        leader = t->group_leader;
+        if(!tsk_aux(leader)->initialized || !tsk_aux(leader)->aux_future) {
+                goto out;
+        }
+        TRACE_CUR("Making %s/%d in %s/%d an aux thread.\n", t->comm, t->pid, leader->comm, leader->pid);
+        INIT_LIST_HEAD(&tsk_rt(t)->aux_task_node);
+        INIT_BINHEAP_NODE(&tsk_rt(t)->aux_task_owner_node);
+        retval = admit_aux_task(t);
+        if (retval == 0) {
+                tsk_rt(t)->is_aux_task = 1;
+#ifdef CONFIG_REALTIME_AUX_TASK_PRIORITY_INHERITANCE
+                list_add_tail(&tsk_rt(t)->aux_task_node, &tsk_aux(leader)->aux_tasks);
+                if (!binheap_empty(&tsk_aux(leader)->aux_task_owners)) {
+                        struct task_struct *hp =
+                                container_of(binheap_top_entry(&tsk_aux(leader)->aux_task_owners, struct rt_param, aux_task_owner_node),
+                                                         struct task_struct, rt_param);
+                        TRACE_CUR("hp in group: %s/%d\n", hp->comm, hp->pid);
+                        retval = litmus->__increase_prio(t, (tsk_rt(hp)->inh_task)? tsk_rt(hp)->inh_task : hp);
+                        if (retval != 0) {
+                                /* don't know how to recover from bugs with prio inheritance.  better just crash. */
+                                read_unlock_irq(&tasklist_lock);
+                                BUG();
+                        }
+                }
+#endif
+        }
+out:
+        read_unlock_irq(&tasklist_lock);
+        return retval;
+}
+long enable_aux_task_owner(struct task_struct *t)
+{
+        long retval = 0;
+#ifdef CONFIG_REALTIME_AUX_TASK_PRIORITY_INHERITANCE
+        struct task_struct *leader = t->group_leader;
+        struct task_struct *hp;
+        if (!tsk_rt(t)->has_aux_tasks) {
+                TRACE_CUR("task %s/%d is not an aux owner\n", t->comm, t->pid);
+                return -1;
+        }
+        BUG_ON(!is_realtime(t));
+        if (binheap_is_in_heap(&tsk_rt(t)->aux_task_owner_node)) {
+                TRACE_CUR("task %s/%d is already active\n", t->comm, t->pid);
+                goto out;
+        }
+        binheap_add(&tsk_rt(t)->aux_task_owner_node, &tsk_aux(leader)->aux_task_owners,
+                                struct rt_param, aux_task_owner_node);
+        hp = container_of(binheap_top_entry(&tsk_aux(leader)->aux_task_owners, struct rt_param, aux_task_owner_node),
+                                          struct task_struct, rt_param);
+        if (hp == t) {
+                /* we're the new hp */
+                TRACE_CUR("%s/%d is new hp in group %s/%d.\n", t->comm, t->pid, leader->comm, leader->pid);
+                retval = aux_tasks_increase_priority(leader,
+                                           (tsk_rt(hp)->inh_task)? tsk_rt(hp)->inh_task : hp);
+        }
+#endif
+out:
+        return retval;
+}
+long disable_aux_task_owner(struct task_struct *t)
+{
+        long retval = 0;
+#ifdef CONFIG_REALTIME_AUX_TASK_PRIORITY_INHERITANCE
+        struct task_struct *leader = t->group_leader;
+        struct task_struct *hp;
+        struct task_struct *new_hp = NULL;
+        if (!tsk_rt(t)->has_aux_tasks) {
+                TRACE_CUR("task %s/%d is not an aux owner\n", t->comm, t->pid);
+                return -1;
+        }
+        BUG_ON(!is_realtime(t));
+        if (!binheap_is_in_heap(&tsk_rt(t)->aux_task_owner_node)) {
+                TRACE_CUR("task %s/%d is already not active\n", t->comm, t->pid);
+                goto out;
+        }
+        TRACE_CUR("task %s/%d exiting from group %s/%d.\n", t->comm, t->pid, leader->comm, leader->pid);
+        hp = container_of(binheap_top_entry(&tsk_aux(leader)->aux_task_owners, struct rt_param, aux_task_owner_node),
+                                          struct task_struct, rt_param);
+        binheap_delete(&tsk_rt(t)->aux_task_owner_node, &tsk_aux(leader)->aux_task_owners);
+        if (!binheap_empty(&tsk_aux(leader)->aux_task_owners)) {
+                new_hp = container_of(binheap_top_entry(&tsk_aux(leader)->aux_task_owners, struct rt_param, aux_task_owner_node),
+                                                          struct task_struct, rt_param);
+        }
+        if (hp == t && new_hp != t) {
+                struct task_struct *to_inh = NULL;
+                TRACE_CUR("%s/%d is no longer hp in group %s/%d.\n", t->comm, t->pid, leader->comm, leader->pid);
+                if (new_hp) {
+                        to_inh = (tsk_rt(new_hp)->inh_task) ? tsk_rt(new_hp)->inh_task : new_hp;
+                }
+                retval = aux_tasks_decrease_priority(leader, to_inh);
+        }
+#endif
+out:
+        return retval;
+}
+static int aux_task_owner_max_priority_order(struct binheap_node *a,
+                                                                           struct binheap_node *b)
+{
+        struct task_struct *d_a = container_of(binheap_entry(a, struct rt_param, aux_task_owner_node),
+                                                                                   struct task_struct, rt_param);
+        struct task_struct *d_b = container_of(binheap_entry(b, struct rt_param, aux_task_owner_node),
+                                                                                   struct task_struct, rt_param);
+        BUG_ON(!d_a);
+        BUG_ON(!d_b);
+        return litmus->compare(d_a, d_b);
+}
+static long __do_enable_aux_tasks(int flags)
+{
+        long retval = 0;
+        struct task_struct *leader;
+        struct task_struct *t;
+        int aux_tasks_added = 0;
+        leader = current->group_leader;
+        if (!tsk_aux(leader)->initialized) {
+                INIT_LIST_HEAD(&tsk_aux(leader)->aux_tasks);
+                INIT_BINHEAP_HANDLE(&tsk_aux(leader)->aux_task_owners, aux_task_owner_max_priority_order);
+                tsk_aux(leader)->initialized = 1;
+        }
+        if (flags & AUX_FUTURE) {
+                tsk_aux(leader)->aux_future = 1;
+        }
+        t = leader;
+        do {
+                if (!tsk_rt(t)->has_aux_tasks && !tsk_rt(t)->is_aux_task) {
+                        /* This may harmlessly reinit unused nodes. TODO: Don't reinit already init nodes. */
+                        /* doesn't hurt to initialize both nodes */
+                        INIT_LIST_HEAD(&tsk_rt(t)->aux_task_node);
+                        INIT_BINHEAP_NODE(&tsk_rt(t)->aux_task_owner_node);
+                }
+                TRACE_CUR("Checking task in %s/%d: %s/%d = (p = %llu):\n",
+                                  leader->comm, leader->pid, t->comm, t->pid,
+                                  tsk_rt(t)->task_params.period);
+                /* inspect period to see if it is an rt task */
+                if (tsk_rt(t)->task_params.period == 0) {
+                        if (flags && AUX_CURRENT) {
+                                if (!tsk_rt(t)->is_aux_task) {
+                                        int admit_ret;
+                                        TRACE_CUR("AUX task in %s/%d: %s/%d:\n", leader->comm, leader->pid, t->comm, t->pid);
+                                        admit_ret = admit_aux_task(t);
+                                        if (admit_ret == 0) {
+                                                /* hasn't been aux_tasks_increase_priorityted into rt. make it a aux. */
+                                                tsk_rt(t)->is_aux_task = 1;
+                                                aux_tasks_added = 1;
+#ifdef CONFIG_REALTIME_AUX_TASK_PRIORITY_INHERITANCE
+                                                list_add_tail(&tsk_rt(t)->aux_task_node, &tsk_aux(leader)->aux_tasks);
+#endif
+                                        }
+                                }
+                                else {
+                                        TRACE_CUR("AUX task in %s/%d is already set up: %s/%d\n", leader->comm, leader->pid, t->comm, t->pid);
+                                }
+                        }
+                        else {
+                                TRACE_CUR("Not changing thread in %s/%d to AUX task: %s/%d\n", leader->comm, leader->pid, t->comm, t->pid);
+                        }
+                }
+                else if (!tsk_rt(t)->is_aux_task) {  /* don't let aux tasks get aux tasks of their own */
+                        if (!tsk_rt(t)->has_aux_tasks) {
+                                TRACE_CUR("task in %s/%d: %s/%d:\n", leader->comm, leader->pid, t->comm, t->pid);
+                                tsk_rt(t)->has_aux_tasks = 1;
+                        }
+                        else {
+                                TRACE_CUR("task in %s/%d is already set up: %s/%d\n", leader->comm, leader->pid, t->comm, t->pid);
+                        }
+                }
+                t = next_thread(t);
+        } while(t != leader);
+#ifdef CONFIG_REALTIME_AUX_TASK_PRIORITY_INHERITANCE
+        if (aux_tasks_added && !binheap_empty(&tsk_aux(leader)->aux_task_owners)) {
+                struct task_struct *hp = container_of(binheap_top_entry(&tsk_aux(leader)->aux_task_owners, struct rt_param, aux_task_owner_node),
+                                                                                          struct task_struct, rt_param);
+                TRACE_CUR("hp in group: %s/%d\n", hp->comm, hp->pid);
+                retval = aux_tasks_increase_priority(leader, (tsk_rt(hp)->inh_task)? tsk_rt(hp)->inh_task : hp);
+        }
+#endif
+        return retval;
+}
+static long __do_disable_aux_tasks(int flags)
+{
+        long retval = 0;
+        struct task_struct *leader;
+        struct task_struct *t;
+        leader = current->group_leader;
+        if (flags & AUX_FUTURE) {
+                tsk_aux(leader)->aux_future = 0;
+        }
+        if (flags & AUX_CURRENT) {
+                t = leader;
+                do {
+                        if (tsk_rt(t)->is_aux_task) {
+                                TRACE_CUR("%s/%d is an aux task.\n", t->comm, t->pid);
+                                if (is_realtime(t)) {
+                                        long temp_retval;
+                                        struct sched_param param = { .sched_priority = 0};
+                                        TRACE_CUR("%s/%d is real-time. Changing policy to SCHED_NORMAL.\n", t->comm, t->pid);
+                                        temp_retval = sched_setscheduler_nocheck(t, SCHED_NORMAL, &param);
+                                        if (temp_retval != 0) {
+                                                TRACE_CUR("error changing policy of %s/%d to SCHED_NORMAL\n", t->comm, t->pid);
+                                                if (retval == 0) {
+                                                        retval = temp_retval;
+                                                }
+                                                else {
+                                                        TRACE_CUR("prior error (%d) masks new error (%d)\n", retval, temp_retval);
+                                                }
+                                        }
+                                }
+                                tsk_rt(t)->is_aux_task = 0;
+                        }
+                        t = next_thread(t);
+                } while(t != leader);
+        }
+        return retval;
+}
+asmlinkage long sys_set_aux_tasks(int flags)
+{
+        long retval;
+        read_lock_irq(&tasklist_lock);
+        if (flags & AUX_ENABLE) {
+                retval = __do_enable_aux_tasks(flags);
+        }
+        else {
+                retval = __do_disable_aux_tasks(flags);
+        }
+        read_unlock_irq(&tasklist_lock);
+        return retval;
+}
+#else
+asmlinkage long sys_set_aux_tasks(int flags)
+{
+        printk("Unsupported. Recompile with CONFIG_REALTIME_AUX_TASKS.\n");
+        return -EINVAL;
+}
+#endif
diff --git a/litmus/budget.c b/litmus/budget.c
index f7712be29adb..518174a37a3b 100644
--- a/litmus/budget.c
+++ b/litmus/budget.c
@@ -1,11 +1,13 @@
 #include <linux/sched.h>
 #include <linux/percpu.h>
 #include <linux/hrtimer.h>
+#include <linux/signal.h>
 #include <litmus/litmus.h>
 #include <litmus/preempt.h>
 #include <litmus/budget.h>
+#include <litmus/signal.h>
 struct enforcement_timer {
        /* The enforcement timer is used to accurately police
@@ -64,7 +66,7 @@ static void arm_enforcement_timer(struct enforcement_timer* et,
        /* Calling this when there is no budget left for the task
         * makes no sense, unless the task is non-preemptive. */
-        BUG_ON(budget_exhausted(t) && (!is_np(t)));
+        BUG_ON(budget_exhausted(t) && !is_np(t));
        /* __hrtimer_start_range_ns() cancels the timer
         * anyway, so we don't have to check whether it is still armed */
@@ -86,7 +88,7 @@ void update_enforcement_timer(struct task_struct* t)
 {
        struct enforcement_timer* et = &__get_cpu_var(budget_timer);
-        if (t && budget_precisely_enforced(t)) {
+        if (t && budget_precisely_tracked(t) && !sigbudget_sent(t)) {
                /* Make sure we call into the scheduler when this budget
                 * expires. */
                arm_enforcement_timer(et, t);
@@ -96,6 +98,16 @@ void update_enforcement_timer(struct task_struct* t)
        }
 }
+void send_sigbudget(struct task_struct* t)
+{
+        if (!test_and_set_bit(RT_JOB_SIG_BUDGET_SENT, &tsk_rt(t)->job_params.flags)) {
+                /* signal has not yet been sent and we are responsible for sending
+                 * since we just set the sent-bit when it was previously 0. */
+                TRACE_TASK(t, "SIG_BUDGET being sent!\n");
+                send_sig(SIG_BUDGET, t, 1); /* '1' denotes signal sent from kernel */
+        }
+}
 static int __init init_budget_enforcement(void)
 {
diff --git a/litmus/edf_common.c b/litmus/edf_common.c
index 5aca2934a7b5..441fbfddf0c2 100644
--- a/litmus/edf_common.c
+++ b/litmus/edf_common.c
@@ -12,6 +12,10 @@
 #include <litmus/sched_plugin.h>
 #include <litmus/sched_trace.h>
+#ifdef CONFIG_LITMUS_NESTED_LOCKING
+#include <litmus/locking.h>
+#endif
 #include <litmus/edf_common.h>
 #ifdef CONFIG_EDF_TIE_BREAK_LATENESS_NORM
@@ -45,33 +49,158 @@ static inline long edf_hash(struct task_struct *t)
 *
 * both first and second may be NULL
 */
-int edf_higher_prio(struct task_struct* first,
+#ifdef CONFIG_LITMUS_NESTED_LOCKING
-                    struct task_struct* second)
+int __edf_higher_prio(
+        struct task_struct* first, comparison_mode_t first_mode,
+        struct task_struct* second, comparison_mode_t second_mode)
+#else
+int edf_higher_prio(struct task_struct* first, struct task_struct* second)
+#endif
 {
        struct task_struct *first_task = first;
        struct task_struct *second_task = second;
        /* There is no point in comparing a task to itself. */
        if (first && first == second) {
-                TRACE_TASK(first,
+                TRACE_CUR("WARNING: pointless edf priority comparison: %s/%d\n", first->comm, first->pid);
-                           "WARNING: pointless edf priority comparison.\n");
+                WARN_ON(1);
                return 0;
        }
        /* check for NULL tasks */
-        if (!first || !second)
+        if (!first || !second) {
                return first && !second;
+        }
-#ifdef CONFIG_LITMUS_LOCKING
+        /* There is some goofy stuff in this code here.  There are three subclasses
+         * within the SCHED_LITMUS scheduling class:
+         * 1) Auxiliary tasks: COTS helper threads from the application level that
+         *      are forced to be real-time.
+         * 2) klmirqd interrupt threads: Litmus threaded interrupt handlers.
+         * 3) Normal Litmus tasks.
+         *
+         * At their base priorities, #3 > #2 > #1.  However, #1 and #2 threads might
+         * inherit a priority from a task of #3.
+         *
+         * The code proceeds in the following manner:
+         * 1) Make aux and klmirqd threads with base-priorities have low priorities.
+         * 2) Determine effective priorities.
+         * 3) Perform priority comparison.  Favor #3 over #1 and #2 in case of tie.
+         */
+#if defined(CONFIG_REALTIME_AUX_TASK_PRIORITY_BOOSTED)
+        /* run aux tasks at max priority */
+        /* TODO: Actually use prio-boosting. */
+        if (first->rt_param.is_aux_task != second->rt_param.is_aux_task)
+        {
+                return (first->rt_param.is_aux_task > second->rt_param.is_aux_task);
+        }
+        else if(first->rt_param.is_aux_task && second->rt_param.is_aux_task)
+        {
+                if(first->group_leader == second->group_leader) {
+                        TRACE_CUR("aux tie break!\n");  // tie-break by BASE priority of the aux tasks
+                        goto aux_tie_break;
+                }
+                first = first->group_leader;
+                second = second->group_leader;
+        }
+#elif defined(CONFIG_REALTIME_AUX_TASK_PRIORITY_INHERITANCE)
+        {
+                int first_lo_aux = first->rt_param.is_aux_task && !first->rt_param.inh_task;
+                int second_lo_aux = second->rt_param.is_aux_task && !second->rt_param.inh_task;
+                /* prioritize aux tasks without inheritance below real-time tasks */
+                if (first_lo_aux || second_lo_aux) {
+                        // one of these is an aux task without inheritance.
+                        if(first_lo_aux && second_lo_aux) {
+                                TRACE_CUR("aux tie break!\n");  // tie-break by BASE priority of the aux tasks
+                                goto aux_tie_break;
+                        }
+                        else {
+                                // make the aux thread lowest priority real-time task
+                                int temp = 0;
+                                if (first_lo_aux && is_realtime(second)) {
+//                                      temp = 0;
+                                }
+                                else if(second_lo_aux && is_realtime(first)) {
+                                        temp = 1;
+                                }
+                                TRACE_CUR("%s/%d >> %s/%d --- %d\n", first->comm, first->pid, second->comm, second->pid, temp);
+                                return temp;
+                        }
+                }
+                if (first->rt_param.is_aux_task && second->rt_param.is_aux_task &&
+                        first->rt_param.inh_task == second->rt_param.inh_task) {
+                        // inh_task is !NULL for both tasks since neither was a lo_aux task.
+                        // Both aux tasks inherit from the same task, so tie-break
+                        // by base priority of the aux tasks.
+                        TRACE_CUR("aux tie break!\n");
+                        goto aux_tie_break;
+                }
+        }
+#endif
+#ifdef CONFIG_LITMUS_SOFTIRQD
+        {
+                int first_lo_klmirqd = first->rt_param.is_interrupt_thread && !first->rt_param.inh_task;
+                int second_lo_klmirqd = second->rt_param.is_interrupt_thread && !second->rt_param.inh_task;
+                /* prioritize aux tasks without inheritance below real-time tasks */
+                if (first_lo_klmirqd || second_lo_klmirqd) {
+                        // one of these is an klmirqd thread without inheritance.
+                        if(first_lo_klmirqd && second_lo_klmirqd) {
+                                TRACE_CUR("klmirqd tie break!\n");  // tie-break by BASE priority of the aux tasks
+                                goto klmirqd_tie_break;
+                        }
+                        else {
+                                // make the klmirqd thread the lowest-priority real-time task
+                                // but (above low-prio aux tasks and Linux tasks)
+                                int temp = 0;
+                                if (first_lo_klmirqd && is_realtime(second)) {
+//                                      temp = 0;
+                                }
+                                else if(second_lo_klmirqd && is_realtime(first)) {
+                                        temp = 1;
+                                }
+                                TRACE_CUR("%s/%d >> %s/%d --- %d\n", first->comm, first->pid, second->comm, second->pid, temp);
+                                return temp;
+                        }
+                }
+                if (first->rt_param.is_interrupt_thread && second->rt_param.is_interrupt_thread &&
+                        first->rt_param.inh_task == second->rt_param.inh_task) {
+                        // inh_task is !NULL for both tasks since neither was a lo_klmirqd task.
+                        // Both klmirqd tasks inherit from the same task, so tie-break
+                        // by base priority of the klmirqd tasks.
+                        TRACE_CUR("klmirqd tie break!\n");
+                        goto klmirqd_tie_break;
+                }
+        }
+#endif
-        /* Check for inherited priorities. Change task
+#ifdef CONFIG_LITMUS_LOCKING
+        /* Check for EFFECTIVE priorities. Change task
         * used for comparison in such a case.
         */
-        if (unlikely(first->rt_param.inh_task))
+        if (unlikely(first->rt_param.inh_task)
+#ifdef CONFIG_LITMUS_NESTED_LOCKING
+                && (first_mode == EFFECTIVE)
+#endif
+                ) {
                first_task = first->rt_param.inh_task;
-        if (unlikely(second->rt_param.inh_task))
+        }
+        if (unlikely(second->rt_param.inh_task)
+#ifdef CONFIG_LITMUS_NESTED_LOCKING
+                && (second_mode == EFFECTIVE)
+#endif
+                ) {
                second_task = second->rt_param.inh_task;
+        }
        /* Check for priority boosting. Tie-break by start of boosting.
         */
@@ -79,17 +208,31 @@ int edf_higher_prio(struct task_struct* first,
                /* first_task is boosted, how about second_task? */
                if (!is_priority_boosted(second_task) ||
                    lt_before(get_boost_start(first_task),
-                              get_boost_start(second_task)))
+                                          get_boost_start(second_task))) {
                        return 1;
-                else
+                }
+                else {
                        return 0;
-        } else if (unlikely(is_priority_boosted(second_task)))
+                }
+        }
+        else if (unlikely(is_priority_boosted(second_task))) {
                /* second_task is boosted, first is not*/
                return 0;
+        }
+#endif
+#ifdef CONFIG_REALTIME_AUX_TASKS
+aux_tie_break:
+#endif
+#ifdef CONFIG_LITMUS_SOFTIRQD
+klmirqd_tie_break:
 #endif
-        if (earlier_deadline(first_task, second_task)) {
+        if (!is_realtime(second_task)) {
+                return 1;
+        }
+        else if (earlier_deadline(first_task, second_task)) {
                return 1;
        }
        else if (get_deadline(first_task) == get_deadline(second_task)) {
@@ -98,7 +241,6 @@ int edf_higher_prio(struct task_struct* first,
                 */
                int pid_break;
 #if defined(CONFIG_EDF_TIE_BREAK_LATENESS)
                /* Tie break by lateness. Jobs with greater lateness get
                 * priority. This should spread tardiness across all tasks,
@@ -154,18 +296,104 @@ int edf_higher_prio(struct task_struct* first,
                                return 1;
                        }
                        else if (first_task->pid == second_task->pid) {
-                                /* If the PIDs are the same then the task with the
+#ifdef CONFIG_LITMUS_SOFTIRQD
-                                 * inherited priority wins.
+                                if (first_task->rt_param.is_interrupt_thread < second_task->rt_param.is_interrupt_thread) {
-                                 */
+                                        return 1;
-                                if (!second->rt_param.inh_task) {
+                                }
+                                else if (first_task->rt_param.is_interrupt_thread == second_task->rt_param.is_interrupt_thread) {
+#endif
+#if defined(CONFIG_REALTIME_AUX_TASK_PRIORITY_INHERITANCE)
+                                if (tsk_rt(first)->is_aux_task < tsk_rt(second)->is_aux_task) {
                                        return 1;
                                }
+                                else if (tsk_rt(first)->is_aux_task == tsk_rt(second)->is_aux_task) {
+#endif
+                                /* Something could be wrong if you get this far. */
+                                if (unlikely(first->rt_param.inh_task == second->rt_param.inh_task)) {
+                                        /* Both tasks have the same inherited priority.
+                                         * Likely in a bug-condition.
+                                     */
+                                        if (first->pid < second->pid) {
+                                                return 1;
+                                        }
+                                        else if (first->pid == second->pid) {
+                                                //WARN_ON(1);
+                                        }
+                                }
+                                else {
+                                        /* At least one task must inherit */
+                                        BUG_ON(!first->rt_param.inh_task &&
+                                                   !second->rt_param.inh_task);
+                                        /* The task withOUT the inherited priority wins. */
+                                        if (second->rt_param.inh_task) {
+                                                /*
+                                                 * common with aux tasks.
+                                                TRACE_CUR("unusual comparison: "
+                                                        "first = %s/%d  first_task = %s/%d  "
+                                                        "second = %s/%d  second_task = %s/%d\n",
+                                                        first->comm, first->pid,
+                                                        (first->rt_param.inh_task) ? first->rt_param.inh_task->comm : "(nil)",
+                                                        (first->rt_param.inh_task) ? first->rt_param.inh_task->pid : 0,
+                                                        second->comm, second->pid,
+                                                        (second->rt_param.inh_task) ? second->rt_param.inh_task->comm : "(nil)",
+                                                        (second->rt_param.inh_task) ? second->rt_param.inh_task->pid : 0);
+                                                 */
+                                                return 1;
+                                        }
+                                }
+#if defined(CONFIG_REALTIME_AUX_TASK_PRIORITY_INHERITANCE)
+                                }
+#endif
+#ifdef CONFIG_LITMUS_SOFTIRQD
+                                }
+#endif
                        }
                }
        }
        return 0; /* fall-through. prio(second_task) > prio(first_task) */
 }
+#ifdef CONFIG_LITMUS_NESTED_LOCKING
+int edf_higher_prio(struct task_struct* first, struct task_struct* second)
+{
+        return __edf_higher_prio(first, EFFECTIVE, second, EFFECTIVE);
+}
+int edf_max_heap_order(struct binheap_node *a, struct binheap_node *b)
+{
+        struct nested_info *l_a = (struct nested_info *)binheap_entry(a, struct nested_info, hp_binheap_node);
+        struct nested_info *l_b = (struct nested_info *)binheap_entry(b, struct nested_info, hp_binheap_node);
+        return __edf_higher_prio(l_a->hp_waiter_eff_prio, EFFECTIVE, l_b->hp_waiter_eff_prio, EFFECTIVE);
+}
+int edf_min_heap_order(struct binheap_node *a, struct binheap_node *b)
+{
+        return edf_max_heap_order(b, a);  // swap comparison
+}
+int edf_max_heap_base_priority_order(struct binheap_node *a, struct binheap_node *b)
+{
+        struct nested_info *l_a = (struct nested_info *)binheap_entry(a, struct nested_info, hp_binheap_node);
+        struct nested_info *l_b = (struct nested_info *)binheap_entry(b, struct nested_info, hp_binheap_node);
+        return __edf_higher_prio(l_a->hp_waiter_eff_prio, BASE, l_b->hp_waiter_eff_prio, BASE);
+}
+int edf_min_heap_base_priority_order(struct binheap_node *a, struct binheap_node *b)
+{
+        return edf_max_heap_base_priority_order(b, a);  // swap comparison
+}
+#endif
 int edf_ready_order(struct bheap_node* a, struct bheap_node* b)
 {
        return edf_higher_prio(bheap2task(a), bheap2task(b));
diff --git a/litmus/fdso.c b/litmus/fdso.c
index 250377d184e7..709be3cc8992 100644
--- a/litmus/fdso.c
+++ b/litmus/fdso.c
@@ -20,13 +20,28 @@
 extern struct fdso_ops generic_lock_ops;
+#ifdef CONFIG_LITMUS_AFFINITY_LOCKING
+extern struct fdso_ops generic_affinity_ops;
+#endif
 static const struct fdso_ops* fdso_ops[] = {
        &generic_lock_ops, /* FMLP_SEM */
        &generic_lock_ops, /* SRP_SEM */
        &generic_lock_ops, /* MPCP_SEM */
        &generic_lock_ops, /* MPCP_VS_SEM */
        &generic_lock_ops, /* DPCP_SEM */
        &generic_lock_ops, /* PCP_SEM */
+        &generic_lock_ops, /* RSM_MUTEX */
+        &generic_lock_ops, /* IKGLP_SEM */
+        &generic_lock_ops, /* KFMLP_SEM */
+#ifdef CONFIG_LITMUS_AFFINITY_LOCKING
+        &generic_affinity_ops, /* IKGLP_SIMPLE_GPU_AFF_OBS */
+        &generic_affinity_ops, /* IKGLP_GPU_AFF_OBS */
+        &generic_affinity_ops, /* KFMLP_SIMPLE_GPU_AFF_OBS */
+        &generic_affinity_ops, /* KFMLP_GPU_AFF_OBS */
+#endif
 };
 static int fdso_create(void** obj_ref, obj_type_t type, void* __user config)
diff --git a/litmus/gpu_affinity.c b/litmus/gpu_affinity.c
new file mode 100644
index 000000000000..7d73105b4181
--- /dev/null
+++ b/litmus/gpu_affinity.c
@@ -0,0 +1,231 @@
+#ifdef CONFIG_LITMUS_NVIDIA
+#include <linux/sched.h>
+#include <litmus/litmus.h>
+#include <litmus/gpu_affinity.h>
+#include <litmus/sched_trace.h>
+#define OBSERVATION_CAP ((lt_t)(2e9))
+// reason for skew: high outliers are less
+// frequent and way out of bounds
+//#define HI_THRESHOLD 2
+//#define LO_THRESHOLD 4
+#define NUM_STDEV_NUM   1
+#define NUM_STDEV_DENOM 2
+#define MIN(a, b) ((a < b) ? a : b)
+static fp_t update_estimate(feedback_est_t* fb, fp_t a, fp_t b, lt_t observed)
+{
+        fp_t relative_err;
+        fp_t err, new;
+        fp_t actual = _integer_to_fp(observed);
+        err = _sub(actual, fb->est);
+        new = _add(_mul(a, err), _mul(b, fb->accum_err));
+        relative_err = _div(err, actual);
+        fb->est = new;
+        fb->accum_err = _add(fb->accum_err, err);
+        return relative_err;
+}
+lt_t varience(lt_t nums[], const lt_t avg, const uint16_t count)
+{
+        /* brute force: takes about as much time as incremental running methods when
+         * count < 50 (on Bonham). Brute force also less prone to overflow.
+         */
+        lt_t sqdeviations = 0;
+        uint16_t i;
+        for(i = 0; i < count; ++i)
+        {
+                lt_t temp = (int64_t)nums[i] - (int64_t)avg;
+                sqdeviations += temp * temp;
+        }
+        return sqdeviations/count;
+}
+lt_t isqrt(lt_t n)
+{
+        /* integer square root using babylonian method
+         * (algo taken from wikipedia */
+        lt_t res = 0;
+        lt_t bit = ((lt_t)1) << (sizeof(n)*8-2);
+        while (bit > n) {
+                bit >>= 2;
+        }
+        while (bit != 0) {
+                if (n >= res + bit) {
+                        n -= res + bit;
+                        res = (res >> 1) + bit;
+                }
+                else {
+                        res >>= 1;
+                }
+                bit >>= 2;
+        }
+        return res;
+}
+void update_gpu_estimate(struct task_struct *t, lt_t observed)
+{
+        //feedback_est_t *fb = &(tsk_rt(t)->gpu_migration_est[tsk_rt(t)->gpu_migration]);
+        avg_est_t *est;
+        struct migration_info mig_info;
+        BUG_ON(tsk_rt(t)->gpu_migration > MIG_LAST);
+        est = &(tsk_rt(t)->gpu_migration_est[tsk_rt(t)->gpu_migration]);
+        if (unlikely(observed > OBSERVATION_CAP)) {
+                TRACE_TASK(t, "Crazy observation greater than was dropped: %llu > %llu\n",
+                        observed,
+                        OBSERVATION_CAP);
+                return;
+        }
+#if 0
+        // filter out values that are HI_THRESHOLDx or (1/LO_THRESHOLD)x out
+        // of range of the average, but only filter if enough samples
+        // have been taken.
+        if (likely((est->count > MIN(10, AVG_EST_WINDOW_SIZE/2)))) {
+                if (unlikely(observed < est->avg/LO_THRESHOLD)) {
+                        TRACE_TASK(t, "Observation is too small: %llu\n",
+                                                        observed);
+                        return;
+                }
+                else if (unlikely(observed > est->avg*HI_THRESHOLD)) {
+                        TRACE_TASK(t, "Observation is too large: %llu\n",
+                                                        observed);
+                        return;
+                }
+#endif
+        // filter values outside NUM_STDEVx the standard deviation,
+        // but only filter if enough samples have been taken.
+        if (likely((est->count > MIN(10, AVG_EST_WINDOW_SIZE/2)))) {
+                lt_t lower, upper;
+                lt_t range = (est->std*NUM_STDEV_NUM)/NUM_STDEV_DENOM;
+                lower = est->avg - MIN(range, est->avg); // no underflow.
+                if (unlikely(observed < lower)) {
+                        TRACE_TASK(t, "Observation is too small: %llu\n", observed);
+                        return;
+                }
+                upper = est->avg + range;
+                if (unlikely(observed > upper)) {
+                        TRACE_TASK(t, "Observation is too large: %llu\n", observed);
+                        return;
+                }
+        }
+        if (unlikely(est->count < AVG_EST_WINDOW_SIZE)) {
+                ++est->count;
+        }
+        else {
+                est->sum -= est->history[est->idx];
+        }
+        mig_info.observed = observed;
+        mig_info.estimated = est->avg;
+        mig_info.distance = tsk_rt(t)->gpu_migration;
+        sched_trace_migration(t, &mig_info);
+        est->history[est->idx] = observed;
+        est->sum += observed;
+        est->avg = est->sum/est->count;
+        est->std = isqrt(varience(est->history, est->avg, est->count));
+        est->idx = (est->idx + 1) % AVG_EST_WINDOW_SIZE;
+#if 0
+        if(unlikely(fb->est.val == 0)) {
+                // kludge-- cap observed values to prevent whacky estimations.
+                // whacky stuff happens during the first few jobs.
+                if(unlikely(observed > OBSERVATION_CAP)) {
+                        TRACE_TASK(t, "Crazy observation was capped: %llu -> %llu\n",
+                                           observed, OBSERVATION_CAP);
+                        observed = OBSERVATION_CAP;
+                }
+                // take the first observation as our estimate
+                // (initial value of 0 was bogus anyhow)
+                fb->est = _integer_to_fp(observed);
+                fb->accum_err = _div(fb->est, _integer_to_fp(2));  // ...seems to work.
+        }
+        else {
+                fp_t rel_err = update_estimate(fb,
+                                                                           tsk_rt(t)->gpu_fb_param_a[tsk_rt(t)->gpu_migration],
+                                                                           tsk_rt(t)->gpu_fb_param_b[tsk_rt(t)->gpu_migration],
+                                                                           observed);
+                if(unlikely(_fp_to_integer(fb->est) <= 0)) {
+                        TRACE_TASK(t, "Invalid estimate. Patching.\n");
+                        fb->est = _integer_to_fp(observed);
+                        fb->accum_err = _div(fb->est, _integer_to_fp(2));  // ...seems to work.
+                }
+                else {
+                        struct migration_info mig_info;
+                        sched_trace_prediction_err(t,
+                                                                           &(tsk_rt(t)->gpu_migration),
+                                                                           &rel_err);
+                        mig_info.observed = observed;
+                        mig_info.estimated = get_gpu_estimate(t, tsk_rt(t)->gpu_migration);
+                        mig_info.distance = tsk_rt(t)->gpu_migration;
+                        sched_trace_migration(t, &mig_info);
+                }
+        }
+#endif
+        TRACE_TASK(t, "GPU est update after (dist = %d, obs = %llu): %llu\n",
+                           tsk_rt(t)->gpu_migration,
+                           observed,
+                           est->avg);
+}
+gpu_migration_dist_t gpu_migration_distance(int a, int b)
+{
+        // GPUs organized in a binary hierarchy, no more than 2^MIG_FAR GPUs
+        int i;
+        int dist;
+        if(likely(a >= 0 && b >= 0)) {
+                for(i = 0; i <= MIG_FAR; ++i) {
+                        if(a>>i == b>>i) {
+                                dist = i;
+                                goto out;
+                        }
+                }
+                dist = MIG_NONE; // hopefully never reached.
+                TRACE_CUR("WARNING: GPU distance too far! %d -> %d\n", a, b);
+        }
+        else {
+                dist = MIG_NONE;
+        }
+out:
+        TRACE_CUR("Distance %d -> %d is %d\n",
+                          a, b, dist);
+        return dist;
+}
+#endif
diff --git a/litmus/ikglp_lock.c b/litmus/ikglp_lock.c
new file mode 100644
index 000000000000..a4ae74331782
--- /dev/null
+++ b/litmus/ikglp_lock.c
@@ -0,0 +1,2976 @@
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <litmus/trace.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/fdso.h>
+#if defined(CONFIG_LITMUS_AFFINITY_LOCKING) && defined(CONFIG_LITMUS_NVIDIA)
+#include <litmus/gpu_affinity.h>
+#include <litmus/nvidia_info.h>
+#endif
+#include <litmus/ikglp_lock.h>
+// big signed value.
+#define IKGLP_INVAL_DISTANCE 0x7FFFFFFF
+int ikglp_max_heap_base_priority_order(struct binheap_node *a,
+                                                                                   struct binheap_node *b)
+{
+        ikglp_heap_node_t *d_a = binheap_entry(a, ikglp_heap_node_t, node);
+        ikglp_heap_node_t *d_b = binheap_entry(b, ikglp_heap_node_t, node);
+        BUG_ON(!d_a);
+        BUG_ON(!d_b);
+        return litmus->__compare(d_a->task, BASE, d_b->task, BASE);
+}
+int ikglp_min_heap_base_priority_order(struct binheap_node *a,
+                                                                                   struct binheap_node *b)
+{
+        ikglp_heap_node_t *d_a = binheap_entry(a, ikglp_heap_node_t, node);
+        ikglp_heap_node_t *d_b = binheap_entry(b, ikglp_heap_node_t, node);
+        return litmus->__compare(d_b->task, BASE, d_a->task, BASE);
+}
+int ikglp_donor_max_heap_base_priority_order(struct binheap_node *a,
+                                                                                                 struct binheap_node *b)
+{
+        ikglp_wait_state_t *d_a = binheap_entry(a, ikglp_wait_state_t, node);
+        ikglp_wait_state_t *d_b = binheap_entry(b, ikglp_wait_state_t, node);
+        return litmus->__compare(d_a->task, BASE, d_b->task, BASE);
+}
+int ikglp_min_heap_donee_order(struct binheap_node *a,
+                                                                   struct binheap_node *b)
+{
+        struct task_struct *prio_a, *prio_b;
+        ikglp_donee_heap_node_t *d_a =
+                binheap_entry(a, ikglp_donee_heap_node_t, node);
+        ikglp_donee_heap_node_t *d_b =
+                binheap_entry(b, ikglp_donee_heap_node_t, node);
+        if(!d_a->donor_info) {
+                prio_a = d_a->task;
+        }
+        else {
+                prio_a = d_a->donor_info->task;
+                BUG_ON(d_a->task != d_a->donor_info->donee_info->task);
+        }
+        if(!d_b->donor_info) {
+                prio_b = d_b->task;
+        }
+        else {
+                prio_b = d_b->donor_info->task;
+                BUG_ON(d_b->task != d_b->donor_info->donee_info->task);
+        }
+        // note reversed order
+        return litmus->__compare(prio_b, BASE, prio_a, BASE);
+}
+static inline int ikglp_get_idx(struct ikglp_semaphore *sem,
+                                                                struct fifo_queue *queue)
+{
+        return (queue - &sem->fifo_queues[0]);
+}
+static inline struct fifo_queue* ikglp_get_queue(struct ikglp_semaphore *sem,
+                                                                                                 struct task_struct *holder)
+{
+        int i;
+        for(i = 0; i < sem->nr_replicas; ++i)
+                if(sem->fifo_queues[i].owner == holder)
+                        return(&sem->fifo_queues[i]);
+        return(NULL);
+}
+static struct task_struct* ikglp_find_hp_waiter(struct fifo_queue *kqueue,
+                                                                                                struct task_struct *skip)
+{
+        struct list_head *pos;
+        struct task_struct *queued, *found = NULL;
+        list_for_each(pos, &kqueue->wait.task_list) {
+                queued  = (struct task_struct*) list_entry(pos,
+                                                                                        wait_queue_t, task_list)->private;
+                /* Compare task prios, find high prio task. */
+                if(queued != skip && litmus->compare(queued, found))
+                        found = queued;
+        }
+        return found;
+}
+static struct fifo_queue* ikglp_find_shortest(struct ikglp_semaphore *sem,
+                                                                                          struct fifo_queue *search_start)
+{
+        // we start our search at search_start instead of at the beginning of the
+        // queue list to load-balance across all resources.
+        struct fifo_queue* step = search_start;
+        struct fifo_queue* shortest = sem->shortest_fifo_queue;
+        do {
+                step = (step+1 != &sem->fifo_queues[sem->nr_replicas]) ?
+                step+1 : &sem->fifo_queues[0];
+                if(step->count < shortest->count) {
+                        shortest = step;
+                        if(step->count == 0)
+                                break; /* can't get any shorter */
+                }
+        }while(step != search_start);
+        return(shortest);
+}
+static inline struct task_struct* ikglp_mth_highest(struct ikglp_semaphore *sem)
+{
+        return binheap_top_entry(&sem->top_m, ikglp_heap_node_t, node)->task;
+}
+#if 0
+static void print_global_list(struct binheap_node* n, int depth)
+{
+        ikglp_heap_node_t *global_heap_node;
+        char padding[81] = "                                                                                ";
+        if(n == NULL) {
+                TRACE_CUR("+-> %p\n", NULL);
+                return;
+        }
+        global_heap_node = binheap_entry(n, ikglp_heap_node_t, node);
+        if(depth*2 <= 80)
+                padding[depth*2] = '\0';
+        TRACE_CUR("%s+-> %s/%d\n",
+                          padding,
+                          global_heap_node->task->comm,
+                          global_heap_node->task->pid);
+    if(n->left) print_global_list(n->left, depth+1);
+    if(n->right) print_global_list(n->right, depth+1);
+}
+static void print_donees(struct ikglp_semaphore *sem, struct binheap_node *n, int depth)
+{
+        ikglp_donee_heap_node_t *donee_node;
+        char padding[81] = "                                                                                ";
+        struct task_struct* donor = NULL;
+        if(n == NULL) {
+                TRACE_CUR("+-> %p\n", NULL);
+                return;
+        }
+        donee_node = binheap_entry(n, ikglp_donee_heap_node_t, node);
+        if(depth*2 <= 80)
+                padding[depth*2] = '\0';
+        if(donee_node->donor_info) {
+                donor = donee_node->donor_info->task;
+        }
+        TRACE_CUR("%s+-> %s/%d (d: %s/%d) (fq: %d)\n",
+                          padding,
+                          donee_node->task->comm,
+                          donee_node->task->pid,
+                          (donor) ? donor->comm : "nil",
+                          (donor) ? donor->pid : -1,
+                          ikglp_get_idx(sem, donee_node->fq));
+    if(n->left) print_donees(sem, n->left, depth+1);
+    if(n->right) print_donees(sem, n->right, depth+1);
+}
+static void print_donors(struct binheap_node *n, int depth)
+{
+        ikglp_wait_state_t *donor_node;
+        char padding[81] = "                                                                                ";
+        if(n == NULL) {
+                TRACE_CUR("+-> %p\n", NULL);
+                return;
+        }
+        donor_node = binheap_entry(n, ikglp_wait_state_t, node);
+        if(depth*2 <= 80)
+                padding[depth*2] = '\0';
+        TRACE_CUR("%s+-> %s/%d (donee: %s/%d)\n",
+                          padding,
+                          donor_node->task->comm,
+                          donor_node->task->pid,
+                          donor_node->donee_info->task->comm,
+                          donor_node->donee_info->task->pid);
+    if(n->left) print_donors(n->left, depth+1);
+    if(n->right) print_donors(n->right, depth+1);
+}
+#endif
+static void ikglp_add_global_list(struct ikglp_semaphore *sem,
+                                                                  struct task_struct *t,
+                                                                  ikglp_heap_node_t *node)
+{
+        node->task = t;
+        INIT_BINHEAP_NODE(&node->node);
+        if(sem->top_m_size < sem->m) {
+                TRACE_CUR("Trivially adding %s/%d to top-m global list.\n",
+                                  t->comm, t->pid);
+//              TRACE_CUR("Top-M Before (size = %d):\n", sem->top_m_size);
+//              print_global_list(sem->top_m.root, 1);
+                binheap_add(&node->node, &sem->top_m, ikglp_heap_node_t, node);
+                ++(sem->top_m_size);
+//              TRACE_CUR("Top-M After (size = %d):\n", sem->top_m_size);
+//              print_global_list(sem->top_m.root, 1);
+        }
+        else if(litmus->__compare(t, BASE, ikglp_mth_highest(sem), BASE)) {
+                ikglp_heap_node_t *evicted =
+                        binheap_top_entry(&sem->top_m, ikglp_heap_node_t, node);
+                TRACE_CUR("Adding %s/%d to top-m and evicting %s/%d.\n",
+                                  t->comm, t->pid,
+                                  evicted->task->comm, evicted->task->pid);
+//              TRACE_CUR("Not-Top-M Before:\n");
+//              print_global_list(sem->not_top_m.root, 1);
+//              TRACE_CUR("Top-M Before (size = %d):\n", sem->top_m_size);
+//              print_global_list(sem->top_m.root, 1);
+                binheap_delete_root(&sem->top_m, ikglp_heap_node_t, node);
+                INIT_BINHEAP_NODE(&evicted->node);
+                binheap_add(&evicted->node, &sem->not_top_m, ikglp_heap_node_t, node);
+                binheap_add(&node->node, &sem->top_m, ikglp_heap_node_t, node);
+//              TRACE_CUR("Top-M After (size = %d):\n", sem->top_m_size);
+//              print_global_list(sem->top_m.root, 1);
+//              TRACE_CUR("Not-Top-M After:\n");
+//              print_global_list(sem->not_top_m.root, 1);
+        }
+        else {
+                TRACE_CUR("Trivially adding %s/%d to not-top-m global list.\n",
+                                  t->comm, t->pid);
+//              TRACE_CUR("Not-Top-M Before:\n");
+//              print_global_list(sem->not_top_m.root, 1);
+                binheap_add(&node->node, &sem->not_top_m, ikglp_heap_node_t, node);
+//              TRACE_CUR("Not-Top-M After:\n");
+//              print_global_list(sem->not_top_m.root, 1);
+        }
+}
+static void ikglp_del_global_list(struct ikglp_semaphore *sem,
+                                                                  struct task_struct *t,
+                                                                  ikglp_heap_node_t *node)
+{
+        BUG_ON(!binheap_is_in_heap(&node->node));
+        TRACE_CUR("Removing %s/%d from global list.\n", t->comm, t->pid);
+        if(binheap_is_in_this_heap(&node->node, &sem->top_m)) {
+                TRACE_CUR("%s/%d is in top-m\n", t->comm, t->pid);
+//              TRACE_CUR("Not-Top-M Before:\n");
+//              print_global_list(sem->not_top_m.root, 1);
+//              TRACE_CUR("Top-M Before (size = %d):\n", sem->top_m_size);
+//              print_global_list(sem->top_m.root, 1);
+                binheap_delete(&node->node, &sem->top_m);
+                if(!binheap_empty(&sem->not_top_m)) {
+                        ikglp_heap_node_t *promoted =
+                                binheap_top_entry(&sem->not_top_m, ikglp_heap_node_t, node);
+                        TRACE_CUR("Promoting %s/%d to top-m\n",
+                                          promoted->task->comm, promoted->task->pid);
+                        binheap_delete_root(&sem->not_top_m, ikglp_heap_node_t, node);
+                        INIT_BINHEAP_NODE(&promoted->node);
+                        binheap_add(&promoted->node, &sem->top_m, ikglp_heap_node_t, node);
+                }
+                else {
+                        TRACE_CUR("No one to promote to top-m.\n");
+                        --(sem->top_m_size);
+                }
+//              TRACE_CUR("Top-M After (size = %d):\n", sem->top_m_size);
+//              print_global_list(sem->top_m.root, 1);
+//              TRACE_CUR("Not-Top-M After:\n");
+//              print_global_list(sem->not_top_m.root, 1);
+        }
+        else {
+                TRACE_CUR("%s/%d is in not-top-m\n", t->comm, t->pid);
+//              TRACE_CUR("Not-Top-M Before:\n");
+//              print_global_list(sem->not_top_m.root, 1);
+                binheap_delete(&node->node, &sem->not_top_m);
+//              TRACE_CUR("Not-Top-M After:\n");
+//              print_global_list(sem->not_top_m.root, 1);
+        }
+}
+static void ikglp_add_donees(struct ikglp_semaphore *sem,
+                                                         struct fifo_queue *fq,
+                                                         struct task_struct *t,
+                                                         ikglp_donee_heap_node_t* node)
+{
+//      TRACE_CUR("Adding %s/%d to donee list.\n", t->comm, t->pid);
+//      TRACE_CUR("donees Before:\n");
+//      print_donees(sem, sem->donees.root, 1);
+        node->task = t;
+        node->donor_info = NULL;
+        node->fq = fq;
+        INIT_BINHEAP_NODE(&node->node);
+        binheap_add(&node->node, &sem->donees, ikglp_donee_heap_node_t, node);
+//      TRACE_CUR("donees After:\n");
+//      print_donees(sem, sem->donees.root, 1);
+}
+static void ikglp_refresh_owners_prio_increase(struct task_struct *t,
+                                                                                           struct fifo_queue *fq,
+                                                                                           struct ikglp_semaphore *sem,
+                                                                                           unsigned long flags)
+{
+        // priority of 't' has increased (note: 't' might already be hp_waiter).
+        if ((t == fq->hp_waiter) || litmus->compare(t, fq->hp_waiter)) {
+                struct task_struct *old_max_eff_prio;
+                struct task_struct *new_max_eff_prio;
+                struct task_struct *new_prio = NULL;
+                struct task_struct *owner = fq->owner;
+                if(fq->hp_waiter)
+                        TRACE_TASK(t, "has higher prio than hp_waiter (%s/%d).\n",
+                                           fq->hp_waiter->comm, fq->hp_waiter->pid);
+                else
+                        TRACE_TASK(t, "has higher prio than hp_waiter (NIL).\n");
+                if(owner)
+                {
+                        raw_spin_lock(&tsk_rt(owner)->hp_blocked_tasks_lock);
+//                      TRACE_TASK(owner, "Heap Before:\n");
+//                      print_hp_waiters(tsk_rt(owner)->hp_blocked_tasks.root, 0);
+                        old_max_eff_prio = top_priority(&tsk_rt(owner)->hp_blocked_tasks);
+                        fq->hp_waiter = t;
+                        fq->nest.hp_waiter_eff_prio = effective_priority(fq->hp_waiter);
+                        binheap_decrease(&fq->nest.hp_binheap_node,
+                                                         &tsk_rt(owner)->hp_blocked_tasks);
+//                      TRACE_TASK(owner, "Heap After:\n");
+//                      print_hp_waiters(tsk_rt(owner)->hp_blocked_tasks.root, 0);
+                        new_max_eff_prio = top_priority(&tsk_rt(owner)->hp_blocked_tasks);
+                        if(new_max_eff_prio != old_max_eff_prio) {
+                                TRACE_TASK(t, "is new hp_waiter.\n");
+                                if ((effective_priority(owner) == old_max_eff_prio) ||
+                                        (litmus->__compare(new_max_eff_prio, BASE,
+                                                                           owner, EFFECTIVE))){
+                                        new_prio = new_max_eff_prio;
+                                }
+                        }
+                        else {
+                                TRACE_TASK(t, "no change in max_eff_prio of heap.\n");
+                        }
+                        if(new_prio) {
+                                // set new inheritance and propagate
+                                TRACE_TASK(t, "Effective priority changed for owner %s/%d to %s/%d\n",
+                                                   owner->comm, owner->pid,
+                                                   new_prio->comm, new_prio->pid);
+                                litmus->nested_increase_prio(owner, new_prio, &sem->lock,
+                                                                                         flags);  // unlocks lock.
+                        }
+                        else {
+                                TRACE_TASK(t, "No change in effective priority (is %s/%d).  Propagation halted.\n",
+                                                   new_max_eff_prio->comm, new_max_eff_prio->pid);
+                                raw_spin_unlock(&tsk_rt(owner)->hp_blocked_tasks_lock);
+                                unlock_fine_irqrestore(&sem->lock, flags);
+                        }
+                }
+                else {
+                        fq->hp_waiter = t;
+                        fq->nest.hp_waiter_eff_prio = effective_priority(fq->hp_waiter);
+                        TRACE_TASK(t, "no owner.\n");
+                        unlock_fine_irqrestore(&sem->lock, flags);
+                }
+        }
+        else {
+                TRACE_TASK(t, "hp_waiter is unaffected.\n");
+                unlock_fine_irqrestore(&sem->lock, flags);
+        }
+}
+// hp_waiter has decreased
+static void ikglp_refresh_owners_prio_decrease(struct fifo_queue *fq,
+                                                                                           struct ikglp_semaphore *sem,
+                                                                                           unsigned long flags)
+{
+        struct task_struct *owner = fq->owner;
+        struct task_struct *old_max_eff_prio;
+        struct task_struct *new_max_eff_prio;
+        if(!owner) {
+                TRACE_CUR("No owner.  Returning.\n");
+                unlock_fine_irqrestore(&sem->lock, flags);
+                return;
+        }
+        TRACE_CUR("ikglp_refresh_owners_prio_decrease\n");
+        raw_spin_lock(&tsk_rt(owner)->hp_blocked_tasks_lock);
+        old_max_eff_prio = top_priority(&tsk_rt(owner)->hp_blocked_tasks);
+        binheap_delete(&fq->nest.hp_binheap_node, &tsk_rt(owner)->hp_blocked_tasks);
+        fq->nest.hp_waiter_eff_prio = fq->hp_waiter;
+        binheap_add(&fq->nest.hp_binheap_node, &tsk_rt(owner)->hp_blocked_tasks,
+                                struct nested_info, hp_binheap_node);
+        new_max_eff_prio = top_priority(&tsk_rt(owner)->hp_blocked_tasks);
+        if((old_max_eff_prio != new_max_eff_prio) &&
+           (effective_priority(owner) == old_max_eff_prio))
+        {
+                // Need to set new effective_priority for owner
+                struct task_struct *decreased_prio;
+                TRACE_CUR("Propagating decreased inheritance to holder of fq %d.\n",
+                                  ikglp_get_idx(sem, fq));
+                if(litmus->__compare(new_max_eff_prio, BASE, owner, BASE)) {
+                        TRACE_CUR("%s/%d has greater base priority than base priority of owner (%s/%d) of fq %d.\n",
+                                          (new_max_eff_prio) ? new_max_eff_prio->comm : "nil",
+                                          (new_max_eff_prio) ? new_max_eff_prio->pid : -1,
+                                          owner->comm,
+                                          owner->pid,
+                                          ikglp_get_idx(sem, fq));
+                        decreased_prio = new_max_eff_prio;
+                }
+                else {
+                        TRACE_CUR("%s/%d has lesser base priority than base priority of owner (%s/%d) of fq %d.\n",
+                                          (new_max_eff_prio) ? new_max_eff_prio->comm : "nil",
+                                          (new_max_eff_prio) ? new_max_eff_prio->pid : -1,
+                                          owner->comm,
+                                          owner->pid,
+                                          ikglp_get_idx(sem, fq));
+                        decreased_prio = NULL;
+                }
+                // beware: recursion
+                litmus->nested_decrease_prio(owner, decreased_prio, &sem->lock, flags); // will unlock mutex->lock
+        }
+        else {
+                TRACE_TASK(owner, "No need to propagate priority decrease forward.\n");
+                raw_spin_unlock(&tsk_rt(owner)->hp_blocked_tasks_lock);
+                unlock_fine_irqrestore(&sem->lock, flags);
+        }
+}
+static void ikglp_remove_donation_from_owner(struct binheap_node *n,
+                                                                                         struct fifo_queue *fq,
+                                                                                         struct ikglp_semaphore *sem,
+                                                                                         unsigned long flags)
+{
+        struct task_struct *owner = fq->owner;
+        struct task_struct *old_max_eff_prio;
+        struct task_struct *new_max_eff_prio;
+        BUG_ON(!owner);
+        raw_spin_lock(&tsk_rt(owner)->hp_blocked_tasks_lock);
+        old_max_eff_prio = top_priority(&tsk_rt(owner)->hp_blocked_tasks);
+        binheap_delete(n, &tsk_rt(owner)->hp_blocked_tasks);
+        new_max_eff_prio = top_priority(&tsk_rt(owner)->hp_blocked_tasks);
+        if((old_max_eff_prio != new_max_eff_prio) &&
+           (effective_priority(owner) == old_max_eff_prio))
+        {
+                // Need to set new effective_priority for owner
+                struct task_struct *decreased_prio;
+                TRACE_CUR("Propagating decreased inheritance to holder of fq %d.\n",
+                                  ikglp_get_idx(sem, fq));
+                if(litmus->__compare(new_max_eff_prio, BASE, owner, BASE)) {
+                        TRACE_CUR("has greater base priority than base priority of owner of fq %d.\n",
+                                          ikglp_get_idx(sem, fq));
+                        decreased_prio = new_max_eff_prio;
+                }
+                else {
+                        TRACE_CUR("has lesser base priority than base priority of owner of fq %d.\n",
+                                          ikglp_get_idx(sem, fq));
+                        decreased_prio = NULL;
+                }
+                // beware: recursion
+                litmus->nested_decrease_prio(owner, decreased_prio, &sem->lock, flags); // will unlock mutex->lock
+        }
+        else {
+                TRACE_TASK(owner, "No need to propagate priority decrease forward.\n");
+                raw_spin_unlock(&tsk_rt(owner)->hp_blocked_tasks_lock);
+                unlock_fine_irqrestore(&sem->lock, flags);
+        }
+}
+static void ikglp_remove_donation_from_fq_waiter(struct task_struct *t,
+                                                                                                 struct binheap_node *n)
+{
+        struct task_struct *old_max_eff_prio;
+        struct task_struct *new_max_eff_prio;
+        raw_spin_lock(&tsk_rt(t)->hp_blocked_tasks_lock);
+        old_max_eff_prio = top_priority(&tsk_rt(t)->hp_blocked_tasks);
+        binheap_delete(n, &tsk_rt(t)->hp_blocked_tasks);
+        new_max_eff_prio = top_priority(&tsk_rt(t)->hp_blocked_tasks);
+        if((old_max_eff_prio != new_max_eff_prio) &&
+           (effective_priority(t) == old_max_eff_prio))
+        {
+                // Need to set new effective_priority for owner
+                struct task_struct *decreased_prio;
+                if(litmus->__compare(new_max_eff_prio, BASE, t, BASE)) {
+                        decreased_prio = new_max_eff_prio;
+                }
+                else {
+                        decreased_prio = NULL;
+                }
+                tsk_rt(t)->inh_task = decreased_prio;
+        }
+        raw_spin_unlock(&tsk_rt(t)->hp_blocked_tasks_lock);
+}
+static void ikglp_get_immediate(struct task_struct* t,
+                                                                struct fifo_queue *fq,
+                                                                struct ikglp_semaphore *sem,
+                                                                unsigned long flags)
+{
+        // resource available now
+        TRACE_CUR("queue %d: acquired immediately\n", ikglp_get_idx(sem, fq));
+        fq->owner = t;
+        raw_spin_lock(&tsk_rt(t)->hp_blocked_tasks_lock);
+        binheap_add(&fq->nest.hp_binheap_node, &tsk_rt(t)->hp_blocked_tasks,
+                                struct nested_info, hp_binheap_node);
+        raw_spin_unlock(&tsk_rt(t)->hp_blocked_tasks_lock);
+        ++(fq->count);
+        ikglp_add_global_list(sem, t, &fq->global_heap_node);
+        ikglp_add_donees(sem, fq, t, &fq->donee_heap_node);
+        sem->shortest_fifo_queue = ikglp_find_shortest(sem, sem->shortest_fifo_queue);
+#ifdef CONFIG_LITMUS_AFFINITY_LOCKING
+        if(sem->aff_obs) {
+                sem->aff_obs->ops->notify_enqueue(sem->aff_obs, fq, t);
+                sem->aff_obs->ops->notify_acquired(sem->aff_obs, fq, t);
+        }
+#endif
+        unlock_fine_irqrestore(&sem->lock, flags);
+}
+static void __ikglp_enqueue_on_fq(struct ikglp_semaphore *sem,
+                                                                  struct fifo_queue* fq,
+                                                                  struct task_struct* t,
+                                                                  wait_queue_t *wait,
+                                                                  ikglp_heap_node_t *global_heap_node,
+                                                                  ikglp_donee_heap_node_t *donee_heap_node)
+{
+        /* resource is not free => must suspend and wait */
+        TRACE_TASK(t, "Enqueuing on fq %d.\n",
+                           ikglp_get_idx(sem, fq));
+        init_waitqueue_entry(wait, t);
+        __add_wait_queue_tail_exclusive(&fq->wait, wait);
+        ++(fq->count);
+        ++(sem->nr_in_fifos);
+        // update global list.
+        if(likely(global_heap_node)) {
+                if(binheap_is_in_heap(&global_heap_node->node)) {
+                        WARN_ON(1);
+                        ikglp_del_global_list(sem, t, global_heap_node);
+                }
+                ikglp_add_global_list(sem, t, global_heap_node);
+        }
+        // update donor eligiblity list.
+        if(likely(donee_heap_node)) {
+//              if(binheap_is_in_heap(&donee_heap_node->node)) {
+//                      WARN_ON(1);
+//              }
+                ikglp_add_donees(sem, fq, t, donee_heap_node);
+        }
+        if(sem->shortest_fifo_queue == fq) {
+                sem->shortest_fifo_queue = ikglp_find_shortest(sem, fq);
+        }
+#ifdef CONFIG_LITMUS_AFFINITY_LOCKING
+        if(sem->aff_obs) {
+                sem->aff_obs->ops->notify_enqueue(sem->aff_obs, fq, t);
+        }
+#endif
+        TRACE_TASK(t, "shortest queue is now %d\n", ikglp_get_idx(sem, fq));
+}
+static void ikglp_enqueue_on_fq(
+                                                                struct ikglp_semaphore *sem,
+                                                                struct fifo_queue *fq,
+                                                                ikglp_wait_state_t *wait,
+                                                                unsigned long flags)
+{
+        /* resource is not free => must suspend and wait */
+        TRACE_TASK(wait->task, "queue %d: Resource is not free => must suspend and wait.\n",
+                           ikglp_get_idx(sem, fq));
+        INIT_BINHEAP_NODE(&wait->global_heap_node.node);
+        INIT_BINHEAP_NODE(&wait->donee_heap_node.node);
+        __ikglp_enqueue_on_fq(sem, fq, wait->task, &wait->fq_node,
+                                                  &wait->global_heap_node, &wait->donee_heap_node);
+        ikglp_refresh_owners_prio_increase(wait->task, fq, sem, flags);  // unlocks sem->lock
+}
+static void __ikglp_enqueue_on_pq(struct ikglp_semaphore *sem,
+                                                                  ikglp_wait_state_t *wait)
+{
+        TRACE_TASK(wait->task, "goes to PQ.\n");
+        wait->pq_node.task = wait->task; // copy over task (little redundant...)
+        binheap_add(&wait->pq_node.node, &sem->priority_queue,
+                                ikglp_heap_node_t, node);
+}
+static void ikglp_enqueue_on_pq(struct ikglp_semaphore *sem,
+                                                                ikglp_wait_state_t *wait)
+{
+        INIT_BINHEAP_NODE(&wait->global_heap_node.node);
+        INIT_BINHEAP_NODE(&wait->donee_heap_node.node);
+        INIT_BINHEAP_NODE(&wait->pq_node.node);
+        __ikglp_enqueue_on_pq(sem, wait);
+}
+static void ikglp_enqueue_on_donor(struct ikglp_semaphore *sem,
+                                                                   ikglp_wait_state_t* wait,
+                                                                   unsigned long flags)
+{
+        struct task_struct *t = wait->task;
+        ikglp_donee_heap_node_t *donee_node = NULL;
+        struct task_struct *donee;
+        struct task_struct *old_max_eff_prio;
+        struct task_struct *new_max_eff_prio;
+        struct task_struct *new_prio = NULL;
+        INIT_BINHEAP_NODE(&wait->global_heap_node.node);
+        INIT_BINHEAP_NODE(&wait->donee_heap_node.node);
+        INIT_BINHEAP_NODE(&wait->pq_node.node);
+        INIT_BINHEAP_NODE(&wait->node);
+//      TRACE_CUR("Adding %s/%d as donor.\n", t->comm, t->pid);
+//      TRACE_CUR("donors Before:\n");
+//      print_donors(sem->donors.root, 1);
+        // Add donor to the global list.
+        ikglp_add_global_list(sem, t, &wait->global_heap_node);
+        // Select a donee
+#ifdef CONFIG_LITMUS_AFFINITY_LOCKING
+        donee_node = (sem->aff_obs) ?
+                sem->aff_obs->ops->advise_donee_selection(sem->aff_obs, t) :
+                binheap_top_entry(&sem->donees, ikglp_donee_heap_node_t, node);
+#else
+        donee_node = binheap_top_entry(&sem->donees, ikglp_donee_heap_node_t, node);
+#endif
+        donee = donee_node->task;
+        TRACE_TASK(t, "Donee selected: %s/%d\n", donee->comm, donee->pid);
+        TRACE_CUR("Temporarily removing %s/%d to donee list.\n",
+                          donee->comm, donee->pid);
+//      TRACE_CUR("donees Before:\n");
+//      print_donees(sem, sem->donees.root, 1);
+        //binheap_delete_root(&sem->donees, ikglp_donee_heap_node_t, node);  // will re-add it shortly
+        binheap_delete(&donee_node->node, &sem->donees);
+//      TRACE_CUR("donees After:\n");
+//      print_donees(sem, sem->donees.root, 1);
+        wait->donee_info = donee_node;
+        // Add t to donor heap.
+        binheap_add(&wait->node, &sem->donors, ikglp_wait_state_t, node);
+        // Now adjust the donee's priority.
+        // Lock the donee's inheritance heap.
+        raw_spin_lock(&tsk_rt(donee)->hp_blocked_tasks_lock);
+        old_max_eff_prio = top_priority(&tsk_rt(donee)->hp_blocked_tasks);
+        if(donee_node->donor_info) {
+                // Steal donation relation.  Evict old donor to PQ.
+                // Remove old donor from donor heap
+                ikglp_wait_state_t *old_wait = donee_node->donor_info;
+                struct task_struct *old_donor = old_wait->task;
+                TRACE_TASK(t, "Donee (%s/%d) had donor %s/%d.  Moving old donor to PQ.\n",
+                                   donee->comm, donee->pid, old_donor->comm, old_donor->pid);
+                binheap_delete(&old_wait->node, &sem->donors);
+                // Remove donation from donee's inheritance heap.
+                binheap_delete(&old_wait->prio_donation.hp_binheap_node,
+                                           &tsk_rt(donee)->hp_blocked_tasks);
+                // WARNING: have not updated inh_prio!
+                // Add old donor to PQ.
+                __ikglp_enqueue_on_pq(sem, old_wait);
+                // Remove old donor from the global heap.
+                ikglp_del_global_list(sem, old_donor, &old_wait->global_heap_node);
+        }
+        // Add back donee's node to the donees heap with increased prio
+        donee_node->donor_info = wait;
+        INIT_BINHEAP_NODE(&donee_node->node);
+        TRACE_CUR("Adding %s/%d back to donee list.\n", donee->comm, donee->pid);
+//      TRACE_CUR("donees Before:\n");
+//      print_donees(sem, sem->donees.root, 1);
+        binheap_add(&donee_node->node, &sem->donees, ikglp_donee_heap_node_t, node);
+//      TRACE_CUR("donees After:\n");
+//      print_donees(sem, sem->donees.root, 1);
+        // Add an inheritance/donation to the donee's inheritance heap.
+        wait->prio_donation.lock = (struct litmus_lock*)sem;
+        wait->prio_donation.hp_waiter_eff_prio = t;
+        wait->prio_donation.hp_waiter_ptr = NULL;
+        INIT_BINHEAP_NODE(&wait->prio_donation.hp_binheap_node);
+        binheap_add(&wait->prio_donation.hp_binheap_node,
+                                &tsk_rt(donee)->hp_blocked_tasks,
+                                struct nested_info, hp_binheap_node);
+        new_max_eff_prio = top_priority(&tsk_rt(donee)->hp_blocked_tasks);
+        if(new_max_eff_prio != old_max_eff_prio) {
+                if ((effective_priority(donee) == old_max_eff_prio) ||
+                        (litmus->__compare(new_max_eff_prio, BASE, donee, EFFECTIVE))){
+                        TRACE_TASK(t, "Donation increases %s/%d's effective priority\n",
+                                           donee->comm, donee->pid);
+                        new_prio = new_max_eff_prio;
+                }
+//              else {
+//                      // should be bug.  donor would not be in top-m.
+//                      TRACE_TASK(t, "Donation is not greater than base prio of %s/%d?\n", donee->comm, donee->pid);
+//                      WARN_ON(1);
+//              }
+//      }
+//      else {
+//              // should be bug.  donor would not be in top-m.
+//              TRACE_TASK(t, "No change in %s/%d's inheritance heap?\n", donee->comm, donee->pid);
+//              WARN_ON(1);
+        }
+        if(new_prio) {
+                struct fifo_queue *donee_fq = donee_node->fq;
+                if(donee != donee_fq->owner) {
+                        TRACE_TASK(t, "%s/%d is not the owner. Propagating priority to owner %s/%d.\n",
+                                           donee->comm, donee->pid,
+                                           donee_fq->owner->comm, donee_fq->owner->pid);
+                        raw_spin_unlock(&tsk_rt(donee)->hp_blocked_tasks_lock);
+                        ikglp_refresh_owners_prio_increase(donee, donee_fq, sem, flags);  // unlocks sem->lock
+                }
+                else {
+                        TRACE_TASK(t, "%s/%d is the owner. Progatating priority immediatly.\n",
+                                           donee->comm, donee->pid);
+                        litmus->nested_increase_prio(donee, new_prio, &sem->lock, flags);  // unlocks sem->lock and donee's heap lock
+                }
+        }
+        else {
+                TRACE_TASK(t, "No change in effective priority (it is %d/%s).  BUG?\n",
+                                   new_max_eff_prio->comm, new_max_eff_prio->pid);
+                raw_spin_unlock(&tsk_rt(donee)->hp_blocked_tasks_lock);
+                unlock_fine_irqrestore(&sem->lock, flags);
+        }
+//      TRACE_CUR("donors After:\n");
+//      print_donors(sem->donors.root, 1);
+}
+int ikglp_lock(struct litmus_lock* l)
+{
+        struct task_struct* t = current;
+        struct ikglp_semaphore *sem = ikglp_from_lock(l);
+        unsigned long flags = 0, real_flags;
+        struct fifo_queue *fq = NULL;
+        int replica = -EINVAL;
+#ifdef CONFIG_LITMUS_DGL_SUPPORT
+        raw_spinlock_t *dgl_lock;
+#endif
+        ikglp_wait_state_t wait;
+        if (!is_realtime(t))
+                return -EPERM;
+#ifdef CONFIG_LITMUS_DGL_SUPPORT
+        dgl_lock = litmus->get_dgl_spinlock(t);
+#endif
+        raw_spin_lock_irqsave(&sem->real_lock, real_flags);
+        lock_global_irqsave(dgl_lock, flags);
+        lock_fine_irqsave(&sem->lock, flags);
+        if(sem->nr_in_fifos < sem->m) {
+                // enqueue somwhere
+#ifdef CONFIG_LITMUS_AFFINITY_LOCKING
+                fq = (sem->aff_obs) ?
+                        sem->aff_obs->ops->advise_enqueue(sem->aff_obs, t) :
+                        sem->shortest_fifo_queue;
+#else
+                fq = sem->shortest_fifo_queue;
+#endif
+                if(fq->count == 0) {
+                        // take available resource
+                        replica = ikglp_get_idx(sem, fq);
+                        ikglp_get_immediate(t, fq, sem, flags);  // unlocks sem->lock
+                        unlock_global_irqrestore(dgl_lock, flags);
+                        raw_spin_unlock_irqrestore(&sem->real_lock, real_flags);
+                        goto acquired;
+                }
+                else {
+                        wait.task = t;   // THIS IS CRITICALLY IMPORTANT!!!
+                        tsk_rt(t)->blocked_lock = (struct litmus_lock*)sem;  // record where we are blocked
+                        mb();
+                        /* FIXME: interruptible would be nice some day */
+                        set_task_state(t, TASK_UNINTERRUPTIBLE);
+                        ikglp_enqueue_on_fq(sem, fq, &wait, flags);  // unlocks sem->lock
+                }
+        }
+        else {
+                // donor!
+                wait.task = t;   // THIS IS CRITICALLY IMPORTANT!!!
+                tsk_rt(t)->blocked_lock = (struct litmus_lock*)sem;  // record where we are blocked
+                mb();
+                /* FIXME: interruptible would be nice some day */
+                set_task_state(t, TASK_UNINTERRUPTIBLE);
+                if(litmus->__compare(ikglp_mth_highest(sem), BASE, t, BASE)) {
+                        // enqueue on PQ
+                        ikglp_enqueue_on_pq(sem, &wait);
+                        unlock_fine_irqrestore(&sem->lock, flags);
+                }
+                else {
+                        // enqueue as donor
+                        ikglp_enqueue_on_donor(sem, &wait, flags);       // unlocks sem->lock
+                }
+        }
+        unlock_global_irqrestore(dgl_lock, flags);
+        raw_spin_unlock_irqrestore(&sem->real_lock, real_flags);
+        TS_LOCK_SUSPEND;
+        suspend_for_lock();
+        TS_LOCK_RESUME;
+        fq = ikglp_get_queue(sem, t);
+        BUG_ON(!fq);
+        replica = ikglp_get_idx(sem, fq);
+acquired:
+        TRACE_CUR("Acquired lock %d, queue %d\n",
+                          l->ident, replica);
+#ifdef CONFIG_LITMUS_AFFINITY_LOCKING
+        if(sem->aff_obs) {
+                return sem->aff_obs->ops->replica_to_resource(sem->aff_obs, fq);
+        }
+#endif
+        return replica;
+}
+//int ikglp_lock(struct litmus_lock* l)
+//{
+//      struct task_struct* t = current;
+//      struct ikglp_semaphore *sem = ikglp_from_lock(l);
+//      unsigned long flags = 0, real_flags;
+//      struct fifo_queue *fq = NULL;
+//      int replica = -EINVAL;
+//
+//#ifdef CONFIG_LITMUS_DGL_SUPPORT
+//      raw_spinlock_t *dgl_lock;
+//#endif
+//
+//      ikglp_wait_state_t wait;
+//
+//      if (!is_realtime(t))
+//              return -EPERM;
+//
+//#ifdef CONFIG_LITMUS_DGL_SUPPORT
+//      dgl_lock = litmus->get_dgl_spinlock(t);
+//#endif
+//
+//      raw_spin_lock_irqsave(&sem->real_lock, real_flags);
+//
+//      lock_global_irqsave(dgl_lock, flags);
+//      lock_fine_irqsave(&sem->lock, flags);
+//
+//
+//#ifdef CONFIG_LITMUS_AFFINITY_LOCKING
+//      fq = (sem->aff_obs) ?
+//              sem->aff_obs->ops->advise_enqueue(sem->aff_obs, t) :
+//              sem->shortest_fifo_queue;
+//#else
+//      fq = sem->shortest_fifo_queue;
+//#endif
+//
+//      if(fq->count == 0) {
+//              // take available resource
+//              replica = ikglp_get_idx(sem, fq);
+//
+//              ikglp_get_immediate(t, fq, sem, flags);  // unlocks sem->lock
+//
+//              unlock_global_irqrestore(dgl_lock, flags);
+//              raw_spin_unlock_irqrestore(&sem->real_lock, real_flags);
+//      }
+//      else
+//      {
+//              // we have to suspend.
+//
+//              wait.task = t;   // THIS IS CRITICALLY IMPORTANT!!!
+//
+//              tsk_rt(t)->blocked_lock = (struct litmus_lock*)sem;  // record where we are blocked
+//              mb();
+//
+//              /* FIXME: interruptible would be nice some day */
+//              set_task_state(t, TASK_UNINTERRUPTIBLE);
+//
+//              if(fq->count < sem->max_fifo_len) {
+//                      // enqueue on fq
+//                      ikglp_enqueue_on_fq(sem, fq, &wait, flags);  // unlocks sem->lock
+//              }
+//              else {
+//
+//                      TRACE_CUR("IKGLP fifo queues are full (at least they better be).\n");
+//
+//                      // no room in fifos.  Go to PQ or donors.
+//
+//                      if(litmus->__compare(ikglp_mth_highest(sem), BASE, t, BASE)) {
+//                              // enqueue on PQ
+//                              ikglp_enqueue_on_pq(sem, &wait);
+//                              unlock_fine_irqrestore(&sem->lock, flags);
+//                      }
+//                      else {
+//                              // enqueue as donor
+//                              ikglp_enqueue_on_donor(sem, &wait, flags);       // unlocks sem->lock
+//                      }
+//              }
+//
+//              unlock_global_irqrestore(dgl_lock, flags);
+//              raw_spin_unlock_irqrestore(&sem->real_lock, real_flags);
+//
+//              TS_LOCK_SUSPEND;
+//
+//              schedule();
+//
+//              TS_LOCK_RESUME;
+//
+//              fq = ikglp_get_queue(sem, t);
+//              BUG_ON(!fq);
+//
+//              replica = ikglp_get_idx(sem, fq);
+//      }
+//
+//      TRACE_CUR("Acquired lock %d, queue %d\n",
+//                        l->ident, replica);
+//
+//#ifdef CONFIG_LITMUS_AFFINITY_LOCKING
+//      if(sem->aff_obs) {
+//              return sem->aff_obs->ops->replica_to_resource(sem->aff_obs, fq);
+//      }
+//#endif
+//
+//      return replica;
+//}
+static void ikglp_move_donor_to_fq(struct ikglp_semaphore *sem,
+                                                                   struct fifo_queue *fq,
+                                                                   ikglp_wait_state_t *donor_info)
+{
+        struct task_struct *t = donor_info->task;
+        TRACE_CUR("Donor %s/%d being moved to fq %d\n",
+                          t->comm,
+                          t->pid,
+                          ikglp_get_idx(sem, fq));
+        binheap_delete(&donor_info->node, &sem->donors);
+        __ikglp_enqueue_on_fq(sem, fq, t,
+                                                  &donor_info->fq_node,
+                                                  NULL, // already in global_list, so pass null to prevent adding 2nd time.
+                                                  &donor_info->donee_heap_node);
+        // warning:
+        // ikglp_update_owners_prio(t, fq, sem, flags) has not been called.
+}
+static void ikglp_move_pq_to_fq(struct ikglp_semaphore *sem,
+                                                                struct fifo_queue *fq,
+                                                                ikglp_wait_state_t *wait)
+{
+        struct task_struct *t = wait->task;
+        TRACE_CUR("PQ request %s/%d being moved to fq %d\n",
+                          t->comm,
+                          t->pid,
+                          ikglp_get_idx(sem, fq));
+        binheap_delete(&wait->pq_node.node, &sem->priority_queue);
+        __ikglp_enqueue_on_fq(sem, fq, t,
+                                                  &wait->fq_node,
+                                                  &wait->global_heap_node,
+                                                  &wait->donee_heap_node);
+        // warning:
+        // ikglp_update_owners_prio(t, fq, sem, flags) has not been called.
+}
+static ikglp_wait_state_t* ikglp_find_hp_waiter_to_steal(
+        struct ikglp_semaphore* sem)
+{
+        /* must hold sem->lock */
+        struct fifo_queue *fq = NULL;
+        struct list_head        *pos;
+        struct task_struct      *queued;
+        int i;
+        for(i = 0; i < sem->nr_replicas; ++i) {
+                if( (sem->fifo_queues[i].count > 1) &&
+                   (!fq || litmus->compare(sem->fifo_queues[i].hp_waiter, fq->hp_waiter)) ) {
+                        TRACE_CUR("hp_waiter on fq %d (%s/%d) has higher prio than hp_waiter on fq %d (%s/%d)\n",
+                                          ikglp_get_idx(sem, &sem->fifo_queues[i]),
+                                          sem->fifo_queues[i].hp_waiter->comm,
+                                          sem->fifo_queues[i].hp_waiter->pid,
+                                          (fq) ? ikglp_get_idx(sem, fq) : -1,
+                                          (fq) ? ((fq->hp_waiter) ? fq->hp_waiter->comm : "nil") : "nilXX",
+                                          (fq) ? ((fq->hp_waiter) ? fq->hp_waiter->pid : -1) : -2);
+                        fq = &sem->fifo_queues[i];
+                        WARN_ON(!(fq->hp_waiter));
+                }
+        }
+        if(fq) {
+                struct task_struct *max_hp = fq->hp_waiter;
+                ikglp_wait_state_t* ret = NULL;
+                TRACE_CUR("Searching for %s/%d on fq %d\n",
+                                  max_hp->comm,
+                                  max_hp->pid,
+                                  ikglp_get_idx(sem, fq));
+                BUG_ON(!max_hp);
+                list_for_each(pos, &fq->wait.task_list) {
+                        wait_queue_t *wait = list_entry(pos, wait_queue_t, task_list);
+                        queued  = (struct task_struct*) wait->private;
+                        TRACE_CUR("fq %d entry: %s/%d\n",
+                                          ikglp_get_idx(sem, fq),
+                                          queued->comm,
+                                          queued->pid);
+                        /* Compare task prios, find high prio task. */
+                        if (queued == max_hp) {
+                                TRACE_CUR("Found it!\n");
+                                ret = container_of(wait, ikglp_wait_state_t, fq_node);
+                        }
+                }
+                WARN_ON(!ret);
+                return ret;
+        }
+        return(NULL);
+}
+static void ikglp_steal_to_fq(struct ikglp_semaphore *sem,
+                                                          struct fifo_queue *fq,
+                                                          ikglp_wait_state_t *fq_wait)
+{
+        struct task_struct *t = fq_wait->task;
+        struct fifo_queue *fq_steal = fq_wait->donee_heap_node.fq;
+        TRACE_CUR("FQ request %s/%d being moved to fq %d\n",
+                          t->comm,
+                          t->pid,
+                          ikglp_get_idx(sem, fq));
+        fq_wait->donee_heap_node.fq = fq;  // just to be safe
+        __remove_wait_queue(&fq_steal->wait, &fq_wait->fq_node);
+        --(fq_steal->count);
+#ifdef CONFIG_LITMUS_AFFINITY_LOCKING
+        if(sem->aff_obs) {
+                sem->aff_obs->ops->notify_dequeue(sem->aff_obs, fq_steal, t);
+        }
+#endif
+        if(t == fq_steal->hp_waiter) {
+                fq_steal->hp_waiter = ikglp_find_hp_waiter(fq_steal, NULL);
+                TRACE_TASK(t, "New hp_waiter for fq %d is %s/%d!\n",
+                                   ikglp_get_idx(sem, fq_steal),
+                                   (fq_steal->hp_waiter) ? fq_steal->hp_waiter->comm : "nil",
+                                   (fq_steal->hp_waiter) ? fq_steal->hp_waiter->pid : -1);
+        }
+        // Update shortest.
+        if(fq_steal->count < sem->shortest_fifo_queue->count) {
+                sem->shortest_fifo_queue = fq_steal;
+        }
+        __ikglp_enqueue_on_fq(sem, fq, t,
+                                                  &fq_wait->fq_node,
+                                                  NULL,
+                                                  NULL);
+        // warning: We have not checked the priority inheritance of fq's owner yet.
+}
+static void ikglp_migrate_fq_to_owner_heap_nodes(struct ikglp_semaphore *sem,
+                                                                                                 struct fifo_queue *fq,
+                                                                                                 ikglp_wait_state_t *old_wait)
+{
+        struct task_struct *t = old_wait->task;
+        BUG_ON(old_wait->donee_heap_node.fq != fq);
+        TRACE_TASK(t, "Migrating wait_state to memory of queue %d.\n",
+                           ikglp_get_idx(sem, fq));
+        // need to migrate global_heap_node and donee_heap_node off of the stack
+        // to the nodes allocated for the owner of this fq.
+        // TODO: Enhance binheap() to perform this operation in place.
+        ikglp_del_global_list(sem, t, &old_wait->global_heap_node); // remove
+        fq->global_heap_node = old_wait->global_heap_node;                      // copy
+        ikglp_add_global_list(sem, t, &fq->global_heap_node);           // re-add
+        binheap_delete(&old_wait->donee_heap_node.node, &sem->donees);  // remove
+        fq->donee_heap_node = old_wait->donee_heap_node;  // copy
+        if(fq->donee_heap_node.donor_info) {
+                // let donor know that our location has changed
+                BUG_ON(fq->donee_heap_node.donor_info->donee_info->task != t);  // validate cross-link
+                fq->donee_heap_node.donor_info->donee_info = &fq->donee_heap_node;
+        }
+        INIT_BINHEAP_NODE(&fq->donee_heap_node.node);
+        binheap_add(&fq->donee_heap_node.node, &sem->donees,
+                                ikglp_donee_heap_node_t, node);  // re-add
+}
+int ikglp_unlock(struct litmus_lock* l)
+{
+        struct ikglp_semaphore *sem = ikglp_from_lock(l);
+        struct task_struct *t = current;
+        struct task_struct *donee = NULL;
+        struct task_struct *next = NULL;
+        struct task_struct *new_on_fq = NULL;
+        struct fifo_queue *fq_of_new_on_fq = NULL;
+        ikglp_wait_state_t *other_donor_info = NULL;
+        struct fifo_queue *to_steal = NULL;
+        int need_steal_prio_reeval = 0;
+        struct fifo_queue *fq;
+#ifdef CONFIG_LITMUS_DGL_SUPPORT
+        raw_spinlock_t *dgl_lock;
+#endif
+        unsigned long flags = 0, real_flags;
+        int err = 0;
+        fq = ikglp_get_queue(sem, t);  // returns NULL if 't' is not owner.
+        if (!fq) {
+                err = -EINVAL;
+                goto out;
+        }
+#ifdef CONFIG_LITMUS_DGL_SUPPORT
+        dgl_lock = litmus->get_dgl_spinlock(t);
+#endif
+        raw_spin_lock_irqsave(&sem->real_lock, real_flags);
+        lock_global_irqsave(dgl_lock, flags);  // TODO: Push this deeper
+        lock_fine_irqsave(&sem->lock, flags);
+        TRACE_TASK(t, "Freeing replica %d.\n", ikglp_get_idx(sem, fq));
+        // Remove 't' from the heaps, but data in nodes will still be good.
+        ikglp_del_global_list(sem, t, &fq->global_heap_node);
+        binheap_delete(&fq->donee_heap_node.node, &sem->donees);
+        fq->owner = NULL;  // no longer owned!!
+        --(fq->count);
+        if(fq->count < sem->shortest_fifo_queue->count) {
+                sem->shortest_fifo_queue = fq;
+        }
+        --(sem->nr_in_fifos);
+#ifdef CONFIG_LITMUS_AFFINITY_LOCKING
+        if(sem->aff_obs) {
+                sem->aff_obs->ops->notify_dequeue(sem->aff_obs, fq, t);
+                sem->aff_obs->ops->notify_freed(sem->aff_obs, fq, t);
+        }
+#endif
+        // Move the next request into the FQ and update heaps as needed.
+        // We defer re-evaluation of priorities to later in the function.
+        if(fq->donee_heap_node.donor_info) {  // move my donor to FQ
+                ikglp_wait_state_t *donor_info = fq->donee_heap_node.donor_info;
+                new_on_fq = donor_info->task;
+                // donor moved to FQ
+                donee = t;
+#ifdef CONFIG_LITMUS_AFFINITY_LOCKING
+                if(sem->aff_obs && sem->aff_obs->relax_max_fifo_len) {
+                        fq_of_new_on_fq = sem->aff_obs->ops->advise_enqueue(sem->aff_obs, new_on_fq);
+                        if(fq_of_new_on_fq->count == 0) {
+                                // ignore it?
+//                              fq_of_new_on_fq = fq;
+                        }
+                }
+                else {
+                        fq_of_new_on_fq = fq;
+                }
+#else
+                fq_of_new_on_fq = fq;
+#endif
+                TRACE_TASK(t, "Moving MY donor (%s/%d) to fq %d (non-aff wanted fq %d).\n",
+                                   new_on_fq->comm, new_on_fq->pid,
+                                   ikglp_get_idx(sem, fq_of_new_on_fq),
+                                   ikglp_get_idx(sem, fq));
+                ikglp_move_donor_to_fq(sem, fq_of_new_on_fq, donor_info);
+        }
+        else if(!binheap_empty(&sem->donors)) {  // No donor, so move any donor to FQ
+                                                                                         // move other donor to FQ
+                // Select a donor
+#ifdef CONFIG_LITMUS_AFFINITY_LOCKING
+                other_donor_info = (sem->aff_obs) ?
+                        sem->aff_obs->ops->advise_donor_to_fq(sem->aff_obs, fq) :
+                        binheap_top_entry(&sem->donors, ikglp_wait_state_t, node);
+#else
+                other_donor_info = binheap_top_entry(&sem->donors, ikglp_wait_state_t, node);
+#endif
+                new_on_fq = other_donor_info->task;
+                donee = other_donor_info->donee_info->task;
+                // update the donee's heap position.
+                other_donor_info->donee_info->donor_info = NULL;  // clear the cross-link
+                binheap_decrease(&other_donor_info->donee_info->node, &sem->donees);
+#ifdef CONFIG_LITMUS_AFFINITY_LOCKING
+                if(sem->aff_obs && sem->aff_obs->relax_max_fifo_len) {
+                        fq_of_new_on_fq = sem->aff_obs->ops->advise_enqueue(sem->aff_obs, new_on_fq);
+                        if(fq_of_new_on_fq->count == 0) {
+                                // ignore it?
+//                              fq_of_new_on_fq = fq;
+                        }
+                }
+                else {
+                        fq_of_new_on_fq = fq;
+                }
+#else
+                fq_of_new_on_fq = fq;
+#endif
+                TRACE_TASK(t, "Moving a donor (%s/%d) to fq %d (non-aff wanted fq %d).\n",
+                                   new_on_fq->comm, new_on_fq->pid,
+                                   ikglp_get_idx(sem, fq_of_new_on_fq),
+                                   ikglp_get_idx(sem, fq));
+                ikglp_move_donor_to_fq(sem, fq_of_new_on_fq, other_donor_info);
+        }
+        else if(!binheap_empty(&sem->priority_queue)) {  // No donors, so move PQ
+                ikglp_heap_node_t *pq_node = binheap_top_entry(&sem->priority_queue,
+                                                                                                           ikglp_heap_node_t, node);
+                ikglp_wait_state_t *pq_wait = container_of(pq_node, ikglp_wait_state_t,
+                                                                                                   pq_node);
+                new_on_fq = pq_wait->task;
+#ifdef CONFIG_LITMUS_AFFINITY_LOCKING
+                if(sem->aff_obs && sem->aff_obs->relax_max_fifo_len) {
+                        fq_of_new_on_fq = sem->aff_obs->ops->advise_enqueue(sem->aff_obs, new_on_fq);
+                        if(fq_of_new_on_fq->count == 0) {
+                                // ignore it?
+//                              fq_of_new_on_fq = fq;
+                        }
+                }
+                else {
+                        fq_of_new_on_fq = fq;
+                }
+#else
+                fq_of_new_on_fq = fq;
+#endif
+                TRACE_TASK(t, "Moving a pq waiter (%s/%d) to fq %d (non-aff wanted fq %d).\n",
+                                   new_on_fq->comm, new_on_fq->pid,
+                                   ikglp_get_idx(sem, fq_of_new_on_fq),
+                                   ikglp_get_idx(sem, fq));
+                ikglp_move_pq_to_fq(sem, fq_of_new_on_fq, pq_wait);
+        }
+        else if(fq->count == 0) {  // No PQ and this queue is empty, so steal.
+                ikglp_wait_state_t *fq_wait;
+                TRACE_TASK(t, "Looking to steal a request for fq %d...\n",
+                                   ikglp_get_idx(sem, fq));
+#ifdef CONFIG_LITMUS_AFFINITY_LOCKING
+                fq_wait = (sem->aff_obs) ?
+                        sem->aff_obs->ops->advise_steal(sem->aff_obs, fq) :
+                        ikglp_find_hp_waiter_to_steal(sem);
+#else
+                fq_wait = ikglp_find_hp_waiter_to_steal(sem);
+#endif
+                if(fq_wait) {
+                        to_steal = fq_wait->donee_heap_node.fq;
+                        new_on_fq = fq_wait->task;
+                        fq_of_new_on_fq = fq;
+                        need_steal_prio_reeval = (new_on_fq == to_steal->hp_waiter);
+                        TRACE_TASK(t, "Found %s/%d of fq %d to steal for fq %d...\n",
+                                           new_on_fq->comm, new_on_fq->pid,
+                                           ikglp_get_idx(sem, to_steal),
+                                           ikglp_get_idx(sem, fq));
+                        ikglp_steal_to_fq(sem, fq, fq_wait);
+                }
+                else {
+                        TRACE_TASK(t, "Found nothing to steal for fq %d.\n",
+                                           ikglp_get_idx(sem, fq));
+                }
+        }
+        else { // move no one
+        }
+        // 't' must drop all priority and clean up data structures before hand-off.
+        // DROP ALL INHERITANCE.  IKGLP MUST BE OUTER-MOST
+        raw_spin_lock(&tsk_rt(t)->hp_blocked_tasks_lock);
+        {
+                int count = 0;
+                while(!binheap_empty(&tsk_rt(t)->hp_blocked_tasks)) {
+                        binheap_delete_root(&tsk_rt(t)->hp_blocked_tasks,
+                                                                struct nested_info, hp_binheap_node);
+                        ++count;
+                }
+                litmus->decrease_prio(t, NULL);
+                WARN_ON(count > 2); // should not be greater than 2.  only local fq inh and donation can be possible.
+        }
+        raw_spin_unlock(&tsk_rt(t)->hp_blocked_tasks_lock);
+        // Now patch up other priorities.
+        //
+        // At most one of the following:
+        //   if(donee && donee != t), decrease prio, propagate to owner, or onward
+        //   if(to_steal), update owner's prio (hp_waiter has already been set)
+        //
+        BUG_ON((other_donor_info != NULL) && (to_steal != NULL));
+        if(other_donor_info) {
+                struct fifo_queue *other_fq = other_donor_info->donee_info->fq;
+                BUG_ON(!donee);
+                BUG_ON(donee == t);
+                TRACE_TASK(t, "Terminating donation relation of donor %s/%d to donee %s/%d!\n",
+                                   other_donor_info->task->comm, other_donor_info->task->pid,
+                                   donee->comm, donee->pid);
+                // need to terminate donation relation.
+                if(donee == other_fq->owner) {
+                        TRACE_TASK(t, "Donee %s/%d is an owner of fq %d.\n",
+                                           donee->comm, donee->pid,
+                                           ikglp_get_idx(sem, other_fq));
+                        ikglp_remove_donation_from_owner(&other_donor_info->prio_donation.hp_binheap_node, other_fq, sem, flags);
+                        lock_fine_irqsave(&sem->lock, flags);  // there should be no contention!!!!
+                }
+                else {
+                        TRACE_TASK(t, "Donee %s/%d is an blocked in of fq %d.\n",
+                                           donee->comm, donee->pid,
+                                           ikglp_get_idx(sem, other_fq));
+                        ikglp_remove_donation_from_fq_waiter(donee, &other_donor_info->prio_donation.hp_binheap_node);
+                        if(donee == other_fq->hp_waiter) {
+                                TRACE_TASK(t, "Donee %s/%d was an hp_waiter of fq %d. Rechecking hp_waiter.\n",
+                                                   donee->comm, donee->pid,
+                                                   ikglp_get_idx(sem, other_fq));
+                                other_fq->hp_waiter = ikglp_find_hp_waiter(other_fq, NULL);
+                                TRACE_TASK(t, "New hp_waiter for fq %d is %s/%d!\n",
+                                                   ikglp_get_idx(sem, other_fq),
+                                                   (other_fq->hp_waiter) ? other_fq->hp_waiter->comm : "nil",
+                                                   (other_fq->hp_waiter) ? other_fq->hp_waiter->pid : -1);
+                                ikglp_refresh_owners_prio_decrease(other_fq, sem, flags); // unlocks sem->lock.  reacquire it.
+                                lock_fine_irqsave(&sem->lock, flags);  // there should be no contention!!!!
+                        }
+                }
+        }
+        else if(to_steal) {
+                TRACE_TASK(t, "Rechecking priority inheritance of fq %d, triggered by stealing.\n",
+                                   ikglp_get_idx(sem, to_steal));
+                if(need_steal_prio_reeval) {
+                        ikglp_refresh_owners_prio_decrease(to_steal, sem, flags); // unlocks sem->lock.  reacquire it.
+                        lock_fine_irqsave(&sem->lock, flags);  // there should be no contention!!!!
+                }
+        }
+        // check for new HP waiter.
+        if(new_on_fq) {
+                if(fq == fq_of_new_on_fq) {
+                        // fq->owner is null, so just update the hp_waiter without locking.
+                        if(new_on_fq == fq->hp_waiter) {
+                                TRACE_TASK(t, "new_on_fq is already hp_waiter.\n",
+                                                   fq->hp_waiter->comm, fq->hp_waiter->pid);
+                                fq->nest.hp_waiter_eff_prio = effective_priority(fq->hp_waiter);  // set this just to be sure...
+                        }
+                        else if(litmus->compare(new_on_fq, fq->hp_waiter)) {
+                                if(fq->hp_waiter)
+                                        TRACE_TASK(t, "has higher prio than hp_waiter (%s/%d).\n",
+                                                           fq->hp_waiter->comm, fq->hp_waiter->pid);
+                                else
+                                        TRACE_TASK(t, "has higher prio than hp_waiter (NIL).\n");
+                                fq->hp_waiter = new_on_fq;
+                                fq->nest.hp_waiter_eff_prio = effective_priority(fq->hp_waiter);
+                                TRACE_TASK(t, "New hp_waiter for fq %d is %s/%d!\n",
+                                                   ikglp_get_idx(sem, fq),
+                                                   (fq->hp_waiter) ? fq->hp_waiter->comm : "nil",
+                                                   (fq->hp_waiter) ? fq->hp_waiter->pid : -1);
+                        }
+                }
+                else {
+                        ikglp_refresh_owners_prio_increase(new_on_fq, fq_of_new_on_fq, sem, flags); // unlocks sem->lock.  reacquire it.
+                        lock_fine_irqsave(&sem->lock, flags);  // there should be no contention!!!!
+                }
+        }
+wake_kludge:
+        if(waitqueue_active(&fq->wait))
+        {
+                wait_queue_t *wait = list_entry(fq->wait.task_list.next, wait_queue_t, task_list);
+                ikglp_wait_state_t *fq_wait = container_of(wait, ikglp_wait_state_t, fq_node);
+                next = (struct task_struct*) wait->private;
+                __remove_wait_queue(&fq->wait, wait);
+                TRACE_CUR("queue %d: ASSIGNING %s/%d as owner - next\n",
+                                  ikglp_get_idx(sem, fq),
+                                  next->comm, next->pid);
+                // migrate wait-state to fifo-memory.
+                ikglp_migrate_fq_to_owner_heap_nodes(sem, fq, fq_wait);
+                /* next becomes the resouce holder */
+                fq->owner = next;
+                tsk_rt(next)->blocked_lock = NULL;
+#ifdef CONFIG_LITMUS_AFFINITY_LOCKING
+                if(sem->aff_obs) {
+                        sem->aff_obs->ops->notify_acquired(sem->aff_obs, fq, next);
+                }
+#endif
+                /* determine new hp_waiter if necessary */
+                if (next == fq->hp_waiter) {
+                        TRACE_TASK(next, "was highest-prio waiter\n");
+                        /* next has the highest priority --- it doesn't need to
+                         * inherit.  However, we need to make sure that the
+                         * next-highest priority in the queue is reflected in
+                         * hp_waiter. */
+                        fq->hp_waiter = ikglp_find_hp_waiter(fq, NULL);
+                        TRACE_TASK(next, "New hp_waiter for fq %d is %s/%d!\n",
+                                           ikglp_get_idx(sem, fq),
+                                           (fq->hp_waiter) ? fq->hp_waiter->comm : "nil",
+                                           (fq->hp_waiter) ? fq->hp_waiter->pid : -1);
+                        fq->nest.hp_waiter_eff_prio = (fq->hp_waiter) ?
+                                                                effective_priority(fq->hp_waiter) : NULL;
+                        if (fq->hp_waiter)
+                                TRACE_TASK(fq->hp_waiter, "is new highest-prio waiter\n");
+                        else
+                                TRACE("no further waiters\n");
+                        raw_spin_lock(&tsk_rt(next)->hp_blocked_tasks_lock);
+//                      TRACE_TASK(next, "Heap Before:\n");
+//                      print_hp_waiters(tsk_rt(next)->hp_blocked_tasks.root, 0);
+                        binheap_add(&fq->nest.hp_binheap_node,
+                                                &tsk_rt(next)->hp_blocked_tasks,
+                                                struct nested_info,
+                                                hp_binheap_node);
+//                      TRACE_TASK(next, "Heap After:\n");
+//                      print_hp_waiters(tsk_rt(next)->hp_blocked_tasks.root, 0);
+                        raw_spin_unlock(&tsk_rt(next)->hp_blocked_tasks_lock);
+                }
+                else {
+                        /* Well, if 'next' is not the highest-priority waiter,
+                         * then it (probably) ought to inherit the highest-priority
+                         * waiter's priority. */
+                        TRACE_TASK(next, "is not hp_waiter of replica %d. hp_waiter is %s/%d\n",
+                                           ikglp_get_idx(sem, fq),
+                                           (fq->hp_waiter) ? fq->hp_waiter->comm : "nil",
+                                           (fq->hp_waiter) ? fq->hp_waiter->pid : -1);
+                        raw_spin_lock(&tsk_rt(next)->hp_blocked_tasks_lock);
+                        binheap_add(&fq->nest.hp_binheap_node,
+                                                &tsk_rt(next)->hp_blocked_tasks,
+                                                struct nested_info,
+                                                hp_binheap_node);
+                        /* It is possible that 'next' *should* be the hp_waiter, but isn't
+                     * because that update hasn't yet executed (update operation is
+                         * probably blocked on mutex->lock). So only inherit if the top of
+                         * 'next's top heap node is indeed the effective prio. of hp_waiter.
+                         * (We use fq->hp_waiter_eff_prio instead of effective_priority(hp_waiter)
+                         * since the effective priority of hp_waiter can change (and the
+                         * update has not made it to this lock).)
+                         */
+                        if(likely(top_priority(&tsk_rt(next)->hp_blocked_tasks) ==
+                                                                                                fq->nest.hp_waiter_eff_prio))
+                        {
+                                if(fq->nest.hp_waiter_eff_prio)
+                                        litmus->increase_prio(next, fq->nest.hp_waiter_eff_prio);
+                                else
+                                        WARN_ON(1);
+                        }
+                        raw_spin_unlock(&tsk_rt(next)->hp_blocked_tasks_lock);
+                }
+                // wake up the new resource holder!
+                wake_up_process(next);
+        }
+        if(fq_of_new_on_fq && fq_of_new_on_fq != fq && fq_of_new_on_fq->count == 1) {
+                // The guy we promoted when to an empty FQ. (Why didn't stealing pick this up?)
+                // Wake up the new guy too.
+                BUG_ON(fq_of_new_on_fq->owner != NULL);
+                fq = fq_of_new_on_fq;
+                fq_of_new_on_fq = NULL;
+                goto wake_kludge;
+        }
+        unlock_fine_irqrestore(&sem->lock, flags);
+        unlock_global_irqrestore(dgl_lock, flags);
+        raw_spin_unlock_irqrestore(&sem->real_lock, real_flags);
+out:
+        return err;
+}
+int ikglp_close(struct litmus_lock* l)
+{
+        struct task_struct *t = current;
+        struct ikglp_semaphore *sem = ikglp_from_lock(l);
+        unsigned long flags;
+        int owner = 0;
+        int i;
+        raw_spin_lock_irqsave(&sem->real_lock, flags);
+        for(i = 0; i < sem->nr_replicas; ++i) {
+                if(sem->fifo_queues[i].owner == t) {
+                        owner = 1;
+                        break;
+                }
+        }
+        raw_spin_unlock_irqrestore(&sem->real_lock, flags);
+        if (owner)
+                ikglp_unlock(l);
+        return 0;
+}
+void ikglp_free(struct litmus_lock* l)
+{
+        struct ikglp_semaphore *sem = ikglp_from_lock(l);
+        kfree(sem->fifo_queues);
+        kfree(sem);
+}
+struct litmus_lock* ikglp_new(int m,
+                                                          struct litmus_lock_ops* ops,
+                                                          void* __user arg)
+{
+        struct ikglp_semaphore* sem;
+        int nr_replicas = 0;
+        int i;
+        if(!access_ok(VERIFY_READ, arg, sizeof(nr_replicas)))
+        {
+                return(NULL);
+        }
+        if(__copy_from_user(&nr_replicas, arg, sizeof(nr_replicas)))
+        {
+                return(NULL);
+        }
+        if(nr_replicas < 1)
+        {
+                return(NULL);
+        }
+        sem = kmalloc(sizeof(*sem), GFP_KERNEL);
+        if(!sem)
+        {
+                return NULL;
+        }
+        sem->fifo_queues = kmalloc(sizeof(struct fifo_queue)*nr_replicas, GFP_KERNEL);
+        if(!sem->fifo_queues)
+        {
+                kfree(sem);
+                return NULL;
+        }
+        sem->litmus_lock.ops = ops;
+#ifdef CONFIG_DEBUG_SPINLOCK
+        {
+                __raw_spin_lock_init(&sem->lock, ((struct litmus_lock*)sem)->cheat_lockdep, &((struct litmus_lock*)sem)->key);
+        }
+#else
+        raw_spin_lock_init(&sem->lock);
+#endif
+        raw_spin_lock_init(&sem->real_lock);
+        sem->nr_replicas = nr_replicas;
+        sem->m = m;
+        sem->max_fifo_len = (sem->m/nr_replicas) + ((sem->m%nr_replicas) != 0);
+        sem->nr_in_fifos = 0;
+        TRACE("New IKGLP Sem: m = %d, k = %d, max fifo_len = %d\n",
+                  sem->m,
+                  sem->nr_replicas,
+                  sem->max_fifo_len);
+        for(i = 0; i < nr_replicas; ++i)
+        {
+                struct fifo_queue* q = &(sem->fifo_queues[i]);
+                q->owner = NULL;
+                q->hp_waiter = NULL;
+                init_waitqueue_head(&q->wait);
+                q->count = 0;
+                q->global_heap_node.task = NULL;
+                INIT_BINHEAP_NODE(&q->global_heap_node.node);
+                q->donee_heap_node.task = NULL;
+                q->donee_heap_node.donor_info = NULL;
+                q->donee_heap_node.fq = NULL;
+                INIT_BINHEAP_NODE(&q->donee_heap_node.node);
+                q->nest.lock = (struct litmus_lock*)sem;
+                q->nest.hp_waiter_eff_prio = NULL;
+                q->nest.hp_waiter_ptr = &q->hp_waiter;
+                INIT_BINHEAP_NODE(&q->nest.hp_binheap_node);
+        }
+        sem->shortest_fifo_queue = &sem->fifo_queues[0];
+        sem->top_m_size = 0;
+        // init heaps
+        INIT_BINHEAP_HANDLE(&sem->top_m, ikglp_min_heap_base_priority_order);
+        INIT_BINHEAP_HANDLE(&sem->not_top_m, ikglp_max_heap_base_priority_order);
+        INIT_BINHEAP_HANDLE(&sem->donees, ikglp_min_heap_donee_order);
+        INIT_BINHEAP_HANDLE(&sem->priority_queue, ikglp_max_heap_base_priority_order);
+        INIT_BINHEAP_HANDLE(&sem->donors, ikglp_donor_max_heap_base_priority_order);
+#ifdef CONFIG_LITMUS_AFFINITY_LOCKING
+        sem->aff_obs = NULL;
+#endif
+        return &sem->litmus_lock;
+}
+#if defined(CONFIG_LITMUS_AFFINITY_LOCKING) && defined(CONFIG_LITMUS_NVIDIA)
+static inline int __replica_to_gpu(struct ikglp_affinity* aff, int replica)
+{
+        int gpu = replica % aff->nr_rsrc;
+        return gpu;
+}
+static inline int replica_to_gpu(struct ikglp_affinity* aff, int replica)
+{
+        int gpu = __replica_to_gpu(aff, replica) + aff->offset;
+        return gpu;
+}
+static inline int gpu_to_base_replica(struct ikglp_affinity* aff, int gpu)
+{
+        int replica = gpu - aff->offset;
+        return replica;
+}
+static inline int same_gpu(struct ikglp_affinity* aff, int replica_a, int replica_b)
+{
+        return(replica_to_gpu(aff, replica_a) == replica_to_gpu(aff, replica_b));
+}
+static inline int has_affinity(struct ikglp_affinity* aff, struct task_struct* t, int replica)
+{
+        if(tsk_rt(t)->last_gpu >= 0)
+        {
+                return (tsk_rt(t)->last_gpu == replica_to_gpu(aff, replica));
+        }
+        return 0;
+}
+int ikglp_aff_obs_close(struct affinity_observer* obs)
+{
+        return 0;
+}
+void ikglp_aff_obs_free(struct affinity_observer* obs)
+{
+        struct ikglp_affinity *ikglp_aff = ikglp_aff_obs_from_aff_obs(obs);
+        // make sure the thread destroying this semaphore will not
+        // call the exit callback on a destroyed lock.
+        struct task_struct *t = current;
+        if (is_realtime(t) && tsk_rt(t)->rsrc_exit_cb_args == ikglp_aff)
+        {
+                tsk_rt(t)->rsrc_exit_cb = NULL;
+                tsk_rt(t)->rsrc_exit_cb_args = NULL;
+        }
+        kfree(ikglp_aff->nr_cur_users_on_rsrc);
+        kfree(ikglp_aff->nr_aff_on_rsrc);
+        kfree(ikglp_aff->q_info);
+        kfree(ikglp_aff);
+}
+static struct affinity_observer* ikglp_aff_obs_new(struct affinity_observer_ops* ops,
+                                                                                                   struct ikglp_affinity_ops* ikglp_ops,
+                                                                                                   void* __user args)
+{
+        struct ikglp_affinity* ikglp_aff;
+        struct gpu_affinity_observer_args aff_args;
+        struct ikglp_semaphore* sem;
+        int i;
+        unsigned long flags;
+        if(!access_ok(VERIFY_READ, args, sizeof(aff_args))) {
+                return(NULL);
+        }
+        if(__copy_from_user(&aff_args, args, sizeof(aff_args))) {
+                return(NULL);
+        }
+        sem = (struct ikglp_semaphore*) get_lock_from_od(aff_args.obs.lock_od);
+        if(sem->litmus_lock.type != IKGLP_SEM) {
+                TRACE_CUR("Lock type not supported.  Type = %d\n", sem->litmus_lock.type);
+                return(NULL);
+        }
+        if((aff_args.nr_simult_users <= 0) ||
+           (sem->nr_replicas%aff_args.nr_simult_users != 0)) {
+                TRACE_CUR("Lock %d does not support #replicas (%d) for #simult_users "
+                                  "(%d) per replica.  #replicas should be evenly divisible "
+                                  "by #simult_users.\n",
+                                  sem->litmus_lock.ident,
+                                  sem->nr_replicas,
+                                  aff_args.nr_simult_users);
+                return(NULL);
+        }
+//      if(aff_args.nr_simult_users > NV_MAX_SIMULT_USERS) {
+//              TRACE_CUR("System does not support #simult_users > %d. %d requested.\n",
+//                                NV_MAX_SIMULT_USERS, aff_args.nr_simult_users);
+////            return(NULL);
+//      }
+        ikglp_aff = kmalloc(sizeof(*ikglp_aff), GFP_KERNEL);
+        if(!ikglp_aff) {
+                return(NULL);
+        }
+        ikglp_aff->q_info = kmalloc(sizeof(struct ikglp_queue_info)*sem->nr_replicas, GFP_KERNEL);
+        if(!ikglp_aff->q_info) {
+                kfree(ikglp_aff);
+                return(NULL);
+        }
+        ikglp_aff->nr_cur_users_on_rsrc = kmalloc(sizeof(int)*(sem->nr_replicas / aff_args.nr_simult_users), GFP_KERNEL);
+        if(!ikglp_aff->nr_cur_users_on_rsrc) {
+                kfree(ikglp_aff->q_info);
+                kfree(ikglp_aff);
+                return(NULL);
+        }
+        ikglp_aff->nr_aff_on_rsrc =  kmalloc(sizeof(int64_t)*(sem->nr_replicas / aff_args.nr_simult_users), GFP_KERNEL);
+        if(!ikglp_aff->nr_aff_on_rsrc) {
+                kfree(ikglp_aff->nr_cur_users_on_rsrc);
+                kfree(ikglp_aff->q_info);
+                kfree(ikglp_aff);
+                return(NULL);
+        }
+        affinity_observer_new(&ikglp_aff->obs, ops, &aff_args.obs);
+        ikglp_aff->ops = ikglp_ops;
+        ikglp_aff->offset = aff_args.replica_to_gpu_offset;
+        ikglp_aff->nr_simult = aff_args.nr_simult_users;
+        ikglp_aff->nr_rsrc = sem->nr_replicas / ikglp_aff->nr_simult;
+        ikglp_aff->relax_max_fifo_len = (aff_args.relaxed_rules) ? 1 : 0;
+        TRACE_CUR("GPU affinity_observer: offset = %d, nr_simult = %d, "
+                          "nr_rsrc = %d, relaxed_fifo_len = %d\n",
+                          ikglp_aff->offset, ikglp_aff->nr_simult, ikglp_aff->nr_rsrc,
+                          ikglp_aff->relax_max_fifo_len);
+        memset(ikglp_aff->nr_cur_users_on_rsrc, 0, sizeof(int)*(ikglp_aff->nr_rsrc));
+        memset(ikglp_aff->nr_aff_on_rsrc, 0, sizeof(int64_t)*(ikglp_aff->nr_rsrc));
+        for(i = 0; i < sem->nr_replicas; ++i) {
+                ikglp_aff->q_info[i].q = &sem->fifo_queues[i];
+                ikglp_aff->q_info[i].estimated_len = 0;
+                // multiple q_info's will point to the same resource (aka GPU) if
+                // aff_args.nr_simult_users > 1
+                ikglp_aff->q_info[i].nr_cur_users = &ikglp_aff->nr_cur_users_on_rsrc[__replica_to_gpu(ikglp_aff,i)];
+                ikglp_aff->q_info[i].nr_aff_users = &ikglp_aff->nr_aff_on_rsrc[__replica_to_gpu(ikglp_aff,i)];
+        }
+        // attach observer to the lock
+        raw_spin_lock_irqsave(&sem->real_lock, flags);
+        sem->aff_obs = ikglp_aff;
+        raw_spin_unlock_irqrestore(&sem->real_lock, flags);
+        return &ikglp_aff->obs;
+}
+static int gpu_replica_to_resource(struct ikglp_affinity* aff,
+                                                                   struct fifo_queue* fq) {
+        struct ikglp_semaphore *sem = ikglp_from_lock(aff->obs.lock);
+        return(replica_to_gpu(aff, ikglp_get_idx(sem, fq)));
+}
+// Smart IKGLP Affinity
+//static inline struct ikglp_queue_info* ikglp_aff_find_shortest(struct ikglp_affinity* aff)
+//{
+//      struct ikglp_semaphore *sem = ikglp_from_lock(aff->obs.lock);
+//      struct ikglp_queue_info *shortest = &aff->q_info[0];
+//      int i;
+//
+//      for(i = 1; i < sem->nr_replicas; ++i) {
+//              if(aff->q_info[i].estimated_len < shortest->estimated_len) {
+//                      shortest = &aff->q_info[i];
+//              }
+//      }
+//
+//      return(shortest);
+//}
+struct fifo_queue* gpu_ikglp_advise_enqueue(struct ikglp_affinity* aff, struct task_struct* t)
+{
+        // advise_enqueue must be smart as not not break IKGLP rules:
+        //  * No queue can be greater than ceil(m/k) in length.  We may return
+        //    such a queue, but IKGLP will be smart enough as to send requests
+        //    to donors or PQ.
+        //  * Cannot let a queue idle if there exist waiting PQ/donors
+        //      -- needed to guarantee parallel progress of waiters.
+        //
+        // We may be able to relax some of these constraints, but this will have to
+        // be carefully evaluated.
+        //
+        // Huristic strategy: Find the shortest queue that is not full.
+        struct ikglp_semaphore *sem = ikglp_from_lock(aff->obs.lock);
+        lt_t min_len;
+        int min_nr_users, min_nr_aff_users;
+        struct ikglp_queue_info *shortest, *aff_queue;
+        struct fifo_queue *to_enqueue;
+        int i;
+        int affinity_gpu;
+        int max_fifo_len = (aff->relax_max_fifo_len) ?
+                sem->m : sem->max_fifo_len;
+        // if we have no affinity, find the GPU with the least number of users
+        // with active affinity
+        if(unlikely(tsk_rt(t)->last_gpu < 0)) {
+                int temp_min = aff->nr_aff_on_rsrc[0];
+                affinity_gpu = aff->offset;
+                for(i = 1; i < aff->nr_rsrc; ++i) {
+                        if(aff->nr_aff_on_rsrc[i] < temp_min) {
+                                affinity_gpu = aff->offset + i;
+                        }
+                }
+                TRACE_CUR("no affinity. defaulting to %d with %d aff users.\n",
+                                                affinity_gpu, temp_min);
+        }
+        else {
+                affinity_gpu = tsk_rt(t)->last_gpu;
+        }
+        // all things being equal, let's start with the queue with which we have
+        // affinity.  this helps us maintain affinity even when we don't have
+        // an estiamte for local-affinity execution time (i.e., 2nd time on GPU)
+        aff_queue = &aff->q_info[gpu_to_base_replica(aff, affinity_gpu)];
+        shortest = aff_queue;
+        //      if(shortest == aff->shortest_queue) {
+        //              TRACE_CUR("special case: have affinity with shortest queue\n");
+        //              goto out;
+        //      }
+        min_len = shortest->estimated_len + get_gpu_estimate(t, MIG_LOCAL);
+        min_nr_users = *(shortest->nr_cur_users);
+        min_nr_aff_users = *(shortest->nr_aff_users);
+        TRACE_CUR("cs is %llu on queue %d (count = %d): est len = %llu\n",
+                          get_gpu_estimate(t, MIG_LOCAL),
+                          ikglp_get_idx(sem, shortest->q),
+                          shortest->q->count,
+                          min_len);
+        for(i = 0; i < sem->nr_replicas; ++i) {
+                if(&aff->q_info[i] != shortest) {
+                        if(aff->q_info[i].q->count < max_fifo_len) {
+                                int want = 0;
+                                lt_t migration =
+                                        get_gpu_estimate(t,
+                                                                gpu_migration_distance(tsk_rt(t)->last_gpu,
+                                                                                                        replica_to_gpu(aff, i)));
+                                lt_t est_len = aff->q_info[i].estimated_len + migration;
+                                // queue is smaller, or they're equal and the other has a smaller number
+                                // of total users.
+                                //
+                                // tie-break on the shortest number of simult users.  this only kicks in
+                                // when there are more than 1 empty queues.
+                                // TODO: Make "est_len < min_len" a fuzzy function that allows
+                                // queues "close enough" in length to be considered equal.
+                                /* NOTE: 'shortest' starts out with affinity GPU */
+                                if(unlikely(shortest->q->count >= max_fifo_len)) {                                              /* 'shortest' is full and i-th queue is not */
+                                        want = 1;
+                                }
+                                else if(est_len < min_len) {
+                                        want = 1;                                                                                                                       /* i-th queue has shortest length */
+                                }
+                                else if(unlikely(est_len == min_len)) {                                                                 /* equal lengths */
+                                        if(!has_affinity(aff, t, ikglp_get_idx(sem, shortest->q))) {            /* don't sacrifice affinity on tie */
+                                                if(has_affinity(aff, t, i)) {
+                                                        want = 1;                                                                                                       /* switch to maintain affinity */
+                                                }
+                                                else if(*(aff->q_info[i].nr_aff_users) < min_nr_aff_users) {    /* favor one with less affinity load */
+                                                        want = 1;
+                                                }
+                                                else if((*(aff->q_info[i].nr_aff_users) == min_nr_aff_users) && /* equal number of affinity */
+                                                                (*(aff->q_info[i].nr_cur_users) < min_nr_users)) {              /* favor one with current fewer users */
+                                                        want = 1;
+                                                }
+                                        }
+                                }
+                                if(want) {
+                                        shortest = &aff->q_info[i];
+                                        min_len = est_len;
+                                        min_nr_users = *(aff->q_info[i].nr_cur_users);
+                                        min_nr_aff_users = *(aff->q_info[i].nr_aff_users);
+                                }
+                                TRACE_CUR("cs is %llu on queue %d (count = %d): est len = %llu\n",
+                                                  get_gpu_estimate(t,
+                                                                gpu_migration_distance(tsk_rt(t)->last_gpu,
+                                                                                                           replica_to_gpu(aff, i))),
+                                                  ikglp_get_idx(sem, aff->q_info[i].q),
+                                                  aff->q_info[i].q->count,
+                                                  est_len);
+                        }
+                        else {
+                                TRACE_CUR("queue %d is too long.  ineligible for enqueue.\n",
+                                                  ikglp_get_idx(sem, aff->q_info[i].q));
+                        }
+                }
+        }
+        if(shortest->q->count >= max_fifo_len) {
+                TRACE_CUR("selected fq %d is too long, but returning it anyway.\n",
+                                  ikglp_get_idx(sem, shortest->q));
+        }
+        to_enqueue = shortest->q;
+        TRACE_CUR("enqueue on fq %d (count = %d) (non-aff wanted fq %d)\n",
+                          ikglp_get_idx(sem, to_enqueue),
+                          to_enqueue->count,
+                          ikglp_get_idx(sem, sem->shortest_fifo_queue));
+        return to_enqueue;
+        //return(sem->shortest_fifo_queue);
+}
+static ikglp_wait_state_t* pick_steal(struct ikglp_affinity* aff,
+                                                                          int dest_gpu,
+                                                                          struct fifo_queue* fq)
+{
+        struct ikglp_semaphore *sem = ikglp_from_lock(aff->obs.lock);
+        ikglp_wait_state_t *wait = NULL;
+        int max_improvement = -(MIG_NONE+1);
+        int replica = ikglp_get_idx(sem, fq);
+        if(waitqueue_active(&fq->wait)) {
+                int this_gpu = replica_to_gpu(aff, replica);
+                struct list_head *pos;
+                list_for_each(pos, &fq->wait.task_list) {
+                        wait_queue_t *fq_wait = list_entry(pos, wait_queue_t, task_list);
+                        ikglp_wait_state_t *tmp_wait = container_of(fq_wait, ikglp_wait_state_t, fq_node);
+                        int tmp_improvement =
+                                gpu_migration_distance(this_gpu, tsk_rt(tmp_wait->task)->last_gpu) -
+                                gpu_migration_distance(dest_gpu, tsk_rt(tmp_wait->task)->last_gpu);
+                        if(tmp_improvement > max_improvement) {
+                                wait = tmp_wait;
+                                max_improvement = tmp_improvement;
+                                if(max_improvement >= (MIG_NONE-1)) {
+                                        goto out;
+                                }
+                        }
+                }
+                BUG_ON(!wait);
+        }
+        else {
+                TRACE_CUR("fq %d is empty!\n", replica);
+        }
+out:
+        TRACE_CUR("Candidate victim from fq %d is %s/%d.  aff improvement = %d.\n",
+                          replica,
+                          (wait) ? wait->task->comm : "nil",
+                          (wait) ? wait->task->pid  : -1,
+                          max_improvement);
+        return wait;
+}
+ikglp_wait_state_t* gpu_ikglp_advise_steal(struct ikglp_affinity* aff,
+                                                                                   struct fifo_queue* dst)
+{
+        // Huristic strategy: Find task with greatest improvement in affinity.
+        //
+        struct ikglp_semaphore *sem = ikglp_from_lock(aff->obs.lock);
+        ikglp_wait_state_t *to_steal_state = NULL;
+//      ikglp_wait_state_t *default_to_steal_state = ikglp_find_hp_waiter_to_steal(sem);
+        int max_improvement = -(MIG_NONE+1);
+        int replica, i;
+        int dest_gpu;
+        replica = ikglp_get_idx(sem, dst);
+        dest_gpu = replica_to_gpu(aff, replica);
+        for(i = 0; i < sem->nr_replicas; ++i) {
+                ikglp_wait_state_t *tmp_to_steal_state =
+                        pick_steal(aff, dest_gpu, &sem->fifo_queues[i]);
+                if(tmp_to_steal_state) {
+                        int tmp_improvement =
+                                gpu_migration_distance(replica_to_gpu(aff, i), tsk_rt(tmp_to_steal_state->task)->last_gpu) -
+                                gpu_migration_distance(dest_gpu, tsk_rt(tmp_to_steal_state->task)->last_gpu);
+                        if(tmp_improvement > max_improvement) {
+                                to_steal_state = tmp_to_steal_state;
+                                max_improvement = tmp_improvement;
+                                if(max_improvement >= (MIG_NONE-1)) {
+                                        goto out;
+                                }
+                        }
+                }
+        }
+out:
+        if(!to_steal_state) {
+                TRACE_CUR("Could not find anyone to steal.\n");
+        }
+        else {
+                TRACE_CUR("Selected victim %s/%d on fq %d (GPU %d) for fq %d (GPU %d): improvement = %d\n",
+                                  to_steal_state->task->comm, to_steal_state->task->pid,
+                                  ikglp_get_idx(sem, to_steal_state->donee_heap_node.fq),
+                                  replica_to_gpu(aff, ikglp_get_idx(sem, to_steal_state->donee_heap_node.fq)),
+                                  ikglp_get_idx(sem, dst),
+                                  dest_gpu,
+                                  max_improvement);
+//              TRACE_CUR("Non-aff wanted to select victim %s/%d on fq %d (GPU %d) for fq %d (GPU %d): improvement = %d\n",
+//                                default_to_steal_state->task->comm, default_to_steal_state->task->pid,
+//                                ikglp_get_idx(sem, default_to_steal_state->donee_heap_node.fq),
+//                                replica_to_gpu(aff, ikglp_get_idx(sem, default_to_steal_state->donee_heap_node.fq)),
+//                                ikglp_get_idx(sem, dst),
+//                                replica_to_gpu(aff, ikglp_get_idx(sem, dst)),
+//
+//                                gpu_migration_distance(
+//                                        replica_to_gpu(aff, ikglp_get_idx(sem, default_to_steal_state->donee_heap_node.fq)),
+//                                        tsk_rt(default_to_steal_state->task)->last_gpu) -
+//                                gpu_migration_distance(dest_gpu, tsk_rt(default_to_steal_state->task)->last_gpu));
+        }
+        return(to_steal_state);
+}
+static inline int has_donor(wait_queue_t* fq_wait)
+{
+        ikglp_wait_state_t *wait = container_of(fq_wait, ikglp_wait_state_t, fq_node);
+        return(wait->donee_heap_node.donor_info != NULL);
+}
+static ikglp_donee_heap_node_t* pick_donee(struct ikglp_affinity* aff,
+                                          struct fifo_queue* fq,
+                                          int* dist_from_head)
+{
+        struct ikglp_semaphore *sem = ikglp_from_lock(aff->obs.lock);
+        struct task_struct *donee;
+        ikglp_donee_heap_node_t *donee_node;
+        struct task_struct *mth_highest = ikglp_mth_highest(sem);
+//      lt_t now = litmus_clock();
+//
+//      TRACE_CUR("fq %d: mth_highest: %s/%d, deadline = %d: (donor) = ??? ",
+//                        ikglp_get_idx(sem, fq),
+//                        mth_highest->comm, mth_highest->pid,
+//                        (int)get_deadline(mth_highest) - now);
+        if(fq->owner &&
+           fq->donee_heap_node.donor_info == NULL &&
+           mth_highest != fq->owner &&
+           litmus->__compare(mth_highest, BASE, fq->owner, BASE)) {
+                donee = fq->owner;
+                donee_node = &(fq->donee_heap_node);
+                *dist_from_head = 0;
+                BUG_ON(donee != donee_node->task);
+                TRACE_CUR("picked owner of fq %d as donee\n",
+                                  ikglp_get_idx(sem, fq));
+                goto out;
+        }
+        else if(waitqueue_active(&fq->wait)) {
+                struct list_head        *pos;
+//              TRACE_CUR("fq %d: owner: %s/%d, deadline = %d: (donor) = %s/%d "
+//                                "(mth_highest != fq->owner) = %d "
+//                                "(mth_highest > fq->owner) = %d\n",
+//                                ikglp_get_idx(sem, fq),
+//                                (fq->owner) ? fq->owner->comm : "nil",
+//                                (fq->owner) ? fq->owner->pid : -1,
+//                                (fq->owner) ? (int)get_deadline(fq->owner) - now : -999,
+//                                (fq->donee_heap_node.donor_info) ? fq->donee_heap_node.donor_info->task->comm : "nil",
+//                                (fq->donee_heap_node.donor_info) ? fq->donee_heap_node.donor_info->task->pid : -1,
+//                                (mth_highest != fq->owner),
+//                                (litmus->__compare(mth_highest, BASE, fq->owner, BASE)));
+                *dist_from_head = 1;
+                // iterating from the start of the queue is nice since this means
+                // the donee will be closer to obtaining a resource.
+                list_for_each(pos, &fq->wait.task_list) {
+                        wait_queue_t *fq_wait = list_entry(pos, wait_queue_t, task_list);
+                        ikglp_wait_state_t *wait = container_of(fq_wait, ikglp_wait_state_t, fq_node);
+//                      TRACE_CUR("fq %d: waiter %d: %s/%d, deadline = %d (donor) = %s/%d "
+//                                        "(mth_highest != wait->task) = %d "
+//                                        "(mth_highest > wait->task) = %d\n",
+//                                        ikglp_get_idx(sem, fq),
+//                                        dist_from_head,
+//                                        wait->task->comm, wait->task->pid,
+//                                        (int)get_deadline(wait->task) - now,
+//                                        (wait->donee_heap_node.donor_info) ? wait->donee_heap_node.donor_info->task->comm : "nil",
+//                                        (wait->donee_heap_node.donor_info) ? wait->donee_heap_node.donor_info->task->pid : -1,
+//                                        (mth_highest != wait->task),
+//                                        (litmus->__compare(mth_highest, BASE, wait->task, BASE)));
+                        if(!has_donor(fq_wait) &&
+                           mth_highest != wait->task &&
+                           litmus->__compare(mth_highest, BASE, wait->task, BASE)) {
+                                donee = (struct task_struct*) fq_wait->private;
+                                donee_node = &wait->donee_heap_node;
+                                BUG_ON(donee != donee_node->task);
+                                TRACE_CUR("picked waiter in fq %d as donee\n",
+                                                  ikglp_get_idx(sem, fq));
+                                goto out;
+                        }
+                        ++(*dist_from_head);
+                }
+        }
+        donee = NULL;
+        donee_node = NULL;
+        //*dist_from_head = sem->max_fifo_len + 1;
+        *dist_from_head = IKGLP_INVAL_DISTANCE;
+        TRACE_CUR("Found no one to be donee in fq %d!\n", ikglp_get_idx(sem, fq));
+out:
+        TRACE_CUR("Candidate donee for fq %d is %s/%d (dist_from_head = %d)\n",
+                          ikglp_get_idx(sem, fq),
+                          (donee) ? (donee)->comm : "nil",
+                          (donee) ? (donee)->pid  : -1,
+                          *dist_from_head);
+        return donee_node;
+}
+ikglp_donee_heap_node_t* gpu_ikglp_advise_donee_selection(
+                                                                                        struct ikglp_affinity* aff,
+                                                                                        struct task_struct* donor)
+{
+        // Huristic strategy: Find the highest-priority donee that is waiting on
+        // a queue closest to our affinity.  (1) The donee CANNOT already have a
+        // donor (exception: donee is the lowest-prio task in the donee heap).
+        // (2) Requests in 'top_m' heap are ineligible.
+        //
+        // Further strategy: amongst elible donees waiting for the same GPU, pick
+        // the one closest to the head of the FIFO queue (including owners).
+        //
+        struct ikglp_semaphore *sem = ikglp_from_lock(aff->obs.lock);
+        ikglp_donee_heap_node_t *donee_node;
+        gpu_migration_dist_t distance;
+        int start, i, j;
+        ikglp_donee_heap_node_t *default_donee;
+        ikglp_wait_state_t *default_donee_donor_info;
+        if(tsk_rt(donor)->last_gpu < 0) {
+                // no affinity.  just return the min prio, like standard IKGLP
+                // TODO: Find something closer to the head of the queue??
+                donee_node = binheap_top_entry(&sem->donees,
+                                                                           ikglp_donee_heap_node_t,
+                                                                           node);
+                goto out;
+        }
+        // Temporarily break any donation relation the default donee (the lowest
+        // prio task in the FIFO queues) to make it eligible for selection below.
+        //
+        // NOTE: The original donor relation *must* be restored, even if we select
+        // the default donee throug affinity-aware selection, before returning
+        // from this function so we don't screw up our heap ordering.
+        // The standard IKGLP algorithm will steal the donor relationship if needed.
+        default_donee = binheap_top_entry(&sem->donees, ikglp_donee_heap_node_t, node);
+        default_donee_donor_info = default_donee->donor_info;  // back-up donor relation
+        default_donee->donor_info = NULL;  // temporarily break any donor relation.
+        // initialize our search
+        donee_node = NULL;
+        distance = MIG_NONE;
+        // TODO: The below search logic may work well for locating nodes to steal
+        // when an FQ goes idle.  Validate this code and apply it to stealing.
+        // begin search with affinity GPU.
+        start = gpu_to_base_replica(aff, tsk_rt(donor)->last_gpu);
+        i = start;
+        do {  // "for each gpu" / "for each aff->nr_rsrc"
+                gpu_migration_dist_t temp_distance = gpu_migration_distance(start, i);
+                // only interested in queues that will improve our distance
+                if(temp_distance < distance || donee_node == NULL) {
+                        int dist_from_head = IKGLP_INVAL_DISTANCE;
+                        TRACE_CUR("searching for donor on GPU %d", i);
+                        // visit each queue and pick a donee.  bail as soon as we find
+                        // one for this class.
+                        for(j = 0; j < aff->nr_simult; ++j) {
+                                int temp_dist_from_head;
+                                ikglp_donee_heap_node_t *temp_donee_node;
+                                struct fifo_queue *fq;
+                                fq = &(sem->fifo_queues[i + j*aff->nr_rsrc]);
+                                temp_donee_node = pick_donee(aff, fq, &temp_dist_from_head);
+                                if(temp_dist_from_head < dist_from_head)
+                                {
+                                        // we check all the FQs for this GPU to spread priorities
+                                        // out across the queues.  does this decrease jitter?
+                                        donee_node = temp_donee_node;
+                                        dist_from_head = temp_dist_from_head;
+                                }
+                        }
+                        if(dist_from_head != IKGLP_INVAL_DISTANCE) {
+                                TRACE_CUR("found donee %s/%d and is the %d-th waiter.\n",
+                                                  donee_node->task->comm, donee_node->task->pid,
+                                                  dist_from_head);
+                        }
+                        else {
+                                TRACE_CUR("found no eligible donors from GPU %d\n", i);
+                        }
+                }
+                else {
+                        TRACE_CUR("skipping GPU %d (distance = %d, best donor "
+                                          "distance = %d)\n", i, temp_distance, distance);
+                }
+                i = (i+1 < aff->nr_rsrc) ? i+1 : 0;  // increment with wrap-around
+        } while (i != start);
+        // restore old donor info state.
+        default_donee->donor_info = default_donee_donor_info;
+        if(!donee_node) {
+                donee_node = default_donee;
+                TRACE_CUR("Could not find a donee. We have to steal one.\n");
+                WARN_ON(default_donee->donor_info == NULL);
+        }
+out:
+        TRACE_CUR("Selected donee %s/%d on fq %d (GPU %d) for %s/%d with affinity for GPU %d\n",
+                          donee_node->task->comm, donee_node->task->pid,
+                          ikglp_get_idx(sem, donee_node->fq),
+                          replica_to_gpu(aff, ikglp_get_idx(sem, donee_node->fq)),
+                          donor->comm, donor->pid, tsk_rt(donor)->last_gpu);
+        return(donee_node);
+}
+static void __find_closest_donor(int target_gpu,
+                                                                 struct binheap_node* donor_node,
+                                                                 ikglp_wait_state_t** cur_closest,
+                                                                 int* cur_dist)
+{
+        ikglp_wait_state_t *this_donor =
+                binheap_entry(donor_node, ikglp_wait_state_t, node);
+        int this_dist =
+                gpu_migration_distance(target_gpu, tsk_rt(this_donor->task)->last_gpu);
+//      TRACE_CUR("%s/%d: dist from target = %d\n",
+//                        this_donor->task->comm,
+//                        this_donor->task->pid,
+//                        this_dist);
+        if(this_dist < *cur_dist) {
+                // take this donor
+                *cur_dist = this_dist;
+                *cur_closest = this_donor;
+        }
+        else if(this_dist == *cur_dist) {
+                // priority tie-break.  Even though this is a pre-order traversal,
+                // this is a heap, not a binary tree, so we still need to do a priority
+                // comparision.
+                if(!(*cur_closest) ||
+                   litmus->compare(this_donor->task, (*cur_closest)->task)) {
+                        *cur_dist = this_dist;
+                        *cur_closest = this_donor;
+                }
+        }
+    if(donor_node->left) __find_closest_donor(target_gpu, donor_node->left, cur_closest, cur_dist);
+    if(donor_node->right) __find_closest_donor(target_gpu, donor_node->right, cur_closest, cur_dist);
+}
+ikglp_wait_state_t* gpu_ikglp_advise_donor_to_fq(struct ikglp_affinity* aff, struct fifo_queue* fq)
+{
+        // Huristic strategy: Find donor with the closest affinity to fq.
+        // Tie-break on priority.
+        // We need to iterate over all the donors to do this.  Unfortunatly,
+        // our donors are organized in a heap.  We'll visit each node with a
+        // recurisve call.  This is realitively safe since there are only sem->m
+        // donors, at most.  We won't recurse too deeply to have to worry about
+        // our stack.  (even with 128 CPUs, our nest depth is at most 7 deep).
+        struct ikglp_semaphore *sem = ikglp_from_lock(aff->obs.lock);
+        ikglp_wait_state_t *donor = NULL;
+        int distance = MIG_NONE;
+        int gpu = replica_to_gpu(aff, ikglp_get_idx(sem, fq));
+#ifdef CONFIG_SCHED_DEBUG_TRACE
+        ikglp_wait_state_t* default_donor = binheap_top_entry(&sem->donors, ikglp_wait_state_t, node);
+#endif
+        __find_closest_donor(gpu, sem->donors.root, &donor, &distance);
+        TRACE_CUR("Selected donor %s/%d (distance = %d) to move to fq %d "
+                          "(non-aff wanted %s/%d). differs = %d\n",
+                          donor->task->comm, donor->task->pid,
+                          distance,
+                          ikglp_get_idx(sem, fq),
+                          default_donor->task->comm, default_donor->task->pid,
+                          (donor->task != default_donor->task)
+                          );
+        return(donor);
+}
+void gpu_ikglp_notify_enqueue(struct ikglp_affinity* aff, struct fifo_queue* fq, struct task_struct* t)
+{
+        struct ikglp_semaphore *sem = ikglp_from_lock(aff->obs.lock);
+        int replica = ikglp_get_idx(sem, fq);
+        int gpu = replica_to_gpu(aff, replica);
+        struct ikglp_queue_info *info = &aff->q_info[replica];
+        lt_t est_time;
+        lt_t est_len_before;
+        if(current == t) {
+                tsk_rt(t)->suspend_gpu_tracker_on_block = 1;
+        }
+        est_len_before = info->estimated_len;
+        est_time = get_gpu_estimate(t, gpu_migration_distance(tsk_rt(t)->last_gpu, gpu));
+        info->estimated_len += est_time;
+        TRACE_CUR("fq %d: q_len (%llu) + est_cs (%llu) = %llu\n",
+                          ikglp_get_idx(sem, info->q),
+                          est_len_before, est_time,
+                          info->estimated_len);
+        //      if(aff->shortest_queue == info) {
+        //              // we may no longer be the shortest
+        //              aff->shortest_queue = ikglp_aff_find_shortest(aff);
+        //
+        //              TRACE_CUR("shortest queue is fq %d (with %d in queue) has est len %llu\n",
+        //                                ikglp_get_idx(sem, aff->shortest_queue->q),
+        //                                aff->shortest_queue->q->count,
+        //                                aff->shortest_queue->estimated_len);
+        //      }
+}
+void gpu_ikglp_notify_dequeue(struct ikglp_affinity* aff, struct fifo_queue* fq, struct task_struct* t)
+{
+        struct ikglp_semaphore *sem = ikglp_from_lock(aff->obs.lock);
+        int replica = ikglp_get_idx(sem, fq);
+        int gpu = replica_to_gpu(aff, replica);
+        struct ikglp_queue_info *info = &aff->q_info[replica];
+        lt_t est_time = get_gpu_estimate(t, gpu_migration_distance(tsk_rt(t)->last_gpu, gpu));
+        if(est_time > info->estimated_len) {
+                WARN_ON(1);
+                info->estimated_len = 0;
+        }
+        else {
+                info->estimated_len -= est_time;
+        }
+        TRACE_CUR("fq %d est len is now %llu\n",
+                          ikglp_get_idx(sem, info->q),
+                          info->estimated_len);
+        // check to see if we're the shortest queue now.
+        //      if((aff->shortest_queue != info) &&
+        //         (aff->shortest_queue->estimated_len > info->estimated_len)) {
+        //
+        //              aff->shortest_queue = info;
+        //
+        //              TRACE_CUR("shortest queue is fq %d (with %d in queue) has est len %llu\n",
+        //                                ikglp_get_idx(sem, info->q),
+        //                                info->q->count,
+        //                                info->estimated_len);
+        //      }
+}
+int gpu_ikglp_notify_exit(struct ikglp_affinity* aff, struct task_struct* t)
+{
+        struct ikglp_semaphore *sem = ikglp_from_lock(aff->obs.lock);
+        unsigned long flags = 0, real_flags;
+        int aff_rsrc;
+#ifdef CONFIG_LITMUS_DGL_SUPPORT
+        raw_spinlock_t *dgl_lock;
+        dgl_lock = litmus->get_dgl_spinlock(t);
+#endif
+        if (tsk_rt(t)->last_gpu < 0)
+                return 0;
+        raw_spin_lock_irqsave(&sem->real_lock, real_flags);
+        lock_global_irqsave(dgl_lock, flags);
+        lock_fine_irqsave(&sem->lock, flags);
+        // decrement affinity count on old GPU
+        aff_rsrc = tsk_rt(t)->last_gpu - aff->offset;
+        --(aff->nr_aff_on_rsrc[aff_rsrc]);
+//      aff->nr_aff_on_rsrc[aff_rsrc] -= ((uint64_t)1e9)/get_rt_period(t);
+        if(unlikely(aff->nr_aff_on_rsrc[aff_rsrc] < 0)) {
+                WARN_ON(aff->nr_aff_on_rsrc[aff_rsrc] < 0);
+                aff->nr_aff_on_rsrc[aff_rsrc] = 0;
+        }
+        unlock_fine_irqrestore(&sem->lock, flags);
+        unlock_global_irqrestore(dgl_lock, flags);
+        raw_spin_unlock_irqrestore(&sem->real_lock, real_flags);
+        return 0;
+}
+int gpu_ikglp_notify_exit_trampoline(struct task_struct* t)
+{
+        struct ikglp_affinity* aff = (struct ikglp_affinity*)tsk_rt(t)->rsrc_exit_cb_args;
+        if(likely(aff)) {
+                return gpu_ikglp_notify_exit(aff, t);
+        }
+        else {
+                return -1;
+        }
+}
+void gpu_ikglp_notify_acquired(struct ikglp_affinity* aff,
+                                                           struct fifo_queue* fq,
+                                                           struct task_struct* t)
+{
+        struct ikglp_semaphore *sem = ikglp_from_lock(aff->obs.lock);
+        int replica = ikglp_get_idx(sem, fq);
+        int gpu = replica_to_gpu(aff, replica);
+        int last_gpu = tsk_rt(t)->last_gpu;
+        tsk_rt(t)->gpu_migration = gpu_migration_distance(last_gpu, gpu);  // record the type of migration
+        TRACE_CUR("%s/%d acquired gpu %d (prev = %d).  migration type = %d\n",
+                          t->comm, t->pid, gpu, last_gpu, tsk_rt(t)->gpu_migration);
+        // count the number or resource holders
+        ++(*(aff->q_info[replica].nr_cur_users));
+        if(gpu != last_gpu) {
+                if(last_gpu >= 0) {
+                        int old_rsrc = last_gpu - aff->offset;
+                        --(aff->nr_aff_on_rsrc[old_rsrc]);
+//                      aff->nr_aff_on_rsrc[old_rsrc] -= ((uint64_t)(1e9)/get_rt_period(t));
+                }
+                // increment affinity count on new GPU
+                ++(aff->nr_aff_on_rsrc[gpu - aff->offset]);
+//              aff->nr_aff_on_rsrc[gpu - aff->offset] += ((uint64_t)(1e9)/get_rt_period(t));
+                tsk_rt(t)->rsrc_exit_cb_args = aff;
+                tsk_rt(t)->rsrc_exit_cb = gpu_ikglp_notify_exit_trampoline;
+        }
+        reg_nv_device(gpu, 1, t);  // register
+        tsk_rt(t)->suspend_gpu_tracker_on_block = 0;
+        reset_gpu_tracker(t);
+        start_gpu_tracker(t);
+}
+void gpu_ikglp_notify_freed(struct ikglp_affinity* aff,
+                                                        struct fifo_queue* fq,
+                                                        struct task_struct* t)
+{
+        struct ikglp_semaphore *sem = ikglp_from_lock(aff->obs.lock);
+        int replica = ikglp_get_idx(sem, fq);
+        int gpu = replica_to_gpu(aff, replica);
+        lt_t est_time;
+        stop_gpu_tracker(t);  // stop the tracker before we do anything else.
+        est_time = get_gpu_estimate(t, gpu_migration_distance(tsk_rt(t)->last_gpu, gpu));
+        // count the number or resource holders
+        --(*(aff->q_info[replica].nr_cur_users));
+        reg_nv_device(gpu, 0, t);       // unregister
+        // update estimates
+        update_gpu_estimate(t, get_gpu_time(t));
+        TRACE_CUR("%s/%d freed gpu %d (prev = %d).  mig type = %d.  actual time was %llu.  "
+                          "estimated was %llu.  diff is %d\n",
+                          t->comm, t->pid, gpu, tsk_rt(t)->last_gpu,
+                          tsk_rt(t)->gpu_migration,
+                          get_gpu_time(t),
+                          est_time,
+                          (long long)get_gpu_time(t) - (long long)est_time);
+        tsk_rt(t)->last_gpu = gpu;
+}
+struct ikglp_affinity_ops gpu_ikglp_affinity =
+{
+        .advise_enqueue = gpu_ikglp_advise_enqueue,
+        .advise_steal = gpu_ikglp_advise_steal,
+        .advise_donee_selection = gpu_ikglp_advise_donee_selection,
+        .advise_donor_to_fq = gpu_ikglp_advise_donor_to_fq,
+        .notify_enqueue = gpu_ikglp_notify_enqueue,
+        .notify_dequeue = gpu_ikglp_notify_dequeue,
+        .notify_acquired = gpu_ikglp_notify_acquired,
+        .notify_freed = gpu_ikglp_notify_freed,
+        .notify_exit = gpu_ikglp_notify_exit,
+        .replica_to_resource = gpu_replica_to_resource,
+};
+struct affinity_observer* ikglp_gpu_aff_obs_new(struct affinity_observer_ops* ops,
+                                                                                                void* __user args)
+{
+        return ikglp_aff_obs_new(ops, &gpu_ikglp_affinity, args);
+}
+// Simple ikglp Affinity (standard ikglp with auto-gpu registration)
+struct fifo_queue* simple_gpu_ikglp_advise_enqueue(struct ikglp_affinity* aff, struct task_struct* t)
+{
+        struct ikglp_semaphore *sem = ikglp_from_lock(aff->obs.lock);
+        int min_count;
+        int min_nr_users;
+        struct ikglp_queue_info *shortest;
+        struct fifo_queue *to_enqueue;
+        int i;
+        //      TRACE_CUR("Simple GPU ikglp advise_enqueue invoked\n");
+        shortest = &aff->q_info[0];
+        min_count = shortest->q->count;
+        min_nr_users = *(shortest->nr_cur_users);
+        TRACE_CUR("queue %d: waiters = %d, total holders = %d\n",
+                          ikglp_get_idx(sem, shortest->q),
+                          shortest->q->count,
+                          min_nr_users);
+        for(i = 1; i < sem->nr_replicas; ++i) {
+                int len = aff->q_info[i].q->count;
+                // queue is smaller, or they're equal and the other has a smaller number
+                // of total users.
+                //
+                // tie-break on the shortest number of simult users.  this only kicks in
+                // when there are more than 1 empty queues.
+                if((len < min_count) ||
+                   ((len == min_count) && (*(aff->q_info[i].nr_cur_users) < min_nr_users))) {
+                        shortest = &aff->q_info[i];
+                        min_count = shortest->q->count;
+                        min_nr_users = *(aff->q_info[i].nr_cur_users);
+                }
+                TRACE_CUR("queue %d: waiters = %d, total holders = %d\n",
+                                  ikglp_get_idx(sem, aff->q_info[i].q),
+                                  aff->q_info[i].q->count,
+                                  *(aff->q_info[i].nr_cur_users));
+        }
+        to_enqueue = shortest->q;
+        TRACE_CUR("enqueue on fq %d (non-aff wanted fq %d)\n",
+                          ikglp_get_idx(sem, to_enqueue),
+                          ikglp_get_idx(sem, sem->shortest_fifo_queue));
+        return to_enqueue;
+}
+ikglp_wait_state_t* simple_gpu_ikglp_advise_steal(struct ikglp_affinity* aff,
+                                                                                                  struct fifo_queue* dst)
+{
+        struct ikglp_semaphore *sem = ikglp_from_lock(aff->obs.lock);
+        //      TRACE_CUR("Simple GPU ikglp advise_steal invoked\n");
+        return ikglp_find_hp_waiter_to_steal(sem);
+}
+ikglp_donee_heap_node_t* simple_gpu_ikglp_advise_donee_selection(struct ikglp_affinity* aff, struct task_struct* donor)
+{
+        struct ikglp_semaphore *sem = ikglp_from_lock(aff->obs.lock);
+        ikglp_donee_heap_node_t *donee = binheap_top_entry(&sem->donees, ikglp_donee_heap_node_t, node);
+        return(donee);
+}
+ikglp_wait_state_t* simple_gpu_ikglp_advise_donor_to_fq(struct ikglp_affinity* aff, struct fifo_queue* fq)
+{
+        struct ikglp_semaphore *sem = ikglp_from_lock(aff->obs.lock);
+        ikglp_wait_state_t* donor = binheap_top_entry(&sem->donors, ikglp_wait_state_t, node);
+        return(donor);
+}
+void simple_gpu_ikglp_notify_enqueue(struct ikglp_affinity* aff, struct fifo_queue* fq, struct task_struct* t)
+{
+        //      TRACE_CUR("Simple GPU ikglp notify_enqueue invoked\n");
+}
+void simple_gpu_ikglp_notify_dequeue(struct ikglp_affinity* aff, struct fifo_queue* fq, struct task_struct* t)
+{
+        //      TRACE_CUR("Simple GPU ikglp notify_dequeue invoked\n");
+}
+void simple_gpu_ikglp_notify_acquired(struct ikglp_affinity* aff, struct fifo_queue* fq, struct task_struct* t)
+{
+        struct ikglp_semaphore *sem = ikglp_from_lock(aff->obs.lock);
+        int replica = ikglp_get_idx(sem, fq);
+        int gpu = replica_to_gpu(aff, replica);
+        //      TRACE_CUR("Simple GPU ikglp notify_acquired invoked\n");
+        // count the number or resource holders
+        ++(*(aff->q_info[replica].nr_cur_users));
+        reg_nv_device(gpu, 1, t);  // register
+}
+void simple_gpu_ikglp_notify_freed(struct ikglp_affinity* aff, struct fifo_queue* fq, struct task_struct* t)
+{
+        struct ikglp_semaphore *sem = ikglp_from_lock(aff->obs.lock);
+        int replica = ikglp_get_idx(sem, fq);
+        int gpu = replica_to_gpu(aff, replica);
+        //      TRACE_CUR("Simple GPU ikglp notify_freed invoked\n");
+        // count the number or resource holders
+        --(*(aff->q_info[replica].nr_cur_users));
+        reg_nv_device(gpu, 0, t);       // unregister
+}
+struct ikglp_affinity_ops simple_gpu_ikglp_affinity =
+{
+        .advise_enqueue = simple_gpu_ikglp_advise_enqueue,
+        .advise_steal = simple_gpu_ikglp_advise_steal,
+        .advise_donee_selection = simple_gpu_ikglp_advise_donee_selection,
+        .advise_donor_to_fq = simple_gpu_ikglp_advise_donor_to_fq,
+        .notify_enqueue = simple_gpu_ikglp_notify_enqueue,
+        .notify_dequeue = simple_gpu_ikglp_notify_dequeue,
+        .notify_acquired = simple_gpu_ikglp_notify_acquired,
+        .notify_freed = simple_gpu_ikglp_notify_freed,
+        .notify_exit = NULL,
+        .replica_to_resource = gpu_replica_to_resource,
+};
+struct affinity_observer* ikglp_simple_gpu_aff_obs_new(struct affinity_observer_ops* ops,
+                                                                                                           void* __user args)
+{
+        return ikglp_aff_obs_new(ops, &simple_gpu_ikglp_affinity, args);
+}
+#endif
diff --git a/litmus/jobs.c b/litmus/jobs.c
index 13a4ed4c9e93..e25854e1d143 100644
--- a/litmus/jobs.c
+++ b/litmus/jobs.c
@@ -13,6 +13,8 @@ static inline void setup_release(struct task_struct *t, lt_t release)
        t->rt_param.job_params.deadline = release + get_rt_relative_deadline(t);
        t->rt_param.job_params.exec_time = 0;
+        clear_bit(RT_JOB_SIG_BUDGET_SENT, &t->rt_param.job_params.flags);
        /* update job sequence number */
        t->rt_param.job_params.job_no++;
diff --git a/litmus/kexclu_affinity.c b/litmus/kexclu_affinity.c
new file mode 100644
index 000000000000..5ef5e54d600d
--- /dev/null
+++ b/litmus/kexclu_affinity.c
@@ -0,0 +1,92 @@
+#include <litmus/fdso.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/trace.h>
+#include <litmus/litmus.h>
+#include <litmus/locking.h>
+#include <litmus/kexclu_affinity.h>
+static int create_generic_aff_obs(void** obj_ref, obj_type_t type, void* __user arg);
+static int open_generic_aff_obs(struct od_table_entry* entry, void* __user arg);
+static int close_generic_aff_obs(struct od_table_entry* entry);
+static void destroy_generic_aff_obs(obj_type_t type, void* sem);
+struct fdso_ops generic_affinity_ops = {
+        .create  = create_generic_aff_obs,
+        .open    = open_generic_aff_obs,
+        .close   = close_generic_aff_obs,
+        .destroy = destroy_generic_aff_obs
+};
+static atomic_t aff_obs_id_gen = ATOMIC_INIT(0);
+static inline bool is_affinity_observer(struct od_table_entry *entry)
+{
+        return (entry->class == &generic_affinity_ops);
+}
+static inline struct affinity_observer* get_affinity_observer(struct od_table_entry* entry)
+{
+        BUG_ON(!is_affinity_observer(entry));
+        return (struct affinity_observer*) entry->obj->obj;
+}
+static int create_generic_aff_obs(void** obj_ref, obj_type_t type, void* __user arg)
+{
+        struct affinity_observer* aff_obs;
+        int err;
+        err = litmus->allocate_aff_obs(&aff_obs, type, arg);
+        if (err == 0) {
+                BUG_ON(!aff_obs->lock);
+                aff_obs->type = type;
+                *obj_ref = aff_obs;
+    }
+        return err;
+}
+static int open_generic_aff_obs(struct od_table_entry* entry, void* __user arg)
+{
+        struct affinity_observer* aff_obs = get_affinity_observer(entry);
+        if (aff_obs->ops->open)
+                return aff_obs->ops->open(aff_obs, arg);
+        else
+                return 0; /* default: any task can open it */
+}
+static int close_generic_aff_obs(struct od_table_entry* entry)
+{
+        struct affinity_observer* aff_obs = get_affinity_observer(entry);
+        if (aff_obs->ops->close)
+                return aff_obs->ops->close(aff_obs);
+        else
+                return 0; /* default: closing succeeds */
+}
+static void destroy_generic_aff_obs(obj_type_t type, void* obj)
+{
+        struct affinity_observer* aff_obs = (struct affinity_observer*) obj;
+        aff_obs->ops->deallocate(aff_obs);
+}
+struct litmus_lock* get_lock_from_od(int od)
+{
+        extern struct fdso_ops generic_lock_ops;
+        struct od_table_entry *entry = get_entry_for_od(od);
+        if(entry && entry->class == &generic_lock_ops) {
+                return (struct litmus_lock*) entry->obj->obj;
+        }
+        return NULL;
+}
+void affinity_observer_new(struct affinity_observer* aff,
+                                                   struct affinity_observer_ops* ops,
+                                                   struct affinity_observer_args* args)
+{
+        aff->ops = ops;
+        aff->lock = get_lock_from_od(args->lock_od);
+        aff->ident = atomic_inc_return(&aff_obs_id_gen);
+}
+\ No newline at end of file
diff --git a/litmus/kfmlp_lock.c b/litmus/kfmlp_lock.c
new file mode 100644
index 000000000000..785a095275e6
--- /dev/null
+++ b/litmus/kfmlp_lock.c
@@ -0,0 +1,1003 @@
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <litmus/trace.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/fdso.h>
+#if defined(CONFIG_LITMUS_AFFINITY_LOCKING) && defined(CONFIG_LITMUS_NVIDIA)
+#include <litmus/gpu_affinity.h>
+#include <litmus/nvidia_info.h>
+#endif
+#include <litmus/kfmlp_lock.h>
+static inline int kfmlp_get_idx(struct kfmlp_semaphore* sem,
+                                                                struct kfmlp_queue* queue)
+{
+        return (queue - &sem->queues[0]);
+}
+static inline struct kfmlp_queue* kfmlp_get_queue(struct kfmlp_semaphore* sem,
+                                                                                                  struct task_struct* holder)
+{
+        int i;
+        for(i = 0; i < sem->num_resources; ++i)
+                if(sem->queues[i].owner == holder)
+                        return(&sem->queues[i]);
+        return(NULL);
+}
+/* caller is responsible for locking */
+static struct task_struct* kfmlp_find_hp_waiter(struct kfmlp_queue *kqueue,
+                                                                                                struct task_struct *skip)
+{
+        struct list_head        *pos;
+        struct task_struct      *queued, *found = NULL;
+        list_for_each(pos, &kqueue->wait.task_list) {
+                queued  = (struct task_struct*) list_entry(pos, wait_queue_t,
+                                                                                                   task_list)->private;
+                /* Compare task prios, find high prio task. */
+                //if (queued != skip && edf_higher_prio(queued, found))
+                if (queued != skip && litmus->compare(queued, found))
+                        found = queued;
+        }
+        return found;
+}
+static inline struct kfmlp_queue* kfmlp_find_shortest(struct kfmlp_semaphore* sem,
+                                                                                                          struct kfmlp_queue* search_start)
+{
+        // we start our search at search_start instead of at the beginning of the
+        // queue list to load-balance across all resources.
+        struct kfmlp_queue* step = search_start;
+        struct kfmlp_queue* shortest = sem->shortest_queue;
+        do
+        {
+                step = (step+1 != &sem->queues[sem->num_resources]) ?
+                step+1 : &sem->queues[0];
+                if(step->count < shortest->count)
+                {
+                        shortest = step;
+                        if(step->count == 0)
+                                break; /* can't get any shorter */
+                }
+        }while(step != search_start);
+        return(shortest);
+}
+static struct task_struct* kfmlp_select_hp_steal(struct kfmlp_semaphore* sem,
+                                                                                                 wait_queue_t** to_steal,
+                                                                                                 struct kfmlp_queue** to_steal_from)
+{
+        /* must hold sem->lock */
+        int i;
+        *to_steal = NULL;
+        *to_steal_from = NULL;
+        for(i = 0; i < sem->num_resources; ++i)
+        {
+                if( (sem->queues[i].count > 1) &&
+                   ((*to_steal_from == NULL) ||
+                        //(edf_higher_prio(sem->queues[i].hp_waiter, my_queue->hp_waiter))) )
+                        (litmus->compare(sem->queues[i].hp_waiter, (*to_steal_from)->hp_waiter))) )
+                {
+                        *to_steal_from = &sem->queues[i];
+                }
+        }
+        if(*to_steal_from)
+        {
+                struct list_head *pos;
+                struct task_struct *target = (*to_steal_from)->hp_waiter;
+                TRACE_CUR("want to steal hp_waiter (%s/%d) from queue %d\n",
+                                  target->comm,
+                                  target->pid,
+                                  kfmlp_get_idx(sem, *to_steal_from));
+                list_for_each(pos, &(*to_steal_from)->wait.task_list)
+                {
+                        wait_queue_t *node = list_entry(pos, wait_queue_t, task_list);
+                        struct task_struct *queued = (struct task_struct*) node->private;
+                        /* Compare task prios, find high prio task. */
+                        if (queued == target)
+                        {
+                                *to_steal = node;
+                                TRACE_CUR("steal: selected %s/%d from queue %d\n",
+                                                  queued->comm, queued->pid,
+                                                  kfmlp_get_idx(sem, *to_steal_from));
+                                return queued;
+                        }
+                }
+                TRACE_CUR("Could not find %s/%d in queue %d!!!  THIS IS A BUG!\n",
+                                  target->comm,
+                                  target->pid,
+                                  kfmlp_get_idx(sem, *to_steal_from));
+        }
+        return NULL;
+}
+static void kfmlp_steal_node(struct kfmlp_semaphore *sem,
+                                                         struct kfmlp_queue *dst,
+                                                         wait_queue_t *wait,
+                                                         struct kfmlp_queue *src)
+{
+        struct task_struct* t = (struct task_struct*) wait->private;
+        __remove_wait_queue(&src->wait, wait);
+        --(src->count);
+        if(t == src->hp_waiter) {
+                src->hp_waiter = kfmlp_find_hp_waiter(src, NULL);
+                TRACE_CUR("queue %d: %s/%d is new hp_waiter\n",
+                                  kfmlp_get_idx(sem, src),
+                                  (src->hp_waiter) ? src->hp_waiter->comm : "nil",
+                                  (src->hp_waiter) ? src->hp_waiter->pid : -1);
+                if(src->owner && tsk_rt(src->owner)->inh_task == t) {
+                        litmus->decrease_prio(src->owner, src->hp_waiter);
+                }
+        }
+        if(sem->shortest_queue->count > src->count) {
+                sem->shortest_queue = src;
+                TRACE_CUR("queue %d is the shortest\n", kfmlp_get_idx(sem, sem->shortest_queue));
+        }
+#ifdef CONFIG_LITMUS_AFFINITY_LOCKING
+        if(sem->aff_obs) {
+                sem->aff_obs->ops->notify_dequeue(sem->aff_obs, src, t);
+        }
+#endif
+        init_waitqueue_entry(wait, t);
+        __add_wait_queue_tail_exclusive(&dst->wait, wait);
+        ++(dst->count);
+        if(litmus->compare(t, dst->hp_waiter)) {
+                dst->hp_waiter = t;
+                TRACE_CUR("queue %d: %s/%d is new hp_waiter\n",
+                                  kfmlp_get_idx(sem, dst),
+                                  t->comm, t->pid);
+                if(dst->owner && litmus->compare(t, dst->owner))
+                {
+                        litmus->increase_prio(dst->owner, t);
+                }
+        }
+#ifdef CONFIG_LITMUS_AFFINITY_LOCKING
+        if(sem->aff_obs) {
+                sem->aff_obs->ops->notify_enqueue(sem->aff_obs, dst, t);
+        }
+#endif
+}
+int kfmlp_lock(struct litmus_lock* l)
+{
+        struct task_struct* t = current;
+        struct kfmlp_semaphore *sem = kfmlp_from_lock(l);
+        struct kfmlp_queue* my_queue = NULL;
+        wait_queue_t wait;
+        unsigned long flags;
+        if (!is_realtime(t))
+                return -EPERM;
+        spin_lock_irqsave(&sem->lock, flags);
+#ifdef CONFIG_LITMUS_AFFINITY_LOCKING
+        if(sem->aff_obs) {
+                my_queue = sem->aff_obs->ops->advise_enqueue(sem->aff_obs, t);
+        }
+        if(!my_queue) {
+                my_queue = sem->shortest_queue;
+        }
+#else
+        my_queue = sem->shortest_queue;
+#endif
+        if (my_queue->owner) {
+                /* resource is not free => must suspend and wait */
+                TRACE_CUR("queue %d: Resource is not free => must suspend and wait. (queue size = %d)\n",
+                                  kfmlp_get_idx(sem, my_queue),
+                                  my_queue->count);
+                init_waitqueue_entry(&wait, t);
+                /* FIXME: interruptible would be nice some day */
+                set_task_state(t, TASK_UNINTERRUPTIBLE);
+                __add_wait_queue_tail_exclusive(&my_queue->wait, &wait);
+                TRACE_CUR("queue %d: hp_waiter is currently %s/%d\n",
+                                  kfmlp_get_idx(sem, my_queue),
+                                  (my_queue->hp_waiter) ? my_queue->hp_waiter->comm : "nil",
+                                  (my_queue->hp_waiter) ? my_queue->hp_waiter->pid : -1);
+                /* check if we need to activate priority inheritance */
+                //if (edf_higher_prio(t, my_queue->hp_waiter))
+                if (litmus->compare(t, my_queue->hp_waiter)) {
+                        my_queue->hp_waiter = t;
+                        TRACE_CUR("queue %d: %s/%d is new hp_waiter\n",
+                                          kfmlp_get_idx(sem, my_queue),
+                                          t->comm, t->pid);
+                        //if (edf_higher_prio(t, my_queue->owner))
+                        if (litmus->compare(t, my_queue->owner)) {
+                                litmus->increase_prio(my_queue->owner, my_queue->hp_waiter);
+                        }
+                }
+                ++(my_queue->count);
+                if(my_queue == sem->shortest_queue) {
+                        sem->shortest_queue = kfmlp_find_shortest(sem, my_queue);
+                        TRACE_CUR("queue %d is the shortest\n",
+                                          kfmlp_get_idx(sem, sem->shortest_queue));
+                }
+#ifdef CONFIG_LITMUS_AFFINITY_LOCKING
+                if(sem->aff_obs) {
+                        sem->aff_obs->ops->notify_enqueue(sem->aff_obs, my_queue, t);
+                }
+#endif
+                /* release lock before sleeping */
+                spin_unlock_irqrestore(&sem->lock, flags);
+                /* We depend on the FIFO order.  Thus, we don't need to recheck
+                 * when we wake up; we are guaranteed to have the lock since
+                 * there is only one wake up per release (or steal).
+                 */
+                suspend_for_lock();
+                if(my_queue->owner == t) {
+                        TRACE_CUR("queue %d: acquired through waiting\n",
+                                          kfmlp_get_idx(sem, my_queue));
+                }
+                else {
+                        /* this case may happen if our wait entry was stolen
+                         between queues. record where we went. */
+                        my_queue = kfmlp_get_queue(sem, t);
+                        BUG_ON(!my_queue);
+                        TRACE_CUR("queue %d: acquired through stealing\n",
+                                          kfmlp_get_idx(sem, my_queue));
+                }
+        }
+        else {
+                TRACE_CUR("queue %d: acquired immediately\n",
+                                  kfmlp_get_idx(sem, my_queue));
+                my_queue->owner = t;
+                ++(my_queue->count);
+                if(my_queue == sem->shortest_queue) {
+                        sem->shortest_queue = kfmlp_find_shortest(sem, my_queue);
+                        TRACE_CUR("queue %d is the shortest\n",
+                                          kfmlp_get_idx(sem, sem->shortest_queue));
+                }
+#ifdef CONFIG_LITMUS_AFFINITY_LOCKING
+                if(sem->aff_obs) {
+                        sem->aff_obs->ops->notify_enqueue(sem->aff_obs, my_queue, t);
+                        sem->aff_obs->ops->notify_acquired(sem->aff_obs, my_queue, t);
+                }
+#endif
+                spin_unlock_irqrestore(&sem->lock, flags);
+        }
+#ifdef CONFIG_LITMUS_AFFINITY_LOCKING
+        if(sem->aff_obs) {
+                return sem->aff_obs->ops->replica_to_resource(sem->aff_obs, my_queue);
+        }
+#endif
+        return kfmlp_get_idx(sem, my_queue);
+}
+int kfmlp_unlock(struct litmus_lock* l)
+{
+        struct task_struct *t = current, *next;
+        struct kfmlp_semaphore *sem = kfmlp_from_lock(l);
+        struct kfmlp_queue *my_queue, *to_steal_from;
+        unsigned long flags;
+        int err = 0;
+        my_queue = kfmlp_get_queue(sem, t);
+        if (!my_queue) {
+                err = -EINVAL;
+                goto out;
+        }
+        spin_lock_irqsave(&sem->lock, flags);
+        TRACE_CUR("queue %d: unlocking\n", kfmlp_get_idx(sem, my_queue));
+        my_queue->owner = NULL;  // clear ownership
+        --(my_queue->count);
+        if(my_queue->count < sem->shortest_queue->count)
+        {
+                sem->shortest_queue = my_queue;
+                TRACE_CUR("queue %d is the shortest\n",
+                                  kfmlp_get_idx(sem, sem->shortest_queue));
+        }
+#ifdef CONFIG_LITMUS_AFFINITY_LOCKING
+        if(sem->aff_obs) {
+                sem->aff_obs->ops->notify_dequeue(sem->aff_obs, my_queue, t);
+                sem->aff_obs->ops->notify_freed(sem->aff_obs, my_queue, t);
+        }
+#endif
+        /* we lose the benefit of priority inheritance (if any) */
+        if (tsk_rt(t)->inh_task)
+                litmus->decrease_prio(t, NULL);
+        /* check if there are jobs waiting for this resource */
+RETRY:
+        next = __waitqueue_remove_first(&my_queue->wait);
+        if (next) {
+                /* next becomes the resouce holder */
+                my_queue->owner = next;
+#ifdef CONFIG_LITMUS_AFFINITY_LOCKING
+                if(sem->aff_obs) {
+                        sem->aff_obs->ops->notify_acquired(sem->aff_obs, my_queue, next);
+                }
+#endif
+                TRACE_CUR("queue %d: lock ownership passed to %s/%d\n",
+                                  kfmlp_get_idx(sem, my_queue), next->comm, next->pid);
+                /* determine new hp_waiter if necessary */
+                if (next == my_queue->hp_waiter) {
+                        TRACE_TASK(next, "was highest-prio waiter\n");
+                        my_queue->hp_waiter = kfmlp_find_hp_waiter(my_queue, next);
+                        if (my_queue->hp_waiter)
+                                TRACE_TASK(my_queue->hp_waiter, "queue %d: is new highest-prio waiter\n", kfmlp_get_idx(sem, my_queue));
+                        else
+                                TRACE("queue %d: no further waiters\n", kfmlp_get_idx(sem, my_queue));
+                } else {
+                        /* Well, if next is not the highest-priority waiter,
+                         * then it ought to inherit the highest-priority
+                         * waiter's priority. */
+                        litmus->increase_prio(next, my_queue->hp_waiter);
+                }
+                /* wake up next */
+                wake_up_process(next);
+        }
+        else {
+                // TODO: put this stealing logic before we attempt to release
+                // our resource.  (simplifies code and gets rid of ugly goto RETRY.
+                wait_queue_t *wait;
+                TRACE_CUR("queue %d: looking to steal someone...\n",
+                                  kfmlp_get_idx(sem, my_queue));
+#ifdef CONFIG_LITMUS_AFFINITY_LOCKING
+                next = (sem->aff_obs) ?
+                        sem->aff_obs->ops->advise_steal(sem->aff_obs, &wait, &to_steal_from) :
+                        kfmlp_select_hp_steal(sem, &wait, &to_steal_from);
+#else
+                next = kfmlp_select_hp_steal(sem, &wait, &to_steal_from);
+#endif
+                if(next) {
+                        TRACE_CUR("queue %d: stealing %s/%d from queue %d\n",
+                                          kfmlp_get_idx(sem, my_queue),
+                                          next->comm, next->pid,
+                                          kfmlp_get_idx(sem, to_steal_from));
+                        kfmlp_steal_node(sem, my_queue, wait, to_steal_from);
+                        goto RETRY;  // will succeed this time.
+                }
+                else {
+                        TRACE_CUR("queue %d: no one to steal.\n",
+                                          kfmlp_get_idx(sem, my_queue));
+                }
+        }
+        spin_unlock_irqrestore(&sem->lock, flags);
+out:
+        return err;
+}
+int kfmlp_close(struct litmus_lock* l)
+{
+        struct task_struct *t = current;
+        struct kfmlp_semaphore *sem = kfmlp_from_lock(l);
+        struct kfmlp_queue *my_queue;
+        unsigned long flags;
+        int owner;
+        spin_lock_irqsave(&sem->lock, flags);
+        my_queue = kfmlp_get_queue(sem, t);
+        owner = (my_queue) ? (my_queue->owner == t) : 0;
+        spin_unlock_irqrestore(&sem->lock, flags);
+        if (owner)
+                kfmlp_unlock(l);
+        return 0;
+}
+void kfmlp_free(struct litmus_lock* l)
+{
+        struct kfmlp_semaphore *sem = kfmlp_from_lock(l);
+        kfree(sem->queues);
+        kfree(sem);
+}
+struct litmus_lock* kfmlp_new(struct litmus_lock_ops* ops, void* __user args)
+{
+        struct kfmlp_semaphore* sem;
+        int num_resources = 0;
+        int i;
+        if(!access_ok(VERIFY_READ, args, sizeof(num_resources)))
+        {
+                return(NULL);
+        }
+        if(__copy_from_user(&num_resources, args, sizeof(num_resources)))
+        {
+                return(NULL);
+        }
+        if(num_resources < 1)
+        {
+                return(NULL);
+        }
+        sem = kmalloc(sizeof(*sem), GFP_KERNEL);
+        if(!sem)
+        {
+                return(NULL);
+        }
+        sem->queues = kmalloc(sizeof(struct kfmlp_queue)*num_resources, GFP_KERNEL);
+        if(!sem->queues)
+        {
+                kfree(sem);
+                return(NULL);
+        }
+        sem->litmus_lock.ops = ops;
+        spin_lock_init(&sem->lock);
+        sem->num_resources = num_resources;
+        for(i = 0; i < num_resources; ++i)
+        {
+                sem->queues[i].owner = NULL;
+                sem->queues[i].hp_waiter = NULL;
+                init_waitqueue_head(&sem->queues[i].wait);
+                sem->queues[i].count = 0;
+        }
+        sem->shortest_queue = &sem->queues[0];
+#ifdef CONFIG_LITMUS_AFFINITY_LOCKING
+        sem->aff_obs = NULL;
+#endif
+        return &sem->litmus_lock;
+}
+#if defined(CONFIG_LITMUS_AFFINITY_LOCKING) && defined(CONFIG_LITMUS_NVIDIA)
+static inline int __replica_to_gpu(struct kfmlp_affinity* aff, int replica)
+{
+        int gpu = replica % aff->nr_rsrc;
+        return gpu;
+}
+static inline int replica_to_gpu(struct kfmlp_affinity* aff, int replica)
+{
+        int gpu = __replica_to_gpu(aff, replica) + aff->offset;
+        return gpu;
+}
+static inline int gpu_to_base_replica(struct kfmlp_affinity* aff, int gpu)
+{
+        int replica = gpu - aff->offset;
+        return replica;
+}
+int kfmlp_aff_obs_close(struct affinity_observer* obs)
+{
+        return 0;
+}
+void kfmlp_aff_obs_free(struct affinity_observer* obs)
+{
+        struct kfmlp_affinity *kfmlp_aff = kfmlp_aff_obs_from_aff_obs(obs);
+        kfree(kfmlp_aff->nr_cur_users_on_rsrc);
+        kfree(kfmlp_aff->q_info);
+        kfree(kfmlp_aff);
+}
+static struct affinity_observer* kfmlp_aff_obs_new(struct affinity_observer_ops* ops,
+                                                                                                   struct kfmlp_affinity_ops* kfmlp_ops,
+                                                                                                   void* __user args)
+{
+        struct kfmlp_affinity* kfmlp_aff;
+        struct gpu_affinity_observer_args aff_args;
+        struct kfmlp_semaphore* sem;
+        int i;
+        unsigned long flags;
+        if(!access_ok(VERIFY_READ, args, sizeof(aff_args))) {
+                return(NULL);
+        }
+        if(__copy_from_user(&aff_args, args, sizeof(aff_args))) {
+                return(NULL);
+        }
+        sem = (struct kfmlp_semaphore*) get_lock_from_od(aff_args.obs.lock_od);
+        if(sem->litmus_lock.type != KFMLP_SEM) {
+                TRACE_CUR("Lock type not supported.  Type = %d\n", sem->litmus_lock.type);
+                return(NULL);
+        }
+        if((aff_args.nr_simult_users <= 0) ||
+           (sem->num_resources%aff_args.nr_simult_users != 0)) {
+                TRACE_CUR("Lock %d does not support #replicas (%d) for #simult_users "
+                                  "(%d) per replica.  #replicas should be evenly divisible "
+                                  "by #simult_users.\n",
+                                  sem->litmus_lock.ident,
+                                  sem->num_resources,
+                                  aff_args.nr_simult_users);
+                return(NULL);
+        }
+//      if(aff_args.nr_simult_users > NV_MAX_SIMULT_USERS) {
+//              TRACE_CUR("System does not support #simult_users > %d. %d requested.\n",
+//                                NV_MAX_SIMULT_USERS, aff_args.nr_simult_users);
+////            return(NULL);
+//      }
+        kfmlp_aff = kmalloc(sizeof(*kfmlp_aff), GFP_KERNEL);
+        if(!kfmlp_aff) {
+                return(NULL);
+        }
+        kfmlp_aff->q_info = kmalloc(sizeof(struct kfmlp_queue_info)*sem->num_resources, GFP_KERNEL);
+        if(!kfmlp_aff->q_info) {
+                kfree(kfmlp_aff);
+                return(NULL);
+        }
+        kfmlp_aff->nr_cur_users_on_rsrc = kmalloc(sizeof(int)*(sem->num_resources / aff_args.nr_simult_users), GFP_KERNEL);
+        if(!kfmlp_aff->nr_cur_users_on_rsrc) {
+                kfree(kfmlp_aff->q_info);
+                kfree(kfmlp_aff);
+                return(NULL);
+        }
+        affinity_observer_new(&kfmlp_aff->obs, ops, &aff_args.obs);
+        kfmlp_aff->ops = kfmlp_ops;
+        kfmlp_aff->offset = aff_args.replica_to_gpu_offset;
+        kfmlp_aff->nr_simult = aff_args.nr_simult_users;
+        kfmlp_aff->nr_rsrc = sem->num_resources / kfmlp_aff->nr_simult;
+        memset(kfmlp_aff->nr_cur_users_on_rsrc, 0, sizeof(int)*(sem->num_resources / kfmlp_aff->nr_rsrc));
+        for(i = 0; i < sem->num_resources; ++i) {
+                kfmlp_aff->q_info[i].q = &sem->queues[i];
+                kfmlp_aff->q_info[i].estimated_len = 0;
+                // multiple q_info's will point to the same resource (aka GPU) if
+                // aff_args.nr_simult_users > 1
+                kfmlp_aff->q_info[i].nr_cur_users = &kfmlp_aff->nr_cur_users_on_rsrc[__replica_to_gpu(kfmlp_aff,i)];
+        }
+        // attach observer to the lock
+        spin_lock_irqsave(&sem->lock, flags);
+        sem->aff_obs = kfmlp_aff;
+        spin_unlock_irqrestore(&sem->lock, flags);
+        return &kfmlp_aff->obs;
+}
+static int gpu_replica_to_resource(struct kfmlp_affinity* aff,
+                                                                   struct kfmlp_queue* fq) {
+        struct kfmlp_semaphore *sem = kfmlp_from_lock(aff->obs.lock);
+        return(replica_to_gpu(aff, kfmlp_get_idx(sem, fq)));
+}
+// Smart KFMLP Affinity
+//static inline struct kfmlp_queue_info* kfmlp_aff_find_shortest(struct kfmlp_affinity* aff)
+//{
+//      struct kfmlp_semaphore *sem = kfmlp_from_lock(aff->obs.lock);
+//      struct kfmlp_queue_info *shortest = &aff->q_info[0];
+//      int i;
+//
+//      for(i = 1; i < sem->num_resources; ++i) {
+//              if(aff->q_info[i].estimated_len < shortest->estimated_len) {
+//                      shortest = &aff->q_info[i];
+//              }
+//      }
+//
+//      return(shortest);
+//}
+struct kfmlp_queue* gpu_kfmlp_advise_enqueue(struct kfmlp_affinity* aff, struct task_struct* t)
+{
+        struct kfmlp_semaphore *sem = kfmlp_from_lock(aff->obs.lock);
+        lt_t min_len;
+        int min_nr_users;
+        struct kfmlp_queue_info *shortest;
+        struct kfmlp_queue *to_enqueue;
+        int i;
+        int affinity_gpu;
+        // simply pick the shortest queue if, we have no affinity, or we have
+        // affinity with the shortest
+        if(unlikely(tsk_rt(t)->last_gpu < 0)) {
+                affinity_gpu = aff->offset;  // first gpu
+                TRACE_CUR("no affinity\n");
+        }
+        else {
+                affinity_gpu = tsk_rt(t)->last_gpu;
+        }
+        // all things being equal, let's start with the queue with which we have
+        // affinity.  this helps us maintain affinity even when we don't have
+        // an estiamte for local-affinity execution time (i.e., 2nd time on GPU)
+        shortest = &aff->q_info[gpu_to_base_replica(aff, affinity_gpu)];
+//      if(shortest == aff->shortest_queue) {
+//              TRACE_CUR("special case: have affinity with shortest queue\n");
+//              goto out;
+//      }
+        min_len = shortest->estimated_len + get_gpu_estimate(t, MIG_LOCAL);
+        min_nr_users = *(shortest->nr_cur_users);
+        TRACE_CUR("cs is %llu on queue %d: est len = %llu\n",
+                          get_gpu_estimate(t, MIG_LOCAL),
+                          kfmlp_get_idx(sem, shortest->q),
+                          min_len);
+        for(i = 0; i < sem->num_resources; ++i) {
+                if(&aff->q_info[i] != shortest) {
+                        lt_t est_len =
+                                aff->q_info[i].estimated_len +
+                                get_gpu_estimate(t, gpu_migration_distance(tsk_rt(t)->last_gpu, replica_to_gpu(aff, i)));
+                        // queue is smaller, or they're equal and the other has a smaller number
+                        // of total users.
+                        //
+                        // tie-break on the shortest number of simult users.  this only kicks in
+                        // when there are more than 1 empty queues.
+                        if((est_len < min_len) ||
+                           ((est_len == min_len) && (*(aff->q_info[i].nr_cur_users) < min_nr_users))) {
+                                shortest = &aff->q_info[i];
+                                min_len = est_len;
+                                min_nr_users = *(aff->q_info[i].nr_cur_users);
+                        }
+                        TRACE_CUR("cs is %llu on queue %d: est len = %llu\n",
+                                          get_gpu_estimate(t, gpu_migration_distance(tsk_rt(t)->last_gpu, replica_to_gpu(aff, i))),
+                                          kfmlp_get_idx(sem, aff->q_info[i].q),
+                                          est_len);
+                }
+        }
+        to_enqueue = shortest->q;
+        TRACE_CUR("enqueue on fq %d (non-aff wanted fq %d)\n",
+                          kfmlp_get_idx(sem, to_enqueue),
+                          kfmlp_get_idx(sem, sem->shortest_queue));
+        return to_enqueue;
+}
+struct task_struct* gpu_kfmlp_advise_steal(struct kfmlp_affinity* aff, wait_queue_t** to_steal, struct kfmlp_queue** to_steal_from)
+{
+        struct kfmlp_semaphore *sem = kfmlp_from_lock(aff->obs.lock);
+        // For now, just steal highest priority waiter
+        // TODO: Implement affinity-aware stealing.
+        return kfmlp_select_hp_steal(sem, to_steal, to_steal_from);
+}
+void gpu_kfmlp_notify_enqueue(struct kfmlp_affinity* aff, struct kfmlp_queue* fq, struct task_struct* t)
+{
+        struct kfmlp_semaphore *sem = kfmlp_from_lock(aff->obs.lock);
+        int replica = kfmlp_get_idx(sem, fq);
+        int gpu = replica_to_gpu(aff, replica);
+        struct kfmlp_queue_info *info = &aff->q_info[replica];
+        lt_t est_time;
+        lt_t est_len_before;
+        if(current == t) {
+                tsk_rt(t)->suspend_gpu_tracker_on_block = 1;
+        }
+        est_len_before = info->estimated_len;
+        est_time = get_gpu_estimate(t, gpu_migration_distance(tsk_rt(t)->last_gpu, gpu));
+        info->estimated_len += est_time;
+        TRACE_CUR("fq %d: q_len (%llu) + est_cs (%llu) = %llu\n",
+                          kfmlp_get_idx(sem, info->q),
+                          est_len_before, est_time,
+                          info->estimated_len);
+//      if(aff->shortest_queue == info) {
+//              // we may no longer be the shortest
+//              aff->shortest_queue = kfmlp_aff_find_shortest(aff);
+//
+//              TRACE_CUR("shortest queue is fq %d (with %d in queue) has est len %llu\n",
+//                                kfmlp_get_idx(sem, aff->shortest_queue->q),
+//                                aff->shortest_queue->q->count,
+//                                aff->shortest_queue->estimated_len);
+//      }
+}
+void gpu_kfmlp_notify_dequeue(struct kfmlp_affinity* aff, struct kfmlp_queue* fq, struct task_struct* t)
+{
+        struct kfmlp_semaphore *sem = kfmlp_from_lock(aff->obs.lock);
+        int replica = kfmlp_get_idx(sem, fq);
+        int gpu = replica_to_gpu(aff, replica);
+        struct kfmlp_queue_info *info = &aff->q_info[replica];
+        lt_t est_time = get_gpu_estimate(t, gpu_migration_distance(tsk_rt(t)->last_gpu, gpu));
+        if(est_time > info->estimated_len) {
+                WARN_ON(1);
+                info->estimated_len = 0;
+        }
+        else {
+                info->estimated_len -= est_time;
+        }
+        TRACE_CUR("fq %d est len is now %llu\n",
+                          kfmlp_get_idx(sem, info->q),
+                          info->estimated_len);
+        // check to see if we're the shortest queue now.
+//      if((aff->shortest_queue != info) &&
+//         (aff->shortest_queue->estimated_len > info->estimated_len)) {
+//
+//              aff->shortest_queue = info;
+//
+//              TRACE_CUR("shortest queue is fq %d (with %d in queue) has est len %llu\n",
+//                                kfmlp_get_idx(sem, info->q),
+//                                info->q->count,
+//                                info->estimated_len);
+//      }
+}
+void gpu_kfmlp_notify_acquired(struct kfmlp_affinity* aff, struct kfmlp_queue* fq, struct task_struct* t)
+{
+        struct kfmlp_semaphore *sem = kfmlp_from_lock(aff->obs.lock);
+        int replica = kfmlp_get_idx(sem, fq);
+        int gpu = replica_to_gpu(aff, replica);
+        tsk_rt(t)->gpu_migration = gpu_migration_distance(tsk_rt(t)->last_gpu, gpu);  // record the type of migration
+        TRACE_CUR("%s/%d acquired gpu %d.  migration type = %d\n",
+                          t->comm, t->pid, gpu, tsk_rt(t)->gpu_migration);
+        // count the number or resource holders
+        ++(*(aff->q_info[replica].nr_cur_users));
+        reg_nv_device(gpu, 1, t);  // register
+        tsk_rt(t)->suspend_gpu_tracker_on_block = 0;
+        reset_gpu_tracker(t);
+        start_gpu_tracker(t);
+}
+void gpu_kfmlp_notify_freed(struct kfmlp_affinity* aff, struct kfmlp_queue* fq, struct task_struct* t)
+{
+        struct kfmlp_semaphore *sem = kfmlp_from_lock(aff->obs.lock);
+        int replica = kfmlp_get_idx(sem, fq);
+        int gpu = replica_to_gpu(aff, replica);
+        lt_t est_time;
+        stop_gpu_tracker(t);  // stop the tracker before we do anything else.
+        est_time = get_gpu_estimate(t, gpu_migration_distance(tsk_rt(t)->last_gpu, gpu));
+        tsk_rt(t)->last_gpu = gpu;
+        // count the number or resource holders
+        --(*(aff->q_info[replica].nr_cur_users));
+        reg_nv_device(gpu, 0, t);       // unregister
+        // update estimates
+        update_gpu_estimate(t, get_gpu_time(t));
+        TRACE_CUR("%s/%d freed gpu %d.  actual time was %llu.  estimated was %llu.  diff is %d\n",
+                          t->comm, t->pid, gpu,
+                          get_gpu_time(t),
+                          est_time,
+                          (long long)get_gpu_time(t) - (long long)est_time);
+}
+struct kfmlp_affinity_ops gpu_kfmlp_affinity =
+{
+        .advise_enqueue = gpu_kfmlp_advise_enqueue,
+        .advise_steal = gpu_kfmlp_advise_steal,
+        .notify_enqueue = gpu_kfmlp_notify_enqueue,
+        .notify_dequeue = gpu_kfmlp_notify_dequeue,
+        .notify_acquired = gpu_kfmlp_notify_acquired,
+        .notify_freed = gpu_kfmlp_notify_freed,
+        .replica_to_resource = gpu_replica_to_resource,
+};
+struct affinity_observer* kfmlp_gpu_aff_obs_new(struct affinity_observer_ops* ops,
+                                                                                        void* __user args)
+{
+        return kfmlp_aff_obs_new(ops, &gpu_kfmlp_affinity, args);
+}
+// Simple KFMLP Affinity (standard KFMLP with auto-gpu registration)
+struct kfmlp_queue* simple_gpu_kfmlp_advise_enqueue(struct kfmlp_affinity* aff, struct task_struct* t)
+{
+        struct kfmlp_semaphore *sem = kfmlp_from_lock(aff->obs.lock);
+        int min_count;
+        int min_nr_users;
+        struct kfmlp_queue_info *shortest;
+        struct kfmlp_queue *to_enqueue;
+        int i;
+//      TRACE_CUR("Simple GPU KFMLP advise_enqueue invoked\n");
+        shortest = &aff->q_info[0];
+        min_count = shortest->q->count;
+        min_nr_users = *(shortest->nr_cur_users);
+        TRACE_CUR("queue %d: waiters = %d, total holders = %d\n",
+                          kfmlp_get_idx(sem, shortest->q),
+                          shortest->q->count,
+                          min_nr_users);
+        for(i = 1; i < sem->num_resources; ++i) {
+                int len = aff->q_info[i].q->count;
+                // queue is smaller, or they're equal and the other has a smaller number
+                // of total users.
+                //
+                // tie-break on the shortest number of simult users.  this only kicks in
+                // when there are more than 1 empty queues.
+                if((len < min_count) ||
+                   ((len == min_count) && (*(aff->q_info[i].nr_cur_users) < min_nr_users))) {
+                        shortest = &aff->q_info[i];
+                        min_count = shortest->q->count;
+                        min_nr_users = *(aff->q_info[i].nr_cur_users);
+                }
+                TRACE_CUR("queue %d: waiters = %d, total holders = %d\n",
+                                  kfmlp_get_idx(sem, aff->q_info[i].q),
+                                  aff->q_info[i].q->count,
+                                  *(aff->q_info[i].nr_cur_users));
+        }
+        to_enqueue = shortest->q;
+        TRACE_CUR("enqueue on fq %d (non-aff wanted fq %d)\n",
+                          kfmlp_get_idx(sem, to_enqueue),
+                          kfmlp_get_idx(sem, sem->shortest_queue));
+        return to_enqueue;
+}
+struct task_struct* simple_gpu_kfmlp_advise_steal(struct kfmlp_affinity* aff, wait_queue_t** to_steal, struct kfmlp_queue** to_steal_from)
+{
+        struct kfmlp_semaphore *sem = kfmlp_from_lock(aff->obs.lock);
+//      TRACE_CUR("Simple GPU KFMLP advise_steal invoked\n");
+        return kfmlp_select_hp_steal(sem, to_steal, to_steal_from);
+}
+void simple_gpu_kfmlp_notify_enqueue(struct kfmlp_affinity* aff, struct kfmlp_queue* fq, struct task_struct* t)
+{
+//      TRACE_CUR("Simple GPU KFMLP notify_enqueue invoked\n");
+}
+void simple_gpu_kfmlp_notify_dequeue(struct kfmlp_affinity* aff, struct kfmlp_queue* fq, struct task_struct* t)
+{
+//      TRACE_CUR("Simple GPU KFMLP notify_dequeue invoked\n");
+}
+void simple_gpu_kfmlp_notify_acquired(struct kfmlp_affinity* aff, struct kfmlp_queue* fq, struct task_struct* t)
+{
+        struct kfmlp_semaphore *sem = kfmlp_from_lock(aff->obs.lock);
+        int replica = kfmlp_get_idx(sem, fq);
+        int gpu = replica_to_gpu(aff, replica);
+//      TRACE_CUR("Simple GPU KFMLP notify_acquired invoked\n");
+        // count the number or resource holders
+        ++(*(aff->q_info[replica].nr_cur_users));
+        reg_nv_device(gpu, 1, t);  // register
+}
+void simple_gpu_kfmlp_notify_freed(struct kfmlp_affinity* aff, struct kfmlp_queue* fq, struct task_struct* t)
+{
+        struct kfmlp_semaphore *sem = kfmlp_from_lock(aff->obs.lock);
+        int replica = kfmlp_get_idx(sem, fq);
+        int gpu = replica_to_gpu(aff, replica);
+//      TRACE_CUR("Simple GPU KFMLP notify_freed invoked\n");
+        // count the number or resource holders
+        --(*(aff->q_info[replica].nr_cur_users));
+        reg_nv_device(gpu, 0, t);       // unregister
+}
+struct kfmlp_affinity_ops simple_gpu_kfmlp_affinity =
+{
+        .advise_enqueue = simple_gpu_kfmlp_advise_enqueue,
+        .advise_steal = simple_gpu_kfmlp_advise_steal,
+        .notify_enqueue = simple_gpu_kfmlp_notify_enqueue,
+        .notify_dequeue = simple_gpu_kfmlp_notify_dequeue,
+        .notify_acquired = simple_gpu_kfmlp_notify_acquired,
+        .notify_freed = simple_gpu_kfmlp_notify_freed,
+        .replica_to_resource = gpu_replica_to_resource,
+};
+struct affinity_observer* kfmlp_simple_gpu_aff_obs_new(struct affinity_observer_ops* ops,
+                                                                                                void* __user args)
+{
+        return kfmlp_aff_obs_new(ops, &simple_gpu_kfmlp_affinity, args);
+}
+#endif
diff --git a/litmus/litmus.c b/litmus/litmus.c
index dc94be71bfb6..2911e7ec7029 100644
--- a/litmus/litmus.c
+++ b/litmus/litmus.c
@@ -23,6 +23,14 @@
 #include <litmus/affinity.h>
 #endif
+#ifdef CONFIG_LITMUS_NVIDIA
+#include <litmus/nvidia_info.h>
+#endif
+#ifdef CONFIG_REALTIME_AUX_TASKS
+#include <litmus/aux_tasks.h>
+#endif
 /* Number of RT tasks that exist in the system */
 atomic_t rt_task_count          = ATOMIC_INIT(0);
@@ -135,6 +143,16 @@ asmlinkage long sys_set_rt_task_param(pid_t pid, struct rt_task __user * param)
                       pid, tp.budget_policy);
                goto out_unlock;
        }
+        if (tp.budget_signal_policy != NO_SIGNALS &&
+            tp.budget_signal_policy != QUANTUM_SIGNALS &&
+            tp.budget_signal_policy != PRECISE_SIGNALS)
+        {
+                printk(KERN_INFO "litmus: real-time task %d rejected "
+                       "because unsupported budget signalling policy "
+                       "specified (%d)\n",
+                       pid, tp.budget_signal_policy);
+                goto out_unlock;
+        }
        target->rt_param.task_params = tp;
@@ -272,6 +290,7 @@ asmlinkage long sys_query_job_no(unsigned int __user *job)
        return retval;
 }
 /* sys_null_call() is only used for determining raw system call
 * overheads (kernel entry, kernel exit). It has no useful side effects.
 * If ts is non-NULL, then the current Feather-Trace time is recorded.
@@ -289,12 +308,117 @@ asmlinkage long sys_null_call(cycles_t __user *ts)
        return ret;
 }
+asmlinkage long sys_sched_trace_event(int event, struct st_inject_args __user *__args)
+{
+        long retval = 0;
+        struct task_struct* t = current;
+        struct st_inject_args args;
+        if (is_realtime(t)) {
+                printk(KERN_WARNING "Only non-real-time tasks may inject sched_trace events.\n");
+                retval = -EINVAL;
+                goto out;
+        }
+        if (__args && copy_from_user(&args, __args, sizeof(args))) {
+                retval = -EFAULT;
+                goto out;
+        }
+        switch(event) {
+                /*************************************/
+                /* events that don't need parameters */
+                /*************************************/
+                case ST_INJECT_NAME:
+                        sched_trace_task_name(t);
+                        break;
+                case ST_INJECT_PARAM:
+                        /* presumes sporadic_task_ns() has already been called
+                         * and valid data has been initialized even if the calling
+                         * task is SCHED_NORMAL. */
+                        sched_trace_task_param(t);
+                        break;
+                /*******************************/
+                /* events that need parameters */
+                /*******************************/
+                case ST_INJECT_COMPLETION:
+                        if (!__args) {
+                                retval = -EINVAL;
+                                goto out;
+                        }
+                        /* slam in the data */
+                        t->rt_param.job_params.job_no = args.job_no;
+                        sched_trace_task_completion(t, 0);
+                        break;
+                case ST_INJECT_RELEASE:
+                        if (!__args) {
+                                retval = -EINVAL;
+                                goto out;
+                        }
+                        /* slam in the data */
+                        tsk_rt(t)->job_params.release = args.release;
+                        tsk_rt(t)->job_params.deadline = args.deadline;
+                        sched_trace_task_release(t);
+                        break;
+                /**********************/
+                /* unsupported events */
+                /**********************/
+                default:
+                        retval = -EINVAL;
+                        break;
+        }
+out:
+        return retval;
+}
+#if defined(CONFIG_LITMUS_NVIDIA) && defined(CONFIG_LITMUS_AFFINITY_LOCKING)
+void init_gpu_affinity_state(struct task_struct* p)
+{
+        // under-damped
+        //p->rt_param.gpu_fb_param_a = _frac(14008, 10000);
+        //p->rt_param.gpu_fb_param_b = _frac(16024, 10000);
+#if 0
+        // emperical;
+        p->rt_param.gpu_fb_param_a[0] = _frac(7550, 10000);
+        p->rt_param.gpu_fb_param_b[0] = _frac(45800, 10000);
+        p->rt_param.gpu_fb_param_a[1] = _frac(8600, 10000);
+        p->rt_param.gpu_fb_param_b[1] = _frac(40000, 10000);
+        p->rt_param.gpu_fb_param_a[2] = _frac(6890, 10000);
+        p->rt_param.gpu_fb_param_b[2] = _frac(40000, 10000);
+        p->rt_param.gpu_fb_param_a[3] = _frac(7580, 10000);
+        p->rt_param.gpu_fb_param_b[3] = _frac(34590, 10000);
+#endif
+        p->rt_param.gpu_migration = MIG_NONE;
+        p->rt_param.last_gpu = -1;
+}
+#endif
 /* p is a real-time task. Re-init its state as a best-effort task. */
 static void reinit_litmus_state(struct task_struct* p, int restore)
 {
        struct rt_task  user_config = {};
        void*  ctrl_page     = NULL;
+#ifdef CONFIG_LITMUS_NESTED_LOCKING
+        binheap_order_t prio_order = NULL;
+#endif
+        TRACE_TASK(p, "reinit_litmus_state: restore = %d\n", restore);
        if (restore) {
                /* Safe user-space provided configuration data.
                 * and allocated page. */
@@ -302,48 +426,57 @@ static void reinit_litmus_state(struct task_struct* p, int restore)
                ctrl_page   = p->rt_param.ctrl_page;
        }
+#ifdef CONFIG_LITMUS_NVIDIA
+        WARN_ON(p->rt_param.held_gpus != 0);
+#endif
+#ifdef CONFIG_LITMUS_LOCKING
        /* We probably should not be inheriting any task's priority
         * at this point in time.
         */
        WARN_ON(p->rt_param.inh_task);
+#endif
+#ifdef CONFIG_LITMUS_NESTED_LOCKING
+        prio_order = p->rt_param.hp_blocked_tasks.compare;
+#endif
        /* Cleanup everything else. */
        memset(&p->rt_param, 0, sizeof(p->rt_param));
+#ifdef CONFIG_REALTIME_AUX_TASKS
+        /* also clear out the aux_data. the !restore case is only called on
+         * fork (initial thread creation). */
+        if (!restore) {
+                memset(&p->aux_data, 0, sizeof(p->aux_data));
+        }
+#endif
        /* Restore preserved fields. */
        if (restore) {
                p->rt_param.task_params = user_config;
                p->rt_param.ctrl_page   = ctrl_page;
        }
-}
-long litmus_admit_task(struct task_struct* tsk)
+#ifdef CONFIG_LITMUS_NVIDIA
-{
+        INIT_BINHEAP_NODE(&p->rt_param.gpu_owner_node);
-        long retval = 0;
+#endif
-        BUG_ON(is_realtime(tsk));
+#if defined(CONFIG_LITMUS_NVIDIA) && defined(CONFIG_LITMUS_AFFINITY_LOCKING)
+        init_gpu_affinity_state(p);
+#endif
-        tsk_rt(tsk)->heap_node = NULL;
+#ifdef CONFIG_LITMUS_NESTED_LOCKING
-        tsk_rt(tsk)->rel_heap = NULL;
+        INIT_BINHEAP_HANDLE(&p->rt_param.hp_blocked_tasks, prio_order);
+        raw_spin_lock_init(&p->rt_param.hp_blocked_tasks_lock);
+#endif
+}
-        if (get_rt_relative_deadline(tsk) == 0 ||
-            get_exec_cost(tsk) >
-                        min(get_rt_relative_deadline(tsk), get_rt_period(tsk)) ) {
-                TRACE_TASK(tsk,
-                        "litmus admit: invalid task parameters "
-                        "(e = %lu, p = %lu, d = %lu)\n",
-                        get_exec_cost(tsk), get_rt_period(tsk),
-                        get_rt_relative_deadline(tsk));
-                retval = -EINVAL;
-                goto out;
-        }
-        if (!cpu_online(get_partition(tsk))) {
-                TRACE_TASK(tsk, "litmus admit: cpu %d is not online\n",
+long __litmus_admit_task(struct task_struct* tsk)
-                           get_partition(tsk));
+{
-                retval = -EINVAL;
+        long retval = 0;
-                goto out;
-        }
        INIT_LIST_HEAD(&tsk_rt(tsk)->list);
@@ -360,6 +493,17 @@ long litmus_admit_task(struct task_struct* tsk)
                bheap_node_init(&tsk_rt(tsk)->heap_node, tsk);
        }
+#ifdef CONFIG_LITMUS_NVIDIA
+        atomic_set(&tsk_rt(tsk)->nv_int_count, 0);
+#endif
+#if defined(CONFIG_LITMUS_NVIDIA) && defined(CONFIG_LITMUS_AFFINITY_LOCKING)
+        init_gpu_affinity_state(tsk);
+#endif
+#ifdef CONFIG_LITMUS_NESTED_LOCKING
+        tsk_rt(tsk)->blocked_lock = NULL;
+        raw_spin_lock_init(&tsk_rt(tsk)->hp_blocked_tasks_lock);
+#endif
        preempt_disable();
        retval = litmus->admit_task(tsk);
@@ -372,14 +516,56 @@ long litmus_admit_task(struct task_struct* tsk)
        preempt_enable();
-out:
        if (retval) {
                bheap_node_free(tsk_rt(tsk)->heap_node);
                release_heap_free(tsk_rt(tsk)->rel_heap);
        }
+out:
+        return retval;
+}
+long litmus_admit_task(struct task_struct* tsk)
+{
+        long retval = 0;
+        BUG_ON(is_realtime(tsk));
+        if (get_rt_relative_deadline(tsk) == 0 ||
+            get_exec_cost(tsk) >
+                        min(get_rt_relative_deadline(tsk), get_rt_period(tsk)) ) {
+                TRACE_TASK(tsk,
+                        "litmus admit: invalid task parameters "
+                        "(e = %lu, p = %lu, d = %lu)\n",
+                        get_exec_cost(tsk), get_rt_period(tsk),
+                        get_rt_relative_deadline(tsk));
+                retval = -EINVAL;
+                goto out;
+        }
+        if (!cpu_online(get_partition(tsk))) {
+                TRACE_TASK(tsk, "litmus admit: cpu %d is not online\n",
+                           get_partition(tsk));
+                retval = -EINVAL;
+                goto out;
+        }
+        retval = __litmus_admit_task(tsk);
+out:
        return retval;
 }
+void litmus_pre_exit_task(struct task_struct* tsk)
+{
+        if (is_realtime(tsk)) {
+                if (tsk_rt(tsk)->rsrc_exit_cb) {
+                        int ret = tsk_rt(tsk)->rsrc_exit_cb(tsk);
+                        WARN_ON(ret != 0);
+                }
+        }
+}
 void litmus_exit_task(struct task_struct* tsk)
 {
        if (is_realtime(tsk)) {
@@ -388,7 +574,7 @@ void litmus_exit_task(struct task_struct* tsk)
                litmus->task_exit(tsk);
                BUG_ON(bheap_node_in_heap(tsk_rt(tsk)->heap_node));
-                bheap_node_free(tsk_rt(tsk)->heap_node);
+                bheap_node_free(tsk_rt(tsk)->heap_node);
                release_heap_free(tsk_rt(tsk)->rel_heap);
                atomic_dec(&rt_task_count);
@@ -406,14 +592,19 @@ static int do_plugin_switch(void *_plugin)
                ret = litmus->deactivate_plugin();
                if (0 != ret)
                        goto out;
-                ret = plugin->activate_plugin();
+                litmus = plugin;  /* optimistic switch */
+                mb();
+                ret = litmus->activate_plugin();
                if (0 != ret) {
                        printk(KERN_INFO "Can't activate %s (%d).\n",
-                               plugin->plugin_name, ret);
+                               litmus->plugin_name, ret);
-                        plugin = &linux_sched_plugin;
+                        litmus = &linux_sched_plugin; /* fail to Linux */
+                        ret = litmus->activate_plugin();
+                        BUG_ON(ret);
                }
-                printk(KERN_INFO "Switching to LITMUS^RT plugin %s.\n", plugin->plugin_name);
+                printk(KERN_INFO "Switched to LITMUS^RT plugin %s.\n", litmus->plugin_name);
-                litmus = plugin;
        } else
                ret = -EBUSY;
 out:
@@ -429,6 +620,12 @@ int switch_sched_plugin(struct sched_plugin* plugin)
 {
        BUG_ON(!plugin);
+#ifdef CONFIG_LITMUS_SOFTIRQD
+        if (!klmirqd_is_dead()) {
+                kill_klmirqd();
+        }
+#endif
        if (atomic_read(&rt_task_count) == 0)
                return stop_machine(do_plugin_switch, plugin, NULL);
        else
@@ -441,18 +638,33 @@ int switch_sched_plugin(struct sched_plugin* plugin)
 void litmus_fork(struct task_struct* p)
 {
        if (is_realtime(p)) {
+                TRACE_TASK(p, "fork, is real-time\n");
                /* clean out any litmus related state, don't preserve anything */
                reinit_litmus_state(p, 0);
                /* Don't let the child be a real-time task.  */
                p->sched_reset_on_fork = 1;
-        } else
+        } else {
                /* non-rt tasks might have ctrl_page set */
                tsk_rt(p)->ctrl_page = NULL;
+                reinit_litmus_state(p, 0);
+        }
        /* od tables are never inherited across a fork */
        p->od_table = NULL;
 }
+/* Called right before copy_process() returns a forked thread. */
+void litmus_post_fork_thread(struct task_struct* p)
+{
+#ifdef CONFIG_REALTIME_AUX_TASKS
+        make_aux_task_if_required(p);
+#endif
+}
 /* Called upon execve().
 * current is doing the exec.
 * Don't let address space specific stuff leak.
@@ -486,8 +698,10 @@ void exit_litmus(struct task_struct *dead_tsk)
        }
        /* main cleanup only for RT tasks */
-        if (is_realtime(dead_tsk))
+        if (is_realtime(dead_tsk)) {
+                litmus_pre_exit_task(dead_tsk); /* todo: double check that no Linux rq lock is held */
                litmus_exit_task(dead_tsk);
+        }
 }
diff --git a/litmus/litmus_pai_softirq.c b/litmus/litmus_pai_softirq.c
new file mode 100644
index 000000000000..300571a81bbd
--- /dev/null
+++ b/litmus/litmus_pai_softirq.c
@@ -0,0 +1,64 @@
+#include <linux/interrupt.h>
+#include <linux/percpu.h>
+#include <linux/cpu.h>
+#include <linux/kthread.h>
+#include <linux/ftrace.h>
+#include <linux/smp.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/cpuset.h>
+#include <litmus/litmus.h>
+#include <litmus/sched_trace.h>
+#include <litmus/jobs.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/litmus_softirq.h>
+int __litmus_tasklet_schedule(struct tasklet_struct *t, unsigned int k_id)
+{
+        int ret = 0; /* assume failure */
+    if(unlikely((t->owner == NULL) || !is_realtime(t->owner)))
+    {
+        TRACE("%s: No owner associated with this tasklet!\n", __FUNCTION__);
+        BUG();
+    }
+    ret = litmus->enqueue_pai_tasklet(t);
+        return(ret);
+}
+EXPORT_SYMBOL(__litmus_tasklet_schedule);
+// failure causes default Linux handling.
+int __litmus_tasklet_hi_schedule(struct tasklet_struct *t, unsigned int k_id)
+{
+        int ret = 0; /* assume failure */
+        return(ret);
+}
+EXPORT_SYMBOL(__litmus_tasklet_hi_schedule);
+// failure causes default Linux handling.
+int __litmus_tasklet_hi_schedule_first(struct tasklet_struct *t, unsigned int k_id)
+{
+        int ret = 0; /* assume failure */
+        return(ret);
+}
+EXPORT_SYMBOL(__litmus_tasklet_hi_schedule_first);
+// failure causes default Linux handling.
+int __litmus_schedule_work(struct work_struct *w, unsigned int k_id)
+{
+        int ret = 0; /* assume failure */
+        return(ret);
+}
+EXPORT_SYMBOL(__litmus_schedule_work);
diff --git a/litmus/litmus_proc.c b/litmus/litmus_proc.c
index 4bf725a36c9c..136fecfb0b8b 100644
--- a/litmus/litmus_proc.c
+++ b/litmus/litmus_proc.c
@@ -20,11 +20,18 @@ static struct proc_dir_entry *litmus_dir = NULL,
 #ifdef CONFIG_RELEASE_MASTER
        *release_master_file = NULL,
 #endif
+#ifdef CONFIG_LITMUS_SOFTIRQD
+        *klmirqd_file = NULL,
+#endif
        *plugs_file = NULL;
 /* in litmus/sync.c */
 int count_tasks_waiting_for_release(void);
+extern int proc_read_klmirqd_stats(char *page, char **start,
+                                                                        off_t off, int count,
+                                                                        int *eof, void *data);
 static int proc_read_stats(char *page, char **start,
                           off_t off, int count,
                           int *eof, void *data)
@@ -161,6 +168,12 @@ int __init init_litmus_proc(void)
        release_master_file->write_proc  = proc_write_release_master;
 #endif
+#ifdef CONFIG_LITMUS_SOFTIRQD
+        klmirqd_file =
+                create_proc_read_entry("klmirqd_stats", 0444, litmus_dir,
+                                                           proc_read_klmirqd_stats, NULL);
+#endif
        stat_file = create_proc_read_entry("stats", 0444, litmus_dir,
                                           proc_read_stats, NULL);
@@ -187,6 +200,10 @@ void exit_litmus_proc(void)
                remove_proc_entry("stats", litmus_dir);
        if (curr_file)
                remove_proc_entry("active_plugin", litmus_dir);
+#ifdef CONFIG_LITMUS_SOFTIRQD
+        if (klmirqd_file)
+                remove_proc_entry("klmirqd_stats", litmus_dir);
+#endif
 #ifdef CONFIG_RELEASE_MASTER
        if (release_master_file)
                remove_proc_entry("release_master", litmus_dir);
diff --git a/litmus/litmus_softirq.c b/litmus/litmus_softirq.c
new file mode 100644
index 000000000000..464a78d780ad
--- /dev/null
+++ b/litmus/litmus_softirq.c
@@ -0,0 +1,1205 @@
+#include <linux/interrupt.h>
+#include <linux/percpu.h>
+#include <linux/cpu.h>
+#include <linux/kthread.h>
+#include <linux/ftrace.h>
+#include <linux/smp.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/cpuset.h>
+#include <litmus/litmus.h>
+#include <litmus/sched_trace.h>
+#include <litmus/jobs.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/litmus_softirq.h>
+/* TODO: Remove unneeded mb() and other barriers. */
+enum pending_flags
+{
+    LIT_TASKLET_LOW = 0x1,
+    LIT_TASKLET_HI  = LIT_TASKLET_LOW<<1,
+        LIT_WORK = LIT_TASKLET_HI<<1
+};
+struct klmirqd_registration
+{
+        raw_spinlock_t lock;
+        u32 nr_threads;
+        unsigned int initialized:1;
+        unsigned int shuttingdown:1;
+        struct list_head threads;
+};
+static atomic_t klmirqd_id_gen = ATOMIC_INIT(-1);
+static struct klmirqd_registration klmirqd_state;
+void init_klmirqd(void)
+{
+        raw_spin_lock_init(&klmirqd_state.lock);
+        klmirqd_state.nr_threads = 0;
+        klmirqd_state.initialized = 1;
+        klmirqd_state.shuttingdown = 0;
+        INIT_LIST_HEAD(&klmirqd_state.threads);
+}
+static int __klmirqd_is_ready(void)
+{
+        return (klmirqd_state.initialized == 1 && klmirqd_state.shuttingdown == 0);
+}
+int klmirqd_is_ready(void)
+{
+        unsigned long flags;
+        int ret;
+        raw_spin_lock_irqsave(&klmirqd_state.lock, flags);
+    ret = __klmirqd_is_ready();
+    raw_spin_unlock_irqrestore(&klmirqd_state.lock, flags);
+        return ret;
+}
+int klmirqd_is_dead(void)
+{
+        return(!klmirqd_is_ready());
+}
+void kill_klmirqd(void)
+{
+        if(!klmirqd_is_dead())
+        {
+                unsigned long flags;
+                struct list_head *pos;
+                struct list_head *q;
+                raw_spin_lock_irqsave(&klmirqd_state.lock, flags);
+                TRACE("%s: Killing all klmirqd threads! (%d of them)\n", __FUNCTION__, klmirqd_state.nr_threads);
+                klmirqd_state.shuttingdown = 1;
+                list_for_each_safe(pos, q, &klmirqd_state.threads) {
+                        struct klmirqd_info* info = list_entry(pos, struct klmirqd_info, klmirqd_reg);
+                        if(info->terminating != 1)
+                        {
+                                info->terminating = 1;
+                                mb(); /* just to be sure? */
+                                flush_pending(info->klmirqd);
+                                /* signal termination */
+                                raw_spin_unlock_irqrestore(&klmirqd_state.lock, flags);
+                                kthread_stop(info->klmirqd);
+                                raw_spin_lock_irqsave(&klmirqd_state.lock, flags);
+                        }
+                }
+                raw_spin_unlock_irqrestore(&klmirqd_state.lock, flags);
+        }
+}
+void kill_klmirqd_thread(struct task_struct* klmirqd_thread)
+{
+        unsigned long flags;
+        struct klmirqd_info* info;
+        if (!tsk_rt(klmirqd_thread)->is_interrupt_thread) {
+                TRACE("%s/%d is not a klmirqd thread\n", klmirqd_thread->comm, klmirqd_thread->pid);
+                return;
+        }
+        TRACE("%s: Killing klmirqd thread %s/%d\n", __FUNCTION__, klmirqd_thread->comm, klmirqd_thread->pid);
+        raw_spin_lock_irqsave(&klmirqd_state.lock, flags);
+        info = tsk_rt(klmirqd_thread)->klmirqd_info;
+        if(info->terminating != 1) {
+                info->terminating = 1;
+                mb();
+                flush_pending(klmirqd_thread);
+                kthread_stop(klmirqd_thread);
+        }
+        raw_spin_unlock_irqrestore(&klmirqd_state.lock, flags);
+}
+struct klmirqd_launch_data
+{
+        int cpu_affinity;
+        klmirqd_callback_t* cb;
+        char name[MAX_KLMIRQD_NAME_LEN+1];
+        struct work_struct work;
+};
+static int run_klmirqd(void* callback);
+/* executed by a kworker from workqueues */
+static void __launch_klmirqd_thread(struct work_struct *work)
+{
+        int id;
+        struct task_struct* thread = NULL;
+        struct klmirqd_launch_data* launch_data =
+                container_of(work, struct klmirqd_launch_data, work);
+    TRACE("Creating klmirqd thread\n");
+        if (launch_data->cpu_affinity != -1) {
+                if (launch_data->name[0] == '\0') {
+                        id = atomic_inc_return(&klmirqd_id_gen);
+                        TRACE("Launching klmirqd_th%d/%d\n", id, launch_data->cpu_affinity);
+                        thread = kthread_create(
+                                                run_klmirqd,
+                                                /* treat the affinity as a pointer, we'll cast it back later */
+                                                (void*)launch_data->cb,
+                                                "klmirqd_th%d/%d",
+                                                id,
+                                                launch_data->cpu_affinity);
+                }
+                else {
+                        TRACE("Launching %s/%d\n", launch_data->name, launch_data->cpu_affinity);
+                        thread = kthread_create(
+                                                run_klmirqd,
+                                                /* treat the affinity as a pointer, we'll cast it back later */
+                                                (void*)launch_data->cb,
+                                                "%s/%d",
+                                                launch_data->name,
+                                                launch_data->cpu_affinity);
+                }
+                /* litmus will put is in the right cluster. */
+                kthread_bind(thread, launch_data->cpu_affinity);
+        }
+        else {
+                if (launch_data->name[0] == '\0') {
+                        id = atomic_inc_return(&klmirqd_id_gen);
+                        TRACE("Launching klmirqd_th%d\n", id);
+                        thread = kthread_create(
+                                                run_klmirqd,
+                                                /* treat the affinity as a pointer, we'll cast it back later */
+                                                (void*)launch_data->cb,
+                                                "klmirqd_th%d",
+                                                id);
+                }
+                else {
+                        TRACE("Launching %s\n", launch_data->name);
+                        thread = kthread_create(
+                                                run_klmirqd,
+                                                /* treat the affinity as a pointer, we'll cast it back later */
+                                                (void*)launch_data->cb,
+                                                launch_data->name);
+                }
+        }
+        if (thread) {
+                wake_up_process(thread);
+        }
+        else {
+                TRACE("Could not create thread!\n");
+        }
+        kfree(launch_data);
+}
+int launch_klmirqd_thread(char* name, int cpu, klmirqd_callback_t* cb)
+{
+    struct klmirqd_launch_data* delayed_launch;
+        if (!klmirqd_is_ready()) {
+                TRACE("klmirqd is not ready.  Check that it was initialized!\n");
+                return -1;
+        }
+    /* tell a work queue to launch the threads.  we can't make scheduling
+         calls since we're in an atomic state. */
+        delayed_launch = kmalloc(sizeof(struct klmirqd_launch_data), GFP_ATOMIC);
+        delayed_launch->cpu_affinity = cpu;
+        delayed_launch->cb = cb;
+    INIT_WORK(&delayed_launch->work, __launch_klmirqd_thread);
+        if(name) {
+                snprintf(delayed_launch->name, MAX_KLMIRQD_NAME_LEN+1, "%s", name);
+        }
+        else {
+                delayed_launch->name[0] = '\0';
+        }
+    schedule_work(&delayed_launch->work);
+        return 0;
+}
+#define KLMIRQD_SLICE_NR_JIFFIES 1
+#define KLMIRQD_SLICE_NS ((NSEC_PER_SEC / HZ) * KLMIRQD_SLICE_NR_JIFFIES)
+static int become_litmus_daemon(struct task_struct* tsk)
+{
+    int ret = 0;
+        struct rt_task tp = {
+                .period = KLMIRQD_SLICE_NS, /* dummy 1 second period */
+                .relative_deadline = KLMIRQD_SLICE_NS,
+                .exec_cost = KLMIRQD_SLICE_NS,
+                .phase = 0,
+                .cpu = task_cpu(current),
+                .budget_policy = NO_ENFORCEMENT,
+                .budget_signal_policy = NO_SIGNALS,
+                .cls = RT_CLASS_BEST_EFFORT
+        };
+        struct sched_param param = { .sched_priority = 0};
+        TRACE_CUR("Setting %s/%d as daemon thread.\n", tsk->comm, tsk->pid);
+        /* set task params */
+        tsk_rt(tsk)->task_params = tp;
+        tsk_rt(tsk)->is_interrupt_thread = 1;
+        /* inform the OS we're SCHED_LITMUS --
+         sched_setscheduler_nocheck() calls litmus_admit_task(). */
+        sched_setscheduler_nocheck(tsk, SCHED_LITMUS, &param);
+    return ret;
+}
+static int become_normal_daemon(struct task_struct* tsk)
+{
+        int ret = 0;
+        struct sched_param param = { .sched_priority = 0};
+        sched_setscheduler_nocheck(tsk, SCHED_NORMAL, &param);
+        return ret;
+}
+static int register_klmirqd(struct task_struct* tsk)
+{
+        int retval = 0;
+        unsigned long flags;
+        struct klmirqd_info *info = NULL;
+        if (!tsk_rt(tsk)->is_interrupt_thread) {
+                TRACE("Only proxy threads already running in Litmus may become klmirqd threads!\n");
+                WARN_ON(1);
+                retval = -1;
+                goto out;
+        }
+        raw_spin_lock_irqsave(&klmirqd_state.lock, flags);
+        if (!__klmirqd_is_ready()) {
+                TRACE("klmirqd is not ready! Did you forget to initialize it?\n");
+                WARN_ON(1);
+                retval = -1;
+                goto out_unlock;
+        }
+        /* allocate and initialize klmirqd data for the thread */
+        info = kmalloc(sizeof(struct klmirqd_info), GFP_KERNEL);
+        if (!info) {
+                TRACE("Failed to allocate klmirqd_info struct!\n");
+                retval = -1; /* todo: pick better code */
+                goto out_unlock;
+        }
+        memset(info, 0, sizeof(struct klmirqd_info));
+        info->klmirqd = tsk;
+        info->pending_tasklets_hi.tail = &info->pending_tasklets_hi.head;
+        info->pending_tasklets.tail = &info->pending_tasklets.head;
+        INIT_LIST_HEAD(&info->worklist);
+        INIT_LIST_HEAD(&info->klmirqd_reg);
+        raw_spin_lock_init(&info->lock);
+        /* now register with klmirqd */
+        list_add_tail(&info->klmirqd_reg, &klmirqd_state.threads);
+        ++klmirqd_state.nr_threads;
+        /* update the task struct to point to klmirqd info */
+        tsk_rt(tsk)->klmirqd_info = info;
+out_unlock:
+        raw_spin_unlock_irqrestore(&klmirqd_state.lock, flags);
+out:
+        return retval;
+}
+static int unregister_klmirqd(struct task_struct* tsk)
+{
+        int retval = 0;
+        unsigned long flags;
+        struct klmirqd_info *info = tsk_rt(tsk)->klmirqd_info;
+        if (!tsk_rt(tsk)->is_interrupt_thread || !info) {
+                TRACE("%s/%d is not a klmirqd thread!\n", tsk->comm, tsk->pid);
+                WARN_ON(1);
+                retval = -1;
+                goto out;
+        }
+        raw_spin_lock_irqsave(&klmirqd_state.lock, flags);
+        /* remove the entry in the klmirqd thread list */
+        list_del(&info->klmirqd_reg);
+        mb();
+        --klmirqd_state.nr_threads;
+        /* remove link to klmirqd info from thread */
+        tsk_rt(tsk)->klmirqd_info = NULL;
+        /* clean up memory */
+        kfree(info);
+        raw_spin_unlock_irqrestore(&klmirqd_state.lock, flags);
+out:
+        return retval;
+}
+int proc_read_klmirqd_stats(char *page, char **start,
+                                                         off_t off, int count,
+                                                         int *eof, void *data)
+{
+        unsigned long flags;
+        int len;
+        raw_spin_lock_irqsave(&klmirqd_state.lock, flags);
+        if (klmirqd_state.initialized) {
+                if (!klmirqd_state.shuttingdown) {
+                        struct list_head *pos;
+                        len = snprintf(page, PAGE_SIZE,
+                                                   "num ready klmirqds: %d\n\n",
+                                                   klmirqd_state.nr_threads);
+                        list_for_each(pos, &klmirqd_state.threads) {
+                                struct klmirqd_info* info = list_entry(pos, struct klmirqd_info, klmirqd_reg);
+                                len +=
+                                        snprintf(page + len - 1, PAGE_SIZE, /* -1 to strip off \0 */
+                                                         "klmirqd_thread: %s/%d\n"
+                                                         "\tcurrent_owner: %s/%d\n"
+                                                         "\tpending: %x\n"
+                                                         "\tnum hi: %d\n"
+                                                         "\tnum low: %d\n"
+                                                         "\tnum work: %d\n\n",
+                                                         info->klmirqd->comm, info->klmirqd->pid,
+                                                         (info->current_owner != NULL) ?
+                                                                info->current_owner->comm : "(null)",
+                                                         (info->current_owner != NULL) ?
+                                                                info->current_owner->pid : 0,
+                                                         info->pending,
+                                                         atomic_read(&info->num_hi_pending),
+                                                         atomic_read(&info->num_low_pending),
+                                                         atomic_read(&info->num_work_pending));
+                        }
+                }
+                else {
+                        len = snprintf(page, PAGE_SIZE, "klmirqd is shutting down\n");
+                }
+        }
+        else {
+                len = snprintf(page, PAGE_SIZE, "klmirqd is not initialized!\n");
+        }
+        raw_spin_unlock_irqrestore(&klmirqd_state.lock, flags);
+        return(len);
+}
+#if 0
+static atomic_t dump_id = ATOMIC_INIT(0);
+static void __dump_state(struct klmirqd_info* which, const char* caller)
+{
+        struct tasklet_struct* list;
+        int id = atomic_inc_return(&dump_id);
+        //if(in_interrupt())
+        {
+                if(which->current_owner)
+                {
+                        TRACE("(id: %d  caller: %s)\n"
+                                "klmirqd: %s/%d\n"
+                                "current owner: %s/%d\n"
+                                "pending: %x\n",
+                                id, caller,
+                                which->klmirqd->comm, which->klmirqd->pid,
+                                which->current_owner->comm, which->current_owner->pid,
+                                which->pending);
+                }
+                else
+                {
+                        TRACE("(id: %d  caller: %s)\n"
+                                "klmirqd: %s/%d\n"
+                                "current owner: %p\n"
+                                "pending: %x\n",
+                                id, caller,
+                                which->klmirqd->comm, which->klmirqd->pid,
+                                NULL,
+                                which->pending);
+                }
+                list = which->pending_tasklets.head;
+                while(list)
+                {
+                        struct tasklet_struct *t = list;
+                        list = list->next; /* advance */
+                        if(t->owner)
+                                TRACE("(id: %d  caller: %s) Tasklet: %x, Owner = %s/%d\n", id, caller, t, t->owner->comm, t->owner->pid);
+                        else
+                                TRACE("(id: %d  caller: %s) Tasklet: %x, Owner = %p\n", id, caller, t, NULL);
+                }
+        }
+}
+static void dump_state(struct klmirqd_info* which, const char* caller)
+{
+        unsigned long flags;
+        raw_spin_lock_irqsave(&which->lock, flags);
+    __dump_state(which, caller);
+    raw_spin_unlock_irqrestore(&which->lock, flags);
+}
+#endif
+/* forward declarations */
+static void ___litmus_tasklet_schedule(struct tasklet_struct *t,
+                                                                           struct klmirqd_info *which,
+                                                                           int wakeup);
+static void ___litmus_tasklet_hi_schedule(struct tasklet_struct *t,
+                                                                                  struct klmirqd_info *which,
+                                                                                  int wakeup);
+static void ___litmus_schedule_work(struct work_struct *w,
+                                                                        struct klmirqd_info *which,
+                                                                        int wakeup);
+inline static u32 litirq_pending_hi_irqoff(struct klmirqd_info* which)
+{
+    return (which->pending & LIT_TASKLET_HI);
+}
+inline static u32 litirq_pending_low_irqoff(struct klmirqd_info* which)
+{
+    return (which->pending & LIT_TASKLET_LOW);
+}
+inline static u32 litirq_pending_work_irqoff(struct klmirqd_info* which)
+{
+        return (which->pending & LIT_WORK);
+}
+inline static u32 litirq_pending_irqoff(struct klmirqd_info* which)
+{
+    return(which->pending);
+}
+inline static u32 litirq_pending(struct klmirqd_info* which)
+{
+    unsigned long flags;
+    u32 pending;
+    raw_spin_lock_irqsave(&which->lock, flags);
+    pending = litirq_pending_irqoff(which);
+    raw_spin_unlock_irqrestore(&which->lock, flags);
+    return pending;
+};
+static void wakeup_litirqd_locked(struct klmirqd_info* which)
+{
+        /* Interrupts are disabled: no need to stop preemption */
+        if (which && which->klmirqd)
+        {
+        if(which->klmirqd->state != TASK_RUNNING)
+        {
+            TRACE("%s: Waking up klmirqd: %s/%d\n", __FUNCTION__,
+                            which->klmirqd->comm, which->klmirqd->pid);
+                        wake_up_process(which->klmirqd);
+                }
+    }
+}
+static void do_lit_tasklet(struct klmirqd_info* which,
+                                                   struct tasklet_head* pending_tasklets)
+{
+    unsigned long flags;
+        struct tasklet_struct *list;
+        atomic_t* count;
+    raw_spin_lock_irqsave(&which->lock, flags);
+        //__dump_state(which, "do_lit_tasklet: before steal");
+        /* copy out the tasklets for our private use. */
+        list = pending_tasklets->head;
+        pending_tasklets->head = NULL;
+        pending_tasklets->tail = &pending_tasklets->head;
+        /* remove pending flag */
+        which->pending &= (pending_tasklets == &which->pending_tasklets) ?
+                ~LIT_TASKLET_LOW :
+                ~LIT_TASKLET_HI;
+        count = (pending_tasklets == &which->pending_tasklets) ?
+                &which->num_low_pending:
+                &which->num_hi_pending;
+        //__dump_state(which, "do_lit_tasklet: after steal");
+    raw_spin_unlock_irqrestore(&which->lock, flags);
+    while(list)
+    {
+        struct tasklet_struct *t = list;
+        /* advance, lest we forget */
+                list = list->next;
+        /* execute tasklet if it has my priority and is free */
+                if (tasklet_trylock(t)) {
+                        if (!atomic_read(&t->count)) {
+                                sched_trace_tasklet_begin(t->owner);
+                                if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
+                {
+                                        BUG();
+                }
+                TRACE_CUR("%s: Invoking tasklet.\n", __FUNCTION__);
+                                t->func(t->data);
+                                tasklet_unlock(t);
+                                atomic_dec(count);
+                                sched_trace_tasklet_end(t->owner, 0ul);
+                                continue;  /* process more tasklets */
+                        }
+                        tasklet_unlock(t);
+                }
+        TRACE_CUR("%s: Could not invoke tasklet.  Requeuing.\n", __FUNCTION__);
+                /* couldn't process tasklet.  put it back at the end of the queue. */
+                if(pending_tasklets == &which->pending_tasklets)
+                        ___litmus_tasklet_schedule(t, which, 0);
+                else
+                        ___litmus_tasklet_hi_schedule(t, which, 0);
+    }
+}
+// returns 1 if priorities need to be changed to continue processing
+// pending tasklets.
+static void do_litirq(struct klmirqd_info* which)
+{
+    u32 pending;
+    if(in_interrupt())
+    {
+        TRACE("%s: exiting early: in interrupt context!\n", __FUNCTION__);
+                return;
+    }
+        if(which->klmirqd != current)
+        {
+        TRACE_CUR("%s: exiting early: thread/info mismatch! Running %s/%d but given %s/%d.\n",
+                                  __FUNCTION__, current->comm, current->pid,
+                                  which->klmirqd->comm, which->klmirqd->pid);
+        return;
+        }
+    if(!is_realtime(current))
+    {
+        TRACE_CUR("%s: exiting early: klmirqd is not real-time. Sched Policy = %d\n",
+                                  __FUNCTION__, current->policy);
+                return;
+    }
+    /* We only handle tasklets & work objects, no need for RCU triggers? */
+    pending = litirq_pending(which);
+    if(pending) {
+        /* extract the work to do and do it! */
+        if(pending & LIT_TASKLET_HI) {
+            TRACE_CUR("%s: Invoking HI tasklets.\n", __FUNCTION__);
+            do_lit_tasklet(which, &which->pending_tasklets_hi);
+        }
+        if(pending & LIT_TASKLET_LOW) {
+            TRACE_CUR("%s: Invoking LOW tasklets.\n", __FUNCTION__);
+                        do_lit_tasklet(which, &which->pending_tasklets);
+        }
+    }
+}
+static void do_work(struct klmirqd_info* which)
+{
+        unsigned long flags;
+        struct work_struct* work;
+        work_func_t f;
+        // only execute one work-queue item to yield to tasklets.
+        // ...is this a good idea, or should we just batch them?
+        raw_spin_lock_irqsave(&which->lock, flags);
+        if(!litirq_pending_work_irqoff(which))
+        {
+                raw_spin_unlock_irqrestore(&which->lock, flags);
+                goto no_work;
+        }
+        work = list_first_entry(&which->worklist, struct work_struct, entry);
+        list_del_init(&work->entry);
+        if(list_empty(&which->worklist))
+        {
+                which->pending &= ~LIT_WORK;
+        }
+        raw_spin_unlock_irqrestore(&which->lock, flags);
+        TRACE_CUR("%s: Invoking work object.\n", __FUNCTION__);
+        // do the work!
+        work_clear_pending(work);
+        f = work->func;
+        f(work);  /* can't touch 'work' after this point,
+                           the user may have freed it. */
+        atomic_dec(&which->num_work_pending);
+no_work:
+        return;
+}
+/* main loop for klitsoftirqd */
+static int run_klmirqd(void* callback)
+{
+    int retval = 0;
+        struct klmirqd_info* info = NULL;
+        klmirqd_callback_t* cb = (klmirqd_callback_t*)(callback);
+        retval = become_litmus_daemon(current);
+    if (retval != 0) {
+        TRACE_CUR("%s: Failed to transition to rt-task.\n", __FUNCTION__);
+        goto failed;
+    }
+        retval = register_klmirqd(current);
+        if (retval != 0) {
+                TRACE_CUR("%s: Failed to become a klmirqd thread.\n", __FUNCTION__);
+                goto failed_sched_normal;
+        }
+        if (cb && cb->func) {
+                retval = cb->func(cb->arg);
+                if (retval != 0) {
+                        TRACE_CUR("%s: klmirqd callback reported failure. retval = %d\n", __FUNCTION__, retval);
+                        goto failed_unregister;
+                }
+        }
+        /* enter the interrupt handling workloop */
+        info = tsk_rt(current)->klmirqd_info;
+        set_current_state(TASK_INTERRUPTIBLE);
+        while (!kthread_should_stop())
+        {
+                preempt_disable();
+                if (!litirq_pending(info))
+                {
+            /* sleep for work */
+            TRACE_CUR("%s: No more tasklets or work objects. Going to sleep.\n",
+                                          __FUNCTION__);
+                        preempt_enable_no_resched();
+            schedule();
+                        if(kthread_should_stop()) /* bail out */
+                        {
+                                TRACE_CUR("%s:%d: Signaled to terminate.\n", __FUNCTION__, __LINE__);
+                                continue;
+                        }
+                        preempt_disable();
+                }
+                __set_current_state(TASK_RUNNING);
+                while (litirq_pending(info))
+                {
+                        preempt_enable_no_resched();
+                        if(kthread_should_stop())
+                        {
+                                TRACE_CUR("%s:%d: Signaled to terminate.\n", __FUNCTION__, __LINE__);
+                                break;
+                        }
+                        preempt_disable();
+                        /* Double check that there's still pending work and the owner hasn't
+                         * changed. Pending items may have been flushed while we were sleeping.
+                         */
+                        if(litirq_pending(info))
+                        {
+                                TRACE_CUR("%s: Executing tasklets and/or work objects.\n",
+                                                  __FUNCTION__);
+                                do_litirq(info);
+                                preempt_enable_no_resched();
+                                // work objects are preemptible.
+                                do_work(info);
+                        }
+                        else
+                        {
+                                TRACE_CUR("%s: Pending work was flushed!\n", __FUNCTION__);
+                                preempt_enable_no_resched();
+                        }
+                        cond_resched();
+                        preempt_disable();
+                }
+                preempt_enable();
+                set_current_state(TASK_INTERRUPTIBLE);
+        }
+        __set_current_state(TASK_RUNNING);
+failed_unregister:
+        /* remove our registration from klmirqd */
+        unregister_klmirqd(current);
+failed_sched_normal:
+        become_normal_daemon(current);
+failed:
+        return retval;
+}
+void flush_pending(struct task_struct* tsk)
+{
+        unsigned long flags;
+        struct tasklet_struct *list;
+        u32 work_flushed = 0;
+        struct klmirqd_info *which;
+        if (!tsk_rt(tsk)->is_interrupt_thread) {
+                TRACE("%s/%d is not a proxy thread\n", tsk->comm, tsk->pid);
+                WARN_ON(1);
+                return;
+        }
+        which = tsk_rt(tsk)->klmirqd_info;
+        if (!which) {
+                TRACE("%s/%d is not a klmirqd thread!\n", tsk->comm, tsk->pid);
+                WARN_ON(1);
+                return;
+        }
+        raw_spin_lock_irqsave(&which->lock, flags);
+        //__dump_state(which, "flush_pending: before");
+        // flush hi tasklets.
+        if(litirq_pending_hi_irqoff(which))
+        {
+                which->pending &= ~LIT_TASKLET_HI;
+                list = which->pending_tasklets_hi.head;
+                which->pending_tasklets_hi.head = NULL;
+                which->pending_tasklets_hi.tail = &which->pending_tasklets_hi.head;
+                TRACE("%s: Handing HI tasklets back to Linux.\n", __FUNCTION__);
+                while(list)
+                {
+                        struct tasklet_struct *t = list;
+                        list = list->next;
+                        if(unlikely(!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)))
+                        {
+                                BUG();
+                        }
+                        work_flushed |= LIT_TASKLET_HI;
+                        t->owner = NULL;
+                        // WTF?
+                        if(!test_and_set_bit(TASKLET_STATE_SCHED, &t->state))
+                        {
+                                atomic_dec(&which->num_hi_pending);
+                                ___tasklet_hi_schedule(t);
+                        }
+                        else
+                        {
+                                TRACE("%s: dropped hi tasklet??\n", __FUNCTION__);
+                                BUG();
+                        }
+                }
+        }
+        // flush low tasklets.
+        if(litirq_pending_low_irqoff(which))
+        {
+                which->pending &= ~LIT_TASKLET_LOW;
+                list = which->pending_tasklets.head;
+                which->pending_tasklets.head = NULL;
+                which->pending_tasklets.tail = &which->pending_tasklets.head;
+                TRACE("%s: Handing LOW tasklets back to Linux.\n", __FUNCTION__);
+                while(list)
+                {
+                        struct tasklet_struct *t = list;
+                        list = list->next;
+                        if(unlikely(!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)))
+                        {
+                                BUG();
+                        }
+                        work_flushed |= LIT_TASKLET_LOW;
+                        t->owner = NULL;
+//                      sched_trace_tasklet_end(owner, 1ul);
+                        if(!test_and_set_bit(TASKLET_STATE_SCHED, &t->state))
+                        {
+                                atomic_dec(&which->num_low_pending);
+                                ___tasklet_schedule(t);
+                        }
+                        else
+                        {
+                                TRACE("%s: dropped tasklet??\n", __FUNCTION__);
+                                BUG();
+                        }
+                }
+        }
+        // flush work objects
+        if(litirq_pending_work_irqoff(which))
+        {
+                which->pending &= ~LIT_WORK;
+                TRACE("%s: Handing work objects back to Linux.\n", __FUNCTION__);
+                while(!list_empty(&which->worklist))
+                {
+                        struct work_struct* work =
+                                list_first_entry(&which->worklist, struct work_struct, entry);
+                        list_del_init(&work->entry);
+                        work_flushed |= LIT_WORK;
+                        atomic_dec(&which->num_work_pending);
+                        work->owner = NULL;
+//                      sched_trace_work_end(owner, current, 1ul);
+                        __schedule_work(work);
+                }
+        }
+        //__dump_state(which, "flush_pending: after (before reeval prio)");
+        mb(); /* commit changes to pending flags */
+        raw_spin_unlock_irqrestore(&which->lock, flags);
+}
+static void ___litmus_tasklet_schedule(struct tasklet_struct *t,
+                                                                           struct klmirqd_info *which,
+                                                                           int wakeup)
+{
+        unsigned long flags;
+        u32 old_pending;
+        t->next = NULL;
+    raw_spin_lock_irqsave(&which->lock, flags);
+        //__dump_state(which, "___litmus_tasklet_schedule: before queuing");
+    *(which->pending_tasklets.tail) = t;
+    which->pending_tasklets.tail = &t->next;
+        old_pending = which->pending;
+        which->pending |= LIT_TASKLET_LOW;
+        atomic_inc(&which->num_low_pending);
+        mb();
+        if(!old_pending && wakeup)
+        {
+                wakeup_litirqd_locked(which); /* wake up the klmirqd */
+        }
+        //__dump_state(which, "___litmus_tasklet_schedule: after queuing");
+    raw_spin_unlock_irqrestore(&which->lock, flags);
+}
+int __litmus_tasklet_schedule(struct tasklet_struct *t, struct task_struct* klmirqd_thread)
+{
+        int ret = 0; /* assume failure */
+        struct klmirqd_info* info;
+        if (unlikely(!is_realtime(klmirqd_thread) ||
+                !tsk_rt(klmirqd_thread)->is_interrupt_thread ||
+                !tsk_rt(klmirqd_thread)->klmirqd_info)) {
+                TRACE("%s: %s/%d can't handle tasklets\n", klmirqd_thread->comm, klmirqd_thread->pid);
+                return ret;
+        }
+        info = tsk_rt(klmirqd_thread)->klmirqd_info;
+        if (likely(!info->terminating)) {
+                ret = 1;
+                ___litmus_tasklet_schedule(t, info, 1);
+        }
+        else {
+                TRACE("%s: Tasklet rejected because %s/%d is terminating\n", klmirqd_thread->comm, klmirqd_thread->pid);
+        }
+        return(ret);
+}
+EXPORT_SYMBOL(__litmus_tasklet_schedule);
+static void ___litmus_tasklet_hi_schedule(struct tasklet_struct *t,
+                                                                           struct klmirqd_info *which,
+                                                                           int wakeup)
+{
+        unsigned long flags;
+        u32 old_pending;
+        t->next = NULL;
+    raw_spin_lock_irqsave(&which->lock, flags);
+    *(which->pending_tasklets_hi.tail) = t;
+    which->pending_tasklets_hi.tail = &t->next;
+        old_pending = which->pending;
+        which->pending |= LIT_TASKLET_HI;
+        atomic_inc(&which->num_hi_pending);
+        mb();
+        if(!old_pending && wakeup)
+        {
+                wakeup_litirqd_locked(which); /* wake up the klmirqd */
+        }
+    raw_spin_unlock_irqrestore(&which->lock, flags);
+}
+int __litmus_tasklet_hi_schedule(struct tasklet_struct *t, struct task_struct* klmirqd_thread)
+{
+        int ret = 0; /* assume failure */
+        struct klmirqd_info* info;
+        if (unlikely(!is_realtime(klmirqd_thread) ||
+                !tsk_rt(klmirqd_thread)->is_interrupt_thread ||
+                !tsk_rt(klmirqd_thread)->klmirqd_info)) {
+                TRACE("%s: %s/%d can't handle tasklets\n", klmirqd_thread->comm, klmirqd_thread->pid);
+                return ret;
+        }
+        info = tsk_rt(klmirqd_thread)->klmirqd_info;
+        if (likely(!info->terminating)) {
+                ret = 1;
+                ___litmus_tasklet_hi_schedule(t, info, 1);
+        }
+        else {
+                TRACE("%s: Tasklet rejected because %s/%d is terminating\n", klmirqd_thread->comm, klmirqd_thread->pid);
+        }
+        return(ret);
+}
+EXPORT_SYMBOL(__litmus_tasklet_hi_schedule);
+int __litmus_tasklet_hi_schedule_first(struct tasklet_struct *t, struct task_struct* klmirqd_thread)
+{
+        int ret = 0; /* assume failure */
+        u32 old_pending;
+        struct klmirqd_info* info;
+        BUG_ON(!irqs_disabled());
+        if (unlikely(!is_realtime(klmirqd_thread) ||
+                                 !tsk_rt(klmirqd_thread)->is_interrupt_thread ||
+                                 !tsk_rt(klmirqd_thread)->klmirqd_info)) {
+                TRACE("%s: %s/%d can't handle tasklets\n", klmirqd_thread->comm, klmirqd_thread->pid);
+                return ret;
+        }
+        info = tsk_rt(klmirqd_thread)->klmirqd_info;
+        if (likely(!info->terminating)) {
+        raw_spin_lock(&info->lock);
+                ret = 1;  // success!
+                t->next = info->pending_tasklets_hi.head;
+        info->pending_tasklets_hi.head = t;
+                old_pending = info->pending;
+                info->pending |= LIT_TASKLET_HI;
+                atomic_inc(&info->num_hi_pending);
+                mb();
+                if(!old_pending) {
+                        wakeup_litirqd_locked(info); /* wake up the klmirqd */
+                }
+                raw_spin_unlock(&info->lock);
+        }
+        else {
+                TRACE("%s: Tasklet rejected because %s/%d is terminating\n", klmirqd_thread->comm, klmirqd_thread->pid);
+        }
+        return(ret);
+}
+EXPORT_SYMBOL(__litmus_tasklet_hi_schedule_first);
+static void ___litmus_schedule_work(struct work_struct *w,
+                                                                        struct klmirqd_info *which,
+                                                                        int wakeup)
+{
+        unsigned long flags;
+        u32 old_pending;
+        raw_spin_lock_irqsave(&which->lock, flags);
+        work_pending(w);
+        list_add_tail(&w->entry, &which->worklist);
+        old_pending = which->pending;
+        which->pending |= LIT_WORK;
+        atomic_inc(&which->num_work_pending);
+        mb();
+        if(!old_pending && wakeup)
+        {
+                wakeup_litirqd_locked(which); /* wakeup the klmirqd */
+        }
+        raw_spin_unlock_irqrestore(&which->lock, flags);
+}
+int __litmus_schedule_work(struct work_struct *w, struct task_struct* klmirqd_thread)
+{
+        int ret = 1; /* assume success */
+        struct klmirqd_info* info;
+        if (unlikely(!is_realtime(klmirqd_thread) ||
+                                 !tsk_rt(klmirqd_thread)->is_interrupt_thread ||
+                                 !tsk_rt(klmirqd_thread)->klmirqd_info)) {
+                TRACE("%s: %s/%d can't handle work items\n", klmirqd_thread->comm, klmirqd_thread->pid);
+                return ret;
+        }
+        info = tsk_rt(klmirqd_thread)->klmirqd_info;
+        if (likely(!info->terminating)) {
+                ___litmus_schedule_work(w, info, 1);
+        }
+        else {
+                TRACE("%s: Work rejected because %s/%d is terminating\n", klmirqd_thread->comm, klmirqd_thread->pid);
+                ret = 0;
+        }
+        return(ret);
+}
+EXPORT_SYMBOL(__litmus_schedule_work);
diff --git a/litmus/locking.c b/litmus/locking.c
index 43d9aece2e74..c21ec1ae36d7 100644
--- a/litmus/locking.c
+++ b/litmus/locking.c
@@ -8,8 +8,17 @@
 #include <litmus/litmus.h>
 #include <litmus/sched_plugin.h>
 #include <litmus/trace.h>
+#include <litmus/litmus.h>
 #include <litmus/wait.h>
+#ifdef CONFIG_LITMUS_DGL_SUPPORT
+#include <linux/uaccess.h>
+#endif
+#if defined(CONFIG_LITMUS_AFFINITY_LOCKING) && defined(CONFIG_LITMUS_NVIDIA)
+#include <litmus/gpu_affinity.h>
+#endif
 static int create_generic_lock(void** obj_ref, obj_type_t type, void* __user arg);
 static int open_generic_lock(struct od_table_entry* entry, void* __user arg);
 static int close_generic_lock(struct od_table_entry* entry);
@@ -22,6 +31,9 @@ struct fdso_ops generic_lock_ops = {
        .destroy = destroy_generic_lock
 };
+static atomic_t lock_id_gen = ATOMIC_INIT(0);
 static inline bool is_lock(struct od_table_entry* entry)
 {
        return entry->class == &generic_lock_ops;
@@ -39,8 +51,21 @@ static  int create_generic_lock(void** obj_ref, obj_type_t type, void* __user ar
        int err;
        err = litmus->allocate_lock(&lock, type, arg);
-        if (err == 0)
+        if (err == 0) {
+#ifdef CONFIG_LITMUS_NESTED_LOCKING
+                lock->nest.lock = lock;
+                lock->nest.hp_waiter_eff_prio = NULL;
+                INIT_BINHEAP_NODE(&lock->nest.hp_binheap_node);
+                if(!lock->nest.hp_waiter_ptr) {
+                        TRACE_CUR("BEWARE: hp_waiter_ptr should probably not be NULL in "
+                                          "most uses. (exception: IKGLP donors)\n");
+                }
+#endif
+                lock->type = type;
+                lock->ident = atomic_inc_return(&lock_id_gen);
                *obj_ref = lock;
+    }
        return err;
 }
@@ -83,7 +108,8 @@ asmlinkage long sys_litmus_lock(int lock_od)
        entry = get_entry_for_od(lock_od);
        if (entry && is_lock(entry)) {
                l = get_lock(entry);
-                TRACE_CUR("attempts to lock 0x%p\n", l);
+                //TRACE_CUR("attempts to lock 0x%p\n", l);
+                TRACE_CUR("attempts to lock %d\n", l->ident);
                err = l->ops->lock(l);
        }
@@ -111,7 +137,8 @@ asmlinkage long sys_litmus_unlock(int lock_od)
        entry = get_entry_for_od(lock_od);
        if (entry && is_lock(entry)) {
                l = get_lock(entry);
-                TRACE_CUR("attempts to unlock 0x%p\n", l);
+                //TRACE_CUR("attempts to unlock 0x%p\n", l);
+                TRACE_CUR("attempts to unlock %d\n", l->ident);
                err = l->ops->unlock(l);
        }
@@ -138,6 +165,365 @@ struct task_struct* __waitqueue_remove_first(wait_queue_head_t *wq)
        return(t);
 }
+#ifdef CONFIG_LITMUS_NESTED_LOCKING
+void print_hp_waiters(struct binheap_node* n, int depth)
+{
+        struct litmus_lock *l;
+        struct nested_info *nest;
+        char padding[81] = "                                                                                ";
+        struct task_struct *hp = NULL;
+        struct task_struct *hp_eff = NULL;
+        struct task_struct *node_prio = NULL;
+        if(n == NULL) {
+                TRACE("+-> %p\n", NULL);
+                return;
+        }
+        nest = binheap_entry(n, struct nested_info, hp_binheap_node);
+        l = nest->lock;
+        if(depth*2 <= 80)
+                padding[depth*2] = '\0';
+        if(nest->hp_waiter_ptr && *(nest->hp_waiter_ptr)) {
+                hp = *(nest->hp_waiter_ptr);
+                if(tsk_rt(hp)->inh_task) {
+                        hp_eff = tsk_rt(hp)->inh_task;
+                }
+        }
+        node_prio = nest->hp_waiter_eff_prio;
+        TRACE("%s+-> %s/%d [waiter = %s/%d] [waiter's inh = %s/%d] (lock = %d)\n",
+                  padding,
+                  (node_prio) ? node_prio->comm : "nil",
+                  (node_prio) ? node_prio->pid : -1,
+                  (hp) ? hp->comm : "nil",
+                  (hp) ? hp->pid : -1,
+                  (hp_eff) ? hp_eff->comm : "nil",
+                  (hp_eff) ? hp_eff->pid : -1,
+                  l->ident);
+    if(n->left) print_hp_waiters(n->left, depth+1);
+    if(n->right) print_hp_waiters(n->right, depth+1);
+}
+#endif
+#ifdef CONFIG_LITMUS_DGL_SUPPORT
+void select_next_lock(dgl_wait_state_t* dgl_wait /*, struct litmus_lock* prev_lock*/)
+{
+        /*
+         We pick the next lock in reverse order. This causes inheritance propagation
+         from locks received earlier to flow in the same direction as regular nested
+         locking. This might make fine-grain DGL easier in the future.
+         */
+        BUG_ON(tsk_rt(dgl_wait->task)->blocked_lock);
+        //WARN_ON(dgl_wait->locks[dgl_wait->last_primary] != prev_lock);
+        // note reverse order
+        for(dgl_wait->last_primary = dgl_wait->last_primary - 1;
+                dgl_wait->last_primary >= 0;
+                --(dgl_wait->last_primary)){
+                if(!dgl_wait->locks[dgl_wait->last_primary]->ops->is_owner(
+                                dgl_wait->locks[dgl_wait->last_primary], dgl_wait->task)) {
+                        tsk_rt(dgl_wait->task)->blocked_lock =
+                                        dgl_wait->locks[dgl_wait->last_primary];
+                        mb();
+                        TRACE_CUR("New blocked lock is %d\n",
+                                          dgl_wait->locks[dgl_wait->last_primary]->ident);
+                        break;
+                }
+        }
+}
+int dgl_wake_up(wait_queue_t *wq_node, unsigned mode, int sync, void *key)
+{
+        // should never be called.
+        BUG();
+        return 1;
+}
+void __waitqueue_dgl_remove_first(wait_queue_head_t *wq,
+                                                                  dgl_wait_state_t** dgl_wait,
+                                                                  struct task_struct **task)
+{
+        wait_queue_t *q;
+        *dgl_wait = NULL;
+        *task = NULL;
+        if (waitqueue_active(wq)) {
+                q = list_entry(wq->task_list.next,
+                                           wait_queue_t, task_list);
+                if(q->func == dgl_wake_up) {
+                        *dgl_wait = (dgl_wait_state_t*) q->private;
+                }
+                else {
+                        *task = (struct task_struct*) q->private;
+                }
+                __remove_wait_queue(wq, q);
+        }
+}
+void init_dgl_waitqueue_entry(wait_queue_t *wq_node, dgl_wait_state_t* dgl_wait)
+{
+        init_waitqueue_entry(wq_node, dgl_wait->task);
+        wq_node->private = dgl_wait;
+        wq_node->func = dgl_wake_up;
+}
+static long do_litmus_dgl_lock(dgl_wait_state_t *dgl_wait)
+{
+        int i;
+        unsigned long irqflags; //, dummyflags;
+        raw_spinlock_t *dgl_lock = litmus->get_dgl_spinlock(dgl_wait->task);
+        BUG_ON(dgl_wait->task != current);
+        raw_spin_lock_irqsave(dgl_lock, irqflags);
+        dgl_wait->nr_remaining = dgl_wait->size;
+        TRACE_CUR("Locking DGL with size %d\n", dgl_wait->size);
+        // try to acquire each lock.  enqueue (non-blocking) if it is unavailable.
+        for(i = 0; i < dgl_wait->size; ++i) {
+                struct litmus_lock *l = dgl_wait->locks[i];
+                // dgl_lock() must set task state to TASK_UNINTERRUPTIBLE if task blocks.
+                if(l->ops->dgl_lock(l, dgl_wait, &dgl_wait->wq_nodes[i])) {
+                        --(dgl_wait->nr_remaining);
+                        TRACE_CUR("Acquired lock %d immediatly.\n", l->ident);
+                }
+        }
+        if(dgl_wait->nr_remaining == 0) {
+                // acquired entire group immediatly
+                TRACE_CUR("Acquired all locks in DGL immediatly!\n");
+        }
+        else {
+                TRACE_CUR("As many as %d locks in DGL are pending. Suspending.\n",
+                                  dgl_wait->nr_remaining);
+#if defined(CONFIG_LITMUS_AFFINITY_LOCKING) && defined(CONFIG_LITMUS_NVIDIA)
+                // KLUDGE: don't count this suspension as time in the critical gpu
+                // critical section
+                if(tsk_rt(dgl_wait->task)->held_gpus) {
+                        tsk_rt(dgl_wait->task)->suspend_gpu_tracker_on_block = 1;
+                }
+#endif
+                // note reverse order.  see comments in select_next_lock for reason.
+                for(i = dgl_wait->size - 1; i >= 0; --i) {
+                        struct litmus_lock *l = dgl_wait->locks[i];
+                        if(!l->ops->is_owner(l, dgl_wait->task)) {  // double-check to be thread safe
+                                TRACE_CUR("Activating priority inheritance on lock %d\n",
+                                                  l->ident);
+                                TS_DGL_LOCK_SUSPEND;
+                                l->ops->enable_priority(l, dgl_wait);
+                                dgl_wait->last_primary = i;
+                                TRACE_CUR("Suspending for lock %d\n", l->ident);
+                                raw_spin_unlock_irqrestore(dgl_lock, irqflags);  // free dgl_lock before suspending
+                                schedule();  // suspend!!!
+                                TS_DGL_LOCK_RESUME;
+                                TRACE_CUR("Woken up from DGL suspension.\n");
+                                goto all_acquired;  // we should hold all locks when we wake up.
+                        }
+                }
+                TRACE_CUR("Didn't have to suspend after all, but calling schedule() anyway.\n");
+                //BUG();
+        }
+        raw_spin_unlock_irqrestore(dgl_lock, irqflags);
+all_acquired:
+        // FOR SANITY CHECK FOR TESTING
+//      for(i = 0; i < dgl_wait->size; ++i) {
+//              struct litmus_lock *l = dgl_wait->locks[i];
+//              BUG_ON(!l->ops->is_owner(l, dgl_wait->task));
+//      }
+        TRACE_CUR("Acquired entire DGL\n");
+        return 0;
+}
+static int supports_dgl(struct litmus_lock *l)
+{
+        struct litmus_lock_ops* ops = l->ops;
+        return (ops->dgl_lock                   &&
+                        ops->is_owner                   &&
+                        ops->enable_priority);
+}
+asmlinkage long sys_litmus_dgl_lock(void* __user usr_dgl_ods, int dgl_size)
+{
+        struct task_struct *t = current;
+        long err = -EINVAL;
+        int dgl_ods[MAX_DGL_SIZE];
+        int i;
+        dgl_wait_state_t dgl_wait_state;  // lives on the stack until all resources in DGL are held.
+        if(dgl_size > MAX_DGL_SIZE || dgl_size < 1)
+                goto out;
+        if(!access_ok(VERIFY_READ, usr_dgl_ods, dgl_size*(sizeof(int))))
+                goto out;
+        if(__copy_from_user(&dgl_ods, usr_dgl_ods, dgl_size*(sizeof(int))))
+                goto out;
+        if (!is_realtime(t)) {
+                err = -EPERM;
+                goto out;
+        }
+        for(i = 0; i < dgl_size; ++i) {
+                struct od_table_entry *entry = get_entry_for_od(dgl_ods[i]);
+                if(entry && is_lock(entry)) {
+                        dgl_wait_state.locks[i] = get_lock(entry);
+                        if(!supports_dgl(dgl_wait_state.locks[i])) {
+                                TRACE_CUR("Lock %d does not support all required DGL operations.\n",
+                                                  dgl_wait_state.locks[i]->ident);
+                                goto out;
+                        }
+                }
+                else {
+                        TRACE_CUR("Invalid lock identifier\n");
+                        goto out;
+                }
+        }
+        dgl_wait_state.task = t;
+        dgl_wait_state.size = dgl_size;
+        TS_DGL_LOCK_START;
+        err = do_litmus_dgl_lock(&dgl_wait_state);
+        /* Note: task my have been suspended or preempted in between!  Take
+         * this into account when computing overheads. */
+        TS_DGL_LOCK_END;
+out:
+        return err;
+}
+static long do_litmus_dgl_unlock(struct litmus_lock* dgl_locks[], int dgl_size)
+{
+        int i;
+        long err = 0;
+        TRACE_CUR("Unlocking a DGL of %d size\n", dgl_size);
+        for(i = dgl_size - 1; i >= 0; --i) {  // unlock in reverse order
+                struct litmus_lock *l = dgl_locks[i];
+                long tmp_err;
+                TRACE_CUR("Unlocking lock %d of DGL.\n", l->ident);
+                tmp_err = l->ops->unlock(l);
+                if(tmp_err) {
+                        TRACE_CUR("There was an error unlocking %d: %d.\n", l->ident, tmp_err);
+                        err = tmp_err;
+                }
+        }
+        TRACE_CUR("DGL unlocked. err = %d\n", err);
+        return err;
+}
+asmlinkage long sys_litmus_dgl_unlock(void* __user usr_dgl_ods, int dgl_size)
+{
+        long err = -EINVAL;
+        int dgl_ods[MAX_DGL_SIZE];
+        struct od_table_entry* entry;
+        int i;
+        struct litmus_lock* dgl_locks[MAX_DGL_SIZE];
+        if(dgl_size > MAX_DGL_SIZE || dgl_size < 1)
+                goto out;
+        if(!access_ok(VERIFY_READ, usr_dgl_ods, dgl_size*(sizeof(int))))
+                goto out;
+        if(__copy_from_user(&dgl_ods, usr_dgl_ods, dgl_size*(sizeof(int))))
+                goto out;
+        for(i = 0; i < dgl_size; ++i) {
+                entry = get_entry_for_od(dgl_ods[i]);
+                if(entry && is_lock(entry)) {
+                        dgl_locks[i] = get_lock(entry);
+                        if(!supports_dgl(dgl_locks[i])) {
+                                TRACE_CUR("Lock %d does not support all required DGL operations.\n",
+                                                  dgl_locks[i]->ident);
+                                goto out;
+                        }
+                }
+                else {
+                        TRACE_CUR("Invalid lock identifier\n");
+                        goto out;
+                }
+        }
+        TS_DGL_UNLOCK_START;
+        err = do_litmus_dgl_unlock(dgl_locks, dgl_size);
+        /* Note: task my have been suspended or preempted in between!  Take
+         * this into account when computing overheads. */
+        TS_DGL_UNLOCK_END;
+out:
+        return err;
+}
+#else  // CONFIG_LITMUS_DGL_SUPPORT
+asmlinkage long sys_litmus_dgl_lock(void* __user usr_dgl_ods, int dgl_size)
+{
+        return -ENOSYS;
+}
+asmlinkage long sys_litmus_dgl_unlock(void* __user usr_dgl_ods, int dgl_size)
+{
+        return -ENOSYS;
+}
+#endif
 unsigned int __add_wait_queue_prio_exclusive(
        wait_queue_head_t* head,
        prio_wait_queue_t *new)
@@ -171,7 +557,60 @@ out:
 }
-#else
+void suspend_for_lock(void)
+{
+#if defined(CONFIG_REALTIME_AUX_TASKS) || defined(CONFIG_LITMUS_NVIDIA)
+        struct task_struct *t = current;
+#endif
+#ifdef CONFIG_REALTIME_AUX_TASKS
+        unsigned int aux_restore = 0;
+        unsigned int aux_hide;
+#endif
+#ifdef CONFIG_LITMUS_NVIDIA
+        unsigned int gpu_restore = 0;
+        unsigned int gpu_hide;
+#endif
+//#ifdef CONFIG_REALTIME_AUX_TASKS
+//      if (tsk_rt(t)->has_aux_tasks) {
+//              /* hide from aux tasks so they can't inherit our priority when we block
+//               * for a litmus lock. inheritance is already going to a litmus lock
+//               * holder. */
+//              aux_hide = tsk_rt(t)->hide_from_aux_tasks;
+//              aux_restore = 1;
+//              tsk_rt(t)->hide_from_aux_tasks = 1;
+//      }
+//#endif
+#ifdef CONFIG_LITMUS_NVIDIA
+        if (tsk_rt(t)->held_gpus) {
+                gpu_hide = tsk_rt(t)->hide_from_gpu;
+                gpu_restore = 1;
+                tsk_rt(t)->hide_from_gpu = 1;
+        }
+#endif
+        schedule();
+#ifdef CONFIG_LITMUS_NVIDIA
+        if (gpu_restore) {
+                /* restore our state */
+                tsk_rt(t)->hide_from_gpu = gpu_hide;
+        }
+#endif
+#ifdef CONFIG_REALTIME_AUX_TASKS
+        if (aux_restore) {
+                /* restore our state */
+                tsk_rt(t)->hide_from_aux_tasks = aux_hide;
+        }
+#endif
+}
+#else  // CONFIG_LITMUS_LOCKING
 struct fdso_ops generic_lock_ops = {};
diff --git a/litmus/nvidia_info.c b/litmus/nvidia_info.c
new file mode 100644
index 000000000000..5a63fb732e8b
--- /dev/null
+++ b/litmus/nvidia_info.c
@@ -0,0 +1,1137 @@
+#include <linux/module.h>
+#include <linux/semaphore.h>
+#include <linux/pci.h>
+#include <litmus/sched_trace.h>
+#include <litmus/nvidia_info.h>
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/binheap.h>
+#ifdef CONFIG_LITMUS_SOFTIRQD
+#include <litmus/litmus_softirq.h>
+#endif
+typedef unsigned char      NvV8;  /* "void": enumerated or multiple fields   */
+typedef unsigned short     NvV16; /* "void": enumerated or multiple fields   */
+typedef unsigned char      NvU8;  /* 0 to 255                                */
+typedef unsigned short     NvU16; /* 0 to 65535                              */
+typedef signed char        NvS8;  /* -128 to 127                             */
+typedef signed short       NvS16; /* -32768 to 32767                         */
+typedef float              NvF32; /* IEEE Single Precision (S1E8M23)         */
+typedef double             NvF64; /* IEEE Double Precision (S1E11M52)        */
+typedef unsigned int       NvV32; /* "void": enumerated or multiple fields   */
+typedef unsigned int       NvU32; /* 0 to 4294967295                         */
+typedef unsigned long long NvU64; /* 0 to 18446744073709551615          */
+typedef union
+{
+    volatile NvV8 Reg008[1];
+    volatile NvV16 Reg016[1];
+    volatile NvV32 Reg032[1];
+} litmus_nv_hwreg_t, * litmus_nv_phwreg_t;
+typedef struct
+{
+    NvU64 address;
+#ifdef CONFIG_CUDA_5_0
+        NvU64 strapped_size;
+#endif
+    NvU64 size;
+    NvU32 offset;
+    NvU32 *map;
+    litmus_nv_phwreg_t map_u;
+} litmus_nv_aperture_t;
+typedef struct
+{
+    void  *priv;                    /* private data */
+    void  *os_state;                /* os-specific device state */
+#ifndef CONFIG_CUDA_5_0
+    int    rmInitialized;
+#endif
+    int    flags;
+    /* PCI config info */
+    NvU32 domain;
+    NvU16 bus;
+    NvU16 slot;
+    NvU16 vendor_id;
+    NvU16 device_id;
+    NvU16 subsystem_id;
+    NvU32 gpu_id;
+    void *handle;
+    NvU32 pci_cfg_space[16];
+    /* physical characteristics */
+    litmus_nv_aperture_t bars[3];
+    litmus_nv_aperture_t *regs;
+    litmus_nv_aperture_t *fb, ud;
+    litmus_nv_aperture_t agp;
+    NvU32  interrupt_line;
+    NvU32 agp_config;
+    NvU32 agp_status;
+    NvU32 primary_vga;
+    NvU32 sim_env;
+    NvU32 rc_timer_enabled;
+    /* list of events allocated for this device */
+    void *event_list;
+    void *kern_mappings;
+} litmus_nv_state_t;
+typedef struct work_struct litmus_nv_task_t;
+typedef struct litmus_nv_work_s {
+    litmus_nv_task_t task;
+    void *data;
+} litmus_nv_work_t;
+typedef struct litmus_nv_linux_state_s {
+    litmus_nv_state_t nv_state;
+    atomic_t usage_count;
+    struct pci_dev *dev;
+    void *agp_bridge;
+    void *alloc_queue;
+    void *timer_sp;
+    void *isr_sp;
+    void *pci_cfgchk_sp;
+    void *isr_bh_sp;
+#if defined(CONFIG_CUDA_4_0) || defined(CONFIG_CUDA_5_0)
+        char registry_keys[512];
+#endif
+    /* keep track of any pending bottom halfes */
+    struct tasklet_struct tasklet;
+    litmus_nv_work_t work;
+    /* get a timer callback every second */
+    struct timer_list rc_timer;
+    /* lock for linux-specific data, not used by core rm */
+    struct semaphore ldata_lock;
+    /* lock for linux-specific alloc queue */
+    struct semaphore at_lock;
+#if 0
+#if defined(NV_USER_MAP)
+    /* list of user mappings */
+    struct nv_usermap_s *usermap_list;
+    /* lock for VMware-specific mapping list */
+    struct semaphore mt_lock;
+#endif /* defined(NV_USER_MAP) */
+#if defined(NV_PM_SUPPORT_OLD_STYLE_APM)
+        void *apm_nv_dev;
+#endif
+#endif
+    NvU32 device_num;
+    struct litmus_nv_linux_state_s *next;
+} litmus_nv_linux_state_t;
+void dump_nvidia_info(const struct tasklet_struct *t)
+{
+        litmus_nv_state_t* nvstate = NULL;
+        litmus_nv_linux_state_t* linuxstate =  NULL;
+        struct pci_dev* pci = NULL;
+        nvstate = (litmus_nv_state_t*)(t->data);
+        if(nvstate)
+        {
+                TRACE("NV State:\n"
+                          "\ttasklet ptr = %p\n"
+                          "\tstate ptr = %p\n"
+                          "\tprivate data ptr = %p\n"
+                          "\tos state ptr = %p\n"
+                          "\tdomain = %u\n"
+                          "\tbus = %u\n"
+                          "\tslot = %u\n"
+                          "\tvender_id = %u\n"
+                          "\tdevice_id = %u\n"
+                          "\tsubsystem_id = %u\n"
+                          "\tgpu_id = %u\n"
+                          "\tinterrupt_line = %u\n",
+                          t,
+                          nvstate,
+                          nvstate->priv,
+                          nvstate->os_state,
+                          nvstate->domain,
+                          nvstate->bus,
+                          nvstate->slot,
+                          nvstate->vendor_id,
+                          nvstate->device_id,
+                          nvstate->subsystem_id,
+                          nvstate->gpu_id,
+                          nvstate->interrupt_line);
+                linuxstate = container_of(nvstate, litmus_nv_linux_state_t, nv_state);
+        }
+        else
+        {
+                TRACE("INVALID NVSTATE????\n");
+        }
+        if(linuxstate)
+        {
+                int ls_offset = (void*)(&(linuxstate->device_num)) - (void*)(linuxstate);
+                int ns_offset_raw = (void*)(&(linuxstate->device_num)) - (void*)(&(linuxstate->nv_state));
+                int ns_offset_desired = (void*)(&(linuxstate->device_num)) - (void*)(nvstate);
+                TRACE("LINUX NV State:\n"
+                          "\tlinux nv state ptr: %p\n"
+                          "\taddress of tasklet: %p\n"
+                          "\taddress of work: %p\n"
+                          "\tusage_count: %d\n"
+                          "\tdevice_num: %u\n"
+                          "\ttasklet addr == this tasklet: %d\n"
+                          "\tpci: %p\n",
+                          linuxstate,
+                          &(linuxstate->tasklet),
+                          &(linuxstate->work),
+                          atomic_read(&(linuxstate->usage_count)),
+                          linuxstate->device_num,
+                          (t == &(linuxstate->tasklet)),
+                          linuxstate->dev);
+                pci = linuxstate->dev;
+                TRACE("Offsets:\n"
+                          "\tOffset from LinuxState: %d, %x\n"
+                          "\tOffset from NVState: %d, %x\n"
+                          "\tOffset from parameter: %d, %x\n"
+                          "\tdevice_num: %u\n",
+                          ls_offset, ls_offset,
+                          ns_offset_raw, ns_offset_raw,
+                          ns_offset_desired, ns_offset_desired,
+                          *((u32*)((void*)nvstate + ns_offset_desired)));
+        }
+        else
+        {
+                TRACE("INVALID LINUXNVSTATE?????\n");
+        }
+#if 0
+        if(pci)
+        {
+                TRACE("PCI DEV Info:\n"
+                          "pci device ptr: %p\n"
+                          "\tdevfn = %d\n"
+                          "\tvendor = %d\n"
+                          "\tdevice = %d\n"
+                          "\tsubsystem_vendor = %d\n"
+                          "\tsubsystem_device = %d\n"
+                          "\tslot # = %d\n",
+                          pci,
+                          pci->devfn,
+                          pci->vendor,
+                          pci->device,
+                          pci->subsystem_vendor,
+                          pci->subsystem_device,
+                          pci->slot->number);
+        }
+        else
+        {
+                TRACE("INVALID PCIDEV PTR?????\n");
+        }
+#endif
+}
+static struct module* nvidia_mod = NULL;
+#if 0
+static int nvidia_ready_module_notify(struct notifier_block *self,
+                                unsigned long val, void *data)
+{
+        mutex_lock(&module_mutex);
+        nvidia_mod = find_module("nvidia");
+        mutex_unlock(&module_mutex);
+        if(nvidia_mod != NULL)
+        {
+                TRACE("%s : Found NVIDIA module. Core Code: %p to %p\n", __FUNCTION__,
+                          (void*)(nvidia_mod->module_core),
+                          (void*)(nvidia_mod->module_core) + nvidia_mod->core_size);
+                init_nv_device_reg();
+                return(0);
+        }
+        else
+        {
+                TRACE("%s : Could not find NVIDIA module!  Loaded?\n", __FUNCTION__);
+        }
+}
+static int nvidia_going_module_notify(struct notifier_block *self,
+                                unsigned long val, void *data)
+{
+        nvidia_mod = NULL;
+        mb();
+        return 0;
+}
+static struct notifier_block nvidia_ready = {
+        .notifier_call = nvidia_ready_module_notify,
+        .priority = 1,
+};
+static struct notifier_block nvidia_going = {
+        .notifier_call = nvidia_going_module_notify,
+        .priority = 1,
+};
+#endif
+static int init_nv_device_reg(void);
+static int shutdown_nv_device_reg(void);
+int init_nvidia_info(void)
+{
+        mutex_lock(&module_mutex);
+        nvidia_mod = find_module("nvidia");
+        mutex_unlock(&module_mutex);
+        if(nvidia_mod != NULL)
+        {
+                TRACE("%s : Found NVIDIA module. Core Code: %p to %p\n", __FUNCTION__,
+                          (void*)(nvidia_mod->module_core),
+                          (void*)(nvidia_mod->module_core) + nvidia_mod->core_size);
+                init_nv_device_reg();
+                return(0);
+        }
+        else
+        {
+                TRACE("%s : Could not find NVIDIA module!  Loaded?\n", __FUNCTION__);
+                init_nv_device_reg();
+                return(0);
+//              return(-1);
+        }
+}
+void shutdown_nvidia_info(void)
+{
+        nvidia_mod = NULL;
+        mb();
+        shutdown_nv_device_reg();
+}
+/* works with pointers to static data inside the module too. */
+int is_nvidia_func(void* func_addr)
+{
+        int ret = 0;
+        if(nvidia_mod)
+        {
+                ret = within_module_core((long unsigned int)func_addr, nvidia_mod);
+                /*
+                if(ret)
+                {
+                        TRACE("%s : %p is in NVIDIA module: %d\n",
+                                __FUNCTION__, func_addr, ret);
+                }*/
+        }
+        return(ret);
+}
+u32 get_tasklet_nv_device_num(const struct tasklet_struct *t)
+{
+        // life is too short to use hard-coded offsets.  update this later.
+        litmus_nv_state_t* nvstate = (litmus_nv_state_t*)(t->data);
+        litmus_nv_linux_state_t* linuxstate = container_of(nvstate, litmus_nv_linux_state_t, nv_state);
+        BUG_ON(linuxstate->device_num >= NV_DEVICE_NUM);
+        return(linuxstate->device_num);
+}
+u32 get_work_nv_device_num(const struct work_struct *t)
+{
+        // offset determined though observed behavior of the NV driver.
+        const int DEVICE_NUM_OFFSET = sizeof(struct work_struct);
+        void* state = (void*)(t);
+        void** device_num_ptr = state + DEVICE_NUM_OFFSET;
+        return(*((u32*)(*device_num_ptr)));
+}
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+typedef struct {
+        raw_spinlock_t  lock;  /* not needed if GPU not shared between scheudling domains */
+        struct binheap  owners;
+#ifdef CONFIG_LITMUS_SOFTIRQD
+        klmirqd_callback_t callback;
+        struct task_struct* thread;
+        int ready:1;  /* todo: make threads check for the ready flag */
+#endif
+#ifdef CONFIG_LITMUS_NV_KLMIRQD_DEBUG
+        struct tasklet_struct nv_klmirqd_dbg_tasklet;
+#endif
+}nv_device_registry_t;
+static nv_device_registry_t NV_DEVICE_REG[NV_DEVICE_NUM];
+#ifdef CONFIG_LITMUS_SOFTIRQD
+static int nvidia_klmirqd_cb(void *arg)
+{
+        unsigned long flags;
+        int reg_device_id = (int)(long long)(arg);
+        nv_device_registry_t *reg = &NV_DEVICE_REG[reg_device_id];
+        TRACE("nv klmirqd callback for GPU %d\n", reg_device_id);
+        raw_spin_lock_irqsave(&reg->lock, flags);
+        reg->thread = current;
+        reg->ready = 1;
+        raw_spin_unlock_irqrestore(&reg->lock, flags);
+        return 0;
+}
+#endif
+#ifdef CONFIG_LITMUS_NV_KLMIRQD_DEBUG
+struct nv_klmirqd_dbg_timer_struct
+{
+        struct hrtimer timer;
+};
+static struct nv_klmirqd_dbg_timer_struct nv_klmirqd_dbg_timer;
+static void nv_klmirqd_arm_dbg_timer(lt_t relative_time)
+{
+        lt_t when_to_fire = litmus_clock() + relative_time;
+        TRACE("next nv tasklet in %d ns\n", relative_time);
+        __hrtimer_start_range_ns(&nv_klmirqd_dbg_timer.timer,
+                                                         ns_to_ktime(when_to_fire),
+                                                         0,
+                                                         HRTIMER_MODE_ABS_PINNED,
+                                                         0);
+}
+static void nv_klmirqd_dbg_tasklet_func(unsigned long arg)
+{
+        lt_t now = litmus_clock();
+        nv_device_registry_t *reg = (nv_device_registry_t*)arg;
+        int gpunum = reg - &NV_DEVICE_REG[0];
+        TRACE("nv klmirqd routine invoked for GPU %d!\n", gpunum);
+        /* set up the next timer */
+        nv_klmirqd_arm_dbg_timer(now % (NSEC_PER_MSEC * 10)); // within the next 10ms.
+}
+static enum hrtimer_restart nvklmirqd_timer_func(struct hrtimer *timer)
+{
+        lt_t now = litmus_clock();
+        int gpu = (int)(now % num_online_gpus());
+        nv_device_registry_t *reg;
+        TRACE("nvklmirqd_timer invoked!\n");
+        reg = &NV_DEVICE_REG[gpu];
+        if (reg->thread && reg->ready) {
+                TRACE("Adding a tasklet for GPU %d\n", gpu);
+                litmus_tasklet_schedule(&reg->nv_klmirqd_dbg_tasklet, reg->thread);
+        }
+        else {
+                TRACE("nv klmirqd is not ready!\n");
+                nv_klmirqd_arm_dbg_timer(now % (NSEC_PER_MSEC * 10)); // within the next 10ms.
+        }
+        return HRTIMER_NORESTART;
+}
+#endif
+static int gpu_owner_max_priority_order(struct binheap_node *a,
+                                                                                        struct binheap_node *b)
+{
+        struct task_struct *d_a = container_of(binheap_entry(a, struct rt_param, gpu_owner_node),
+                                                                                   struct task_struct, rt_param);
+        struct task_struct *d_b = container_of(binheap_entry(b, struct rt_param, gpu_owner_node),
+                                                                                   struct task_struct, rt_param);
+        BUG_ON(!d_a);
+        BUG_ON(!d_b);
+        return litmus->compare(d_a, d_b);
+}
+static int init_nv_device_reg(void)
+{
+        int i;
+        char name[MAX_KLMIRQD_NAME_LEN+1];
+#ifdef CONFIG_LITMUS_SOFTIRQD
+        if (!klmirqd_is_ready()) {
+                TRACE("klmirqd is not ready!\n");
+                return 0;
+        }
+#endif
+        memset(NV_DEVICE_REG, 0, sizeof(NV_DEVICE_REG));
+        mb();
+        for(i = 0; i < num_online_gpus(); ++i) {
+                raw_spin_lock_init(&NV_DEVICE_REG[i].lock);
+                INIT_BINHEAP_HANDLE(&NV_DEVICE_REG[i].owners, gpu_owner_max_priority_order);
+#ifdef CONFIG_LITMUS_NV_KLMIRQD_DEBUG
+                tasklet_init(&NV_DEVICE_REG[i].nv_klmirqd_dbg_tasklet, nv_klmirqd_dbg_tasklet_func, (unsigned long)&NV_DEVICE_REG[i]);
+#endif
+#ifdef CONFIG_LITMUS_SOFTIRQD
+                {
+                        int default_cpu = litmus->map_gpu_to_cpu(i);
+                        snprintf(name, MAX_KLMIRQD_NAME_LEN, "nvklmirqd%d", i);
+                        NV_DEVICE_REG[i].callback.func = nvidia_klmirqd_cb;
+                        NV_DEVICE_REG[i].callback.arg = (void*)(long long)(i);
+                        mb();
+                        if(launch_klmirqd_thread(name, default_cpu, &NV_DEVICE_REG[i].callback) != 0) {
+                                TRACE("Failed to create klmirqd thread for GPU %d\n", i);
+                        }
+                }
+#endif
+        }
+#ifdef CONFIG_LITMUS_NV_KLMIRQD_DEBUG
+        hrtimer_init(&nv_klmirqd_dbg_timer.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+        nv_klmirqd_dbg_timer.timer.function = nvklmirqd_timer_func;
+        nv_klmirqd_arm_dbg_timer(NSEC_PER_MSEC * 1000);
+#endif
+        return(1);
+}
+/* The following code is full of nasty race conditions... */
+/* spawning of klimirqd threads can race with init_nv_device_reg()!!!! */
+static int shutdown_nv_device_reg(void)
+{
+        TRACE("Shutting down nv device registration.\n");
+#ifdef CONFIG_LITMUS_SOFTIRQD
+        {
+                int i;
+                nv_device_registry_t *reg;
+                for (i = 0; i < num_online_gpus(); ++i) {
+                        TRACE("Shutting down GPU %d.\n", i);
+                        reg = &NV_DEVICE_REG[i];
+                        if (reg->thread && reg->ready) {
+                                kill_klmirqd_thread(reg->thread);
+                                /* assume that all goes according to plan... */
+                                reg->thread = NULL;
+                                reg->ready = 0;
+                        }
+                        while (!binheap_empty(&reg->owners)) {
+                                binheap_delete_root(&reg->owners, struct rt_param, gpu_owner_node);
+                        }
+                }
+        }
+#endif
+        return(1);
+}
+/* use to get the owner of nv_device_id. */
+struct task_struct* get_nv_max_device_owner(u32 target_device_id)
+{
+        struct task_struct *owner = NULL;
+        nv_device_registry_t *reg;
+        BUG_ON(target_device_id >= NV_DEVICE_NUM);
+        reg = &NV_DEVICE_REG[target_device_id];
+        if (!binheap_empty(&reg->owners)) {
+                struct task_struct *hp = container_of(binheap_top_entry(&reg->owners, struct rt_param, gpu_owner_node),
+                                                                                          struct task_struct, rt_param);
+                TRACE_CUR("hp: %s/%d\n", hp->comm, hp->pid);
+        }
+        return(owner);
+}
+#ifdef CONFIG_LITMUS_SOFTIRQD
+struct task_struct* get_nv_klmirqd_thread(u32 target_device_id)
+{
+        struct task_struct *klmirqd = NULL;
+        nv_device_registry_t *reg;
+        BUG_ON(target_device_id >= NV_DEVICE_NUM);
+        reg = &NV_DEVICE_REG[target_device_id];
+        if(likely(reg->ready)) {
+                klmirqd = reg->thread;
+        }
+        return klmirqd;
+}
+#endif
+#ifdef CONFIG_LITMUS_SOFTIRQD
+static int gpu_klmirqd_increase_priority(struct task_struct *klmirqd, struct task_struct *hp)
+{
+        int retval = 0;
+        TRACE_CUR("Increasing priority of nv klmirqd: %s/%d.\n", klmirqd->comm, klmirqd->pid);
+        /* the klmirqd thread should never attempt to hold a litmus-level real-time
+         * so nested support is not required */
+        retval = litmus->__increase_prio(klmirqd, hp);
+        return retval;
+}
+static int gpu_klmirqd_decrease_priority(struct task_struct *klmirqd, struct task_struct *hp)
+{
+        int retval = 0;
+        TRACE_CUR("Decreasing priority of nv klmirqd: %s/%d.\n", klmirqd->comm, klmirqd->pid);
+        /* the klmirqd thread should never attempt to hold a litmus-level real-time
+         * so nested support is not required */
+        retval = litmus->__decrease_prio(klmirqd, hp);
+        return retval;
+}
+#endif
+/* call when an gpu owner becomes real-time */
+long enable_gpu_owner(struct task_struct *t)
+{
+        long retval = 0;
+//      unsigned long flags;
+        int gpu;
+        nv_device_registry_t *reg;
+#ifdef CONFIG_LITMUS_SOFTIRQD
+        struct task_struct *hp;
+#endif
+        if (!tsk_rt(t)->held_gpus) {
+                TRACE_CUR("task %s/%d does not hold any GPUs\n", t->comm, t->pid);
+                return -1;
+        }
+        BUG_ON(!is_realtime(t));
+        gpu = find_first_bit(&tsk_rt(t)->held_gpus, sizeof(tsk_rt(t)->held_gpus));
+        if (binheap_is_in_heap(&tsk_rt(t)->gpu_owner_node)) {
+                TRACE_CUR("task %s/%d is already active on GPU %d\n", t->comm, t->pid, gpu);
+                goto out;
+        }
+        /* update the registration (and maybe klmirqd) */
+        reg = &NV_DEVICE_REG[gpu];
+//      raw_spin_lock_irqsave(&reg->lock, flags);
+        binheap_add(&tsk_rt(t)->gpu_owner_node, &reg->owners,
+                                struct rt_param, gpu_owner_node);
+#ifdef CONFIG_LITMUS_SOFTIRQD
+        hp = container_of(binheap_top_entry(&reg->owners, struct rt_param, gpu_owner_node),
+                                                  struct task_struct, rt_param);
+        if (hp == t) {
+                /* we're the new hp */
+                TRACE_CUR("%s/%d is new hp on GPU %d.\n", t->comm, t->pid, gpu);
+                retval = gpu_klmirqd_increase_priority(reg->thread, (tsk_rt(hp)->inh_task)? tsk_rt(hp)->inh_task : hp);
+        }
+#endif
+//      raw_spin_unlock_irqsave(&reg->lock, flags);
+out:
+        return retval;
+}
+/* call when an gpu owner exits real-time */
+long disable_gpu_owner(struct task_struct *t)
+{
+        long retval = 0;
+//      unsigned long flags;
+        int gpu;
+        nv_device_registry_t *reg;
+#ifdef CONFIG_LITMUS_SOFTIRQD
+        struct task_struct *hp;
+        struct task_struct *new_hp = NULL;
+#endif
+        if (!tsk_rt(t)->held_gpus) {
+                TRACE_CUR("task %s/%d does not hold any GPUs\n", t->comm, t->pid);
+                return -1;
+        }
+        BUG_ON(!is_realtime(t));
+        gpu = find_first_bit(&tsk_rt(t)->held_gpus, sizeof(tsk_rt(t)->held_gpus));
+        if (!binheap_is_in_heap(&tsk_rt(t)->gpu_owner_node)) {
+                TRACE_CUR("task %s/%d is not active on GPU %d\n", t->comm, t->pid, gpu);
+                goto out;
+        }
+        TRACE_CUR("task %s/%d exiting from GPU %d.\n", t->comm, t->pid, gpu);
+        reg = &NV_DEVICE_REG[gpu];
+//      raw_spin_lock_irqsave(&reg->lock, flags);
+#ifdef CONFIG_LITMUS_SOFTIRQD
+        hp = container_of(binheap_top_entry(&reg->owners, struct rt_param, gpu_owner_node),
+                                          struct task_struct, rt_param);
+        binheap_delete(&tsk_rt(t)->gpu_owner_node, &reg->owners);
+        if (!binheap_empty(&reg->owners)) {
+                new_hp = container_of(binheap_top_entry(&reg->owners, struct rt_param, gpu_owner_node),
+                                                          struct task_struct, rt_param);
+        }
+        if (hp == t && new_hp != t) {
+                struct task_struct *to_inh = NULL;
+                TRACE_CUR("%s/%d is no longer hp on GPU %d.\n", t->comm, t->pid, gpu);
+                if (new_hp) {
+                        to_inh = (tsk_rt(new_hp)->inh_task) ? tsk_rt(new_hp)->inh_task : new_hp;
+                }
+                retval = gpu_klmirqd_decrease_priority(reg->thread, to_inh);
+        }
+#else
+        binheap_delete(&tsk_rt(t)->gpu_owner_node, &reg->owners);
+#endif
+//      raw_spin_unlock_irqsave(&reg->lock, flags);
+out:
+        return retval;
+}
+int gpu_owner_increase_priority(struct task_struct *t)
+{
+        int retval = 0;
+        int gpu;
+        nv_device_registry_t *reg;
+        struct task_struct *hp = NULL;
+        struct task_struct *hp_eff = NULL;
+        BUG_ON(!is_realtime(t));
+        BUG_ON(!tsk_rt(t)->held_gpus);
+        gpu = find_first_bit(&tsk_rt(t)->held_gpus, sizeof(tsk_rt(t)->held_gpus));
+        if (!binheap_is_in_heap(&tsk_rt(t)->gpu_owner_node)) {
+                WARN_ON(!is_running(t));
+                TRACE_CUR("gpu klmirqd may not inherit from %s/%d on GPU %d\n",
+                                  t->comm, t->pid, gpu);
+                goto out;
+        }
+        TRACE_CUR("task %s/%d on GPU %d increasing priority.\n", t->comm, t->pid, gpu);
+        reg = &NV_DEVICE_REG[gpu];
+        hp = container_of(binheap_top_entry(&reg->owners, struct rt_param, gpu_owner_node),
+                                          struct task_struct, rt_param);
+        hp_eff = effective_priority(hp);
+        if (hp != t) { /* our position in the heap may have changed. hp is already at the root. */
+                binheap_decrease(&tsk_rt(t)->gpu_owner_node, &reg->owners);
+        }
+        hp = container_of(binheap_top_entry(&reg->owners, struct rt_param, gpu_owner_node),
+                                          struct task_struct, rt_param);
+        if (effective_priority(hp) != hp_eff) { /* the eff. prio. of hp has changed */
+                hp_eff = effective_priority(hp);
+                TRACE_CUR("%s/%d is new hp on GPU %d.\n", t->comm, t->pid, gpu);
+                retval = gpu_klmirqd_increase_priority(reg->thread, hp_eff);
+        }
+out:
+        return retval;
+}
+int gpu_owner_decrease_priority(struct task_struct *t)
+{
+        int retval = 0;
+        int gpu;
+        nv_device_registry_t *reg;
+        struct task_struct *hp = NULL;
+        struct task_struct *hp_eff = NULL;
+        BUG_ON(!is_realtime(t));
+        BUG_ON(!tsk_rt(t)->held_gpus);
+        gpu = find_first_bit(&tsk_rt(t)->held_gpus, sizeof(tsk_rt(t)->held_gpus));
+        if (!binheap_is_in_heap(&tsk_rt(t)->gpu_owner_node)) {
+                WARN_ON(!is_running(t));
+                TRACE_CUR("nv klmirqd may not inherit from %s/%d on GPU %d\n",
+                                  t->comm, t->pid, gpu);
+                goto out;
+        }
+        TRACE_CUR("task %s/%d on GPU %d decresing priority.\n", t->comm, t->pid, gpu);
+        reg = &NV_DEVICE_REG[gpu];
+        hp = container_of(binheap_top_entry(&reg->owners, struct rt_param, gpu_owner_node),
+                                          struct task_struct, rt_param);
+        hp_eff = effective_priority(hp);
+        binheap_delete(&tsk_rt(t)->gpu_owner_node, &reg->owners);
+        binheap_add(&tsk_rt(t)->gpu_owner_node, &reg->owners,
+                                struct rt_param, gpu_owner_node);
+        if (hp == t) { /* t was originally the hp */
+                struct task_struct *new_hp =
+                        container_of(binheap_top_entry(&reg->owners, struct rt_param, gpu_owner_node),
+                                         struct task_struct, rt_param);
+                if (effective_priority(new_hp) != hp_eff) { /* eff prio. of hp has changed */
+                        hp_eff = effective_priority(new_hp);
+                        TRACE_CUR("%s/%d is no longer hp on GPU %d.\n", t->comm, t->pid, gpu);
+                        retval = gpu_klmirqd_decrease_priority(reg->thread, hp_eff);
+                }
+        }
+out:
+        return retval;
+}
+static int __reg_nv_device(int reg_device_id, struct task_struct *t)
+{
+        __set_bit(reg_device_id, &tsk_rt(t)->held_gpus);
+        return(0);
+}
+static int __clear_reg_nv_device(int de_reg_device_id, struct task_struct *t)
+{
+        __clear_bit(de_reg_device_id, &tsk_rt(t)->held_gpus);
+        return(0);
+}
+int reg_nv_device(int reg_device_id, int reg_action, struct task_struct *t)
+{
+        int ret;
+        if((reg_device_id < num_online_gpus()) && (reg_device_id >= 0))
+        {
+                if(reg_action)
+                        ret = __reg_nv_device(reg_device_id, t);
+                else
+                        ret = __clear_reg_nv_device(reg_device_id, t);
+        }
+        else
+        {
+                ret = -ENODEV;
+        }
+        return(ret);
+}
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+//void pai_check_priority_increase(struct task_struct *t, int reg_device_id)
+//{
+//      unsigned long flags;
+//      nv_device_registry_t *reg = &NV_DEVICE_REG[reg_device_id];
+//
+//
+//
+//      if(reg->max_prio_owner != t) {
+//
+//              raw_spin_lock_irqsave(&reg->lock, flags);
+//
+//              if(reg->max_prio_owner != t) {
+//                      if(litmus->compare(t, reg->max_prio_owner)) {
+//                              litmus->change_prio_pai_tasklet(reg->max_prio_owner, t);
+//                              reg->max_prio_owner = t;
+//                      }
+//              }
+//
+//              raw_spin_unlock_irqrestore(&reg->lock, flags);
+//      }
+//}
+//
+//
+//void pai_check_priority_decrease(struct task_struct *t, int reg_device_id)
+//{
+//      unsigned long flags;
+//      nv_device_registry_t *reg = &NV_DEVICE_REG[reg_device_id];
+//
+//      if(reg->max_prio_owner == t) {
+//
+//              raw_spin_lock_irqsave(&reg->lock, flags);
+//
+//              if(reg->max_prio_owner == t) {
+//                      reg->max_prio_owner = find_hp_owner(reg, NULL);
+//                      if(reg->max_prio_owner != t) {
+//                              litmus->change_prio_pai_tasklet(t, reg->max_prio_owner);
+//                      }
+//              }
+//
+//              raw_spin_unlock_irqrestore(&reg->lock, flags);
+//      }
+//}
+#endif
+//static int __reg_nv_device(int reg_device_id, struct task_struct *t)
+//{
+//      int ret = 0;
+//      int i;
+//      struct task_struct *old_max = NULL;
+//
+//
+//      raw_spin_lock_irqsave(&reg->lock, flags);
+//
+//      if(reg->nr_owners < NV_MAX_SIMULT_USERS) {
+//              TRACE_TASK(t, "registers GPU %d\n", reg_device_id);
+//              for(i = 0; i < NV_MAX_SIMULT_USERS; ++i) {
+//                      if(reg->owners[i] == NULL) {
+//                              reg->owners[i] = t;
+//
+//                              //if(edf_higher_prio(t, reg->max_prio_owner)) {
+//                              if(litmus->compare(t, reg->max_prio_owner)) {
+//                                      old_max = reg->max_prio_owner;
+//                                      reg->max_prio_owner = t;
+//
+//#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+//                                      litmus->change_prio_pai_tasklet(old_max, t);
+//#endif
+//                              }
+//
+//#ifdef CONFIG_LITMUS_SOFTIRQD
+//                              down_and_set_stat(t, HELD, &tsk_rt(t)->klmirqd_sem);
+//#endif
+//                              ++(reg->nr_owners);
+//
+//                              break;
+//                      }
+//              }
+//      }
+//      else
+//      {
+//              TRACE_CUR("%s: device %d is already in use!\n", __FUNCTION__, reg_device_id);
+//              //ret = -EBUSY;
+//      }
+//
+//      raw_spin_unlock_irqrestore(&reg->lock, flags);
+//
+//      __set_bit(reg_device_id, &tsk_rt(t)->held_gpus);
+//
+//      return(ret);
+//}
+//
+//static int __clear_reg_nv_device(int de_reg_device_id, struct task_struct *t)
+//{
+//      int ret = 0;
+//      int i;
+//      unsigned long flags;
+//      nv_device_registry_t *reg = &NV_DEVICE_REG[de_reg_device_id];
+//
+//#ifdef CONFIG_LITMUS_SOFTIRQD
+//    struct task_struct* klmirqd_th = get_klmirqd(de_reg_device_id);
+//#endif
+//
+//      if(!test_bit(de_reg_device_id, &tsk_rt(t)->held_gpus)) {
+//              return ret;
+//      }
+//
+//      raw_spin_lock_irqsave(&reg->lock, flags);
+//
+//      TRACE_TASK(t, "unregisters GPU %d\n", de_reg_device_id);
+//
+//      for(i = 0; i < NV_MAX_SIMULT_USERS; ++i) {
+//              if(reg->owners[i] == t) {
+//#ifdef CONFIG_LITMUS_SOFTIRQD
+//                      flush_pending(klmirqd_th, t);
+//#endif
+//                      if(reg->max_prio_owner == t) {
+//                              reg->max_prio_owner = find_hp_owner(reg, t);
+//#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+//                              litmus->change_prio_pai_tasklet(t, reg->max_prio_owner);
+//#endif
+//                      }
+//
+//#ifdef CONFIG_LITMUS_SOFTIRQD
+//                      up_and_set_stat(t, NOT_HELD, &tsk_rt(t)->klmirqd_sem);
+//#endif
+//
+//                      reg->owners[i] = NULL;
+//                      --(reg->nr_owners);
+//
+//                      break;
+//              }
+//      }
+//
+//      raw_spin_unlock_irqrestore(&reg->lock, flags);
+//
+//      __clear_bit(de_reg_device_id, &tsk_rt(t)->held_gpus);
+//
+//      return(ret);
+//}
+//
+//
+//int reg_nv_device(int reg_device_id, int reg_action, struct task_struct *t)
+//{
+//      int ret;
+//
+//      if((reg_device_id < NV_DEVICE_NUM) && (reg_device_id >= 0))
+//      {
+//              if(reg_action)
+//                      ret = __reg_nv_device(reg_device_id, t);
+//              else
+//                      ret = __clear_reg_nv_device(reg_device_id, t);
+//      }
+//      else
+//      {
+//              ret = -ENODEV;
+//      }
+//
+//      return(ret);
+//}
+//void lock_nv_registry(u32 target_device_id, unsigned long* flags)
+//{
+//      BUG_ON(target_device_id >= NV_DEVICE_NUM);
+//
+//      if(in_interrupt())
+//              TRACE("Locking registry for %d.\n", target_device_id);
+//      else
+//              TRACE_CUR("Locking registry for %d.\n", target_device_id);
+//
+//      raw_spin_lock_irqsave(&NV_DEVICE_REG[target_device_id].lock, *flags);
+//}
+//
+//void unlock_nv_registry(u32 target_device_id, unsigned long* flags)
+//{
+//      BUG_ON(target_device_id >= NV_DEVICE_NUM);
+//
+//      if(in_interrupt())
+//              TRACE("Unlocking registry for %d.\n", target_device_id);
+//      else
+//              TRACE_CUR("Unlocking registry for %d.\n", target_device_id);
+//
+//      raw_spin_unlock_irqrestore(&NV_DEVICE_REG[target_device_id].lock, *flags);
+//}
diff --git a/litmus/preempt.c b/litmus/preempt.c
index 6be2f26728b8..86ad2efb591a 100644
--- a/litmus/preempt.c
+++ b/litmus/preempt.c
@@ -27,10 +27,12 @@ void sched_state_will_schedule(struct task_struct* tsk)
                        set_sched_state(PICKED_WRONG_TASK);
                else
                        set_sched_state(WILL_SCHEDULE);
-        } else
+        } else {
                /* Litmus tasks should never be subject to a remote
                 * set_tsk_need_resched(). */
-                BUG_ON(is_realtime(tsk));
+                //BUG_ON(is_realtime(tsk));
+        }
 #ifdef CONFIG_PREEMPT_STATE_TRACE
        TRACE_TASK(tsk, "set_tsk_need_resched() ret:%p\n",
                   __builtin_return_address(0));
@@ -46,14 +48,18 @@ void sched_state_ipi(void)
                /* Cause scheduler to be invoked.
                 * This will cause a transition to WILL_SCHEDULE. */
                set_tsk_need_resched(current);
+                /*
                TRACE_STATE("IPI -> set_tsk_need_resched(%s/%d)\n",
                            current->comm, current->pid);
+                */
                TS_SEND_RESCHED_END;
        } else {
                /* ignore */
+                /*
                TRACE_STATE("ignoring IPI in state %x (%s)\n",
                            get_sched_state(),
                            sched_state_name(get_sched_state()));
+                */
        }
 }
@@ -70,23 +76,34 @@ void litmus_reschedule(int cpu)
         * is not aware of the need to reschedule at this point. */
        /* is a context switch in progress? */
-        if (cpu_is_in_sched_state(cpu, TASK_PICKED))
+        if (cpu_is_in_sched_state(cpu, TASK_PICKED)) {
                picked_transition_ok = sched_state_transition_on(
                        cpu, TASK_PICKED, PICKED_WRONG_TASK);
+                TRACE_CUR("cpu %d: picked_transition_ok = %d\n", cpu, picked_transition_ok);
+        }
+        else {
+                TRACE_CUR("cpu %d: picked_transition_ok = 0 (static)\n", cpu);
+        }
        if (!picked_transition_ok &&
            cpu_is_in_sched_state(cpu, TASK_SCHEDULED)) {
                /* We either raced with the end of the context switch, or the
                 * CPU was in TASK_SCHEDULED anyway. */
                scheduled_transition_ok = sched_state_transition_on(
                        cpu, TASK_SCHEDULED, SHOULD_SCHEDULE);
+                TRACE_CUR("cpu %d: scheduled_transition_ok = %d\n", cpu, scheduled_transition_ok);
+        }
+        else {
+                TRACE_CUR("cpu %d: scheduled_transition_ok = 0 (static)\n", cpu);
        }
        /* If the CPU was in state TASK_SCHEDULED, then we need to cause the
         * scheduler to be invoked. */
        if (scheduled_transition_ok) {
-                if (smp_processor_id() == cpu)
+                if (smp_processor_id() == cpu) {
                        set_tsk_need_resched(current);
+                }
                else {
                        TS_SEND_RESCHED_START(cpu);
                        smp_send_reschedule(cpu);
@@ -101,11 +118,16 @@ void litmus_reschedule(int cpu)
 void litmus_reschedule_local(void)
 {
-        if (is_in_sched_state(TASK_PICKED))
+        if (is_in_sched_state(TASK_PICKED)) {
                set_sched_state(PICKED_WRONG_TASK);
+                TRACE_CUR("cpu %d: transitioned to PICKED_WRONG_TASK\n", smp_processor_id());
+        }
        else if (is_in_sched_state(TASK_SCHEDULED | SHOULD_SCHEDULE)) {
                set_sched_state(WILL_SCHEDULE);
                set_tsk_need_resched(current);
+                TRACE_CUR("cpu %d: transitioned to WILL_SCHEDULE\n", smp_processor_id());
        }
 }
diff --git a/litmus/rsm_lock.c b/litmus/rsm_lock.c
new file mode 100644
index 000000000000..3dfd8ae9d221
--- /dev/null
+++ b/litmus/rsm_lock.c
@@ -0,0 +1,796 @@
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <litmus/trace.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/rsm_lock.h>
+//#include <litmus/edf_common.h>
+#if defined(CONFIG_LITMUS_AFFINITY_LOCKING) && defined(CONFIG_LITMUS_NVIDIA)
+#include <litmus/gpu_affinity.h>
+#endif
+/* caller is responsible for locking */
+static struct task_struct* rsm_mutex_find_hp_waiter(struct rsm_mutex *mutex,
+                                             struct task_struct* skip)
+{
+    wait_queue_t        *q;
+    struct list_head    *pos;
+    struct task_struct  *queued = NULL, *found = NULL;
+#ifdef CONFIG_LITMUS_DGL_SUPPORT
+    dgl_wait_state_t    *dgl_wait = NULL;
+#endif
+    list_for_each(pos, &mutex->wait.task_list) {
+        q = list_entry(pos, wait_queue_t, task_list);
+#ifdef CONFIG_LITMUS_DGL_SUPPORT
+        if(q->func == dgl_wake_up) {
+            dgl_wait = (dgl_wait_state_t*) q->private;
+            if(tsk_rt(dgl_wait->task)->blocked_lock == &mutex->litmus_lock) {
+                queued = dgl_wait->task;
+            }
+            else {
+                queued = NULL;  // skip it.
+            }
+        }
+        else {
+            queued = (struct task_struct*) q->private;
+        }
+#else
+        queued = (struct task_struct*) q->private;
+#endif
+        /* Compare task prios, find high prio task. */
+        //if (queued && queued != skip && edf_higher_prio(queued, found)) {
+                if (queued && queued != skip && litmus->compare(queued, found)) {
+            found = queued;
+        }
+    }
+    return found;
+}
+#ifdef CONFIG_LITMUS_DGL_SUPPORT
+int rsm_mutex_is_owner(struct litmus_lock *l, struct task_struct *t)
+{
+        struct rsm_mutex *mutex = rsm_mutex_from_lock(l);
+        return(mutex->owner == t);
+}
+// return 1 if resource was immediatly acquired.
+// Assumes mutex->lock is held.
+// Must set task state to TASK_UNINTERRUPTIBLE if task blocks.
+int rsm_mutex_dgl_lock(struct litmus_lock *l, dgl_wait_state_t* dgl_wait,
+                                           wait_queue_t* wq_node)
+{
+        struct rsm_mutex *mutex = rsm_mutex_from_lock(l);
+        struct task_struct *t = dgl_wait->task;
+        int acquired_immediatly = 0;
+        BUG_ON(t != current);
+        if (mutex->owner) {
+                TRACE_TASK(t, "Enqueuing on lock %d.\n", l->ident);
+                init_dgl_waitqueue_entry(wq_node, dgl_wait);
+                set_task_state(t, TASK_UNINTERRUPTIBLE);
+                __add_wait_queue_tail_exclusive(&mutex->wait, wq_node);
+        } else {
+                TRACE_TASK(t, "Acquired lock %d with no blocking.\n", l->ident);
+                /* it's ours now */
+                mutex->owner = t;
+                raw_spin_lock(&tsk_rt(t)->hp_blocked_tasks_lock);
+                binheap_add(&l->nest.hp_binheap_node, &tsk_rt(t)->hp_blocked_tasks,
+                                        struct nested_info, hp_binheap_node);
+                raw_spin_unlock(&tsk_rt(t)->hp_blocked_tasks_lock);
+                acquired_immediatly = 1;
+        }
+        return acquired_immediatly;
+}
+void rsm_mutex_enable_priority(struct litmus_lock *l,
+                                                           dgl_wait_state_t* dgl_wait)
+{
+        struct rsm_mutex *mutex = rsm_mutex_from_lock(l);
+        struct task_struct *t = dgl_wait->task;
+        struct task_struct *owner = mutex->owner;
+        unsigned long flags = 0;  // these are unused under DGL coarse-grain locking
+        BUG_ON(owner == t);
+        tsk_rt(t)->blocked_lock = l;
+        mb();
+        //if (edf_higher_prio(t, mutex->hp_waiter)) {
+        if (litmus->compare(t, mutex->hp_waiter)) {
+                struct task_struct *old_max_eff_prio;
+                struct task_struct *new_max_eff_prio;
+                struct task_struct *new_prio = NULL;
+                if(mutex->hp_waiter)
+                        TRACE_TASK(t, "has higher prio than hp_waiter (%s/%d).\n",
+                                           mutex->hp_waiter->comm, mutex->hp_waiter->pid);
+                else
+                        TRACE_TASK(t, "has higher prio than hp_waiter (NIL).\n");
+                raw_spin_lock(&tsk_rt(owner)->hp_blocked_tasks_lock);
+                old_max_eff_prio = top_priority(&tsk_rt(owner)->hp_blocked_tasks);
+                mutex->hp_waiter = t;
+                l->nest.hp_waiter_eff_prio = effective_priority(mutex->hp_waiter);
+                binheap_decrease(&l->nest.hp_binheap_node,
+                                                 &tsk_rt(owner)->hp_blocked_tasks);
+                new_max_eff_prio = top_priority(&tsk_rt(owner)->hp_blocked_tasks);
+                if(new_max_eff_prio != old_max_eff_prio) {
+                        TRACE_TASK(t, "is new hp_waiter.\n");
+                        if ((effective_priority(owner) == old_max_eff_prio) ||
+                                //(__edf_higher_prio(new_max_eff_prio, BASE, owner, EFFECTIVE))){
+                                (litmus->__compare(new_max_eff_prio, BASE, owner, EFFECTIVE))){
+                                new_prio = new_max_eff_prio;
+                        }
+                }
+                else {
+                        TRACE_TASK(t, "no change in max_eff_prio of heap.\n");
+                }
+                if(new_prio) {
+                        litmus->nested_increase_prio(owner, new_prio,
+                                                                                 &mutex->lock, flags);  // unlocks lock.
+                }
+                else {
+                        raw_spin_unlock(&tsk_rt(owner)->hp_blocked_tasks_lock);
+                        unlock_fine_irqrestore(&mutex->lock, flags);
+                }
+        }
+        else {
+                TRACE_TASK(t, "no change in hp_waiter.\n");
+                unlock_fine_irqrestore(&mutex->lock, flags);
+        }
+}
+static void select_next_lock_if_primary(struct litmus_lock *l,
+                                                                                dgl_wait_state_t *dgl_wait)
+{
+        if(tsk_rt(dgl_wait->task)->blocked_lock == l) {
+                TRACE_CUR("Lock %d in DGL was primary for %s/%d.\n",
+                                  l->ident, dgl_wait->task->comm, dgl_wait->task->pid);
+                tsk_rt(dgl_wait->task)->blocked_lock = NULL;
+                mb();
+                select_next_lock(dgl_wait /*, l*/);  // pick the next lock to be blocked on
+        }
+        else {
+                TRACE_CUR("Got lock early! Lock %d in DGL was NOT primary for %s/%d.\n",
+                                  l->ident, dgl_wait->task->comm, dgl_wait->task->pid);
+        }
+}
+#endif
+int rsm_mutex_lock(struct litmus_lock* l)
+{
+        struct task_struct *t = current;
+        struct task_struct *owner;
+        struct rsm_mutex *mutex = rsm_mutex_from_lock(l);
+        wait_queue_t wait;
+        unsigned long flags;
+#ifdef CONFIG_LITMUS_DGL_SUPPORT
+        raw_spinlock_t *dgl_lock;
+#endif
+        if (!is_realtime(t))
+                return -EPERM;
+#ifdef CONFIG_LITMUS_DGL_SUPPORT
+        dgl_lock = litmus->get_dgl_spinlock(t);
+#endif
+        lock_global_irqsave(dgl_lock, flags);
+        lock_fine_irqsave(&mutex->lock, flags);
+        if (mutex->owner) {
+                TRACE_TASK(t, "Blocking on lock %d.\n", l->ident);
+#if defined(CONFIG_LITMUS_AFFINITY_LOCKING) && defined(CONFIG_LITMUS_NVIDIA)
+                // KLUDGE: don't count this suspension as time in the critical gpu
+                // critical section
+                if(tsk_rt(t)->held_gpus) {
+                        tsk_rt(t)->suspend_gpu_tracker_on_block = 1;
+                }
+#endif
+                /* resource is not free => must suspend and wait */
+                owner = mutex->owner;
+                init_waitqueue_entry(&wait, t);
+                tsk_rt(t)->blocked_lock = l;  /* record where we are blocked */
+                mb();  // needed?
+                /* FIXME: interruptible would be nice some day */
+                set_task_state(t, TASK_UNINTERRUPTIBLE);
+                __add_wait_queue_tail_exclusive(&mutex->wait, &wait);
+                /* check if we need to activate priority inheritance */
+                //if (edf_higher_prio(t, mutex->hp_waiter)) {
+                if (litmus->compare(t, mutex->hp_waiter)) {
+                        struct task_struct *old_max_eff_prio;
+                        struct task_struct *new_max_eff_prio;
+                        struct task_struct *new_prio = NULL;
+                        if(mutex->hp_waiter)
+                                TRACE_TASK(t, "has higher prio than hp_waiter (%s/%d).\n",
+                                                   mutex->hp_waiter->comm, mutex->hp_waiter->pid);
+                        else
+                                TRACE_TASK(t, "has higher prio than hp_waiter (NIL).\n");
+                        raw_spin_lock(&tsk_rt(owner)->hp_blocked_tasks_lock);
+                        old_max_eff_prio = top_priority(&tsk_rt(owner)->hp_blocked_tasks);
+                        mutex->hp_waiter = t;
+                        l->nest.hp_waiter_eff_prio = effective_priority(mutex->hp_waiter);
+                        binheap_decrease(&l->nest.hp_binheap_node,
+                                                         &tsk_rt(owner)->hp_blocked_tasks);
+                        new_max_eff_prio = top_priority(&tsk_rt(owner)->hp_blocked_tasks);
+                        if(new_max_eff_prio != old_max_eff_prio) {
+                                TRACE_TASK(t, "is new hp_waiter.\n");
+                                if ((effective_priority(owner) == old_max_eff_prio) ||
+                                        //(__edf_higher_prio(new_max_eff_prio, BASE, owner, EFFECTIVE))){
+                                        (litmus->__compare(new_max_eff_prio, BASE, owner, EFFECTIVE))){
+                                        new_prio = new_max_eff_prio;
+                                }
+                        }
+                        else {
+                                TRACE_TASK(t, "no change in max_eff_prio of heap.\n");
+                        }
+                        if(new_prio) {
+                                litmus->nested_increase_prio(owner, new_prio, &mutex->lock,
+                                                                                         flags);  // unlocks lock.
+                        }
+                        else {
+                                raw_spin_unlock(&tsk_rt(owner)->hp_blocked_tasks_lock);
+                                unlock_fine_irqrestore(&mutex->lock, flags);
+                        }
+                }
+                else {
+                        TRACE_TASK(t, "no change in hp_waiter.\n");
+                        unlock_fine_irqrestore(&mutex->lock, flags);
+                }
+                unlock_global_irqrestore(dgl_lock, flags);
+                TS_LOCK_SUSPEND;
+                /* We depend on the FIFO order.  Thus, we don't need to recheck
+                 * when we wake up; we are guaranteed to have the lock since
+                 * there is only one wake up per release.
+                 */
+                suspend_for_lock();
+                TS_LOCK_RESUME;
+                /* Since we hold the lock, no other task will change
+                 * ->owner. We can thus check it without acquiring the spin
+                 * lock. */
+                BUG_ON(mutex->owner != t);
+                TRACE_TASK(t, "Acquired lock %d.\n", l->ident);
+        } else {
+                TRACE_TASK(t, "Acquired lock %d with no blocking.\n", l->ident);
+                /* it's ours now */
+                mutex->owner = t;
+                raw_spin_lock(&tsk_rt(mutex->owner)->hp_blocked_tasks_lock);
+                binheap_add(&l->nest.hp_binheap_node, &tsk_rt(t)->hp_blocked_tasks,
+                                        struct nested_info, hp_binheap_node);
+                raw_spin_unlock(&tsk_rt(mutex->owner)->hp_blocked_tasks_lock);
+                unlock_fine_irqrestore(&mutex->lock, flags);
+                unlock_global_irqrestore(dgl_lock, flags);
+        }
+        return 0;
+}
+int rsm_mutex_unlock(struct litmus_lock* l)
+{
+        struct task_struct *t = current, *next = NULL;
+        struct rsm_mutex *mutex = rsm_mutex_from_lock(l);
+        unsigned long flags;
+        struct task_struct *old_max_eff_prio;
+        int wake_up_task = 1;
+#ifdef CONFIG_LITMUS_DGL_SUPPORT
+        dgl_wait_state_t *dgl_wait = NULL;
+        raw_spinlock_t *dgl_lock = litmus->get_dgl_spinlock(t);
+#endif
+        int err = 0;
+        if (mutex->owner != t) {
+                err = -EINVAL;
+                return err;
+        }
+        lock_global_irqsave(dgl_lock, flags);
+        lock_fine_irqsave(&mutex->lock, flags);
+        raw_spin_lock(&tsk_rt(t)->hp_blocked_tasks_lock);
+        TRACE_TASK(t, "Freeing lock %d\n", l->ident);
+        old_max_eff_prio = top_priority(&tsk_rt(t)->hp_blocked_tasks);
+        binheap_delete(&l->nest.hp_binheap_node, &tsk_rt(t)->hp_blocked_tasks);
+        if(tsk_rt(t)->inh_task){
+                struct task_struct *new_max_eff_prio =
+                        top_priority(&tsk_rt(t)->hp_blocked_tasks);
+                if((new_max_eff_prio == NULL) ||
+                      /* there was a change in eff prio */
+                   (  (new_max_eff_prio != old_max_eff_prio) &&
+                        /* and owner had the old eff prio */
+                          (effective_priority(t) == old_max_eff_prio))  )
+                {
+                        // old_max_eff_prio > new_max_eff_prio
+                        //if(__edf_higher_prio(new_max_eff_prio, BASE, t, EFFECTIVE)) {
+                        if(litmus->__compare(new_max_eff_prio, BASE, t, EFFECTIVE)) {
+                                TRACE_TASK(t, "new_max_eff_prio > task's eff_prio-- new_max_eff_prio: %s/%d   task: %s/%d [%s/%d]\n",
+                                                   new_max_eff_prio->comm, new_max_eff_prio->pid,
+                                                   t->comm, t->pid, tsk_rt(t)->inh_task->comm,
+                                                   tsk_rt(t)->inh_task->pid);
+                                WARN_ON(1);
+                        }
+                        litmus->decrease_prio(t, new_max_eff_prio);
+                }
+        }
+        if(binheap_empty(&tsk_rt(t)->hp_blocked_tasks) &&
+           tsk_rt(t)->inh_task != NULL)
+        {
+                WARN_ON(tsk_rt(t)->inh_task != NULL);
+                TRACE_TASK(t, "No more locks are held, but eff_prio = %s/%d\n",
+                                   tsk_rt(t)->inh_task->comm, tsk_rt(t)->inh_task->pid);
+        }
+        raw_spin_unlock(&tsk_rt(t)->hp_blocked_tasks_lock);
+        /* check if there are jobs waiting for this resource */
+#ifdef CONFIG_LITMUS_DGL_SUPPORT
+        __waitqueue_dgl_remove_first(&mutex->wait, &dgl_wait, &next);
+        if(dgl_wait) {
+                next = dgl_wait->task;
+                //select_next_lock_if_primary(l, dgl_wait);
+        }
+#else
+        next = __waitqueue_remove_first(&mutex->wait);
+#endif
+        if (next) {
+                /* next becomes the resouce holder */
+                mutex->owner = next;
+                TRACE_CUR("lock ownership passed to %s/%d\n", next->comm, next->pid);
+                /* determine new hp_waiter if necessary */
+                if (next == mutex->hp_waiter) {
+                        TRACE_TASK(next, "was highest-prio waiter\n");
+                        /* next has the highest priority --- it doesn't need to
+                         * inherit.  However, we need to make sure that the
+                         * next-highest priority in the queue is reflected in
+                         * hp_waiter. */
+                        mutex->hp_waiter = rsm_mutex_find_hp_waiter(mutex, next);
+                        l->nest.hp_waiter_eff_prio = (mutex->hp_waiter) ?
+                                effective_priority(mutex->hp_waiter) :
+                                NULL;
+                        if (mutex->hp_waiter)
+                                TRACE_TASK(mutex->hp_waiter, "is new highest-prio waiter\n");
+                        else
+                                TRACE("no further waiters\n");
+                        raw_spin_lock(&tsk_rt(next)->hp_blocked_tasks_lock);
+                        binheap_add(&l->nest.hp_binheap_node,
+                                                &tsk_rt(next)->hp_blocked_tasks,
+                                                struct nested_info, hp_binheap_node);
+#ifdef CONFIG_LITMUS_DGL_SUPPORT
+                        if(dgl_wait) {
+                                select_next_lock_if_primary(l, dgl_wait);
+                                //wake_up_task = atomic_dec_and_test(&dgl_wait->nr_remaining);
+                                --(dgl_wait->nr_remaining);
+                                wake_up_task = (dgl_wait->nr_remaining == 0);
+                        }
+#endif
+                        raw_spin_unlock(&tsk_rt(next)->hp_blocked_tasks_lock);
+                }
+                else {
+                        /* Well, if 'next' is not the highest-priority waiter,
+                         * then it (probably) ought to inherit the highest-priority
+                         * waiter's priority. */
+                        TRACE_TASK(next, "is not hp_waiter of lock %d.\n", l->ident);
+                        raw_spin_lock(&tsk_rt(next)->hp_blocked_tasks_lock);
+                        binheap_add(&l->nest.hp_binheap_node,
+                                                &tsk_rt(next)->hp_blocked_tasks,
+                                                struct nested_info, hp_binheap_node);
+#ifdef CONFIG_LITMUS_DGL_SUPPORT
+                        if(dgl_wait) {
+                                select_next_lock_if_primary(l, dgl_wait);
+                                --(dgl_wait->nr_remaining);
+                                wake_up_task = (dgl_wait->nr_remaining == 0);
+                        }
+#endif
+                        /* It is possible that 'next' *should* be the hp_waiter, but isn't
+                     * because that update hasn't yet executed (update operation is
+                         * probably blocked on mutex->lock). So only inherit if the top of
+                         * 'next's top heap node is indeed the effective prio. of hp_waiter.
+                         * (We use l->hp_waiter_eff_prio instead of effective_priority(hp_waiter)
+                         * since the effective priority of hp_waiter can change (and the
+                         * update has not made it to this lock).)
+                         */
+#ifdef CONFIG_LITMUS_DGL_SUPPORT
+                        if((l->nest.hp_waiter_eff_prio != NULL) &&
+                           (top_priority(&tsk_rt(next)->hp_blocked_tasks) ==
+                                                                                                        l->nest.hp_waiter_eff_prio))
+                        {
+                                if(dgl_wait && tsk_rt(next)->blocked_lock) {
+                                        BUG_ON(wake_up_task);
+                                        //if(__edf_higher_prio(l->nest.hp_waiter_eff_prio, BASE, next, EFFECTIVE)) {
+                                        if(litmus->__compare(l->nest.hp_waiter_eff_prio, BASE, next, EFFECTIVE)) {
+                                                litmus->nested_increase_prio(next,
+                                                        l->nest.hp_waiter_eff_prio, &mutex->lock, flags);  // unlocks lock && hp_blocked_tasks_lock.
+                                                goto out;  // all spinlocks are released.  bail out now.
+                                        }
+                                }
+                                else {
+                                        litmus->increase_prio(next, l->nest.hp_waiter_eff_prio);
+                                }
+                        }
+                        raw_spin_unlock(&tsk_rt(next)->hp_blocked_tasks_lock);
+#else
+                        if(likely(top_priority(&tsk_rt(next)->hp_blocked_tasks) ==
+                                                                                                        l->nest.hp_waiter_eff_prio))
+                        {
+                                litmus->increase_prio(next, l->nest.hp_waiter_eff_prio);
+                        }
+                        raw_spin_unlock(&tsk_rt(next)->hp_blocked_tasks_lock);
+#endif
+                }
+                if(wake_up_task) {
+                        TRACE_TASK(next, "waking up since it is no longer blocked.\n");
+                        tsk_rt(next)->blocked_lock = NULL;
+                        mb();
+#if defined(CONFIG_LITMUS_AFFINITY_LOCKING) && defined(CONFIG_LITMUS_NVIDIA)
+                        // re-enable tracking
+                        if(tsk_rt(next)->held_gpus) {
+                                tsk_rt(next)->suspend_gpu_tracker_on_block = 0;
+                        }
+#endif
+                        wake_up_process(next);
+                }
+                else {
+                        TRACE_TASK(next, "is still blocked.\n");
+                }
+        }
+        else {
+                /* becomes available */
+                mutex->owner = NULL;
+        }
+        unlock_fine_irqrestore(&mutex->lock, flags);
+#ifdef CONFIG_LITMUS_DGL_SUPPORT
+out:
+#endif
+        unlock_global_irqrestore(dgl_lock, flags);
+        return err;
+}
+void rsm_mutex_propagate_increase_inheritance(struct litmus_lock* l,
+                                                                                        struct task_struct* t,
+                                                                                        raw_spinlock_t* to_unlock,
+                                                                                        unsigned long irqflags)
+{
+        struct rsm_mutex *mutex = rsm_mutex_from_lock(l);
+        // relay-style locking
+        lock_fine(&mutex->lock);
+        unlock_fine(to_unlock);
+        if(tsk_rt(t)->blocked_lock == l) {  // prevent race on tsk_rt(t)->blocked
+                struct task_struct *owner = mutex->owner;
+                struct task_struct *old_max_eff_prio;
+                struct task_struct *new_max_eff_prio;
+                raw_spin_lock(&tsk_rt(owner)->hp_blocked_tasks_lock);
+                old_max_eff_prio = top_priority(&tsk_rt(owner)->hp_blocked_tasks);
+                //if((t != mutex->hp_waiter) && edf_higher_prio(t, mutex->hp_waiter)) {
+                if((t != mutex->hp_waiter) && litmus->compare(t, mutex->hp_waiter)) {
+                        TRACE_TASK(t, "is new highest-prio waiter by propagation.\n");
+                        mutex->hp_waiter = t;
+                }
+                if(t == mutex->hp_waiter) {
+                        // reflect the decreased priority in the heap node.
+                        l->nest.hp_waiter_eff_prio = effective_priority(mutex->hp_waiter);
+                        BUG_ON(!binheap_is_in_heap(&l->nest.hp_binheap_node));
+                        BUG_ON(!binheap_is_in_this_heap(&l->nest.hp_binheap_node,
+                                                                                        &tsk_rt(owner)->hp_blocked_tasks));
+                        binheap_decrease(&l->nest.hp_binheap_node,
+                                                         &tsk_rt(owner)->hp_blocked_tasks);
+                }
+                new_max_eff_prio = top_priority(&tsk_rt(owner)->hp_blocked_tasks);
+                if(new_max_eff_prio != old_max_eff_prio) {
+                        // new_max_eff_prio > old_max_eff_prio holds.
+                        if ((effective_priority(owner) == old_max_eff_prio) ||
+                                //(__edf_higher_prio(new_max_eff_prio, BASE, owner, EFFECTIVE))) {
+                                (litmus->__compare(new_max_eff_prio, BASE, owner, EFFECTIVE))) {
+                                TRACE_CUR("Propagating inheritance to holder of lock %d.\n",
+                                                  l->ident);
+                                // beware: recursion
+                                litmus->nested_increase_prio(owner, new_max_eff_prio,
+                                                                                         &mutex->lock, irqflags);  // unlocks mutex->lock
+                        }
+                        else {
+                                TRACE_CUR("Lower priority than holder %s/%d.  No propagation.\n",
+                                                  owner->comm, owner->pid);
+                                raw_spin_unlock(&tsk_rt(owner)->hp_blocked_tasks_lock);
+                                unlock_fine_irqrestore(&mutex->lock, irqflags);
+                        }
+                }
+                else {
+                        TRACE_TASK(mutex->owner, "No change in maxiumum effective priority.\n");
+                        raw_spin_unlock(&tsk_rt(owner)->hp_blocked_tasks_lock);
+                        unlock_fine_irqrestore(&mutex->lock, irqflags);
+                }
+        }
+        else {
+                struct litmus_lock *still_blocked = tsk_rt(t)->blocked_lock;
+                TRACE_TASK(t, "is not blocked on lock %d.\n", l->ident);
+                if(still_blocked) {
+                        TRACE_TASK(t, "is still blocked on a lock though (lock %d).\n",
+                                           still_blocked->ident);
+                        if(still_blocked->ops->propagate_increase_inheritance) {
+                                /* due to relay-style nesting of spinlocks (acq. A, acq. B, free A, free B)
+                                 we know that task 't' has not released any locks behind us in this
+                                 chain.  Propagation just needs to catch up with task 't'. */
+                                still_blocked->ops->propagate_increase_inheritance(still_blocked,
+                                                                                                                                   t,
+                                                                                                                                   &mutex->lock,
+                                                                                                                                   irqflags);
+                        }
+                        else {
+                                TRACE_TASK(t,
+                                                   "Inheritor is blocked on lock (%p) that does not "
+                                                   "support nesting!\n",
+                                                   still_blocked);
+                                unlock_fine_irqrestore(&mutex->lock, irqflags);
+                        }
+                }
+                else {
+                        unlock_fine_irqrestore(&mutex->lock, irqflags);
+                }
+        }
+}
+void rsm_mutex_propagate_decrease_inheritance(struct litmus_lock* l,
+                                                                                         struct task_struct* t,
+                                                                                         raw_spinlock_t* to_unlock,
+                                                                                         unsigned long irqflags)
+{
+        struct rsm_mutex *mutex = rsm_mutex_from_lock(l);
+        // relay-style locking
+        lock_fine(&mutex->lock);
+        unlock_fine(to_unlock);
+        if(tsk_rt(t)->blocked_lock == l) {  // prevent race on tsk_rt(t)->blocked
+                if(t == mutex->hp_waiter) {
+                        struct task_struct *owner = mutex->owner;
+                        struct task_struct *old_max_eff_prio;
+                        struct task_struct *new_max_eff_prio;
+                        raw_spin_lock(&tsk_rt(owner)->hp_blocked_tasks_lock);
+                        old_max_eff_prio = top_priority(&tsk_rt(owner)->hp_blocked_tasks);
+                        binheap_delete(&l->nest.hp_binheap_node, &tsk_rt(owner)->hp_blocked_tasks);
+                        mutex->hp_waiter = rsm_mutex_find_hp_waiter(mutex, NULL);
+                        l->nest.hp_waiter_eff_prio = (mutex->hp_waiter) ?
+                                effective_priority(mutex->hp_waiter) : NULL;
+                        binheap_add(&l->nest.hp_binheap_node,
+                                                &tsk_rt(owner)->hp_blocked_tasks,
+                                                struct nested_info, hp_binheap_node);
+                        new_max_eff_prio = top_priority(&tsk_rt(owner)->hp_blocked_tasks);
+                        if((old_max_eff_prio != new_max_eff_prio) &&
+                           (effective_priority(owner) == old_max_eff_prio))
+                        {
+                                // Need to set new effective_priority for owner
+                                struct task_struct *decreased_prio;
+                                TRACE_CUR("Propagating decreased inheritance to holder of lock %d.\n",
+                                                  l->ident);
+                                //if(__edf_higher_prio(new_max_eff_prio, BASE, owner, BASE)) {
+                                if(litmus->__compare(new_max_eff_prio, BASE, owner, BASE)) {
+                                        TRACE_CUR("%s/%d has greater base priority than base priority of owner (%s/%d) of lock %d.\n",
+                                                          (new_max_eff_prio) ? new_max_eff_prio->comm : "nil",
+                                                          (new_max_eff_prio) ? new_max_eff_prio->pid : -1,
+                                                          owner->comm,
+                                                          owner->pid,
+                                                          l->ident);
+                                        decreased_prio = new_max_eff_prio;
+                                }
+                                else {
+                                        TRACE_CUR("%s/%d has lesser base priority than base priority of owner (%s/%d) of lock %d.\n",
+                                                          (new_max_eff_prio) ? new_max_eff_prio->comm : "nil",
+                                                          (new_max_eff_prio) ? new_max_eff_prio->pid : -1,
+                                                          owner->comm,
+                                                          owner->pid,
+                                                          l->ident);
+                                        decreased_prio = NULL;
+                                }
+                                // beware: recursion
+                                litmus->nested_decrease_prio(owner, decreased_prio, &mutex->lock, irqflags);    // will unlock mutex->lock
+                        }
+                        else {
+                                raw_spin_unlock(&tsk_rt(owner)->hp_blocked_tasks_lock);
+                                unlock_fine_irqrestore(&mutex->lock, irqflags);
+                        }
+                }
+                else {
+                        TRACE_TASK(t, "is not hp_waiter.  No propagation.\n");
+                        unlock_fine_irqrestore(&mutex->lock, irqflags);
+                }
+        }
+        else {
+                struct litmus_lock *still_blocked = tsk_rt(t)->blocked_lock;
+                TRACE_TASK(t, "is not blocked on lock %d.\n", l->ident);
+                if(still_blocked) {
+                        TRACE_TASK(t, "is still blocked on a lock though (lock %d).\n",
+                                           still_blocked->ident);
+                        if(still_blocked->ops->propagate_decrease_inheritance) {
+                                /* due to linked nesting of spinlocks (acq. A, acq. B, free A, free B)
+                                 we know that task 't' has not released any locks behind us in this
+                                 chain.  propagation just needs to catch up with task 't' */
+                                still_blocked->ops->propagate_decrease_inheritance(still_blocked,
+                                                                                                                                   t,
+                                                                                                                                   &mutex->lock,
+                                                                                                                                   irqflags);
+                        }
+                        else {
+                                TRACE_TASK(t, "Inheritor is blocked on lock (%p) that does not support nesting!\n",
+                                                   still_blocked);
+                                unlock_fine_irqrestore(&mutex->lock, irqflags);
+                        }
+                }
+                else {
+                        unlock_fine_irqrestore(&mutex->lock, irqflags);
+                }
+        }
+}
+int rsm_mutex_close(struct litmus_lock* l)
+{
+        struct task_struct *t = current;
+        struct rsm_mutex *mutex = rsm_mutex_from_lock(l);
+        unsigned long flags;
+        int owner;
+#ifdef CONFIG_LITMUS_DGL_SUPPORT
+        raw_spinlock_t *dgl_lock = litmus->get_dgl_spinlock(t);
+#endif
+        lock_global_irqsave(dgl_lock, flags);
+        lock_fine_irqsave(&mutex->lock, flags);
+        owner = (mutex->owner == t);
+        unlock_fine_irqrestore(&mutex->lock, flags);
+        unlock_global_irqrestore(dgl_lock, flags);
+        if (owner)
+                rsm_mutex_unlock(l);
+        return 0;
+}
+void rsm_mutex_free(struct litmus_lock* lock)
+{
+        kfree(rsm_mutex_from_lock(lock));
+}
+struct litmus_lock* rsm_mutex_new(struct litmus_lock_ops* ops)
+{
+        struct rsm_mutex* mutex;
+        mutex = kmalloc(sizeof(*mutex), GFP_KERNEL);
+        if (!mutex)
+                return NULL;
+        mutex->litmus_lock.ops = ops;
+        mutex->owner   = NULL;
+        mutex->hp_waiter = NULL;
+        init_waitqueue_head(&mutex->wait);
+#ifdef CONFIG_DEBUG_SPINLOCK
+        {
+                __raw_spin_lock_init(&mutex->lock,
+                                                         ((struct litmus_lock*)mutex)->cheat_lockdep,
+                                                         &((struct litmus_lock*)mutex)->key);
+        }
+#else
+        raw_spin_lock_init(&mutex->lock);
+#endif
+        ((struct litmus_lock*)mutex)->nest.hp_waiter_ptr = &mutex->hp_waiter;
+        return &mutex->litmus_lock;
+}
diff --git a/litmus/rt_domain.c b/litmus/rt_domain.c
index 1683d3847560..54322e278a1e 100644
--- a/litmus/rt_domain.c
+++ b/litmus/rt_domain.c
@@ -300,10 +300,15 @@ void rt_domain_init(rt_domain_t *rt,
 */
 void __add_ready(rt_domain_t* rt, struct task_struct *new)
 {
-        TRACE("rt: adding %s/%d (%llu, %llu, %llu) rel=%llu "
+        TRACE("rt: adding %s/%d (%llu, %llu, %llu) "
-                "to ready queue at %llu\n",
+                "[inh_task: %s/%d (%llu, %llu %llu)] "
-                new->comm, new->pid,
+                "rel=%llu to ready queue at %llu\n",
-                get_exec_cost(new), get_rt_period(new), get_rt_relative_deadline(new),
+                new->comm, new->pid, get_exec_cost(new), get_rt_period(new), get_rt_relative_deadline(new),
+                (tsk_rt(new)->inh_task) ? tsk_rt(new)->inh_task->comm : "(nil)",
+                (tsk_rt(new)->inh_task) ? tsk_rt(new)->inh_task->pid : 0,
+                (tsk_rt(new)->inh_task) ? get_exec_cost(tsk_rt(new)->inh_task) : 0,
+                (tsk_rt(new)->inh_task) ? get_rt_period(tsk_rt(new)->inh_task) : 0,
+                (tsk_rt(new)->inh_task) ? get_rt_relative_deadline(tsk_rt(new)->inh_task) : 0,
                get_release(new), litmus_clock());
        BUG_ON(bheap_node_in_heap(tsk_rt(new)->heap_node));
diff --git a/litmus/sched_cedf.c b/litmus/sched_cedf.c
index b45b46fc4fca..db47f4413329 100644
--- a/litmus/sched_cedf.c
+++ b/litmus/sched_cedf.c
@@ -29,7 +29,7 @@
 #include <linux/percpu.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/uaccess.h>
 #include <linux/module.h>
 #include <litmus/litmus.h>
@@ -43,14 +43,48 @@
 #include <litmus/clustered.h>
 #include <litmus/bheap.h>
+#include <litmus/binheap.h>
+#include <litmus/trace.h>
+#ifdef CONFIG_LITMUS_LOCKING
+#include <litmus/kfmlp_lock.h>
+#endif
+#ifdef CONFIG_LITMUS_NESTED_LOCKING
+#include <litmus/rsm_lock.h>
+#include <litmus/ikglp_lock.h>
+#endif
 #ifdef CONFIG_SCHED_CPU_AFFINITY
 #include <litmus/affinity.h>
 #endif
+#ifdef CONFIG_REALTIME_AUX_TASKS
+#include <litmus/aux_tasks.h>
+#endif
 /* to configure the cluster size */
 #include <litmus/litmus_proc.h>
-#include <linux/uaccess.h>
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+#include <litmus/affinity.h>
+#endif
+#ifdef CONFIG_LITMUS_SOFTIRQD
+#include <litmus/litmus_softirq.h>
+#endif
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+#include <linux/interrupt.h>
+#endif
+#ifdef CONFIG_LITMUS_NVIDIA
+#include <litmus/nvidia_info.h>
+#endif
+#if defined(CONFIG_LITMUS_AFFINITY_LOCKING) && defined(CONFIG_LITMUS_NVIDIA)
+#include <litmus/gpu_affinity.h>
+#endif
 /* Reference configuration variable. Determines which cache level is used to
 * group CPUs into clusters.  GLOBAL_CLUSTER, which is the default, means that
@@ -71,7 +105,7 @@ typedef struct  {
        struct task_struct*     linked;         /* only RT tasks */
        struct task_struct*     scheduled;      /* only RT tasks */
        atomic_t                will_schedule;  /* prevent unneeded IPIs */
-        struct bheap_node*      hn;
+        struct binheap_node hn;
 } cpu_entry_t;
 /* one cpu_entry_t per CPU */
@@ -97,10 +131,17 @@ typedef struct clusterdomain {
        /* map of this cluster cpus */
        cpumask_var_t   cpu_map;
        /* the cpus queue themselves according to priority in here */
-        struct bheap_node *heap_node;
+        struct binheap cpu_heap;
-        struct bheap      cpu_heap;
        /* lock for this cluster */
 #define cluster_lock domain.ready_lock
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+        struct tasklet_head pending_tasklets;
+#endif
+#ifdef CONFIG_LITMUS_DGL_SUPPORT
+        raw_spinlock_t dgl_lock;
+#endif
 } cedf_domain_t;
 /* a cedf_domain per cluster; allocation is done at init/activation time */
@@ -109,6 +150,29 @@ cedf_domain_t *cedf;
 #define remote_cluster(cpu)     ((cedf_domain_t *) per_cpu(cedf_cpu_entries, cpu).cluster)
 #define task_cpu_cluster(task)  remote_cluster(get_partition(task))
+/* total number of cluster */
+static int num_clusters;
+/* we do not support cluster of different sizes */
+static unsigned int cluster_size;
+static int clusters_allocated = 0;
+#if defined(CONFIG_LITMUS_NVIDIA) && defined(CONFIG_LITMUS_SOFTIRQD)
+static int num_gpu_clusters;
+static unsigned int gpu_cluster_size;
+#endif
+#ifdef CONFIG_LITMUS_DGL_SUPPORT
+static raw_spinlock_t* cedf_get_dgl_spinlock(struct task_struct *t)
+{
+        cedf_domain_t *cluster = task_cpu_cluster(t);
+        return(&cluster->dgl_lock);
+}
+#endif
 /* Uncomment WANT_ALL_SCHED_EVENTS if you want to see all scheduling
 * decisions in the TRACE() log; uncomment VERBOSE_INIT for verbose
 * information during the initialization of the plugin (e.g., topology)
@@ -116,11 +180,11 @@ cedf_domain_t *cedf;
 */
 #define VERBOSE_INIT
-static int cpu_lower_prio(struct bheap_node *_a, struct bheap_node *_b)
+static int cpu_lower_prio(struct binheap_node *_a, struct binheap_node *_b)
 {
-        cpu_entry_t *a, *b;
+        cpu_entry_t *a = binheap_entry(_a, cpu_entry_t, hn);
-        a = _a->value;
+        cpu_entry_t *b = binheap_entry(_b, cpu_entry_t, hn);
-        b = _b->value;
        /* Note that a and b are inverted: we want the lowest-priority CPU at
         * the top of the heap.
         */
@@ -134,20 +198,17 @@ static void update_cpu_position(cpu_entry_t *entry)
 {
        cedf_domain_t *cluster = entry->cluster;
-        if (likely(bheap_node_in_heap(entry->hn)))
+        if (likely(binheap_is_in_heap(&entry->hn))) {
-                bheap_delete(cpu_lower_prio,
+                binheap_delete(&entry->hn, &cluster->cpu_heap);
-                                &cluster->cpu_heap,
+        }
-                                entry->hn);
-        bheap_insert(cpu_lower_prio, &cluster->cpu_heap, entry->hn);
+        binheap_add(&entry->hn, &cluster->cpu_heap, cpu_entry_t, hn);
 }
 /* caller must hold cedf lock */
 static cpu_entry_t* lowest_prio_cpu(cedf_domain_t *cluster)
 {
-        struct bheap_node* hn;
+        return binheap_top_entry(&cluster->cpu_heap, cpu_entry_t, hn);
-        hn = bheap_peek(cpu_lower_prio, &cluster->cpu_heap);
-        return hn->value;
 }
@@ -209,7 +270,7 @@ static noinline void link_task_to_cpu(struct task_struct* linked,
 }
 /* unlink - Make sure a task is not linked any longer to an entry
- *          where it was linked before. Must hold cedf_lock.
+ *          where it was linked before. Must hold cluster_lock.
 */
 static noinline void unlink(struct task_struct* t)
 {
@@ -245,7 +306,7 @@ static void preempt(cpu_entry_t *entry)
 }
 /* requeue - Put an unlinked task into gsn-edf domain.
- *           Caller must hold cedf_lock.
+ *           Caller must hold cluster_lock.
 */
 static noinline void requeue(struct task_struct* task)
 {
@@ -255,7 +316,15 @@ static noinline void requeue(struct task_struct* task)
        BUG_ON(is_queued(task));
        if (is_released(task, litmus_clock()))
-                __add_ready(&cluster->domain, task);
+#ifdef CONFIG_REALTIME_AUX_TASKS
+                if (unlikely(tsk_rt(task)->is_aux_task && !is_running(task))) {
+                        /* aux_task probably transitioned to real-time while it was blocked */
+                        TRACE_CUR("aux task %s/%d is not ready!\n", task->comm, task->pid);
+                        unlink(task); /* really needed? */
+                }
+                else
+#endif
+                        __add_ready(&cluster->domain, task);
        else {
                /* it has got to wait */
                add_release(&cluster->domain, task);
@@ -340,13 +409,17 @@ static void cedf_release_jobs(rt_domain_t* rt, struct bheap* tasks)
        raw_spin_unlock_irqrestore(&cluster->cluster_lock, flags);
 }
-/* caller holds cedf_lock */
+/* caller holds cluster_lock */
 static noinline void job_completion(struct task_struct *t, int forced)
 {
        BUG_ON(!t);
        sched_trace_task_completion(t, forced);
+#ifdef CONFIG_LITMUS_NVIDIA
+        atomic_set(&tsk_rt(t)->nv_int_count, 0);
+#endif
        TRACE_TASK(t, "job_completion().\n");
        /* set flags */
@@ -371,25 +444,341 @@ static noinline void job_completion(struct task_struct *t, int forced)
 */
 static void cedf_tick(struct task_struct* t)
 {
-        if (is_realtime(t) && budget_enforced(t) && budget_exhausted(t)) {
+        if (is_realtime(t) && budget_exhausted(t))
-                if (!is_np(t)) {
+        {
-                        /* np tasks will be preempted when they become
+                if (budget_signalled(t) && !sigbudget_sent(t)) {
-                         * preemptable again
+                        /* signal exhaustion */
-                         */
+                        send_sigbudget(t);
-                        litmus_reschedule_local();
+                }
-                        set_will_schedule();
-                        TRACE("cedf_scheduler_tick: "
+                if (budget_enforced(t)) {
-                              "%d is preemptable "
+                        if (!is_np(t)) {
-                              " => FORCE_RESCHED\n", t->pid);
+                                /* np tasks will be preempted when they become
-                } else if (is_user_np(t)) {
+                                 * preemptable again
-                        TRACE("cedf_scheduler_tick: "
+                                 */
-                              "%d is non-preemptable, "
+                                litmus_reschedule_local();
-                              "preemption delayed.\n", t->pid);
+                                set_will_schedule();
-                        request_exit_np(t);
+                                TRACE("cedf_scheduler_tick: "
+                                          "%d is preemptable "
+                                          " => FORCE_RESCHED\n", t->pid);
+                        } else if (is_user_np(t)) {
+                                TRACE("cedf_scheduler_tick: "
+                                          "%d is non-preemptable, "
+                                          "preemption delayed.\n", t->pid);
+                                request_exit_np(t);
+                        }
                }
        }
 }
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+static void __do_lit_tasklet(struct tasklet_struct* tasklet, unsigned long flushed)
+{
+        if (!atomic_read(&tasklet->count)) {
+                if(tasklet->owner) {
+                        sched_trace_tasklet_begin(tasklet->owner);
+                }
+                if (!test_and_clear_bit(TASKLET_STATE_SCHED, &tasklet->state))
+                {
+                        BUG();
+                }
+                TRACE("%s: Invoking tasklet with owner pid = %d (flushed = %d).\n",
+                          __FUNCTION__,
+                          (tasklet->owner) ? tasklet->owner->pid : -1,
+                          (tasklet->owner) ? 0 : 1);
+                tasklet->func(tasklet->data);
+                tasklet_unlock(tasklet);
+                if(tasklet->owner) {
+                        sched_trace_tasklet_end(tasklet->owner, flushed);
+                }
+        }
+        else {
+                BUG();
+        }
+}
+static void do_lit_tasklets(cedf_domain_t* cluster, struct task_struct* sched_task)
+{
+        int work_to_do = 1;
+        struct tasklet_struct *tasklet = NULL;
+        unsigned long flags;
+        while(work_to_do) {
+                TS_NV_SCHED_BOTISR_START;
+                raw_spin_lock_irqsave(&cluster->cluster_lock, flags);
+                if(cluster->pending_tasklets.head != NULL) {
+                        // remove tasklet at head.
+                        struct tasklet_struct *prev = NULL;
+                        tasklet = cluster->pending_tasklets.head;
+                        // find a tasklet with prio to execute; skip ones where
+                        // sched_task has a higher priority.
+                        // We use the '!edf' test instead of swaping function arguments since
+                        // both sched_task and owner could be NULL.  In this case, we want to
+                        // still execute the tasklet.
+                        while(tasklet && !edf_higher_prio(tasklet->owner, sched_task)) {
+                                prev = tasklet;
+                                tasklet = tasklet->next;
+                        }
+                        if(tasklet) {  // found something to execuite
+                                // remove the tasklet from the queue
+                                if(prev) {
+                                        prev->next = tasklet->next;
+                                        if(prev->next == NULL) {
+                                                TRACE("%s: Tasklet for %d is the last element in tasklet queue.\n", __FUNCTION__, tasklet->owner->pid);
+                                                cluster->pending_tasklets.tail = &(prev);
+                                        }
+                                }
+                                else {
+                                        cluster->pending_tasklets.head = tasklet->next;
+                                        if(tasklet->next == NULL) {
+                                                TRACE("%s: Tasklet for %d is the last element in tasklet queue.\n", __FUNCTION__, tasklet->owner->pid);
+                                                cluster->pending_tasklets.tail = &(cluster->pending_tasklets.head);
+                                        }
+                                }
+                        }
+                        else {
+                                TRACE("%s: No tasklets with eligible priority.\n", __FUNCTION__);
+                        }
+                }
+                else {
+                        TRACE("%s: Tasklet queue is empty.\n", __FUNCTION__);
+                }
+                raw_spin_unlock_irqrestore(&cluster->cluster_lock, flags);
+                if(tasklet) {
+                        __do_lit_tasklet(tasklet, 0ul);
+                        tasklet = NULL;
+                }
+                else {
+                        work_to_do = 0;
+                }
+                TS_NV_SCHED_BOTISR_END;
+        }
+}
+static void __add_pai_tasklet(struct tasklet_struct* tasklet, cedf_domain_t* cluster)
+{
+        struct tasklet_struct* step;
+        tasklet->next = NULL;  // make sure there are no old values floating around
+        step = cluster->pending_tasklets.head;
+        if(step == NULL) {
+                TRACE("%s: tasklet queue empty.  inserting tasklet for %d at head.\n", __FUNCTION__, tasklet->owner->pid);
+                // insert at tail.
+                *(cluster->pending_tasklets.tail) = tasklet;
+                cluster->pending_tasklets.tail = &(tasklet->next);
+        }
+        else if((*(cluster->pending_tasklets.tail) != NULL) &&
+                        edf_higher_prio((*(cluster->pending_tasklets.tail))->owner, tasklet->owner)) {
+                // insert at tail.
+                TRACE("%s: tasklet belongs at end.  inserting tasklet for %d at tail.\n", __FUNCTION__, tasklet->owner->pid);
+                *(cluster->pending_tasklets.tail) = tasklet;
+                cluster->pending_tasklets.tail = &(tasklet->next);
+        }
+        else {
+                // insert the tasklet somewhere in the middle.
+        TRACE("%s: tasklet belongs somewhere in the middle.\n", __FUNCTION__);
+                while(step->next && edf_higher_prio(step->next->owner, tasklet->owner)) {
+                        step = step->next;
+                }
+                // insert tasklet right before step->next.
+                TRACE("%s: inserting tasklet for %d between %d and %d.\n", __FUNCTION__,
+                          tasklet->owner->pid,
+                          (step->owner) ?
+                          step->owner->pid :
+                          -1,
+                          (step->next) ?
+                          ((step->next->owner) ?
+                           step->next->owner->pid :
+                           -1) :
+                          -1);
+                tasklet->next = step->next;
+                step->next = tasklet;
+                // patch up the head if needed.
+                if(cluster->pending_tasklets.head == step)
+                {
+                        TRACE("%s: %d is the new tasklet queue head.\n", __FUNCTION__, tasklet->owner->pid);
+                        cluster->pending_tasklets.head = tasklet;
+                }
+        }
+}
+static void cedf_run_tasklets(struct task_struct* sched_task)
+{
+        cedf_domain_t* cluster;
+        preempt_disable();
+        cluster = (is_realtime(sched_task)) ?
+                task_cpu_cluster(sched_task) :
+                remote_cluster(smp_processor_id());
+        if(cluster && cluster->pending_tasklets.head != NULL) {
+                TRACE("%s: There are tasklets to process.\n", __FUNCTION__);
+                do_lit_tasklets(cluster, sched_task);
+        }
+        preempt_enable_no_resched();
+}
+static int cedf_enqueue_pai_tasklet(struct tasklet_struct* tasklet)
+{
+#if 0
+        cedf_domain_t *cluster = NULL;
+        cpu_entry_t *targetCPU = NULL;
+        int thisCPU;
+        int runLocal = 0;
+        int runNow = 0;
+        unsigned long flags;
+    if(unlikely((tasklet->owner == NULL) || !is_realtime(tasklet->owner)))
+    {
+        TRACE("%s: No owner associated with this tasklet!\n", __FUNCTION__);
+                return 0;
+    }
+        cluster = task_cpu_cluster(tasklet->owner);
+        raw_spin_lock_irqsave(&cluster->cluster_lock, flags);
+        thisCPU = smp_processor_id();
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+        {
+                cpu_entry_t* affinity = NULL;
+                // use this CPU if it is in our cluster and isn't running any RT work.
+                if(cpu_isset(thisCPU, *cluster->cpu_map) && (__get_cpu_var(cedf_cpu_entries).linked == NULL)) {
+                        affinity = &(__get_cpu_var(cedf_cpu_entries));
+                }
+                else {
+                        // this CPU is busy or shouldn't run tasklet in this cluster.
+                        // look for available near by CPUs.
+                        // NOTE: Affinity towards owner and not this CPU.  Is this right?
+                        affinity =
+                                cedf_get_nearest_available_cpu(cluster,
+                                                                &per_cpu(cedf_cpu_entries, task_cpu(tasklet->owner)));
+                }
+                targetCPU = affinity;
+        }
+#endif
+        if (targetCPU == NULL) {
+                targetCPU = lowest_prio_cpu(cluster);
+        }
+        if (edf_higher_prio(tasklet->owner, targetCPU->linked)) {
+                if (thisCPU == targetCPU->cpu) {
+                        TRACE("%s: Run tasklet locally (and now).\n", __FUNCTION__);
+                        runLocal = 1;
+                        runNow = 1;
+                }
+                else {
+                        TRACE("%s: Run tasklet remotely (and now).\n", __FUNCTION__);
+                        runLocal = 0;
+                        runNow = 1;
+                }
+        }
+        else {
+                runLocal = 0;
+                runNow = 0;
+        }
+        if(!runLocal) {
+                // enqueue the tasklet
+                __add_pai_tasklet(tasklet, cluster);
+        }
+        raw_spin_unlock_irqrestore(&cluster->cluster_lock, flags);
+        if (runLocal /*&& runNow */) {  // runNow == 1 is implied
+                TRACE("%s: Running tasklet on CPU where it was received.\n", __FUNCTION__);
+                __do_lit_tasklet(tasklet, 0ul);
+        }
+        else if (runNow /*&& !runLocal */) {  // runLocal == 0 is implied
+                TRACE("%s: Triggering CPU %d to run tasklet.\n", __FUNCTION__, targetCPU->cpu);
+                preempt(targetCPU);  // need to be protected by cluster_lock?
+        }
+        else {
+                TRACE("%s: Scheduling of tasklet was deferred.\n", __FUNCTION__);
+        }
+#else
+        TRACE("%s: Running tasklet on CPU where it was received.\n", __FUNCTION__);
+        __do_lit_tasklet(tasklet, 0ul);
+#endif
+        return(1); // success
+}
+static void cedf_change_prio_pai_tasklet(struct task_struct *old_prio,
+                                                                                 struct task_struct *new_prio)
+{
+        struct tasklet_struct* step;
+        unsigned long flags;
+        cedf_domain_t *cluster;
+        struct task_struct *probe;
+        // identify the cluster by the assignment of these tasks.  one should
+        // be non-NULL.
+        probe = (old_prio) ? old_prio : new_prio;
+        if(probe) {
+                cluster = task_cpu_cluster(probe);
+                if(cluster->pending_tasklets.head != NULL) {
+                        raw_spin_lock_irqsave(&cluster->cluster_lock, flags);
+                        for(step = cluster->pending_tasklets.head; step != NULL; step = step->next) {
+                                if(step->owner == old_prio) {
+                                        TRACE("%s: Found tasklet to change: %d\n", __FUNCTION__, step->owner->pid);
+                                        step->owner = new_prio;
+                                }
+                        }
+                        raw_spin_unlock_irqrestore(&cluster->cluster_lock, flags);
+                }
+        }
+        else {
+                TRACE("%s: Both priorities were NULL\n");
+        }
+}
+#endif  // PAI
 /* Getting schedule() right is a bit tricky. schedule() may not make any
 * assumptions on the state of the current task since it may be called for a
 * number of reasons. The reasons include a scheduler_tick() determined that it
@@ -415,7 +804,7 @@ static struct task_struct* cedf_schedule(struct task_struct * prev)
 {
        cpu_entry_t* entry = &__get_cpu_var(cedf_cpu_entries);
        cedf_domain_t *cluster = entry->cluster;
-        int out_of_time, sleep, preempt, np, exists, blocks;
+        int out_of_time, signal_budget, sleep, preempt, np, exists, blocks;
        struct task_struct* next = NULL;
 #ifdef CONFIG_RELEASE_MASTER
@@ -442,6 +831,10 @@ static struct task_struct* cedf_schedule(struct task_struct * prev)
        out_of_time = exists &&
                                  budget_enforced(entry->scheduled) &&
                                  budget_exhausted(entry->scheduled);
+        signal_budget = exists &&
+                                        budget_signalled(entry->scheduled) &&
+                                        budget_exhausted(entry->scheduled) &&
+                                        !sigbudget_sent(entry->scheduled);
        np          = exists && is_np(entry->scheduled);
        sleep       = exists && is_completed(entry->scheduled);
        preempt     = entry->scheduled != entry->linked;
@@ -460,12 +853,28 @@ static struct task_struct* cedf_schedule(struct task_struct * prev)
                TRACE_TASK(prev, "will be preempted by %s/%d\n",
                           entry->linked->comm, entry->linked->pid);
+        /* Send the signal that the budget has been exhausted */
+        if (signal_budget)
+                send_sigbudget(entry->scheduled);
        /* If a task blocks we have no choice but to reschedule.
         */
        if (blocks)
                unlink(entry->scheduled);
+#if defined(CONFIG_LITMUS_NVIDIA) && defined(CONFIG_LITMUS_AFFINITY_LOCKING)
+        if(exists && is_realtime(entry->scheduled) && tsk_rt(entry->scheduled)->held_gpus) {
+                if(!blocks || tsk_rt(entry->scheduled)->suspend_gpu_tracker_on_block) {
+                        // don't track preemptions or locking protocol suspensions.
+                        TRACE_TASK(entry->scheduled, "stopping GPU tracker.\n");
+                        stop_gpu_tracker(entry->scheduled);
+                }
+                else if(blocks && !tsk_rt(entry->scheduled)->suspend_gpu_tracker_on_block) {
+                        TRACE_TASK(entry->scheduled, "GPU tracker remains on during suspension.\n");
+                }
+        }
+#endif
        /* Request a sys_exit_np() call if we would like to preempt but cannot.
         * We need to make sure to update the link structure anyway in case
         * that we are still linked. Multiple calls to request_exit_np() don't
@@ -515,7 +924,7 @@ static struct task_struct* cedf_schedule(struct task_struct * prev)
        raw_spin_unlock(&cluster->cluster_lock);
 #ifdef WANT_ALL_SCHED_EVENTS
-        TRACE("cedf_lock released, next=0x%p\n", next);
+        TRACE("cluster_lock released, next=0x%p\n", next);
        if (next)
                TRACE_TASK(next, "scheduled at %llu\n", litmus_clock());
@@ -523,7 +932,6 @@ static struct task_struct* cedf_schedule(struct task_struct * prev)
                TRACE("becomes idle at %llu.\n", litmus_clock());
 #endif
        return next;
 }
@@ -549,7 +957,7 @@ static void cedf_task_new(struct task_struct * t, int on_rq, int running)
        cpu_entry_t*            entry;
        cedf_domain_t*          cluster;
-        TRACE("gsn edf: task new %d\n", t->pid);
+        TRACE("c-edf: task new %d\n", t->pid);
        /* the cluster doesn't change even if t is running */
        cluster = task_cpu_cluster(t);
@@ -587,7 +995,7 @@ static void cedf_task_new(struct task_struct * t, int on_rq, int running)
 static void cedf_task_wake_up(struct task_struct *task)
 {
        unsigned long flags;
-        lt_t now;
+        //lt_t now;
        cedf_domain_t *cluster;
        TRACE_TASK(task, "wake_up at %llu\n", litmus_clock());
@@ -595,6 +1003,9 @@ static void cedf_task_wake_up(struct task_struct *task)
        cluster = task_cpu_cluster(task);
        raw_spin_lock_irqsave(&cluster->cluster_lock, flags);
+#if 0
+        /* sporadic task model. will increment job numbers automatically */
        now = litmus_clock();
        if (is_tardy(task, now)) {
                /* new sporadic release */
@@ -608,6 +1019,26 @@ static void cedf_task_wake_up(struct task_struct *task)
                        tsk_rt(task)->completed = 0;
                }
        }
+#else
+        /* periodic task model.  don't force job to end.
+         * rely on user to say when jobs complete or when budget expires. */
+        tsk_rt(task)->completed = 0;
+#endif
+#ifdef CONFIG_REALTIME_AUX_TASKS
+        if (tsk_rt(task)->has_aux_tasks && !tsk_rt(task)->hide_from_aux_tasks) {
+                TRACE_CUR("%s/%d is ready so aux tasks may not inherit.\n", task->comm, task->pid);
+                disable_aux_task_owner(task);
+        }
+#endif
+#ifdef CONFIG_LITMUS_NVIDIA
+        if (tsk_rt(task)->held_gpus && !tsk_rt(task)->hide_from_gpu) {
+                TRACE_CUR("%s/%d is ready so gpu klmirqd tasks may not inherit.\n", task->comm, task->pid);
+                disable_gpu_owner(task);
+        }
+#endif
        cedf_job_arrival(task);
        raw_spin_unlock_irqrestore(&cluster->cluster_lock, flags);
 }
@@ -623,7 +1054,25 @@ static void cedf_task_block(struct task_struct *t)
        /* unlink if necessary */
        raw_spin_lock_irqsave(&cluster->cluster_lock, flags);
        unlink(t);
+#ifdef CONFIG_REALTIME_AUX_TASKS
+        if (tsk_rt(t)->has_aux_tasks && !tsk_rt(t)->hide_from_aux_tasks) {
+                TRACE_CUR("%s/%d is blocked so aux tasks may inherit.\n", t->comm, t->pid);
+                enable_aux_task_owner(t);
+        }
+#endif
+#ifdef CONFIG_LITMUS_NVIDIA
+        if (tsk_rt(t)->held_gpus && !tsk_rt(t)->hide_from_gpu) {
+                TRACE_CUR("%s/%d is blocked so aux tasks may inherit.\n", t->comm, t->pid);
+                enable_gpu_owner(t);
+        }
+#endif
        raw_spin_unlock_irqrestore(&cluster->cluster_lock, flags);
        BUG_ON(!is_realtime(t));
@@ -635,8 +1084,30 @@ static void cedf_task_exit(struct task_struct * t)
        unsigned long flags;
        cedf_domain_t *cluster = task_cpu_cluster(t);
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+        cedf_change_prio_pai_tasklet(t, NULL);
+#endif
        /* unlink if necessary */
        raw_spin_lock_irqsave(&cluster->cluster_lock, flags);
+#ifdef CONFIG_REALTIME_AUX_TASKS
+        /* make sure we clean up on our way out */
+        if (unlikely(tsk_rt(t)->is_aux_task)) {
+                exit_aux_task(t);
+        }
+        else if(tsk_rt(t)->has_aux_tasks) {
+                disable_aux_task_owner(t);
+        }
+#endif
+#ifdef CONFIG_LITMUS_NVIDIA
+        /* make sure we clean up on our way out */
+        if(tsk_rt(t)->held_gpus) {
+                disable_gpu_owner(t);
+        }
+#endif
        unlink(t);
        if (tsk_rt(t)->scheduled_on != NO_CPU) {
                cpu_entry_t *cpu;
@@ -652,13 +1123,505 @@ static void cedf_task_exit(struct task_struct * t)
 static long cedf_admit_task(struct task_struct* tsk)
 {
+#ifdef CONFIG_LITMUS_NESTED_LOCKING
+        INIT_BINHEAP_HANDLE(&tsk_rt(tsk)->hp_blocked_tasks,
+                                                edf_max_heap_base_priority_order);
+#endif
        return task_cpu(tsk) == tsk->rt_param.task_params.cpu ? 0 : -EINVAL;
 }
-/* total number of cluster */
-static int num_clusters;
-/* we do not support cluster of different sizes */
+#ifdef CONFIG_LITMUS_LOCKING
-static unsigned int cluster_size;
+#include <litmus/fdso.h>
+/* called with IRQs off */
+static int __increase_priority_inheritance(struct task_struct* t,
+                                                                                    struct task_struct* prio_inh)
+{
+        int success = 1;
+        int linked_on;
+        int check_preempt = 0;
+        cedf_domain_t* cluster;
+        if (prio_inh && prio_inh == effective_priority(t)) {
+                /* relationship already established. */
+                TRACE_TASK(t, "already has effective priority of %s/%d\n",
+                                   prio_inh->comm, prio_inh->pid);
+                goto out;
+        }
+        cluster = task_cpu_cluster(t);
+#ifdef CONFIG_LITMUS_NESTED_LOCKING
+        /* this sanity check allows for weaker locking in protocols */
+        /* TODO (klmirqd): Skip this check if 't' is a proxy thread (???) */
+        if(__edf_higher_prio(prio_inh, BASE, t, EFFECTIVE)) {
+#endif
+                TRACE_TASK(t, "inherits priority from %s/%d\n",
+                                   prio_inh->comm, prio_inh->pid);
+                tsk_rt(t)->inh_task = prio_inh;
+                linked_on  = tsk_rt(t)->linked_on;
+                /* If it is scheduled, then we need to reorder the CPU heap. */
+                if (linked_on != NO_CPU) {
+                        TRACE_TASK(t, "%s: linked  on %d\n",
+                                           __FUNCTION__, linked_on);
+                        /* Holder is scheduled; need to re-order CPUs.
+                         * We can't use heap_decrease() here since
+                         * the cpu_heap is ordered in reverse direction, so
+                         * it is actually an increase. */
+                        binheap_delete(&per_cpu(cedf_cpu_entries, linked_on).hn,
+                                                   &cluster->cpu_heap);
+                        binheap_add(&per_cpu(cedf_cpu_entries, linked_on).hn,
+                                                &cluster->cpu_heap, cpu_entry_t, hn);
+                } else {
+                        /* holder may be queued: first stop queue changes */
+                        raw_spin_lock(&cluster->domain.release_lock);
+                        if (is_queued(t)) {
+                                TRACE_TASK(t, "%s: is queued\n",
+                                                   __FUNCTION__);
+                                /* We need to update the position of holder in some
+                                 * heap. Note that this could be a release heap if we
+                                 * budget enforcement is used and this job overran. */
+                                check_preempt =
+                                        !bheap_decrease(edf_ready_order, tsk_rt(t)->heap_node);
+                        } else {
+                                /* Nothing to do: if it is not queued and not linked
+                                 * then it is either sleeping or currently being moved
+                                 * by other code (e.g., a timer interrupt handler) that
+                                 * will use the correct priority when enqueuing the
+                                 * task. */
+                                TRACE_TASK(t, "%s: is NOT queued => Done.\n",
+                                                   __FUNCTION__);
+                        }
+                        raw_spin_unlock(&cluster->domain.release_lock);
+                        /* If holder was enqueued in a release heap, then the following
+                         * preemption check is pointless, but we can't easily detect
+                         * that case. If you want to fix this, then consider that
+                         * simply adding a state flag requires O(n) time to update when
+                         * releasing n tasks, which conflicts with the goal to have
+                         * O(log n) merges. */
+                        if (check_preempt) {
+                                /* heap_decrease() hit the top level of the heap: make
+                                 * sure preemption checks get the right task, not the
+                                 * potentially stale cache. */
+                                bheap_uncache_min(edf_ready_order,
+                                                                  &cluster->domain.ready_queue);
+                                check_for_preemptions(cluster);
+                        }
+#ifdef CONFIG_REALTIME_AUX_TASKS
+                        /* propagate to aux tasks */
+                        if (tsk_rt(t)->has_aux_tasks) {
+                                aux_task_owner_increase_priority(t);
+                        }
+#endif
+#ifdef CONFIG_LITMUS_NVIDIA
+                        /* propagate to gpu klmirqd */
+                        if (tsk_rt(t)->held_gpus) {
+                                gpu_owner_increase_priority(t);
+                        }
+#endif
+                }
+#ifdef CONFIG_LITMUS_NESTED_LOCKING
+        }
+        else {
+                TRACE_TASK(t, "Spurious invalid priority increase. "
+                                   "Inheritance request: %s/%d [eff_prio = %s/%d] to inherit from %s/%d\n"
+                                   "Occurance is likely okay: probably due to (hopefully safe) concurrent priority updates.\n",
+                                   t->comm, t->pid,
+                                   effective_priority(t)->comm, effective_priority(t)->pid,
+                                   (prio_inh) ? prio_inh->comm : "nil",
+                                   (prio_inh) ? prio_inh->pid : -1);
+                WARN_ON(!prio_inh);
+                success = 0;
+        }
+#endif
+out:
+        return success;
+}
+/* called with IRQs off */
+static void increase_priority_inheritance(struct task_struct* t, struct task_struct* prio_inh)
+{
+        cedf_domain_t* cluster = task_cpu_cluster(t);
+        raw_spin_lock(&cluster->cluster_lock);
+        __increase_priority_inheritance(t, prio_inh);
+        raw_spin_unlock(&cluster->cluster_lock);
+#if defined(CONFIG_LITMUS_PAI_SOFTIRQD) && defined(CONFIG_LITMUS_NVIDIA)
+        if(tsk_rt(t)->held_gpus) {
+                int i;
+                for(i = find_first_bit(&tsk_rt(t)->held_gpus, sizeof(tsk_rt(t)->held_gpus));
+                        i < NV_DEVICE_NUM;
+                        i = find_next_bit(&tsk_rt(t)->held_gpus, sizeof(tsk_rt(t)->held_gpus), i+1)) {
+                        pai_check_priority_increase(t, i);
+                }
+        }
+#endif
+}
+/* called with IRQs off */
+static int __decrease_priority_inheritance(struct task_struct* t,
+                                                                                        struct task_struct* prio_inh)
+{
+        int success = 1;
+        if (prio_inh == tsk_rt(t)->inh_task) {
+                /* relationship already established. */
+                TRACE_TASK(t, "already inherits priority from %s/%d\n",
+                                   (prio_inh) ? prio_inh->comm : "(nil)",
+                                   (prio_inh) ? prio_inh->pid : 0);
+                goto out;
+        }
+#ifdef CONFIG_LITMUS_NESTED_LOCKING
+        if(__edf_higher_prio(t, EFFECTIVE, prio_inh, BASE)) {
+#endif
+                /* A job only stops inheriting a priority when it releases a
+                 * resource. Thus we can make the following assumption.*/
+                if(prio_inh)
+                        TRACE_TASK(t, "EFFECTIVE priority decreased to %s/%d\n",
+                                           prio_inh->comm, prio_inh->pid);
+                else
+                        TRACE_TASK(t, "base priority restored.\n");
+                tsk_rt(t)->inh_task = prio_inh;
+                if(tsk_rt(t)->scheduled_on != NO_CPU) {
+                        TRACE_TASK(t, "is scheduled.\n");
+                        /* Check if rescheduling is necessary. We can't use heap_decrease()
+                         * since the priority was effectively lowered. */
+                        unlink(t);
+                        cedf_job_arrival(t);
+                }
+                else {
+                        cedf_domain_t* cluster = task_cpu_cluster(t);
+                        /* task is queued */
+                        raw_spin_lock(&cluster->domain.release_lock);
+                        if (is_queued(t)) {
+                                TRACE_TASK(t, "is queued.\n");
+                                /* decrease in priority, so we have to re-add to binomial heap */
+                                unlink(t);
+                                cedf_job_arrival(t);
+                        }
+                        else {
+                                TRACE_TASK(t, "is not in scheduler. Probably on wait queue somewhere.\n");
+                        }
+                        raw_spin_unlock(&cluster->domain.release_lock);
+                }
+#ifdef CONFIG_REALTIME_AUX_TASKS
+                /* propagate to aux tasks */
+                if (tsk_rt(t)->has_aux_tasks) {
+                        aux_task_owner_decrease_priority(t);
+                }
+#endif
+#ifdef CONFIG_LITMUS_NVIDIA
+                /* propagate to gpu */
+                if (tsk_rt(t)->held_gpus) {
+                        gpu_owner_decrease_priority(t);
+                }
+#endif
+#ifdef CONFIG_LITMUS_NESTED_LOCKING
+        }
+        else {
+                TRACE_TASK(t, "Spurious invalid priority decrease. "
+                                   "Inheritance request: %s/%d [eff_prio = %s/%d] to inherit from %s/%d\n"
+                                   "Occurance is likely okay: probably due to (hopefully safe) concurrent priority updates.\n",
+                                   t->comm, t->pid,
+                                   effective_priority(t)->comm, effective_priority(t)->pid,
+                                   (prio_inh) ? prio_inh->comm : "nil",
+                                   (prio_inh) ? prio_inh->pid : -1);
+                success = 0;
+        }
+#endif
+out:
+        return success;
+}
+static void decrease_priority_inheritance(struct task_struct* t,
+                                                                                struct task_struct* prio_inh)
+{
+        cedf_domain_t* cluster = task_cpu_cluster(t);
+        raw_spin_lock(&cluster->cluster_lock);
+        __decrease_priority_inheritance(t, prio_inh);
+        raw_spin_unlock(&cluster->cluster_lock);
+#if defined(CONFIG_LITMUS_PAI_SOFTIRQD) && defined(CONFIG_LITMUS_NVIDIA)
+        if(tsk_rt(t)->held_gpus) {
+                int i;
+                for(i = find_first_bit(&tsk_rt(t)->held_gpus, sizeof(tsk_rt(t)->held_gpus));
+                        i < NV_DEVICE_NUM;
+                        i = find_next_bit(&tsk_rt(t)->held_gpus, sizeof(tsk_rt(t)->held_gpus), i+1)) {
+                        pai_check_priority_decrease(t, i);
+                }
+        }
+#endif
+}
+#ifdef CONFIG_LITMUS_NESTED_LOCKING
+/* called with IRQs off */
+/* preconditions:
+ (1) The 'hp_blocked_tasks_lock' of task 't' is held.
+ (2) The lock 'to_unlock' is held.
+ */
+static void nested_increase_priority_inheritance(struct task_struct* t,
+                                                                                                 struct task_struct* prio_inh,
+                                                                                                 raw_spinlock_t *to_unlock,
+                                                                                                 unsigned long irqflags)
+{
+        struct litmus_lock *blocked_lock = tsk_rt(t)->blocked_lock;
+        if(tsk_rt(t)->inh_task != prio_inh) {           // shield redundent calls.
+                increase_priority_inheritance(t, prio_inh);  // increase our prio.
+        }
+        raw_spin_unlock(&tsk_rt(t)->hp_blocked_tasks_lock);  // unlock the t's heap.
+        if(blocked_lock) {
+                if(blocked_lock->ops->propagate_increase_inheritance) {
+                        TRACE_TASK(t, "Inheritor is blocked (...perhaps).  Checking lock %d.\n",
+                                           blocked_lock->ident);
+                        // beware: recursion
+                        blocked_lock->ops->propagate_increase_inheritance(blocked_lock,
+                                                                                                                          t, to_unlock,
+                                                                                                                          irqflags);
+                }
+                else {
+                        TRACE_TASK(t, "Inheritor is blocked on lock (%d) that does not support nesting!\n",
+                                           blocked_lock->ident);
+                        unlock_fine_irqrestore(to_unlock, irqflags);
+                }
+        }
+        else {
+                TRACE_TASK(t, "is not blocked.  No propagation.\n");
+                unlock_fine_irqrestore(to_unlock, irqflags);
+        }
+}
+/* called with IRQs off */
+/* preconditions:
+ (1) The 'hp_blocked_tasks_lock' of task 't' is held.
+ (2) The lock 'to_unlock' is held.
+ */
+static void nested_decrease_priority_inheritance(struct task_struct* t,
+                                                                                                 struct task_struct* prio_inh,
+                                                                                                 raw_spinlock_t *to_unlock,
+                                                                                                 unsigned long irqflags)
+{
+        struct litmus_lock *blocked_lock = tsk_rt(t)->blocked_lock;
+        decrease_priority_inheritance(t, prio_inh);
+        raw_spin_unlock(&tsk_rt(t)->hp_blocked_tasks_lock);  // unlock the t's heap.
+        if(blocked_lock) {
+                if(blocked_lock->ops->propagate_decrease_inheritance) {
+                        TRACE_TASK(t, "Inheritor is blocked (...perhaps).  Checking lock %d.\n",
+                                           blocked_lock->ident);
+                        // beware: recursion
+                        blocked_lock->ops->propagate_decrease_inheritance(blocked_lock, t,
+                                                                                                                          to_unlock,
+                                                                                                                          irqflags);
+                }
+                else {
+                        TRACE_TASK(t, "Inheritor is blocked on lock (%p) that does not support nesting!\n",
+                                           blocked_lock);
+                        unlock_fine_irqrestore(to_unlock, irqflags);
+                }
+        }
+        else {
+                TRACE_TASK(t, "is not blocked.  No propagation.\n");
+                unlock_fine_irqrestore(to_unlock, irqflags);
+        }
+}
+/* ******************** RSM MUTEX ********************** */
+static struct litmus_lock_ops cedf_rsm_mutex_lock_ops = {
+        .lock   = rsm_mutex_lock,
+        .unlock = rsm_mutex_unlock,
+        .close  = rsm_mutex_close,
+        .deallocate = rsm_mutex_free,
+        .propagate_increase_inheritance = rsm_mutex_propagate_increase_inheritance,
+        .propagate_decrease_inheritance = rsm_mutex_propagate_decrease_inheritance,
+#ifdef CONFIG_LITMUS_DGL_SUPPORT
+        .dgl_lock = rsm_mutex_dgl_lock,
+        .is_owner = rsm_mutex_is_owner,
+        .enable_priority = rsm_mutex_enable_priority,
+#endif
+};
+static struct litmus_lock* cedf_new_rsm_mutex(void)
+{
+        return rsm_mutex_new(&cedf_rsm_mutex_lock_ops);
+}
+/* ******************** IKGLP ********************** */
+static struct litmus_lock_ops cedf_ikglp_lock_ops = {
+        .lock   = ikglp_lock,
+        .unlock = ikglp_unlock,
+        .close  = ikglp_close,
+        .deallocate = ikglp_free,
+        // ikglp can only be an outer-most lock.
+        .propagate_increase_inheritance = NULL,
+        .propagate_decrease_inheritance = NULL,
+};
+static struct litmus_lock* cedf_new_ikglp(void* __user arg)
+{
+        // assumes clusters of uniform size.
+        return ikglp_new(cluster_size/num_clusters, &cedf_ikglp_lock_ops, arg);
+}
+#endif  /* CONFIG_LITMUS_NESTED_LOCKING */
+/* ******************** KFMLP support ********************** */
+static struct litmus_lock_ops cedf_kfmlp_lock_ops = {
+        .lock   = kfmlp_lock,
+        .unlock = kfmlp_unlock,
+        .close  = kfmlp_close,
+        .deallocate = kfmlp_free,
+        // kfmlp can only be an outer-most lock.
+        .propagate_increase_inheritance = NULL,
+        .propagate_decrease_inheritance = NULL,
+};
+static struct litmus_lock* cedf_new_kfmlp(void* __user arg)
+{
+        return kfmlp_new(&cedf_kfmlp_lock_ops, arg);
+}
+/* **** lock constructor **** */
+static long cedf_allocate_lock(struct litmus_lock **lock, int type,
+                                                                 void* __user args)
+{
+        int err;
+        switch (type) {
+#ifdef CONFIG_LITMUS_NESTED_LOCKING
+                case RSM_MUTEX:
+                        *lock = cedf_new_rsm_mutex();
+                        break;
+                case IKGLP_SEM:
+                        *lock = cedf_new_ikglp(args);
+                        break;
+#endif
+                case KFMLP_SEM:
+                        *lock = cedf_new_kfmlp(args);
+                        break;
+                default:
+                        err = -ENXIO;
+                        goto UNSUPPORTED_LOCK;
+        };
+        if (*lock)
+                err = 0;
+        else
+                err = -ENOMEM;
+UNSUPPORTED_LOCK:
+        return err;
+}
+#endif  // CONFIG_LITMUS_LOCKING
+#ifdef CONFIG_LITMUS_AFFINITY_LOCKING
+static struct affinity_observer_ops cedf_kfmlp_affinity_ops = {
+        .close = kfmlp_aff_obs_close,
+        .deallocate = kfmlp_aff_obs_free,
+};
+#ifdef CONFIG_LITMUS_NESTED_LOCKING
+static struct affinity_observer_ops cedf_ikglp_affinity_ops = {
+        .close = ikglp_aff_obs_close,
+        .deallocate = ikglp_aff_obs_free,
+};
+#endif
+static long cedf_allocate_affinity_observer(struct affinity_observer **aff_obs,
+                                                                                        int type,
+                                                                                        void* __user args)
+{
+        int err;
+        switch (type) {
+                case KFMLP_SIMPLE_GPU_AFF_OBS:
+                        *aff_obs = kfmlp_simple_gpu_aff_obs_new(&cedf_kfmlp_affinity_ops, args);
+                        break;
+                case KFMLP_GPU_AFF_OBS:
+                        *aff_obs = kfmlp_gpu_aff_obs_new(&cedf_kfmlp_affinity_ops, args);
+                        break;
+#ifdef CONFIG_LITMUS_NESTED_LOCKING
+                case IKGLP_SIMPLE_GPU_AFF_OBS:
+                        *aff_obs = ikglp_simple_gpu_aff_obs_new(&cedf_ikglp_affinity_ops, args);
+                        break;
+                case IKGLP_GPU_AFF_OBS:
+                        *aff_obs = ikglp_gpu_aff_obs_new(&cedf_ikglp_affinity_ops, args);
+                        break;
+#endif
+                default:
+                        err = -ENXIO;
+                        goto UNSUPPORTED_AFF_OBS;
+        };
+        if (*aff_obs)
+                err = 0;
+        else
+                err = -ENOMEM;
+UNSUPPORTED_AFF_OBS:
+        return err;
+}
+#endif
 #ifdef VERBOSE_INIT
 static void print_cluster_topology(cpumask_var_t mask, int cpu)
@@ -673,16 +1636,17 @@ static void print_cluster_topology(cpumask_var_t mask, int cpu)
 }
 #endif
-static int clusters_allocated = 0;
 static void cleanup_cedf(void)
 {
        int i;
+#ifdef CONFIG_LITMUS_NVIDIA
+        shutdown_nvidia_info();
+#endif
        if (clusters_allocated) {
                for (i = 0; i < num_clusters; i++) {
                        kfree(cedf[i].cpus);
-                        kfree(cedf[i].heap_node);
                        free_cpumask_var(cedf[i].cpu_map);
                }
@@ -690,6 +1654,18 @@ static void cleanup_cedf(void)
        }
 }
+#if defined(CONFIG_LITMUS_NVIDIA) && defined(CONFIG_LITMUS_SOFTIRQD)
+static int cedf_map_gpu_to_cpu(int gpu)
+{
+        int cpu_cluster = gpu / gpu_cluster_size;
+        int default_cpu = cedf[cpu_cluster].cpus[0]->cpu;  // first CPU in given cluster
+        TRACE("CPU %d is default for GPU %d interrupt threads.\n", default_cpu, gpu);
+        return default_cpu;
+}
+#endif
 static long cedf_activate_plugin(void)
 {
        int i, j, cpu, ccpu, cpu_count;
@@ -736,18 +1712,33 @@ static long cedf_activate_plugin(void)
        printk(KERN_INFO "C-EDF: %d cluster(s) of size = %d\n",
                        num_clusters, cluster_size);
+#if defined(CONFIG_LITMUS_NVIDIA) && defined(CONFIG_LITMUS_SOFTIRQD)
+        num_gpu_clusters = min(num_clusters, num_online_gpus());
+        gpu_cluster_size = num_online_gpus() / num_gpu_clusters;
+        if (((num_online_gpus() % gpu_cluster_size) != 0) ||
+                (num_gpu_clusters != num_clusters)) {
+                printk(KERN_WARNING "C-EDF: GPUs not uniformly distributed among CPU clusters.\n");
+        }
+#endif
        /* initialize clusters */
        cedf = kmalloc(num_clusters * sizeof(cedf_domain_t), GFP_ATOMIC);
        for (i = 0; i < num_clusters; i++) {
                cedf[i].cpus = kmalloc(cluster_size * sizeof(cpu_entry_t),
                                GFP_ATOMIC);
-                cedf[i].heap_node = kmalloc(
+                INIT_BINHEAP_HANDLE(&(cedf[i].cpu_heap), cpu_lower_prio);
-                                cluster_size * sizeof(struct bheap_node),
-                                GFP_ATOMIC);
-                bheap_init(&(cedf[i].cpu_heap));
                edf_domain_init(&(cedf[i].domain), NULL, cedf_release_jobs);
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+                cedf[i].pending_tasklets.head = NULL;
+                cedf[i].pending_tasklets.tail = &(cedf[i].pending_tasklets.head);
+#endif
                if(!zalloc_cpumask_var(&cedf[i].cpu_map, GFP_ATOMIC))
                        return -ENOMEM;
 #ifdef CONFIG_RELEASE_MASTER
@@ -758,6 +1749,10 @@ static long cedf_activate_plugin(void)
        /* cycle through cluster and add cpus to them */
        for (i = 0; i < num_clusters; i++) {
+#ifdef CONFIG_LITMUS_DGL_SUPPORT
+                raw_spin_lock_init(&cedf[i].dgl_lock);
+#endif
                for_each_online_cpu(cpu) {
                        /* check if the cpu is already in a cluster */
                        for (j = 0; j < num_clusters; j++)
@@ -788,8 +1783,8 @@ static long cedf_activate_plugin(void)
                                atomic_set(&entry->will_schedule, 0);
                                entry->cpu = ccpu;
                                entry->cluster = &cedf[i];
-                                entry->hn = &(cedf[i].heap_node[cpu_count]);
-                                bheap_node_init(&entry->hn, entry);
+                                INIT_BINHEAP_NODE(&entry->hn);
                                cpu_count++;
@@ -806,6 +1801,14 @@ static long cedf_activate_plugin(void)
                }
        }
+#ifdef CONFIG_LITMUS_SOFTIRQD
+        init_klmirqd();
+#endif
+#ifdef CONFIG_LITMUS_NVIDIA
+        init_nvidia_info();
+#endif
        free_cpumask_var(mask);
        clusters_allocated = 1;
        return 0;
@@ -824,6 +1827,33 @@ static struct sched_plugin cedf_plugin __cacheline_aligned_in_smp = {
        .task_block             = cedf_task_block,
        .admit_task             = cedf_admit_task,
        .activate_plugin        = cedf_activate_plugin,
+        .compare                = edf_higher_prio,
+#ifdef CONFIG_LITMUS_LOCKING
+        .allocate_lock          = cedf_allocate_lock,
+        .increase_prio          = increase_priority_inheritance,
+        .decrease_prio          = decrease_priority_inheritance,
+        .__increase_prio        = __increase_priority_inheritance,
+        .__decrease_prio        = __decrease_priority_inheritance,
+#endif
+#ifdef CONFIG_LITMUS_NESTED_LOCKING
+        .nested_increase_prio           = nested_increase_priority_inheritance,
+        .nested_decrease_prio           = nested_decrease_priority_inheritance,
+        .__compare                                      = __edf_higher_prio,
+#endif
+#ifdef CONFIG_LITMUS_DGL_SUPPORT
+        .get_dgl_spinlock = cedf_get_dgl_spinlock,
+#endif
+#ifdef CONFIG_LITMUS_AFFINITY_LOCKING
+        .allocate_aff_obs = cedf_allocate_affinity_observer,
+#endif
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+        .enqueue_pai_tasklet = cedf_enqueue_pai_tasklet,
+        .change_prio_pai_tasklet = cedf_change_prio_pai_tasklet,
+        .run_tasklets = cedf_run_tasklets,
+#endif
+#if defined(CONFIG_LITMUS_NVIDIA) && defined(CONFIG_LITMUS_SOFTIRQD)
+        .map_gpu_to_cpu = cedf_map_gpu_to_cpu,
+#endif
 };
 static struct proc_dir_entry *cluster_file = NULL, *cedf_dir = NULL;
diff --git a/litmus/sched_gsn_edf.c b/litmus/sched_gsn_edf.c
index b8548b885b35..01791a18e8f3 100644
--- a/litmus/sched_gsn_edf.c
+++ b/litmus/sched_gsn_edf.c
@@ -12,24 +12,54 @@
 #include <linux/percpu.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
 #include <litmus/litmus.h>
 #include <litmus/jobs.h>
 #include <litmus/sched_plugin.h>
 #include <litmus/edf_common.h>
 #include <litmus/sched_trace.h>
-#include <litmus/trace.h>
 #include <litmus/preempt.h>
 #include <litmus/budget.h>
 #include <litmus/bheap.h>
+#include <litmus/binheap.h>
+#include <litmus/trace.h>
+#ifdef CONFIG_LITMUS_LOCKING
+#include <litmus/kfmlp_lock.h>
+#endif
+#ifdef CONFIG_LITMUS_NESTED_LOCKING
+#include <litmus/rsm_lock.h>
+#include <litmus/ikglp_lock.h>
+#endif
 #ifdef CONFIG_SCHED_CPU_AFFINITY
 #include <litmus/affinity.h>
 #endif
-#include <linux/module.h>
+#ifdef CONFIG_REALTIME_AUX_TASKS
+#include <litmus/aux_tasks.h>
+#endif
+#ifdef CONFIG_LITMUS_SOFTIRQD
+#include <litmus/litmus_softirq.h>
+#endif
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+#include <linux/interrupt.h>
+#endif
+#ifdef CONFIG_LITMUS_NVIDIA
+#include <litmus/nvidia_info.h>
+#endif
+#if defined(CONFIG_LITMUS_AFFINITY_LOCKING) && defined(CONFIG_LITMUS_NVIDIA)
+#include <litmus/gpu_affinity.h>
+#endif
 /* Overview of GSN-EDF operations.
 *
@@ -104,52 +134,64 @@ typedef struct  {
        int                     cpu;
        struct task_struct*     linked;         /* only RT tasks */
        struct task_struct*     scheduled;      /* only RT tasks */
-        struct bheap_node*      hn;
+        struct binheap_node hn;
 } cpu_entry_t;
 DEFINE_PER_CPU(cpu_entry_t, gsnedf_cpu_entries);
 cpu_entry_t* gsnedf_cpus[NR_CPUS];
 /* the cpus queue themselves according to priority in here */
-static struct bheap_node gsnedf_heap_node[NR_CPUS];
+static struct binheap gsnedf_cpu_heap;
-static struct bheap      gsnedf_cpu_heap;
 static rt_domain_t gsnedf;
 #define gsnedf_lock (gsnedf.ready_lock)
+#ifdef CONFIG_LITMUS_DGL_SUPPORT
+static raw_spinlock_t dgl_lock;
+static raw_spinlock_t* gsnedf_get_dgl_spinlock(struct task_struct *t)
+{
+        return(&dgl_lock);
+}
+#endif
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+struct tasklet_head gsnedf_pending_tasklets;
+#endif
 /* Uncomment this if you want to see all scheduling decisions in the
 * TRACE() log.
 #define WANT_ALL_SCHED_EVENTS
 */
-static int cpu_lower_prio(struct bheap_node *_a, struct bheap_node *_b)
+static int cpu_lower_prio(struct binheap_node *_a, struct binheap_node *_b)
 {
-        cpu_entry_t *a, *b;
+        cpu_entry_t *a = binheap_entry(_a, cpu_entry_t, hn);
-        a = _a->value;
+        cpu_entry_t *b = binheap_entry(_b, cpu_entry_t, hn);
-        b = _b->value;
        /* Note that a and b are inverted: we want the lowest-priority CPU at
         * the top of the heap.
         */
        return edf_higher_prio(b->linked, a->linked);
 }
 /* update_cpu_position - Move the cpu entry to the correct place to maintain
 *                       order in the cpu queue. Caller must hold gsnedf lock.
 */
 static void update_cpu_position(cpu_entry_t *entry)
 {
-        if (likely(bheap_node_in_heap(entry->hn)))
+        if (likely(binheap_is_in_heap(&entry->hn))) {
-                bheap_delete(cpu_lower_prio, &gsnedf_cpu_heap, entry->hn);
+                binheap_delete(&entry->hn, &gsnedf_cpu_heap);
-        bheap_insert(cpu_lower_prio, &gsnedf_cpu_heap, entry->hn);
+        }
+        binheap_add(&entry->hn, &gsnedf_cpu_heap, cpu_entry_t, hn);
 }
 /* caller must hold gsnedf lock */
 static cpu_entry_t* lowest_prio_cpu(void)
 {
-        struct bheap_node* hn;
+        return binheap_top_entry(&gsnedf_cpu_heap, cpu_entry_t, hn);
-        hn = bheap_peek(cpu_lower_prio, &gsnedf_cpu_heap);
-        return hn->value;
 }
@@ -164,8 +206,17 @@ static noinline void link_task_to_cpu(struct task_struct* linked,
        struct task_struct* tmp;
        int on_cpu;
+        //int print = (linked != NULL || entry->linked != NULL);
        BUG_ON(linked && !is_realtime(linked));
+        /*
+        if (print) {
+                TRACE_CUR("linked = %s/%d\n", (linked) ? linked->comm : "(nil)", (linked)? linked->pid : 0);
+                TRACE_CUR("entry->linked = %s/%d\n", (entry->linked) ? entry->linked->comm : "(nil)", (entry->linked)? entry->linked->pid : 0);
+        }
+        */
        /* Currently linked task is set to be unlinked. */
        if (entry->linked) {
                entry->linked->rt_param.linked_on = NO_CPU;
@@ -201,12 +252,18 @@ static noinline void link_task_to_cpu(struct task_struct* linked,
                        linked->rt_param.linked_on = entry->cpu;
        }
        entry->linked = linked;
-#ifdef WANT_ALL_SCHED_EVENTS
-        if (linked)
+        /*
-                TRACE_TASK(linked, "linked to %d.\n", entry->cpu);
+        if (print) {
-        else
+                //#ifdef WANT_ALL_SCHED_EVENTS
-                TRACE("NULL linked to %d.\n", entry->cpu);
+                if (linked)
-#endif
+                        TRACE_TASK(linked, "linked to %d.\n", entry->cpu);
+                else
+                        TRACE("NULL linked to %d.\n", entry->cpu);
+                //#endif
+        }
+        */
        update_cpu_position(entry);
 }
@@ -251,8 +308,17 @@ static noinline void requeue(struct task_struct* task)
        /* sanity check before insertion */
        BUG_ON(is_queued(task));
-        if (is_released(task, litmus_clock()))
+        if (is_released(task, litmus_clock())) {
-                __add_ready(&gsnedf, task);
+#ifdef CONFIG_REALTIME_AUX_TASKS
+                if (unlikely(tsk_rt(task)->is_aux_task && !is_running(task))) {
+                        /* aux_task probably transitioned to real-time while it was blocked */
+                        TRACE_CUR("aux task %s/%d is not ready!\n", task->comm, task->pid);
+                        unlink(task); /* really needed? */
+                }
+                else
+#endif
+                        __add_ready(&gsnedf, task);
+        }
        else {
                /* it has got to wait */
                add_release(&gsnedf, task);
@@ -326,6 +392,7 @@ static void gsnedf_release_jobs(rt_domain_t* rt, struct bheap* tasks)
        raw_spin_lock_irqsave(&gsnedf_lock, flags);
        __merge_ready(rt, tasks);
        check_for_preemptions();
        raw_spin_unlock_irqrestore(&gsnedf_lock, flags);
@@ -338,12 +405,17 @@ static noinline void job_completion(struct task_struct *t, int forced)
        sched_trace_task_completion(t, forced);
+#ifdef CONFIG_LITMUS_NVIDIA
+        atomic_set(&tsk_rt(t)->nv_int_count, 0);
+#endif
        TRACE_TASK(t, "job_completion().\n");
        /* set flags */
        tsk_rt(t)->completed = 1;
        /* prepare for next period */
        prepare_for_next_period(t);
        if (is_released(t, litmus_clock()))
                sched_trace_task_release(t);
        /* unlink */
@@ -362,24 +434,350 @@ static noinline void job_completion(struct task_struct *t, int forced)
 */
 static void gsnedf_tick(struct task_struct* t)
 {
-        if (is_realtime(t) && budget_enforced(t) && budget_exhausted(t)) {
+        if (is_realtime(t) && budget_exhausted(t))
-                if (!is_np(t)) {
+        {
-                        /* np tasks will be preempted when they become
+                if (budget_signalled(t) && !sigbudget_sent(t)) {
-                         * preemptable again
+                        /* signal exhaustion */
-                         */
+                        send_sigbudget(t);
-                        litmus_reschedule_local();
+                }
-                        TRACE("gsnedf_scheduler_tick: "
-                              "%d is preemptable "
+                if (budget_enforced(t)) {
-                              " => FORCE_RESCHED\n", t->pid);
+                        if (!is_np(t)) {
-                } else if (is_user_np(t)) {
+                                /* np tasks will be preempted when they become
-                        TRACE("gsnedf_scheduler_tick: "
+                                 * preemptable again
-                              "%d is non-preemptable, "
+                                 */
-                              "preemption delayed.\n", t->pid);
+                                litmus_reschedule_local();
-                        request_exit_np(t);
+                                TRACE("gsnedf_scheduler_tick: "
+                                          "%d is preemptable "
+                                          " => FORCE_RESCHED\n", t->pid);
+                        } else if (is_user_np(t)) {
+                                TRACE("gsnedf_scheduler_tick: "
+                                          "%d is non-preemptable, "
+                                          "preemption delayed.\n", t->pid);
+                                request_exit_np(t);
+                        }
+                }
+        }
+        /*
+        if(is_realtime(t)) {
+                TRACE_TASK(t, "tick %llu\n", litmus_clock());
+        }
+         */
+}
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+static void __do_lit_tasklet(struct tasklet_struct* tasklet, unsigned long flushed)
+{
+        if (!atomic_read(&tasklet->count)) {
+                if(tasklet->owner) {
+                        sched_trace_tasklet_begin(tasklet->owner);
+                }
+                if (!test_and_clear_bit(TASKLET_STATE_SCHED, &tasklet->state))
+                {
+                        BUG();
+                }
+                TRACE("%s: Invoking tasklet with owner pid = %d (flushed = %d).\n",
+                          __FUNCTION__,
+                          (tasklet->owner) ? tasklet->owner->pid : -1,
+                          (tasklet->owner) ? 0 : 1);
+                tasklet->func(tasklet->data);
+                tasklet_unlock(tasklet);
+                if(tasklet->owner) {
+                        sched_trace_tasklet_end(tasklet->owner, flushed);
+                }
+        }
+        else {
+                BUG();
+        }
+}
+static void do_lit_tasklets(struct task_struct* sched_task)
+{
+        int work_to_do = 1;
+        struct tasklet_struct *tasklet = NULL;
+        unsigned long flags;
+        while(work_to_do) {
+                TS_NV_SCHED_BOTISR_START;
+                // execute one tasklet that has higher priority
+                raw_spin_lock_irqsave(&gsnedf_lock, flags);
+                if(gsnedf_pending_tasklets.head != NULL) {
+                        struct tasklet_struct *prev = NULL;
+                        tasklet = gsnedf_pending_tasklets.head;
+                        while(tasklet && edf_higher_prio(sched_task, tasklet->owner)) {
+                                prev = tasklet;
+                                tasklet = tasklet->next;
+                        }
+                        // remove the tasklet from the queue
+                        if(prev) {
+                                prev->next = tasklet->next;
+                                if(prev->next == NULL) {
+                                        TRACE("%s: Tasklet for %d is the last element in tasklet queue.\n", __FUNCTION__, tasklet->owner->pid);
+                                        gsnedf_pending_tasklets.tail = &(prev);
+                                }
+                        }
+                        else {
+                                gsnedf_pending_tasklets.head = tasklet->next;
+                                if(tasklet->next == NULL) {
+                                        TRACE("%s: Tasklet for %d is the last element in tasklet queue.\n", __FUNCTION__, tasklet->owner->pid);
+                                        gsnedf_pending_tasklets.tail = &(gsnedf_pending_tasklets.head);
+                                }
+                        }
+                }
+                else {
+                        TRACE("%s: Tasklet queue is empty.\n", __FUNCTION__);
+                }
+                raw_spin_unlock_irqrestore(&gsnedf_lock, flags);
+                if(tasklet) {
+                        __do_lit_tasklet(tasklet, 0ul);
+                        tasklet = NULL;
+                }
+                else {
+                        work_to_do = 0;
+                }
+                TS_NV_SCHED_BOTISR_END;
+        }
+}
+//static void do_lit_tasklets(struct task_struct* sched_task)
+//{
+//      int work_to_do = 1;
+//      struct tasklet_struct *tasklet = NULL;
+//      //struct tasklet_struct *step;
+//      unsigned long flags;
+//
+//      while(work_to_do) {
+//
+//              TS_NV_SCHED_BOTISR_START;
+//
+//              // remove tasklet at head of list if it has higher priority.
+//              raw_spin_lock_irqsave(&gsnedf_lock, flags);
+//
+//              if(gsnedf_pending_tasklets.head != NULL) {
+//                      // remove tasklet at head.
+//                      tasklet = gsnedf_pending_tasklets.head;
+//
+//                      if(edf_higher_prio(tasklet->owner, sched_task)) {
+//
+//                              if(NULL == tasklet->next) {
+//                                      // tasklet is at the head, list only has one element
+//                                      TRACE("%s: Tasklet for %d is the last element in tasklet queue.\n", __FUNCTION__, tasklet->owner->pid);
+//                                      gsnedf_pending_tasklets.tail = &(gsnedf_pending_tasklets.head);
+//                              }
+//
+//                              // remove the tasklet from the queue
+//                              gsnedf_pending_tasklets.head = tasklet->next;
+//
+//                              TRACE("%s: Removed tasklet for %d from tasklet queue.\n", __FUNCTION__, tasklet->owner->pid);
+//                      }
+//                      else {
+//                              TRACE("%s: Pending tasklet (%d) does not have priority to run on this CPU (%d).\n", __FUNCTION__, tasklet->owner->pid, smp_processor_id());
+//                              tasklet = NULL;
+//                      }
+//              }
+//              else {
+//                      TRACE("%s: Tasklet queue is empty.\n", __FUNCTION__);
+//              }
+//
+//              raw_spin_unlock_irqrestore(&gsnedf_lock, flags);
+//
+//              TS_NV_SCHED_BOTISR_END;
+//
+//              if(tasklet) {
+//                      __do_lit_tasklet(tasklet, 0ul);
+//                      tasklet = NULL;
+//              }
+//              else {
+//                      work_to_do = 0;
+//              }
+//      }
+//
+//      //TRACE("%s: exited.\n", __FUNCTION__);
+//}
+static void __add_pai_tasklet(struct tasklet_struct* tasklet)
+{
+        struct tasklet_struct* step;
+        tasklet->next = NULL;  // make sure there are no old values floating around
+        step = gsnedf_pending_tasklets.head;
+        if(step == NULL) {
+                TRACE("%s: tasklet queue empty.  inserting tasklet for %d at head.\n", __FUNCTION__, tasklet->owner->pid);
+                // insert at tail.
+                *(gsnedf_pending_tasklets.tail) = tasklet;
+                gsnedf_pending_tasklets.tail = &(tasklet->next);
+        }
+        else if((*(gsnedf_pending_tasklets.tail) != NULL) &&
+                        edf_higher_prio((*(gsnedf_pending_tasklets.tail))->owner, tasklet->owner)) {
+                // insert at tail.
+                TRACE("%s: tasklet belongs at end.  inserting tasklet for %d at tail.\n", __FUNCTION__, tasklet->owner->pid);
+                *(gsnedf_pending_tasklets.tail) = tasklet;
+                gsnedf_pending_tasklets.tail = &(tasklet->next);
+        }
+        else {
+                // insert the tasklet somewhere in the middle.
+        TRACE("%s: tasklet belongs somewhere in the middle.\n", __FUNCTION__);
+                while(step->next && edf_higher_prio(step->next->owner, tasklet->owner)) {
+                        step = step->next;
+                }
+                // insert tasklet right before step->next.
+                TRACE("%s: inserting tasklet for %d between %d and %d.\n", __FUNCTION__, tasklet->owner->pid, step->owner->pid, (step->next) ? step->next->owner->pid : -1);
+                tasklet->next = step->next;
+                step->next = tasklet;
+                // patch up the head if needed.
+                if(gsnedf_pending_tasklets.head == step)
+                {
+                        TRACE("%s: %d is the new tasklet queue head.\n", __FUNCTION__, tasklet->owner->pid);
+                        gsnedf_pending_tasklets.head = tasklet;
+                }
+        }
+}
+static void gsnedf_run_tasklets(struct task_struct* sched_task)
+{
+        preempt_disable();
+        if(gsnedf_pending_tasklets.head != NULL) {
+                TRACE("%s: There are tasklets to process.\n", __FUNCTION__);
+                do_lit_tasklets(sched_task);
+        }
+        preempt_enable_no_resched();
+}
+static int gsnedf_enqueue_pai_tasklet(struct tasklet_struct* tasklet)
+{
+        cpu_entry_t *targetCPU = NULL;
+        int thisCPU;
+        int runLocal = 0;
+        int runNow = 0;
+        unsigned long flags;
+    if(unlikely((tasklet->owner == NULL) || !is_realtime(tasklet->owner)))
+    {
+        TRACE("%s: No owner associated with this tasklet!\n", __FUNCTION__);
+                return 0;
+    }
+        raw_spin_lock_irqsave(&gsnedf_lock, flags);
+        thisCPU = smp_processor_id();
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+        {
+                cpu_entry_t* affinity = NULL;
+                // use this CPU if it is in our cluster and isn't running any RT work.
+                if(
+#ifdef CONFIG_RELEASE_MASTER
+                   (thisCPU != gsnedf.release_master) &&
+#endif
+                   (__get_cpu_var(gsnedf_cpu_entries).linked == NULL)) {
+                        affinity = &(__get_cpu_var(gsnedf_cpu_entries));
+                }
+                else {
+                        // this CPU is busy or shouldn't run tasklet in this cluster.
+                        // look for available near by CPUs.
+                        // NOTE: Affinity towards owner and not this CPU.  Is this right?
+                        affinity =
+                                gsnedf_get_nearest_available_cpu(
+                                        &per_cpu(gsnedf_cpu_entries, task_cpu(tasklet->owner)));
+                }
+                targetCPU = affinity;
+        }
+#endif
+        if (targetCPU == NULL) {
+                targetCPU = lowest_prio_cpu();
+        }
+        if (edf_higher_prio(tasklet->owner, targetCPU->linked)) {
+                if (thisCPU == targetCPU->cpu) {
+                        TRACE("%s: Run tasklet locally (and now).\n", __FUNCTION__);
+                        runLocal = 1;
+                        runNow = 1;
+                }
+                else {
+                        TRACE("%s: Run tasklet remotely (and now).\n", __FUNCTION__);
+                        runLocal = 0;
+                        runNow = 1;
+                }
+        }
+        else {
+                runLocal = 0;
+                runNow = 0;
+        }
+        if(!runLocal) {
+                // enqueue the tasklet
+                __add_pai_tasklet(tasklet);
+        }
+        raw_spin_unlock_irqrestore(&gsnedf_lock, flags);
+        if (runLocal /*&& runNow */) {  // runNow == 1 is implied
+                TRACE("%s: Running tasklet on CPU where it was received.\n", __FUNCTION__);
+                __do_lit_tasklet(tasklet, 0ul);
+        }
+        else if (runNow /*&& !runLocal */) {  // runLocal == 0 is implied
+                TRACE("%s: Triggering CPU %d to run tasklet.\n", __FUNCTION__, targetCPU->cpu);
+                preempt(targetCPU);  // need to be protected by cedf_lock?
+        }
+        else {
+                TRACE("%s: Scheduling of tasklet was deferred.\n", __FUNCTION__);
+        }
+        return(1); // success
+}
+static void gsnedf_change_prio_pai_tasklet(struct task_struct *old_prio,
+                                                                                   struct task_struct *new_prio)
+{
+        struct tasklet_struct* step;
+        unsigned long flags;
+        if(gsnedf_pending_tasklets.head != NULL) {
+                raw_spin_lock_irqsave(&gsnedf_lock, flags);
+                for(step = gsnedf_pending_tasklets.head; step != NULL; step = step->next) {
+                        if(step->owner == old_prio) {
+                                TRACE("%s: Found tasklet to change: %d\n", __FUNCTION__, step->owner->pid);
+                                step->owner = new_prio;
+                        }
                }
+                raw_spin_unlock_irqrestore(&gsnedf_lock, flags);
        }
 }
+#endif  // end PAI
 /* Getting schedule() right is a bit tricky. schedule() may not make any
 * assumptions on the state of the current task since it may be called for a
 * number of reasons. The reasons include a scheduler_tick() determined that it
@@ -404,9 +802,11 @@ static void gsnedf_tick(struct task_struct* t)
 static struct task_struct* gsnedf_schedule(struct task_struct * prev)
 {
        cpu_entry_t* entry = &__get_cpu_var(gsnedf_cpu_entries);
-        int out_of_time, sleep, preempt, np, exists, blocks;
+        int out_of_time, signal_budget, sleep, preempt, np, exists, blocks;
        struct task_struct* next = NULL;
+        //int completion = 0;
 #ifdef CONFIG_RELEASE_MASTER
        /* Bail out early if we are the release master.
         * The release master never schedules any real-time tasks.
@@ -427,8 +827,13 @@ static struct task_struct* gsnedf_schedule(struct task_struct * prev)
        /* (0) Determine state */
        exists      = entry->scheduled != NULL;
        blocks      = exists && !is_running(entry->scheduled);
-        out_of_time = exists && budget_enforced(entry->scheduled)
+        out_of_time = exists &&
-                && budget_exhausted(entry->scheduled);
+                budget_enforced(entry->scheduled) &&
+                budget_exhausted(entry->scheduled);
+        signal_budget = exists &&
+                budget_signalled(entry->scheduled) &&
+                budget_exhausted(entry->scheduled) &&
+                !sigbudget_sent(entry->scheduled);
        np          = exists && is_np(entry->scheduled);
        sleep       = exists && is_completed(entry->scheduled);
        preempt     = entry->scheduled != entry->linked;
@@ -437,21 +842,36 @@ static struct task_struct* gsnedf_schedule(struct task_struct * prev)
        TRACE_TASK(prev, "invoked gsnedf_schedule.\n");
 #endif
-        if (exists)
+        if (exists) {
                TRACE_TASK(prev,
-                           "blocks:%d out_of_time:%d np:%d sleep:%d preempt:%d "
+                           "blocks:%d out_of_time:%d signal_budget: %d np:%d sleep:%d preempt:%d "
                           "state:%d sig:%d\n",
-                           blocks, out_of_time, np, sleep, preempt,
+                           blocks, out_of_time, signal_budget, np, sleep, preempt,
                           prev->state, signal_pending(prev));
+        }
        if (entry->linked && preempt)
                TRACE_TASK(prev, "will be preempted by %s/%d\n",
                           entry->linked->comm, entry->linked->pid);
+        /* Send the signal that the budget has been exhausted */
+        if (signal_budget) {
+                send_sigbudget(entry->scheduled);
+        }
        /* If a task blocks we have no choice but to reschedule.
         */
-        if (blocks)
+        if (blocks) {
                unlink(entry->scheduled);
+        }
+#if defined(CONFIG_LITMUS_NVIDIA) && defined(CONFIG_LITMUS_AFFINITY_LOCKING)
+        if(exists && is_realtime(entry->scheduled) && tsk_rt(entry->scheduled)->held_gpus) {
+                if(!blocks || tsk_rt(entry->scheduled)->suspend_gpu_tracker_on_block) {
+                        stop_gpu_tracker(entry->scheduled);
+                }
+        }
+#endif
        /* Request a sys_exit_np() call if we would like to preempt but cannot.
         * We need to make sure to update the link structure anyway in case
@@ -468,8 +888,10 @@ static struct task_struct* gsnedf_schedule(struct task_struct * prev)
         * this. Don't do a job completion if we block (can't have timers running
         * for blocked jobs).
         */
-        if (!np && (out_of_time || sleep) && !blocks)
+        if (!np && (out_of_time || sleep) && !blocks) {
                job_completion(entry->scheduled, !sleep);
+                //completion = 1;
+        }
        /* Link pending task if we became unlinked.
         */
@@ -492,12 +914,21 @@ static struct task_struct* gsnedf_schedule(struct task_struct * prev)
                        entry->scheduled->rt_param.scheduled_on = NO_CPU;
                        TRACE_TASK(entry->scheduled, "scheduled_on = NO_CPU\n");
                }
-        } else
+        }
+        else
+        {
                /* Only override Linux scheduler if we have a real-time task
                 * scheduled that needs to continue.
                 */
                if (exists)
                        next = prev;
+        }
+#if 0
+        if (completion) {
+                TRACE_CUR("switching away from a completion\n");
+        }
+#endif
        sched_state_task_picked();
@@ -512,7 +943,6 @@ static struct task_struct* gsnedf_schedule(struct task_struct * prev)
                TRACE("becomes idle at %llu.\n", litmus_clock());
 #endif
        return next;
 }
@@ -524,6 +954,7 @@ static void gsnedf_finish_switch(struct task_struct *prev)
        cpu_entry_t*    entry = &__get_cpu_var(gsnedf_cpu_entries);
        entry->scheduled = is_realtime(current) ? current : NULL;
 #ifdef WANT_ALL_SCHED_EVENTS
        TRACE_TASK(prev, "switched away from\n");
 #endif
@@ -537,7 +968,7 @@ static void gsnedf_task_new(struct task_struct * t, int on_rq, int running)
        unsigned long           flags;
        cpu_entry_t*            entry;
-        TRACE("gsn edf: task new %d\n", t->pid);
+        TRACE("gsn edf: task new = %d on_rq = %d running = %d\n", t->pid, on_rq, running);
        raw_spin_lock_irqsave(&gsnedf_lock, flags);
@@ -572,11 +1003,14 @@ static void gsnedf_task_new(struct task_struct * t, int on_rq, int running)
 static void gsnedf_task_wake_up(struct task_struct *task)
 {
        unsigned long flags;
-        lt_t now;
+        //lt_t now;
        TRACE_TASK(task, "wake_up at %llu\n", litmus_clock());
        raw_spin_lock_irqsave(&gsnedf_lock, flags);
+#if 0
+        /* sporadic task model. will increment job numbers automatically */
        now = litmus_clock();
        if (is_tardy(task, now)) {
                /* new sporadic release */
@@ -590,6 +1024,25 @@ static void gsnedf_task_wake_up(struct task_struct *task)
                        tsk_rt(task)->completed = 0;
                }
        }
+#else
+        /* don't force job to end.  rely on user to say when jobs complete */
+        tsk_rt(task)->completed = 0;
+#endif
+#ifdef CONFIG_REALTIME_AUX_TASKS
+        if (tsk_rt(task)->has_aux_tasks && !tsk_rt(task)->hide_from_aux_tasks) {
+                TRACE_CUR("%s/%d is ready so aux tasks may not inherit.\n", task->comm, task->pid);
+                disable_aux_task_owner(task);
+        }
+#endif
+#ifdef CONFIG_LITMUS_NVIDIA
+        if (tsk_rt(task)->held_gpus && !tsk_rt(task)->hide_from_gpu) {
+                TRACE_CUR("%s/%d is ready so gpu klmirqd tasks may not inherit.\n", task->comm, task->pid);
+                disable_gpu_owner(task);
+        }
+#endif
        gsnedf_job_arrival(task);
        raw_spin_unlock_irqrestore(&gsnedf_lock, flags);
 }
@@ -602,7 +1055,25 @@ static void gsnedf_task_block(struct task_struct *t)
        /* unlink if necessary */
        raw_spin_lock_irqsave(&gsnedf_lock, flags);
        unlink(t);
+#ifdef CONFIG_REALTIME_AUX_TASKS
+        if (tsk_rt(t)->has_aux_tasks && !tsk_rt(t)->hide_from_aux_tasks) {
+                TRACE_CUR("%s/%d is blocked so aux tasks may inherit.\n", t->comm, t->pid);
+                enable_aux_task_owner(t);
+        }
+#endif
+#ifdef CONFIG_LITMUS_NVIDIA
+        if (tsk_rt(t)->held_gpus && !tsk_rt(t)->hide_from_gpu) {
+                TRACE_CUR("%s/%d is blocked so aux tasks may inherit.\n", t->comm, t->pid);
+                enable_gpu_owner(t);
+        }
+#endif
        raw_spin_unlock_irqrestore(&gsnedf_lock, flags);
        BUG_ON(!is_realtime(t));
@@ -613,8 +1084,30 @@ static void gsnedf_task_exit(struct task_struct * t)
 {
        unsigned long flags;
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+        gsnedf_change_prio_pai_tasklet(t, NULL);
+#endif
        /* unlink if necessary */
        raw_spin_lock_irqsave(&gsnedf_lock, flags);
+#ifdef CONFIG_REALTIME_AUX_TASKS
+        /* make sure we clean up on our way out */
+        if (unlikely(tsk_rt(t)->is_aux_task)) {
+                exit_aux_task(t);
+        }
+        else if(tsk_rt(t)->has_aux_tasks) {
+                disable_aux_task_owner(t);
+        }
+#endif
+#ifdef CONFIG_LITMUS_NVIDIA
+        /* make sure we clean up on our way out */
+        if(tsk_rt(t)->held_gpus) {
+                disable_gpu_owner(t);
+        }
+#endif
        unlink(t);
        if (tsk_rt(t)->scheduled_on != NO_CPU) {
                gsnedf_cpus[tsk_rt(t)->scheduled_on]->scheduled = NULL;
@@ -623,106 +1116,413 @@ static void gsnedf_task_exit(struct task_struct * t)
        raw_spin_unlock_irqrestore(&gsnedf_lock, flags);
        BUG_ON(!is_realtime(t));
-        TRACE_TASK(t, "RIP\n");
+        TRACE_TASK(t, "RIP\n");
 }
 static long gsnedf_admit_task(struct task_struct* tsk)
 {
+#ifdef CONFIG_LITMUS_NESTED_LOCKING
+        INIT_BINHEAP_HANDLE(&tsk_rt(tsk)->hp_blocked_tasks,
+                                                edf_max_heap_base_priority_order);
+#endif
        return 0;
 }
 #ifdef CONFIG_LITMUS_LOCKING
 #include <litmus/fdso.h>
 /* called with IRQs off */
-static void set_priority_inheritance(struct task_struct* t, struct task_struct* prio_inh)
+static int __increase_priority_inheritance(struct task_struct* t,
+                                                                                    struct task_struct* prio_inh)
 {
+        int success = 1;
        int linked_on;
        int check_preempt = 0;
-        raw_spin_lock(&gsnedf_lock);
+        if (prio_inh && prio_inh == effective_priority(t)) {
+                /* relationship already established. */
+                TRACE_TASK(t, "already has effective priority of %s/%d\n",
+                                        prio_inh->comm, prio_inh->pid);
+                goto out;
+        }
-        TRACE_TASK(t, "inherits priority from %s/%d\n", prio_inh->comm, prio_inh->pid);
+#ifdef CONFIG_LITMUS_NESTED_LOCKING
-        tsk_rt(t)->inh_task = prio_inh;
+        /* this sanity check allows for weaker locking in protocols */
+        if(__edf_higher_prio(prio_inh, BASE, t, EFFECTIVE)) {
-        linked_on  = tsk_rt(t)->linked_on;
+#endif
+                TRACE_TASK(t, "inherits priority from %s/%d\n",
-        /* If it is scheduled, then we need to reorder the CPU heap. */
+                                   prio_inh->comm, prio_inh->pid);
-        if (linked_on != NO_CPU) {
+                tsk_rt(t)->inh_task = prio_inh;
-                TRACE_TASK(t, "%s: linked  on %d\n",
-                           __FUNCTION__, linked_on);
+                linked_on  = tsk_rt(t)->linked_on;
-                /* Holder is scheduled; need to re-order CPUs.
-                 * We can't use heap_decrease() here since
+                /* If it is scheduled, then we need to reorder the CPU heap. */
-                 * the cpu_heap is ordered in reverse direction, so
+                if (linked_on != NO_CPU) {
-                 * it is actually an increase. */
+                        TRACE_TASK(t, "%s: linked  on %d\n",
-                bheap_delete(cpu_lower_prio, &gsnedf_cpu_heap,
+                                   __FUNCTION__, linked_on);
-                            gsnedf_cpus[linked_on]->hn);
+                        /* Holder is scheduled; need to re-order CPUs.
-                bheap_insert(cpu_lower_prio, &gsnedf_cpu_heap,
+                         * We can't use heap_decrease() here since
-                            gsnedf_cpus[linked_on]->hn);
+                         * the cpu_heap is ordered in reverse direction, so
-        } else {
+                         * it is actually an increase. */
-                /* holder may be queued: first stop queue changes */
+                        binheap_delete(&gsnedf_cpus[linked_on]->hn, &gsnedf_cpu_heap);
-                raw_spin_lock(&gsnedf.release_lock);
+                        binheap_add(&gsnedf_cpus[linked_on]->hn,
-                if (is_queued(t)) {
+                                        &gsnedf_cpu_heap, cpu_entry_t, hn);
-                        TRACE_TASK(t, "%s: is queued\n",
-                                   __FUNCTION__);
-                        /* We need to update the position of holder in some
-                         * heap. Note that this could be a release heap if we
-                         * budget enforcement is used and this job overran. */
-                        check_preempt =
-                                !bheap_decrease(edf_ready_order,
-                                               tsk_rt(t)->heap_node);
                } else {
-                        /* Nothing to do: if it is not queued and not linked
+                        /* holder may be queued: first stop queue changes */
-                         * then it is either sleeping or currently being moved
+                        raw_spin_lock(&gsnedf.release_lock);
-                         * by other code (e.g., a timer interrupt handler) that
+                        if (is_queued(t)) {
-                         * will use the correct priority when enqueuing the
+                                TRACE_TASK(t, "%s: is queued\n",
-                         * task. */
+                                           __FUNCTION__);
-                        TRACE_TASK(t, "%s: is NOT queued => Done.\n",
+                                /* We need to update the position of holder in some
-                                   __FUNCTION__);
+                                 * heap. Note that this could be a release heap if we
-                }
+                                 * budget enforcement is used and this job overran. */
-                raw_spin_unlock(&gsnedf.release_lock);
+                                check_preempt =
+                                        !bheap_decrease(edf_ready_order,
-                /* If holder was enqueued in a release heap, then the following
+                                                           tsk_rt(t)->heap_node);
-                 * preemption check is pointless, but we can't easily detect
+                        } else {
-                 * that case. If you want to fix this, then consider that
+                                /* Nothing to do: if it is not queued and not linked
-                 * simply adding a state flag requires O(n) time to update when
+                                 * then it is either sleeping or currently being moved
-                 * releasing n tasks, which conflicts with the goal to have
+                                 * by other code (e.g., a timer interrupt handler) that
-                 * O(log n) merges. */
+                                 * will use the correct priority when enqueuing the
-                if (check_preempt) {
+                                 * task. */
-                        /* heap_decrease() hit the top level of the heap: make
+                                TRACE_TASK(t, "%s: is NOT queued => Done.\n",
-                         * sure preemption checks get the right task, not the
+                                           __FUNCTION__);
-                         * potentially stale cache. */
+                        }
-                        bheap_uncache_min(edf_ready_order,
+                        raw_spin_unlock(&gsnedf.release_lock);
-                                         &gsnedf.ready_queue);
-                        check_for_preemptions();
+                        /* If holder was enqueued in a release heap, then the following
+                         * preemption check is pointless, but we can't easily detect
+                         * that case. If you want to fix this, then consider that
+                         * simply adding a state flag requires O(n) time to update when
+                         * releasing n tasks, which conflicts with the goal to have
+                         * O(log n) merges. */
+                        if (check_preempt) {
+                                /* heap_decrease() hit the top level of the heap: make
+                                 * sure preemption checks get the right task, not the
+                                 * potentially stale cache. */
+                                bheap_uncache_min(edf_ready_order,
+                                                 &gsnedf.ready_queue);
+                                check_for_preemptions();
+                        }
+#ifdef CONFIG_REALTIME_AUX_TASKS
+                        /* propagate to aux tasks */
+                        if (tsk_rt(t)->has_aux_tasks) {
+                                aux_task_owner_increase_priority(t);
+                        }
+#endif
+#ifdef CONFIG_LITMUS_NVIDIA
+                        /* propagate to gpu klmirqd */
+                        if (tsk_rt(t)->held_gpus) {
+                                gpu_owner_increase_priority(t);
+                        }
+#endif
                }
+#ifdef CONFIG_LITMUS_NESTED_LOCKING
+        }
+        else {
+                TRACE_TASK(t, "Spurious invalid priority increase. "
+                                          "Inheritance request: %s/%d [eff_prio = %s/%d] to inherit from %s/%d\n"
+                                          "Occurance is likely okay: probably due to (hopefully safe) concurrent priority updates.\n",
+                                   t->comm, t->pid,
+                                   effective_priority(t)->comm, effective_priority(t)->pid,
+                                   (prio_inh) ? prio_inh->comm : "nil",
+                                   (prio_inh) ? prio_inh->pid : -1);
+                WARN_ON(!prio_inh);
+                success = 0;
        }
+#endif
-        raw_spin_unlock(&gsnedf_lock);
+out:
+        return success;
 }
 /* called with IRQs off */
-static void clear_priority_inheritance(struct task_struct* t)
+static void increase_priority_inheritance(struct task_struct* t, struct task_struct* prio_inh)
 {
+        int success;
        raw_spin_lock(&gsnedf_lock);
-        /* A job only stops inheriting a priority when it releases a
+        success = __increase_priority_inheritance(t, prio_inh);
-         * resource. Thus we can make the following assumption.*/
-        BUG_ON(tsk_rt(t)->scheduled_on == NO_CPU);
-        TRACE_TASK(t, "priority restored\n");
+        raw_spin_unlock(&gsnedf_lock);
-        tsk_rt(t)->inh_task = NULL;
-        /* Check if rescheduling is necessary. We can't use heap_decrease()
+#if defined(CONFIG_LITMUS_PAI_SOFTIRQD) && defined(CONFIG_LITMUS_NVIDIA)
-         * since the priority was effectively lowered. */
+        if(tsk_rt(t)->held_gpus) {
-        unlink(t);
+                int i;
-        gsnedf_job_arrival(t);
+                for(i = find_first_bit(&tsk_rt(t)->held_gpus, sizeof(tsk_rt(t)->held_gpus));
+                        i < NV_DEVICE_NUM;
+                        i = find_next_bit(&tsk_rt(t)->held_gpus, sizeof(tsk_rt(t)->held_gpus), i+1)) {
+                        pai_check_priority_increase(t, i);
+                }
+        }
+#endif
+}
+/* called with IRQs off */
+static int __decrease_priority_inheritance(struct task_struct* t,
+                                                                                        struct task_struct* prio_inh)
+{
+        int success = 1;
+        if (prio_inh == tsk_rt(t)->inh_task) {
+                /* relationship already established. */
+                TRACE_TASK(t, "already inherits priority from %s/%d\n",
+                                   (prio_inh) ? prio_inh->comm : "(nil)",
+                                   (prio_inh) ? prio_inh->pid : 0);
+                goto out;
+        }
+#ifdef CONFIG_LITMUS_NESTED_LOCKING
+        if(__edf_higher_prio(t, EFFECTIVE, prio_inh, BASE)) {
+#endif
+                /* A job only stops inheriting a priority when it releases a
+                 * resource. Thus we can make the following assumption.*/
+                if(prio_inh)
+                        TRACE_TASK(t, "EFFECTIVE priority decreased to %s/%d\n",
+                                           prio_inh->comm, prio_inh->pid);
+                else
+                        TRACE_TASK(t, "base priority restored.\n");
+                tsk_rt(t)->inh_task = prio_inh;
+                if(tsk_rt(t)->scheduled_on != NO_CPU) {
+                        TRACE_TASK(t, "is scheduled.\n");
+                        /* Check if rescheduling is necessary. We can't use heap_decrease()
+                         * since the priority was effectively lowered. */
+                        unlink(t);
+                        gsnedf_job_arrival(t);
+                }
+                else {
+                        /* task is queued */
+                        raw_spin_lock(&gsnedf.release_lock);
+                        if (is_queued(t)) {
+                                TRACE_TASK(t, "is queued.\n");
+                                /* decrease in priority, so we have to re-add to binomial heap */
+                                unlink(t);
+                                gsnedf_job_arrival(t);
+                        }
+                        else {
+                                TRACE_TASK(t, "is not in scheduler. Probably on wait queue somewhere.\n");
+                        }
+                        raw_spin_unlock(&gsnedf.release_lock);
+                }
+#ifdef CONFIG_REALTIME_AUX_TASKS
+                /* propagate to aux tasks */
+                if (tsk_rt(t)->has_aux_tasks) {
+                        aux_task_owner_decrease_priority(t);
+                }
+#endif
+#ifdef CONFIG_LITMUS_NVIDIA
+                /* propagate to gpu */
+                if (tsk_rt(t)->held_gpus) {
+                        gpu_owner_decrease_priority(t);
+                }
+#endif
+#ifdef CONFIG_LITMUS_NESTED_LOCKING
+        }
+        else {
+                TRACE_TASK(t, "Spurious invalid priority decrease. "
+                                   "Inheritance request: %s/%d [eff_prio = %s/%d] to inherit from %s/%d\n"
+                                   "Occurance is likely okay: probably due to (hopefully safe) concurrent priority updates.\n",
+                                   t->comm, t->pid,
+                                   effective_priority(t)->comm, effective_priority(t)->pid,
+                                   (prio_inh) ? prio_inh->comm : "nil",
+                                   (prio_inh) ? prio_inh->pid : -1);
+                success = 0;
+        }
+#endif
+out:
+        return success;
+}
+static void decrease_priority_inheritance(struct task_struct* t,
+                                                                                  struct task_struct* prio_inh)
+{
+        int success;
+        raw_spin_lock(&gsnedf_lock);
+        success = __decrease_priority_inheritance(t, prio_inh);
        raw_spin_unlock(&gsnedf_lock);
+#if defined(CONFIG_LITMUS_PAI_SOFTIRQD) && defined(CONFIG_LITMUS_NVIDIA)
+        if(tsk_rt(t)->held_gpus) {
+                int i;
+                for(i = find_first_bit(&tsk_rt(t)->held_gpus, sizeof(tsk_rt(t)->held_gpus));
+                        i < NV_DEVICE_NUM;
+                        i = find_next_bit(&tsk_rt(t)->held_gpus, sizeof(tsk_rt(t)->held_gpus), i+1)) {
+                        pai_check_priority_decrease(t, i);
+                }
+        }
+#endif
+}
+#ifdef CONFIG_LITMUS_NESTED_LOCKING
+/* called with IRQs off */
+/* preconditions:
+ (1) The 'hp_blocked_tasks_lock' of task 't' is held.
+ (2) The lock 'to_unlock' is held.
+ */
+static void nested_increase_priority_inheritance(struct task_struct* t,
+                                                                                                 struct task_struct* prio_inh,
+                                                                                                 raw_spinlock_t *to_unlock,
+                                                                                                 unsigned long irqflags)
+{
+        struct litmus_lock *blocked_lock = tsk_rt(t)->blocked_lock;
+        if(tsk_rt(t)->inh_task != prio_inh) {           // shield redundent calls.
+                increase_priority_inheritance(t, prio_inh);  // increase our prio.
+        }
+        raw_spin_unlock(&tsk_rt(t)->hp_blocked_tasks_lock);  // unlock the t's heap.
+        if(blocked_lock) {
+                if(blocked_lock->ops->propagate_increase_inheritance) {
+                        TRACE_TASK(t, "Inheritor is blocked (...perhaps).  Checking lock %d.\n",
+                                           blocked_lock->ident);
+                        // beware: recursion
+                        blocked_lock->ops->propagate_increase_inheritance(blocked_lock,
+                                                                                                                          t, to_unlock,
+                                                                                                                          irqflags);
+                }
+                else {
+                        TRACE_TASK(t, "Inheritor is blocked on lock (%d) that does not support nesting!\n",
+                                           blocked_lock->ident);
+                        unlock_fine_irqrestore(to_unlock, irqflags);
+                }
+        }
+        else {
+                TRACE_TASK(t, "is not blocked.  No propagation.\n");
+                unlock_fine_irqrestore(to_unlock, irqflags);
+        }
+}
+/* called with IRQs off */
+/* preconditions:
+ (1) The 'hp_blocked_tasks_lock' of task 't' is held.
+ (2) The lock 'to_unlock' is held.
+ */
+static void nested_decrease_priority_inheritance(struct task_struct* t,
+                                                                                                 struct task_struct* prio_inh,
+                                                                                                 raw_spinlock_t *to_unlock,
+                                                                                                 unsigned long irqflags)
+{
+        struct litmus_lock *blocked_lock = tsk_rt(t)->blocked_lock;
+        decrease_priority_inheritance(t, prio_inh);
+        raw_spin_unlock(&tsk_rt(t)->hp_blocked_tasks_lock);  // unlock the t's heap.
+        if(blocked_lock) {
+                if(blocked_lock->ops->propagate_decrease_inheritance) {
+                        TRACE_TASK(t, "Inheritor is blocked (...perhaps).  Checking lock %d.\n",
+                                           blocked_lock->ident);
+                        // beware: recursion
+                        blocked_lock->ops->propagate_decrease_inheritance(blocked_lock, t,
+                                                                                                                          to_unlock,
+                                                                                                                          irqflags);
+                }
+                else {
+                        TRACE_TASK(t, "Inheritor is blocked on lock (%p) that does not support nesting!\n",
+                                           blocked_lock);
+                        unlock_fine_irqrestore(to_unlock, irqflags);
+                }
+        }
+        else {
+                TRACE_TASK(t, "is not blocked.  No propagation.\n");
+                unlock_fine_irqrestore(to_unlock, irqflags);
+        }
+}
+/* ******************** RSM MUTEX ********************** */
+static struct litmus_lock_ops gsnedf_rsm_mutex_lock_ops = {
+        .lock   = rsm_mutex_lock,
+        .unlock = rsm_mutex_unlock,
+        .close  = rsm_mutex_close,
+        .deallocate = rsm_mutex_free,
+        .propagate_increase_inheritance = rsm_mutex_propagate_increase_inheritance,
+        .propagate_decrease_inheritance = rsm_mutex_propagate_decrease_inheritance,
+#ifdef CONFIG_LITMUS_DGL_SUPPORT
+        .dgl_lock = rsm_mutex_dgl_lock,
+        .is_owner = rsm_mutex_is_owner,
+        .enable_priority = rsm_mutex_enable_priority,
+#endif
+};
+static struct litmus_lock* gsnedf_new_rsm_mutex(void)
+{
+        return rsm_mutex_new(&gsnedf_rsm_mutex_lock_ops);
 }
+/* ******************** IKGLP ********************** */
+static struct litmus_lock_ops gsnedf_ikglp_lock_ops = {
+        .lock   = ikglp_lock,
+        .unlock = ikglp_unlock,
+        .close  = ikglp_close,
+        .deallocate = ikglp_free,
+        // ikglp can only be an outer-most lock.
+        .propagate_increase_inheritance = NULL,
+        .propagate_decrease_inheritance = NULL,
+};
+static struct litmus_lock* gsnedf_new_ikglp(void* __user arg)
+{
+        return ikglp_new(num_online_cpus(), &gsnedf_ikglp_lock_ops, arg);
+}
+#endif  /* CONFIG_LITMUS_NESTED_LOCKING */
+/* ******************** KFMLP support ********************** */
+static struct litmus_lock_ops gsnedf_kfmlp_lock_ops = {
+        .lock   = kfmlp_lock,
+        .unlock = kfmlp_unlock,
+        .close  = kfmlp_close,
+        .deallocate = kfmlp_free,
+        // kfmlp can only be an outer-most lock.
+        .propagate_increase_inheritance = NULL,
+        .propagate_decrease_inheritance = NULL,
+};
+static struct litmus_lock* gsnedf_new_kfmlp(void* __user arg)
+{
+        return kfmlp_new(&gsnedf_kfmlp_lock_ops, arg);
+}
 /* ******************** FMLP support ********************** */
@@ -789,7 +1589,7 @@ int gsnedf_fmlp_lock(struct litmus_lock* l)
                if (edf_higher_prio(t, sem->hp_waiter)) {
                        sem->hp_waiter = t;
                        if (edf_higher_prio(t, sem->owner))
-                                set_priority_inheritance(sem->owner, sem->hp_waiter);
+                                increase_priority_inheritance(sem->owner, sem->hp_waiter);
                }
                TS_LOCK_SUSPEND;
@@ -802,7 +1602,7 @@ int gsnedf_fmlp_lock(struct litmus_lock* l)
                 * there is only one wake up per release.
                 */
-                schedule();
+                suspend_for_lock();
                TS_LOCK_RESUME;
@@ -857,7 +1657,7 @@ int gsnedf_fmlp_unlock(struct litmus_lock* l)
                        /* Well, if next is not the highest-priority waiter,
                         * then it ought to inherit the highest-priority
                         * waiter's priority. */
-                        set_priority_inheritance(next, sem->hp_waiter);
+                        increase_priority_inheritance(next, sem->hp_waiter);
                }
                /* wake up next */
@@ -868,7 +1668,7 @@ int gsnedf_fmlp_unlock(struct litmus_lock* l)
        /* we lose the benefit of priority inheritance (if any) */
        if (tsk_rt(t)->inh_task)
-                clear_priority_inheritance(t);
+                decrease_priority_inheritance(t, NULL);
 out:
        spin_unlock_irqrestore(&sem->wait.lock, flags);
@@ -906,6 +1706,11 @@ static struct litmus_lock_ops gsnedf_fmlp_lock_ops = {
        .lock   = gsnedf_fmlp_lock,
        .unlock = gsnedf_fmlp_unlock,
        .deallocate = gsnedf_fmlp_free,
+#ifdef CONFIG_LITMUS_NESTED_LOCKING
+        .propagate_increase_inheritance = NULL,
+        .propagate_decrease_inheritance = NULL
+#endif
 };
 static struct litmus_lock* gsnedf_new_fmlp(void)
@@ -924,31 +1729,110 @@ static struct litmus_lock* gsnedf_new_fmlp(void)
        return &sem->litmus_lock;
 }
-/* **** lock constructor **** */
 static long gsnedf_allocate_lock(struct litmus_lock **lock, int type,
-                                 void* __user unused)
+                                 void* __user args)
 {
-        int err = -ENXIO;
+        int err;
-        /* GSN-EDF currently only supports the FMLP for global resources. */
        switch (type) {
        case FMLP_SEM:
                /* Flexible Multiprocessor Locking Protocol */
                *lock = gsnedf_new_fmlp();
-                if (*lock)
+                break;
-                        err = 0;
+#ifdef CONFIG_LITMUS_NESTED_LOCKING
-                else
+    case RSM_MUTEX:
-                        err = -ENOMEM;
+                *lock = gsnedf_new_rsm_mutex();
                break;
+        case IKGLP_SEM:
+                *lock = gsnedf_new_ikglp(args);
+                break;
+#endif
+        case KFMLP_SEM:
+                *lock = gsnedf_new_kfmlp(args);
+                break;
+        default:
+                err = -ENXIO;
+                goto UNSUPPORTED_LOCK;
+        };
+        if (*lock)
+                err = 0;
+        else
+                err = -ENOMEM;
+UNSUPPORTED_LOCK:
+        return err;
+}
+#endif  // CONFIG_LITMUS_LOCKING
+#ifdef CONFIG_LITMUS_AFFINITY_LOCKING
+static struct affinity_observer_ops gsnedf_kfmlp_affinity_ops = {
+        .close = kfmlp_aff_obs_close,
+        .deallocate = kfmlp_aff_obs_free,
+};
+#ifdef CONFIG_LITMUS_NESTED_LOCKING
+static struct affinity_observer_ops gsnedf_ikglp_affinity_ops = {
+        .close = ikglp_aff_obs_close,
+        .deallocate = ikglp_aff_obs_free,
+};
+#endif
+static long gsnedf_allocate_affinity_observer(
+                                                                struct affinity_observer **aff_obs,
+                                                                int type,
+                                                                void* __user args)
+{
+        int err;
+        switch (type) {
+                case KFMLP_SIMPLE_GPU_AFF_OBS:
+                        *aff_obs = kfmlp_simple_gpu_aff_obs_new(&gsnedf_kfmlp_affinity_ops, args);
+                        break;
+                case KFMLP_GPU_AFF_OBS:
+                        *aff_obs = kfmlp_gpu_aff_obs_new(&gsnedf_kfmlp_affinity_ops, args);
+                        break;
+#ifdef CONFIG_LITMUS_NESTED_LOCKING
+                case IKGLP_SIMPLE_GPU_AFF_OBS:
+                        *aff_obs = ikglp_simple_gpu_aff_obs_new(&gsnedf_ikglp_affinity_ops, args);
+                        break;
+                case IKGLP_GPU_AFF_OBS:
+                        *aff_obs = ikglp_gpu_aff_obs_new(&gsnedf_ikglp_affinity_ops, args);
+                        break;
+#endif
+                default:
+                        err = -ENXIO;
+                        goto UNSUPPORTED_AFF_OBS;
        };
+        if (*aff_obs)
+                err = 0;
+        else
+                err = -ENOMEM;
+UNSUPPORTED_AFF_OBS:
        return err;
 }
+#endif
+#if defined(CONFIG_LITMUS_NVIDIA) && defined(CONFIG_LITMUS_SOFTIRQD)
+static int gsnedf_map_gpu_to_cpu(int gpu)
+{
+        return -1;  // No CPU affinity needed.
+}
 #endif
@@ -957,14 +1841,14 @@ static long gsnedf_activate_plugin(void)
        int cpu;
        cpu_entry_t *entry;
-        bheap_init(&gsnedf_cpu_heap);
+        INIT_BINHEAP_HANDLE(&gsnedf_cpu_heap, cpu_lower_prio);
 #ifdef CONFIG_RELEASE_MASTER
        gsnedf.release_master = atomic_read(&release_master_cpu);
 #endif
        for_each_online_cpu(cpu) {
                entry = &per_cpu(gsnedf_cpu_entries, cpu);
-                bheap_node_init(&entry->hn, entry);
+                INIT_BINHEAP_NODE(&entry->hn);
                entry->linked    = NULL;
                entry->scheduled = NULL;
 #ifdef CONFIG_RELEASE_MASTER
@@ -978,6 +1862,20 @@ static long gsnedf_activate_plugin(void)
                }
 #endif
        }
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+        gsnedf_pending_tasklets.head = NULL;
+        gsnedf_pending_tasklets.tail = &(gsnedf_pending_tasklets.head);
+#endif
+#ifdef CONFIG_LITMUS_SOFTIRQD
+    init_klmirqd();
+#endif
+#ifdef CONFIG_LITMUS_NVIDIA
+        init_nvidia_info();
+#endif
        return 0;
 }
@@ -994,8 +1892,32 @@ static struct sched_plugin gsn_edf_plugin __cacheline_aligned_in_smp = {
        .task_block             = gsnedf_task_block,
        .admit_task             = gsnedf_admit_task,
        .activate_plugin        = gsnedf_activate_plugin,
+        .compare                = edf_higher_prio,
 #ifdef CONFIG_LITMUS_LOCKING
        .allocate_lock          = gsnedf_allocate_lock,
+        .increase_prio          = increase_priority_inheritance,
+        .decrease_prio          = decrease_priority_inheritance,
+        .__increase_prio        = __increase_priority_inheritance,
+        .__decrease_prio        = __decrease_priority_inheritance,
+#endif
+#ifdef CONFIG_LITMUS_NESTED_LOCKING
+        .nested_increase_prio           = nested_increase_priority_inheritance,
+        .nested_decrease_prio           = nested_decrease_priority_inheritance,
+        .__compare                                      = __edf_higher_prio,
+#endif
+#ifdef CONFIG_LITMUS_DGL_SUPPORT
+        .get_dgl_spinlock = gsnedf_get_dgl_spinlock,
+#endif
+#ifdef CONFIG_LITMUS_AFFINITY_LOCKING
+        .allocate_aff_obs = gsnedf_allocate_affinity_observer,
+#endif
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+        .enqueue_pai_tasklet = gsnedf_enqueue_pai_tasklet,
+        .change_prio_pai_tasklet = gsnedf_change_prio_pai_tasklet,
+        .run_tasklets = gsnedf_run_tasklets,
+#endif
+#if defined(CONFIG_LITMUS_NVIDIA) && defined(CONFIG_LITMUS_SOFTIRQD)
+        .map_gpu_to_cpu = gsnedf_map_gpu_to_cpu,
 #endif
 };
@@ -1005,15 +1927,20 @@ static int __init init_gsn_edf(void)
        int cpu;
        cpu_entry_t *entry;
-        bheap_init(&gsnedf_cpu_heap);
+        INIT_BINHEAP_HANDLE(&gsnedf_cpu_heap, cpu_lower_prio);
        /* initialize CPU state */
-        for (cpu = 0; cpu < NR_CPUS; cpu++)  {
+        for (cpu = 0; cpu < NR_CPUS; ++cpu)  {
                entry = &per_cpu(gsnedf_cpu_entries, cpu);
                gsnedf_cpus[cpu] = entry;
                entry->cpu       = cpu;
-                entry->hn        = &gsnedf_heap_node[cpu];
-                bheap_node_init(&entry->hn, entry);
+                INIT_BINHEAP_NODE(&entry->hn);
        }
+#ifdef CONFIG_LITMUS_DGL_SUPPORT
+        raw_spin_lock_init(&dgl_lock);
+#endif
        edf_domain_init(&gsnedf, NULL, gsnedf_release_jobs);
        return register_sched_plugin(&gsn_edf_plugin);
 }
diff --git a/litmus/sched_litmus.c b/litmus/sched_litmus.c
index 6b32cf09abbd..9de03c95b825 100644
--- a/litmus/sched_litmus.c
+++ b/litmus/sched_litmus.c
@@ -175,8 +175,10 @@ static void enqueue_task_litmus(struct rq *rq, struct task_struct *p,
                litmus->task_wake_up(p);
                rq->litmus.nr_running++;
-        } else
+        } else {
                TRACE_TASK(p, "ignoring an enqueue, not a wake up.\n");
+                //WARN_ON(1);
+        }
 }
 static void dequeue_task_litmus(struct rq *rq, struct task_struct *p,
diff --git a/litmus/sched_pfp.c b/litmus/sched_pfp.c
index 91e52391a173..a96c2b1aa26f 100644
--- a/litmus/sched_pfp.c
+++ b/litmus/sched_pfp.c
@@ -142,17 +142,25 @@ static void pfp_tick(struct task_struct *t)
         */
        BUG_ON(is_realtime(t) && t != pfp->scheduled);
-        if (is_realtime(t) && budget_enforced(t) && budget_exhausted(t)) {
+        if (is_realtime(t) && budget_exhausted(t))
-                if (!is_np(t)) {
+        {
-                        litmus_reschedule_local();
+                if (budget_signalled(t) && !sigbudget_sent(t)) {
-                        TRACE("pfp_scheduler_tick: "
+                        /* signal exhaustion */
-                              "%d is preemptable "
+                        send_sigbudget(t);
-                              " => FORCE_RESCHED\n", t->pid);
+                }
-                } else if (is_user_np(t)) {
-                        TRACE("pfp_scheduler_tick: "
+                if (budget_enforced(t)) {
-                              "%d is non-preemptable, "
+                        if (!is_np(t)) {
-                              "preemption delayed.\n", t->pid);
+                                litmus_reschedule_local();
-                        request_exit_np(t);
+                                TRACE("pfp_scheduler_tick: "
+                                          "%d is preemptable "
+                                          " => FORCE_RESCHED\n", t->pid);
+                        } else if (is_user_np(t)) {
+                                TRACE("pfp_scheduler_tick: "
+                                          "%d is non-preemptable, "
+                                          "preemption delayed.\n", t->pid);
+                                request_exit_np(t);
+                        }
                }
        }
 }
@@ -162,7 +170,7 @@ static struct task_struct* pfp_schedule(struct task_struct * prev)
        pfp_domain_t*   pfp = local_pfp;
        struct task_struct*     next;
-        int out_of_time, sleep, preempt, np, exists, blocks, resched, migrate;
+        int out_of_time, signal_budget, sleep, preempt, np, exists, blocks, resched, migrate;
        raw_spin_lock(&pfp->slock);
@@ -179,6 +187,10 @@ static struct task_struct* pfp_schedule(struct task_struct * prev)
        out_of_time = exists &&
                                  budget_enforced(pfp->scheduled) &&
                                  budget_exhausted(pfp->scheduled);
+        signal_budget = exists &&
+                                        budget_signalled(pfp->scheduled) &&
+                                        budget_exhausted(pfp->scheduled) &&
+                                        !sigbudget_sent(pfp->scheduled);
        np          = exists && is_np(pfp->scheduled);
        sleep       = exists && is_completed(pfp->scheduled);
        migrate     = exists && get_partition(pfp->scheduled) != pfp->cpu;
@@ -190,6 +202,10 @@ static struct task_struct* pfp_schedule(struct task_struct * prev)
         */
        resched = preempt;
+        /* Send the signal that the budget has been exhausted */
+        if (signal_budget)
+                send_sigbudget(pfp->scheduled);
        /* If a task blocks we have no choice but to reschedule.
         */
        if (blocks)
diff --git a/litmus/sched_plugin.c b/litmus/sched_plugin.c
index 00a1900d6457..76ff892122aa 100644
--- a/litmus/sched_plugin.c
+++ b/litmus/sched_plugin.c
@@ -13,6 +13,10 @@
 #include <litmus/preempt.h>
 #include <litmus/jobs.h>
+#ifdef CONFIG_LITMUS_NVIDIA
+#include <litmus/nvidia_info.h>
+#endif
 /*
 * Generic function to trigger preemption on either local or remote cpu
 * from scheduler plugins. The key feature is that this function is
@@ -27,11 +31,19 @@ void preempt_if_preemptable(struct task_struct* t, int cpu)
        int reschedule = 0;
-        if (!t)
+        TRACE_CUR("preempt_if_preemptable: %s/%d\n",
+                                (t) ? t->comm : "(nil)",
+                                (t) ? t->pid : 0);
+        if (!t) {
+                TRACE_CUR("unconditionally reshcedule\n");
                /* move non-real-time task out of the way */
                reschedule = 1;
+        }
        else {
                if (smp_processor_id() == cpu) {
+                        TRACE_CUR("preempt local cpu.\n");
                        /* local CPU case */
                        /* check if we need to poke userspace */
                        if (is_user_np(t))
@@ -43,14 +55,22 @@ void preempt_if_preemptable(struct task_struct* t, int cpu)
                                 * currently-executing task */
                                reschedule = 1;
                } else {
+                        int is_knp = is_kernel_np(t);
+                        int reqexit = request_exit_np_atomic(t);
+                        TRACE_CUR("preempt remote cpu: isknp = %d  reqexit = %d\n", is_knp, reqexit);
                        /* Remote CPU case.  Only notify if it's not a kernel
                         * NP section and if we didn't set the userspace
                         * flag. */
-                        reschedule = !(is_kernel_np(t) || request_exit_np_atomic(t));
+                        //reschedule = !(is_kernel_np(t) || request_exit_np_atomic(t));
+                        reschedule = !(is_knp || reqexit);
                }
        }
-        if (likely(reschedule))
+        if (likely(reschedule)) {
+                TRACE_CUR("calling litmus_reschedule()\n");
                litmus_reschedule(cpu);
+        }
 }
@@ -102,6 +122,9 @@ static long litmus_dummy_complete_job(void)
 static long litmus_dummy_activate_plugin(void)
 {
+#ifdef CONFIG_LITMUS_NVIDIA
+        shutdown_nvidia_info();
+#endif
        return 0;
 }
@@ -110,14 +133,100 @@ static long litmus_dummy_deactivate_plugin(void)
        return 0;
 }
-#ifdef CONFIG_LITMUS_LOCKING
+static int litmus_dummy_compare(struct task_struct* a, struct task_struct* b)
+{
+        TRACE_CUR("WARNING: Dummy compare function called!\n");
+        return 0;
+}
+#ifdef CONFIG_LITMUS_LOCKING
 static long litmus_dummy_allocate_lock(struct litmus_lock **lock, int type,
                                       void* __user config)
 {
        return -ENXIO;
 }
+static void litmus_dummy_increase_prio(struct task_struct* t, struct task_struct* prio_inh)
+{
+}
+static void litmus_dummy_decrease_prio(struct task_struct* t, struct task_struct* prio_inh)
+{
+}
+static int litmus_dummy___increase_prio(struct task_struct* t, struct task_struct* prio_inh)
+{
+        TRACE_CUR("WARNING: Dummy litmus_dummy___increase_prio called!\n");
+        return 0;
+}
+static int litmus_dummy___decrease_prio(struct task_struct* t, struct task_struct* prio_inh)
+{
+        TRACE_CUR("WARNING: Dummy litmus_dummy___decrease_prio called!\n");
+        return 0;
+}
+#endif
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+static int litmus_dummy_enqueue_pai_tasklet(struct tasklet_struct* t)
+{
+        TRACE("%s: PAI Tasklet unsupported in this plugin!!!!!!\n", __FUNCTION__);
+        return(0); // failure.
+}
+static void litmus_dummy_change_prio_pai_tasklet(struct task_struct *old_prio,
+                                                                                                 struct task_struct *new_prio)
+{
+        TRACE("%s: PAI Tasklet unsupported in this plugin!!!!!!\n", __FUNCTION__);
+}
+static void litmus_dummy_run_tasklets(struct task_struct* t)
+{
+        //TRACE("%s: PAI Tasklet unsupported in this plugin!!!!!!\n", __FUNCTION__);
+}
+#endif
+#ifdef CONFIG_LITMUS_NESTED_LOCKING
+static void litmus_dummy_nested_increase_prio(struct task_struct* t, struct task_struct* prio_inh,
+                                                                                        raw_spinlock_t *to_unlock, unsigned long irqflags)
+{
+}
+static void litmus_dummy_nested_decrease_prio(struct task_struct* t, struct task_struct* prio_inh,
+                                                                                        raw_spinlock_t *to_unlock, unsigned long irqflags)
+{
+}
+static int litmus_dummy___compare(struct task_struct* a, comparison_mode_t a_mod,
+                                                                  struct task_struct* b, comparison_mode_t b_mode)
+{
+        TRACE_CUR("WARNING: Dummy compare function called!\n");
+        return 0;
+}
+#endif
+#ifdef CONFIG_LITMUS_DGL_SUPPORT
+static raw_spinlock_t* litmus_dummy_get_dgl_spinlock(struct task_struct *t)
+{
+        return NULL;
+}
+#endif
+#ifdef CONFIG_LITMUS_AFFINITY_LOCKING
+static long litmus_dummy_allocate_aff_obs(struct affinity_observer **aff_obs,
+                                                                           int type,
+                                                                           void* __user config)
+{
+        return -ENXIO;
+}
+#endif
+#if defined(CONFIG_LITMUS_NVIDIA) && defined(CONFIG_LITMUS_SOFTIRQD)
+static int litmus_dummy_map_gpu_to_cpu(int gpu)
+{
+        return 0;
+}
 #endif
@@ -136,9 +245,34 @@ struct sched_plugin linux_sched_plugin = {
        .finish_switch = litmus_dummy_finish_switch,
        .activate_plugin = litmus_dummy_activate_plugin,
        .deactivate_plugin = litmus_dummy_deactivate_plugin,
+        .compare = litmus_dummy_compare,
 #ifdef CONFIG_LITMUS_LOCKING
        .allocate_lock = litmus_dummy_allocate_lock,
+        .increase_prio = litmus_dummy_increase_prio,
+        .decrease_prio = litmus_dummy_decrease_prio,
+        .__increase_prio = litmus_dummy___increase_prio,
+        .__decrease_prio = litmus_dummy___decrease_prio,
+#endif
+#ifdef CONFIG_LITMUS_NESTED_LOCKING
+        .nested_increase_prio = litmus_dummy_nested_increase_prio,
+        .nested_decrease_prio = litmus_dummy_nested_decrease_prio,
+        .__compare = litmus_dummy___compare,
+#endif
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+        .enqueue_pai_tasklet = litmus_dummy_enqueue_pai_tasklet,
+        .change_prio_pai_tasklet = litmus_dummy_change_prio_pai_tasklet,
+        .run_tasklets = litmus_dummy_run_tasklets,
+#endif
+#ifdef CONFIG_LITMUS_DGL_SUPPORT
+        .get_dgl_spinlock = litmus_dummy_get_dgl_spinlock,
 #endif
+#ifdef CONFIG_LITMUS_AFFINITY_LOCKING
+        .allocate_aff_obs = litmus_dummy_allocate_aff_obs,
+#endif
+#if defined(CONFIG_LITMUS_NVIDIA) && defined(CONFIG_LITMUS_SOFTIRQD)
+        .map_gpu_to_cpu = litmus_dummy_map_gpu_to_cpu,
+#endif
        .admit_task = litmus_dummy_admit_task
 };
@@ -174,9 +308,34 @@ int register_sched_plugin(struct sched_plugin* plugin)
        CHECK(complete_job);
        CHECK(activate_plugin);
        CHECK(deactivate_plugin);
+        CHECK(compare);
 #ifdef CONFIG_LITMUS_LOCKING
        CHECK(allocate_lock);
+        CHECK(increase_prio);
+        CHECK(decrease_prio);
+        CHECK(__increase_prio);
+        CHECK(__decrease_prio);
+#endif
+#ifdef CONFIG_LITMUS_NESTED_LOCKING
+        CHECK(nested_increase_prio);
+        CHECK(nested_decrease_prio);
+        CHECK(__compare);
+#endif
+#ifdef CONFIG_LITMUS_PAI_SOFTIRQD
+        CHECK(enqueue_pai_tasklet);
+        CHECK(change_prio_pai_tasklet);
+        CHECK(run_tasklets);
 #endif
+#ifdef CONFIG_LITMUS_DGL_SUPPORT
+        CHECK(get_dgl_spinlock);
+#endif
+#ifdef CONFIG_LITMUS_AFFINITY_LOCKING
+        CHECK(allocate_aff_obs);
+#endif
+#if defined(CONFIG_LITMUS_NVIDIA) && defined(CONFIG_LITMUS_SOFTIRQD)
+        CHECK(map_gpu_to_cpu);
+#endif
        CHECK(admit_task);
        if (!plugin->release_at)
diff --git a/litmus/sched_psn_edf.c b/litmus/sched_psn_edf.c
index 0e1675d2e572..63fa6103882a 100644
--- a/litmus/sched_psn_edf.c
+++ b/litmus/sched_psn_edf.c
@@ -174,17 +174,25 @@ static void psnedf_tick(struct task_struct *t)
         */
        BUG_ON(is_realtime(t) && t != pedf->scheduled);
-        if (is_realtime(t) && budget_enforced(t) && budget_exhausted(t)) {
+        if (is_realtime(t) && budget_exhausted(t))
-                if (!is_np(t)) {
+        {
-                        litmus_reschedule_local();
+                if (budget_signalled(t) && !sigbudget_sent(t)) {
-                        TRACE("psnedf_scheduler_tick: "
+                        /* signal exhaustion */
-                              "%d is preemptable "
+                        send_sigbudget(t);
-                              " => FORCE_RESCHED\n", t->pid);
+                }
-                } else if (is_user_np(t)) {
-                        TRACE("psnedf_scheduler_tick: "
+                if (budget_enforced(t)) {
-                              "%d is non-preemptable, "
+                        if (!is_np(t)) {
-                              "preemption delayed.\n", t->pid);
+                                litmus_reschedule_local();
-                        request_exit_np(t);
+                                TRACE("psnedf_scheduler_tick: "
+                                          "%d is preemptable "
+                                          " => FORCE_RESCHED\n", t->pid);
+                        } else if (is_user_np(t)) {
+                                TRACE("psnedf_scheduler_tick: "
+                                          "%d is non-preemptable, "
+                                          "preemption delayed.\n", t->pid);
+                                request_exit_np(t);
+                        }
                }
        }
 }
@@ -195,8 +203,7 @@ static struct task_struct* psnedf_schedule(struct task_struct * prev)
        rt_domain_t*            edf  = &pedf->domain;
        struct task_struct*     next;
-        int                     out_of_time, sleep, preempt,
+        int out_of_time, signal_budget, sleep, preempt, np, exists, blocks, resched;
-                                np, exists, blocks, resched;
        raw_spin_lock(&pedf->slock);
@@ -213,6 +220,10 @@ static struct task_struct* psnedf_schedule(struct task_struct * prev)
        out_of_time = exists &&
                                  budget_enforced(pedf->scheduled) &&
                                  budget_exhausted(pedf->scheduled);
+        signal_budget = exists &&
+                                        budget_signalled(pedf->scheduled) &&
+                                        budget_exhausted(pedf->scheduled) &&
+                                        !sigbudget_sent(pedf->scheduled);
        np          = exists && is_np(pedf->scheduled);
        sleep       = exists && is_completed(pedf->scheduled);
        preempt     = edf_preemption_needed(edf, prev);
@@ -223,6 +234,10 @@ static struct task_struct* psnedf_schedule(struct task_struct * prev)
         */
        resched = preempt;
+        /* Send the signal that the budget has been exhausted */
+        if (signal_budget)
+                send_sigbudget(pedf->scheduled);
        /* If a task blocks we have no choice but to reschedule.
         */
        if (blocks)
diff --git a/litmus/sched_task_trace.c b/litmus/sched_task_trace.c
index 5ef8d09ab41f..f7f575346b54 100644
--- a/litmus/sched_task_trace.c
+++ b/litmus/sched_task_trace.c
@@ -7,6 +7,7 @@
 #include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/percpu.h>
+#include <linux/hardirq.h>
 #include <litmus/ftdev.h>
 #include <litmus/litmus.h>
@@ -16,13 +17,13 @@
 #include <litmus/ftdev.h>
-#define NO_EVENTS               (1 << CONFIG_SCHED_TASK_TRACE_SHIFT)
+#define NUM_EVENTS              (1 << (CONFIG_SCHED_TASK_TRACE_SHIFT+11))
 #define now() litmus_clock()
 struct local_buffer {
-        struct st_event_record record[NO_EVENTS];
+        struct st_event_record record[NUM_EVENTS];
-        char   flag[NO_EVENTS];
+        char   flag[NUM_EVENTS];
        struct ft_buffer ftbuf;
 };
@@ -41,7 +42,7 @@ static int __init init_sched_task_trace(void)
        int i, ok = 0, err;
        printk("Allocated %u sched_trace_xxx() events per CPU "
               "(buffer size: %d bytes)\n",
-               NO_EVENTS, (int) sizeof(struct local_buffer));
+               NUM_EVENTS, (int) sizeof(struct local_buffer));
        err = ftdev_init(&st_dev, THIS_MODULE,
                        num_online_cpus(), "sched_trace");
@@ -50,7 +51,7 @@ static int __init init_sched_task_trace(void)
        for (i = 0; i < st_dev.minor_cnt; i++) {
                buf = &per_cpu(st_event_buffer, i);
-                ok += init_ft_buffer(&buf->ftbuf, NO_EVENTS,
+                ok += init_ft_buffer(&buf->ftbuf, NUM_EVENTS,
                                     sizeof(struct st_event_record),
                                     buf->flag,
                                     buf->record);
@@ -154,7 +155,8 @@ feather_callback void do_sched_trace_task_switch_to(unsigned long id,
 {
        struct task_struct *t = (struct task_struct*) _task;
        struct st_event_record* rec;
-        if (is_realtime(t)) {
+        //if (is_realtime(t))  /* comment out to trace EVERYTHING */
+        {
                rec = get_record(ST_SWITCH_TO, t);
                if (rec) {
                        rec->data.switch_to.when      = now();
@@ -169,7 +171,8 @@ feather_callback void do_sched_trace_task_switch_away(unsigned long id,
 {
        struct task_struct *t = (struct task_struct*) _task;
        struct st_event_record* rec;
-        if (is_realtime(t)) {
+        //if (is_realtime(t))  /* comment out to trace EVERYTHING */
+        {
                rec = get_record(ST_SWITCH_AWAY, t);
                if (rec) {
                        rec->data.switch_away.when      = now();
@@ -188,6 +191,9 @@ feather_callback void do_sched_trace_task_completion(unsigned long id,
        if (rec) {
                rec->data.completion.when   = now();
                rec->data.completion.forced = forced;
+#ifdef LITMUS_NVIDIA
+                rec->data.completion.nv_int_count = (u16)atomic_read(&tsk_rt(t)->nv_int_count);
+#endif
                put_record(rec);
        }
 }
@@ -239,3 +245,265 @@ feather_callback void do_sched_trace_action(unsigned long id,
                put_record(rec);
        }
 }
+feather_callback void do_sched_trace_prediction_err(unsigned long id,
+                                                                                                        unsigned long _task,
+                                                                                                        unsigned long _distance,
+                                                                                                        unsigned long _rel_err)
+{
+        struct task_struct *t = (struct task_struct*) _task;
+        struct st_event_record *rec = get_record(ST_PREDICTION_ERR, t);
+        if (rec) {
+                gpu_migration_dist_t* distance = (gpu_migration_dist_t*) _distance;
+                fp_t* rel_err = (fp_t*) _rel_err;
+                rec->data.prediction_err.distance = *distance;
+                rec->data.prediction_err.rel_err = rel_err->val;
+                put_record(rec);
+        }
+}
+feather_callback void do_sched_trace_migration(unsigned long id,
+                                                                                                        unsigned long _task,
+                                                                                                        unsigned long _mig_info)
+{
+        struct task_struct *t = (struct task_struct*) _task;
+        struct st_event_record *rec = get_record(ST_MIGRATION, t);
+        if (rec) {
+                struct migration_info* mig_info = (struct migration_info*) _mig_info;
+                rec->hdr.extra = mig_info->distance;
+                rec->data.migration.observed = mig_info->observed;
+                rec->data.migration.estimated = mig_info->estimated;
+                put_record(rec);
+        }
+}
+feather_callback void do_sched_trace_tasklet_release(unsigned long id,
+                                                                                                   unsigned long _owner)
+{
+        struct task_struct *t = (struct task_struct*) _owner;
+        struct st_event_record *rec = get_record(ST_TASKLET_RELEASE, t);
+        if (rec) {
+                rec->data.tasklet_release.when = now();
+                put_record(rec);
+        }
+}
+feather_callback void do_sched_trace_tasklet_begin(unsigned long id,
+                                                                                                   unsigned long _owner)
+{
+        struct task_struct *t = (struct task_struct*) _owner;
+        struct st_event_record *rec = get_record(ST_TASKLET_BEGIN, t);
+        if (rec) {
+                rec->data.tasklet_begin.when = now();
+                if(!in_interrupt())
+                        rec->data.tasklet_begin.exe_pid = current->pid;
+                else
+                        rec->data.tasklet_begin.exe_pid = 0;
+                put_record(rec);
+        }
+}
+EXPORT_SYMBOL(do_sched_trace_tasklet_begin);
+feather_callback void do_sched_trace_tasklet_end(unsigned long id,
+                                                                                                 unsigned long _owner,
+                                                                                                 unsigned long _flushed)
+{
+        struct task_struct *t = (struct task_struct*) _owner;
+        struct st_event_record *rec = get_record(ST_TASKLET_END, t);
+        if (rec) {
+                rec->data.tasklet_end.when = now();
+                rec->data.tasklet_end.flushed = _flushed;
+                if(!in_interrupt())
+                        rec->data.tasklet_end.exe_pid = current->pid;
+                else
+                        rec->data.tasklet_end.exe_pid = 0;
+                put_record(rec);
+        }
+}
+EXPORT_SYMBOL(do_sched_trace_tasklet_end);
+feather_callback void do_sched_trace_work_release(unsigned long id,
+                                                                                                         unsigned long _owner)
+{
+        struct task_struct *t = (struct task_struct*) _owner;
+        struct st_event_record *rec = get_record(ST_WORK_RELEASE, t);
+        if (rec) {
+                rec->data.work_release.when = now();
+                put_record(rec);
+        }
+}
+feather_callback void do_sched_trace_work_begin(unsigned long id,
+                                                                                                unsigned long _owner,
+                                                                                                unsigned long _exe)
+{
+        struct task_struct *t = (struct task_struct*) _owner;
+        struct st_event_record *rec = get_record(ST_WORK_BEGIN, t);
+        if (rec) {
+                struct task_struct *exe = (struct task_struct*) _exe;
+                rec->data.work_begin.exe_pid = exe->pid;
+                rec->data.work_begin.when = now();
+                put_record(rec);
+        }
+}
+EXPORT_SYMBOL(do_sched_trace_work_begin);
+feather_callback void do_sched_trace_work_end(unsigned long id,
+                                                                                          unsigned long _owner,
+                                                                                          unsigned long _exe,
+                                                                                          unsigned long _flushed)
+{
+        struct task_struct *t = (struct task_struct*) _owner;
+        struct st_event_record *rec = get_record(ST_WORK_END, t);
+        if (rec) {
+                struct task_struct *exe = (struct task_struct*) _exe;
+                rec->data.work_end.exe_pid = exe->pid;
+                rec->data.work_end.flushed = _flushed;
+                rec->data.work_end.when = now();
+                put_record(rec);
+        }
+}
+EXPORT_SYMBOL(do_sched_trace_work_end);
+feather_callback void do_sched_trace_eff_prio_change(unsigned long id,
+                                                                                          unsigned long _task,
+                                                                                          unsigned long _inh)
+{
+        struct task_struct *t = (struct task_struct*) _task;
+        struct st_event_record *rec = get_record(ST_EFF_PRIO_CHANGE, t);
+        if (rec) {
+                struct task_struct *inh = (struct task_struct*) _inh;
+                rec->data.effective_priority_change.when = now();
+                rec->data.effective_priority_change.inh_pid = (inh != NULL) ?
+                        inh->pid :
+                        0xffff;
+                put_record(rec);
+        }
+}
+/* pray for no nesting of nv interrupts on same CPU... */
+struct tracing_interrupt_map
+{
+        int active;
+        int count;
+        unsigned long data[128]; // assume nesting less than 128...
+        unsigned long serial[128];
+};
+DEFINE_PER_CPU(struct tracing_interrupt_map, active_interrupt_tracing);
+DEFINE_PER_CPU(u32, intCounter);
+feather_callback void do_sched_trace_nv_interrupt_begin(unsigned long id,
+                                                                                                unsigned long _device)
+{
+        struct st_event_record *rec;
+        u32 serialNum;
+        {
+                u32* serial;
+                struct tracing_interrupt_map* int_map = &per_cpu(active_interrupt_tracing, smp_processor_id());
+                if(!int_map->active == 0xcafebabe)
+                {
+                        int_map->count++;
+                }
+                else
+                {
+                        int_map->active = 0xcafebabe;
+                        int_map->count = 1;
+                }
+                //int_map->data[int_map->count-1] = _device;
+                serial = &per_cpu(intCounter, smp_processor_id());
+                *serial += num_online_cpus();
+                serialNum = *serial;
+                int_map->serial[int_map->count-1] = serialNum;
+        }
+        rec = get_record(ST_NV_INTERRUPT_BEGIN, NULL);
+        if(rec) {
+                u32 device = _device;
+                rec->data.nv_interrupt_begin.when = now();
+                rec->data.nv_interrupt_begin.device = device;
+                rec->data.nv_interrupt_begin.serialNumber = serialNum;
+                put_record(rec);
+        }
+}
+EXPORT_SYMBOL(do_sched_trace_nv_interrupt_begin);
+/*
+int is_interrupt_tracing_active(void)
+{
+        struct tracing_interrupt_map* int_map = &per_cpu(active_interrupt_tracing, smp_processor_id());
+        if(int_map->active == 0xcafebabe)
+                return 1;
+        return 0;
+}
+*/
+feather_callback void do_sched_trace_nv_interrupt_end(unsigned long id, unsigned long _device)
+{
+        struct tracing_interrupt_map* int_map = &per_cpu(active_interrupt_tracing, smp_processor_id());
+        if(int_map->active == 0xcafebabe)
+        {
+                struct st_event_record *rec = get_record(ST_NV_INTERRUPT_END, NULL);
+                int_map->count--;
+                if(int_map->count == 0)
+                        int_map->active = 0;
+                if(rec) {
+                        u32 device = _device;
+                        rec->data.nv_interrupt_end.when = now();
+                        //rec->data.nv_interrupt_end.device = int_map->data[int_map->count];
+                        rec->data.nv_interrupt_end.device = device;
+                        rec->data.nv_interrupt_end.serialNumber = int_map->serial[int_map->count];
+                        put_record(rec);
+                }
+        }
+}
+EXPORT_SYMBOL(do_sched_trace_nv_interrupt_end);
diff --git a/litmus/sched_trace_external.c b/litmus/sched_trace_external.c
new file mode 100644
index 000000000000..cf8e1d78aa77
--- /dev/null
+++ b/litmus/sched_trace_external.c
@@ -0,0 +1,64 @@
+#include <linux/module.h>
+#include <litmus/trace.h>
+#include <litmus/sched_trace.h>
+#include <litmus/litmus.h>
+void __sched_trace_tasklet_begin_external(struct task_struct* t)
+{
+        sched_trace_tasklet_begin(t);
+}
+EXPORT_SYMBOL(__sched_trace_tasklet_begin_external);
+void __sched_trace_tasklet_end_external(struct task_struct* t, unsigned long flushed)
+{
+        sched_trace_tasklet_end(t, flushed);
+}
+EXPORT_SYMBOL(__sched_trace_tasklet_end_external);
+void __sched_trace_work_begin_external(struct task_struct* t, struct task_struct* e)
+{
+        sched_trace_work_begin(t, e);
+}
+EXPORT_SYMBOL(__sched_trace_work_begin_external);
+void __sched_trace_work_end_external(struct task_struct* t, struct task_struct* e, unsigned long f)
+{
+        sched_trace_work_end(t, e, f);
+}
+EXPORT_SYMBOL(__sched_trace_work_end_external);
+void __sched_trace_nv_interrupt_begin_external(u32 device)
+{
+        //unsigned long _device = device;
+        sched_trace_nv_interrupt_begin((unsigned long)device);
+}
+EXPORT_SYMBOL(__sched_trace_nv_interrupt_begin_external);
+void __sched_trace_nv_interrupt_end_external(u32 device)
+{
+        //unsigned long _device = device;
+        sched_trace_nv_interrupt_end((unsigned long)device);
+}
+EXPORT_SYMBOL(__sched_trace_nv_interrupt_end_external);
+#ifdef CONFIG_LITMUS_NVIDIA
+#define EXX_TS(evt) \
+void __##evt(void) { evt; } \
+EXPORT_SYMBOL(__##evt);
+EXX_TS(TS_NV_TOPISR_START)
+EXX_TS(TS_NV_TOPISR_END)
+EXX_TS(TS_NV_BOTISR_START)
+EXX_TS(TS_NV_BOTISR_END)
+EXX_TS(TS_NV_RELEASE_BOTISR_START)
+EXX_TS(TS_NV_RELEASE_BOTISR_END)
+#endif