16 files changed, 562 insertions, 213 deletions
diff --git a/litmus/Kconfig b/litmus/Kconfig
index ad8dc8308cf0..94b48e199577 100644
--- a/litmus/Kconfig
+++ b/litmus/Kconfig
@@ -62,6 +62,25 @@ config LITMUS_LOCKING
 endmenu
+menu "Performance Enhancements"
+config SCHED_CPU_AFFINITY
+        bool "Local Migration Affinity"
+        depends on X86
+        default y
+        help
+          Rescheduled tasks prefer CPUs near to their previously used CPU.  This
+          may improve performance through possible preservation of cache affinity.
+          Warning: May make bugs harder to find since tasks may migrate less often.
+          NOTES:
+                * Feature is not utilized by PFair/PD^2.
+          Say Yes if unsure.
+endmenu
 menu "Tracing"
 config FEATHER_TRACE
@@ -180,6 +199,20 @@ config SCHED_DEBUG_TRACE_CALLER
         If unsure, say No.
+config PREEMPT_STATE_TRACE
+       bool "Trace preemption state machine transitions"
+       depends on SCHED_DEBUG_TRACE
+       default n
+       help
+         With this option enabled, each CPU will log when it transitions
+         states in the preemption state machine. This state machine is
+         used to determine how to react to IPIs (avoid races with in-flight IPIs).
+         Warning: this creates a lot of information in the debug trace. Only
+         recommended when you are debugging preemption-related races.
+         If unsure, say No.
 endmenu
 endmenu
diff --git a/litmus/Makefile b/litmus/Makefile
index ad9936e07b83..7338180f196f 100644
--- a/litmus/Makefile
+++ b/litmus/Makefile
@@ -21,6 +21,7 @@ obj-y     = sched_plugin.o litmus.o \
 obj-$(CONFIG_PLUGIN_CEDF) += sched_cedf.o
 obj-$(CONFIG_PLUGIN_PFAIR) += sched_pfair.o
+obj-$(CONFIG_SCHED_CPU_AFFINITY) += affinity.o
 obj-$(CONFIG_FEATHER_TRACE) += ft_event.o ftdev.o
 obj-$(CONFIG_SCHED_TASK_TRACE) += sched_task_trace.o
diff --git a/litmus/affinity.c b/litmus/affinity.c
new file mode 100644
index 000000000000..3fa6dd789400
--- /dev/null
+++ b/litmus/affinity.c
@@ -0,0 +1,42 @@
+#include <linux/cpu.h>
+#include <litmus/affinity.h>
+struct neighborhood neigh_info[NR_CPUS];
+/* called by _init_litmus() */
+void init_topology(void) {
+        int cpu;
+        int i;
+        int chk;
+        int depth = num_cache_leaves;
+        if (depth > NUM_CACHE_LEVELS)
+                depth = NUM_CACHE_LEVELS;
+        for_each_online_cpu(cpu) {
+                for (i = 0; i < depth; ++i) {
+                        chk = get_shared_cpu_map((struct cpumask *)&neigh_info[cpu].neighbors[i], cpu, i);
+                        if (chk) {
+                                /* failed */
+                                neigh_info[cpu].size[i] = 0;
+                        } else {
+                                /* size = num bits in mask */
+                                neigh_info[cpu].size[i] =
+                                        cpumask_weight((struct cpumask *)&neigh_info[cpu].neighbors[i]);
+                        }
+                        printk("CPU %d has %d neighbors at level %d. (mask = %lx)\n",
+                                                        cpu, neigh_info[cpu].size[i], i, 
+                                                        *cpumask_bits(neigh_info[cpu].neighbors[i]));
+                }
+                /* set data for non-existent levels */
+                for (; i < NUM_CACHE_LEVELS; ++i) {
+                        neigh_info[cpu].size[i] = 0;
+                        printk("CPU %d has %d neighbors at level %d. (mask = %lx)\n",
+                                                cpu, neigh_info[cpu].size[i], i, 0lu);
+                }
+        }
+}
diff --git a/litmus/clustered.c b/litmus/clustered.c
index 04450a8ad4fe..6fe1b512f628 100644
--- a/litmus/clustered.c
+++ b/litmus/clustered.c
@@ -102,7 +102,7 @@ int assign_cpus_to_clusters(enum cache_level level,
                        cpus[i]->cluster = cpus[low_cpu]->cluster;
                }
                /* enqueue in cpus list */
-                list_add(&cpus[i]->cluster_list, &cpus[i]->cluster->cpus);
+                list_add_tail(&cpus[i]->cluster_list, &cpus[i]->cluster->cpus);
                printk(KERN_INFO "Assigning CPU%u to cluster %u\n.", i, cpus[i]->cluster->id);
        }
 out:
diff --git a/litmus/ftdev.c b/litmus/ftdev.c
index 4a4b2e3e56c2..06fcf4cf77dc 100644
--- a/litmus/ftdev.c
+++ b/litmus/ftdev.c
@@ -114,6 +114,7 @@ static int ftdev_open(struct inode *in, struct file *filp)
                goto out;
        ftdm = ftdev->minor + buf_idx;
+        ftdm->ftdev = ftdev;
        filp->private_data = ftdm;
        if (mutex_lock_interruptible(&ftdm->lock)) {
@@ -250,64 +251,61 @@ out:
        return err;
 }
-typedef uint32_t cmd_t;
+static long ftdev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-static ssize_t ftdev_write(struct file *filp, const char __user *from,
-                           size_t len, loff_t *f_pos)
 {
+        long err = -ENOIOCTLCMD;
        struct ftdev_minor* ftdm = filp->private_data;
-        ssize_t err = -EINVAL;
-        cmd_t cmd;
-        cmd_t id;
-        if (len % sizeof(cmd) || len < 2 * sizeof(cmd))
-                goto out;
-        if (copy_from_user(&cmd, from, sizeof(cmd))) {
-                err = -EFAULT;
-                goto out;
-        }
-        len  -= sizeof(cmd);
-        from += sizeof(cmd);
-        if (cmd != FTDEV_ENABLE_CMD && cmd != FTDEV_DISABLE_CMD)
-                goto out;
        if (mutex_lock_interruptible(&ftdm->lock)) {
                err = -ERESTARTSYS;
                goto out;
        }
-        err = sizeof(cmd);
+        /* FIXME: check id against list of acceptable events */
-        while (len) {
-                if (copy_from_user(&id, from, sizeof(cmd))) {
+        switch (cmd) {
-                        err = -EFAULT;
+        case  FTDEV_ENABLE_CMD:
-                        goto out_unlock;
+                if (activate(&ftdm->events, arg))
-                }
-                /* FIXME: check id against list of acceptable events */
-                len  -= sizeof(cmd);
-                from += sizeof(cmd);
-                if (cmd == FTDEV_DISABLE_CMD)
-                        deactivate(&ftdm->events, id);
-                else if (activate(&ftdm->events, id) != 0) {
                        err = -ENOMEM;
-                        goto out_unlock;
+                else
-                }
+                        err = 0;
-                err += sizeof(cmd);
+                break;
-        }
+        case FTDEV_DISABLE_CMD:
+                deactivate(&ftdm->events, arg);
+                err = 0;
+                break;
+        default:
+                printk(KERN_DEBUG "ftdev: strange ioctl (%u, %lu)\n", cmd, arg);
+        };
-out_unlock:
        mutex_unlock(&ftdm->lock);
 out:
        return err;
 }
+static ssize_t ftdev_write(struct file *filp, const char __user *from,
+                           size_t len, loff_t *f_pos)
+{
+        struct ftdev_minor* ftdm = filp->private_data;
+        ssize_t err = -EINVAL;
+        struct ftdev* ftdev = ftdm->ftdev;
+        /* dispatch write to buffer-specific code, if available */
+        if (ftdev->write)
+                err = ftdev->write(ftdm->buf, len, from);
+        return err;
+}
 struct file_operations ftdev_fops = {
        .owner   = THIS_MODULE,
        .open    = ftdev_open,
        .release = ftdev_release,
        .write   = ftdev_write,
        .read    = ftdev_read,
+        .unlocked_ioctl = ftdev_ioctl,
 };
 int ftdev_init( struct ftdev* ftdev, struct module* owner,
@@ -325,6 +323,7 @@ int ftdev_init(	struct ftdev* ftdev, struct module* owner,
        ftdev->alloc    = NULL;
        ftdev->free     = NULL;
        ftdev->can_open = NULL;
+        ftdev->write    = NULL;
        ftdev->minor = kcalloc(ftdev->minor_cnt, sizeof(*ftdev->minor),
                        GFP_KERNEL);
diff --git a/litmus/litmus.c b/litmus/litmus.c
index 11ccaafd50de..301390148d02 100644
--- a/litmus/litmus.c
+++ b/litmus/litmus.c
@@ -17,6 +17,10 @@
 #include <litmus/litmus_proc.h>
 #include <litmus/sched_trace.h>
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+#include <litmus/affinity.h>
+#endif
 /* Number of RT tasks that exist in the system */
 atomic_t rt_task_count          = ATOMIC_INIT(0);
 static DEFINE_RAW_SPINLOCK(task_transition_lock);
@@ -110,6 +114,14 @@ asmlinkage long sys_set_rt_task_param(pid_t pid, struct rt_task __user * param)
                       "because wcet > period\n", pid);
                goto out_unlock;
        }
+        if (    tp.cls != RT_CLASS_HARD &&
+                tp.cls != RT_CLASS_SOFT &&
+                tp.cls != RT_CLASS_BEST_EFFORT)
+        {
+                printk(KERN_INFO "litmus: real-time task %d rejected "
+                                 "because its class is invalid\n", pid);
+                goto out_unlock;
+        }
        if (tp.budget_policy != NO_ENFORCEMENT &&
            tp.budget_policy != QUANTUM_ENFORCEMENT &&
            tp.budget_policy != PRECISE_ENFORCEMENT)
@@ -517,6 +529,8 @@ static int __init _init_litmus(void)
         */
        printk("Starting LITMUS^RT kernel\n");
+        BUILD_BUG_ON(sizeof(union np_flag) != sizeof(uint32_t));
        register_sched_plugin(&linux_sched_plugin);
        bheap_node_cache    = KMEM_CACHE(bheap_node, SLAB_PANIC);
@@ -532,6 +546,10 @@ static int __init _init_litmus(void)
        init_litmus_proc();
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+        init_topology();
+#endif
        return 0;
 }
diff --git a/litmus/locking.c b/litmus/locking.c
index cfce98e7480d..b3279c1930b7 100644
--- a/litmus/locking.c
+++ b/litmus/locking.c
@@ -80,7 +80,7 @@ asmlinkage long sys_litmus_lock(int lock_od)
        /* Note: task my have been suspended or preempted in between!  Take
         * this into account when computing overheads. */
-        TS_UNLOCK_END;
+        TS_LOCK_END;
        return err;
 }
diff --git a/litmus/preempt.c b/litmus/preempt.c
index ebe2e3461895..5704d0bf4c0b 100644
--- a/litmus/preempt.c
+++ b/litmus/preempt.c
@@ -30,8 +30,10 @@ void sched_state_will_schedule(struct task_struct* tsk)
                /* Litmus tasks should never be subject to a remote
                 * set_tsk_need_resched(). */
                BUG_ON(is_realtime(tsk));
+#ifdef CONFIG_PREEMPT_STATE_TRACE
        TRACE_TASK(tsk, "set_tsk_need_resched() ret:%p\n",
                   __builtin_return_address(0));
+#endif
 }
 /* Called by the IPI handler after another CPU called smp_send_resched(). */
diff --git a/litmus/rt_domain.c b/litmus/rt_domain.c
index 81a5ac16f164..d405854cd39c 100644
--- a/litmus/rt_domain.c
+++ b/litmus/rt_domain.c
@@ -55,12 +55,14 @@ static enum hrtimer_restart on_release_timer(struct hrtimer *timer)
 {
        unsigned long flags;
        struct release_heap* rh;
+        rh = container_of(timer, struct release_heap, timer);
+        TS_RELEASE_LATENCY(rh->release_time);
        VTRACE("on_release_timer(0x%p) starts.\n", timer);
        TS_RELEASE_START;
-        rh = container_of(timer, struct release_heap, timer);
        raw_spin_lock_irqsave(&rh->dom->release_lock, flags);
        VTRACE("CB has the release_lock 0x%p\n", &rh->dom->release_lock);
diff --git a/litmus/sched_cedf.c b/litmus/sched_cedf.c
index 5e977dd2fef0..87f8bc9bb50b 100644
--- a/litmus/sched_cedf.c
+++ b/litmus/sched_cedf.c
@@ -44,6 +44,10 @@
 #include <litmus/bheap.h>
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+#include <litmus/affinity.h>
+#endif
 /* to configure the cluster size */
 #include <litmus/litmus_proc.h>
@@ -95,7 +99,7 @@ typedef struct clusterdomain {
        struct bheap_node *heap_node;
        struct bheap      cpu_heap;
        /* lock for this cluster */
-#define cedf_lock domain.ready_lock
+#define cluster_lock domain.ready_lock
 } cedf_domain_t;
 /* a cedf_domain per cluster; allocation is done at init/activation time */
@@ -204,7 +208,7 @@ static noinline void link_task_to_cpu(struct task_struct* linked,
 }
 /* unlink - Make sure a task is not linked any longer to an entry
- *          where it was linked before. Must hold cedf_lock.
+ *          where it was linked before. Must hold cluster_lock.
 */
 static noinline void unlink(struct task_struct* t)
 {
@@ -240,7 +244,7 @@ static void preempt(cpu_entry_t *entry)
 }
 /* requeue - Put an unlinked task into gsn-edf domain.
- *           Caller must hold cedf_lock.
+ *           Caller must hold cluster_lock.
 */
 static noinline void requeue(struct task_struct* task)
 {
@@ -257,11 +261,34 @@ static noinline void requeue(struct task_struct* task)
        }
 }
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+static cpu_entry_t* cedf_get_nearest_available_cpu(
+                                cedf_domain_t *cluster, cpu_entry_t *start)
+{
+        cpu_entry_t *affinity;
+        get_nearest_available_cpu(affinity, start, cedf_cpu_entries,
+#ifdef CONFIG_RELEASE_MASTER
+                cluster->domain.release_master
+#else
+                NO_CPU
+#endif
+                );
+        /* make sure CPU is in our cluster */
+        if (affinity && cpu_isset(affinity->cpu, *cluster->cpu_map))
+                return(affinity);
+        else
+                return(NULL);
+}
+#endif
 /* check for any necessary preemptions */
 static void check_for_preemptions(cedf_domain_t *cluster)
 {
        struct task_struct *task;
-        cpu_entry_t* last;
+        cpu_entry_t *last;
        for(last = lowest_prio_cpu(cluster);
            edf_preemption_needed(&cluster->domain, last->linked);
@@ -270,8 +297,20 @@ static void check_for_preemptions(cedf_domain_t *cluster)
                task = __take_ready(&cluster->domain);
                TRACE("check_for_preemptions: attempting to link task %d to %d\n",
                      task->pid, last->cpu);
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+                {
+                        cpu_entry_t *affinity =
+                                        cedf_get_nearest_available_cpu(cluster,
+                                                &per_cpu(cedf_cpu_entries, task_cpu(task)));
+                        if(affinity)
+                                last = affinity;
+                        else if(last->linked)
+                                requeue(last->linked);
+                }
+#else
                if (last->linked)
                        requeue(last->linked);
+#endif
                link_task_to_cpu(task, last);
                preempt(last);
        }
@@ -292,15 +331,15 @@ static void cedf_release_jobs(rt_domain_t* rt, struct bheap* tasks)
        cedf_domain_t* cluster = container_of(rt, cedf_domain_t, domain);
        unsigned long flags;
-        raw_spin_lock_irqsave(&cluster->cedf_lock, flags);
+        raw_spin_lock_irqsave(&cluster->cluster_lock, flags);
        __merge_ready(&cluster->domain, tasks);
        check_for_preemptions(cluster);
-        raw_spin_unlock_irqrestore(&cluster->cedf_lock, flags);
+        raw_spin_unlock_irqrestore(&cluster->cluster_lock, flags);
 }
-/* caller holds cedf_lock */
+/* caller holds cluster_lock */
 static noinline void job_completion(struct task_struct *t, int forced)
 {
        BUG_ON(!t);
@@ -378,7 +417,18 @@ static struct task_struct* cedf_schedule(struct task_struct * prev)
        int out_of_time, sleep, preempt, np, exists, blocks;
        struct task_struct* next = NULL;
-        raw_spin_lock(&cluster->cedf_lock);
+#ifdef CONFIG_RELEASE_MASTER
+        /* Bail out early if we are the release master.
+         * The release master never schedules any real-time tasks.
+         */
+        if (unlikely(cluster->domain.release_master == entry->cpu)) {
+                sched_state_task_picked();
+                return NULL;
+        }
+#endif
+        raw_spin_lock(&cluster->cluster_lock);
        clear_will_schedule();
        /* sanity checking */
@@ -462,10 +512,10 @@ static struct task_struct* cedf_schedule(struct task_struct * prev)
                        next = prev;
        sched_state_task_picked();
-        raw_spin_unlock(&cluster->cedf_lock);
+        raw_spin_unlock(&cluster->cluster_lock);
 #ifdef WANT_ALL_SCHED_EVENTS
-        TRACE("cedf_lock released, next=0x%p\n", next);
+        TRACE("cluster_lock released, next=0x%p\n", next);
        if (next)
                TRACE_TASK(next, "scheduled at %llu\n", litmus_clock());
@@ -504,7 +554,7 @@ static void cedf_task_new(struct task_struct * t, int on_rq, int running)
        /* the cluster doesn't change even if t is running */
        cluster = task_cpu_cluster(t);
-        raw_spin_lock_irqsave(&cluster->cedf_lock, flags);
+        raw_spin_lock_irqsave(&cluster->cluster_lock, flags);
        /* setup job params */
        release_at(t, litmus_clock());
@@ -513,15 +563,25 @@ static void cedf_task_new(struct task_struct * t, int on_rq, int running)
                entry = &per_cpu(cedf_cpu_entries, task_cpu(t));
                BUG_ON(entry->scheduled);
-                entry->scheduled = t;
+#ifdef CONFIG_RELEASE_MASTER
-                tsk_rt(t)->scheduled_on = task_cpu(t);
+                if (entry->cpu != cluster->domain.release_master) {
+#endif
+                        entry->scheduled = t;
+                        tsk_rt(t)->scheduled_on = task_cpu(t);
+#ifdef CONFIG_RELEASE_MASTER
+                } else {
+                        /* do not schedule on release master */
+                        preempt(entry); /* force resched */
+                        tsk_rt(t)->scheduled_on = NO_CPU;
+                }
+#endif
        } else {
                t->rt_param.scheduled_on = NO_CPU;
        }
        t->rt_param.linked_on          = NO_CPU;
        cedf_job_arrival(t);
-        raw_spin_unlock_irqrestore(&(cluster->cedf_lock), flags);
+        raw_spin_unlock_irqrestore(&(cluster->cluster_lock), flags);
 }
 static void cedf_task_wake_up(struct task_struct *task)
@@ -534,7 +594,8 @@ static void cedf_task_wake_up(struct task_struct *task)
        cluster = task_cpu_cluster(task);
-        raw_spin_lock_irqsave(&cluster->cedf_lock, flags);
+        raw_spin_lock_irqsave(&cluster->cluster_lock, flags);
        /* We need to take suspensions because of semaphores into
         * account! If a job resumes after being suspended due to acquiring
         * a semaphore, it should never be treated as a new job release.
@@ -557,7 +618,8 @@ static void cedf_task_wake_up(struct task_struct *task)
                }
        }
        cedf_job_arrival(task);
-        raw_spin_unlock_irqrestore(&cluster->cedf_lock, flags);
+        raw_spin_unlock_irqrestore(&cluster->cluster_lock, flags);
 }
 static void cedf_task_block(struct task_struct *t)
@@ -570,9 +632,9 @@ static void cedf_task_block(struct task_struct *t)
        cluster = task_cpu_cluster(t);
        /* unlink if necessary */
-        raw_spin_lock_irqsave(&cluster->cedf_lock, flags);
+        raw_spin_lock_irqsave(&cluster->cluster_lock, flags);
        unlink(t);
-        raw_spin_unlock_irqrestore(&cluster->cedf_lock, flags);
+        raw_spin_unlock_irqrestore(&cluster->cluster_lock, flags);
        BUG_ON(!is_realtime(t));
 }
@@ -584,7 +646,7 @@ static void cedf_task_exit(struct task_struct * t)
        cedf_domain_t *cluster = task_cpu_cluster(t);
        /* unlink if necessary */
-        raw_spin_lock_irqsave(&cluster->cedf_lock, flags);
+        raw_spin_lock_irqsave(&cluster->cluster_lock, flags);
        unlink(t);
        if (tsk_rt(t)->scheduled_on != NO_CPU) {
                cpu_entry_t *cpu;
@@ -592,7 +654,7 @@ static void cedf_task_exit(struct task_struct * t)
                cpu->scheduled = NULL;
                tsk_rt(t)->scheduled_on = NO_CPU;
        }
-        raw_spin_unlock_irqrestore(&cluster->cedf_lock, flags);
+        raw_spin_unlock_irqrestore(&cluster->cluster_lock, flags);
        BUG_ON(!is_realtime(t));
        TRACE_TASK(t, "RIP\n");
@@ -605,16 +667,6 @@ static long cedf_admit_task(struct task_struct* tsk)
 #ifdef CONFIG_LITMUS_LOCKING
 #include <litmus/fdso.h>
@@ -692,11 +744,11 @@ static void set_priority_inheritance(struct task_struct* t, struct task_struct*
 {
        cedf_domain_t* cluster = task_cpu_cluster(t);
        
-        raw_spin_lock(&cluster->cedf_lock);
+        raw_spin_lock(&cluster->cluster_lock);
        
        __set_priority_inheritance(t, prio_inh);
-        raw_spin_unlock(&cluster->cedf_lock);
+        raw_spin_unlock(&cluster->cluster_lock);
 }
@@ -727,9 +779,9 @@ static void clear_priority_inheritance(struct task_struct* t)
 {
        cedf_domain_t* cluster = task_cpu_cluster(t);
        
-        raw_spin_lock(&cluster->cedf_lock);
+        raw_spin_lock(&cluster->cluster_lock);
        __clear_priority_inheritance(t);
-        raw_spin_unlock(&cluster->cedf_lock);
+        raw_spin_unlock(&cluster->cluster_lock);
 }
@@ -857,7 +909,7 @@ static struct task_struct* kfmlp_remove_hp_waiter(struct kfmlp_semaphore* sem)
        
                cluster = task_cpu_cluster(max_hp);
-                raw_spin_lock(&cluster->cedf_lock);
+                raw_spin_lock(&cluster->cluster_lock);
                
                if(tsk_rt(my_queue->owner)->inh_task == max_hp)
                {
@@ -867,7 +919,7 @@ static struct task_struct* kfmlp_remove_hp_waiter(struct kfmlp_semaphore* sem)
                                __set_priority_inheritance(my_queue->owner, my_queue->hp_waiter);
                        }
                }
-                raw_spin_unlock(&cluster->cedf_lock);
+                raw_spin_unlock(&cluster->cluster_lock);
                
                list_for_each(pos, &my_queue->wait.task_list)
                {
@@ -1270,6 +1322,9 @@ static long cedf_activate_plugin(void)
                if(!zalloc_cpumask_var(&cedf[i].cpu_map, GFP_ATOMIC))
                        return -ENOMEM;
+#ifdef CONFIG_RELEASE_MASTER
+                cedf[i].domain.release_master = atomic_read(&release_master_cpu);
+#endif
        }
        /* cycle through cluster and add cpus to them */
@@ -1312,7 +1367,11 @@ static long cedf_activate_plugin(void)
                                entry->linked = NULL;
                                entry->scheduled = NULL;
-                                update_cpu_position(entry);
+#ifdef CONFIG_RELEASE_MASTER
+                                /* only add CPUs that should schedule jobs */
+                                if (entry->cpu != entry->cluster->domain.release_master)
+#endif
+                                        update_cpu_position(entry);
                        }
                        /* done with this cluster */
                        break;
diff --git a/litmus/sched_gsn_edf.c b/litmus/sched_gsn_edf.c
index b87524cf1802..d5bb326ebc9b 100644
--- a/litmus/sched_gsn_edf.c
+++ b/litmus/sched_gsn_edf.c
@@ -20,11 +20,16 @@
 #include <litmus/sched_plugin.h>
 #include <litmus/edf_common.h>
 #include <litmus/sched_trace.h>
+#include <litmus/trace.h>
 #include <litmus/preempt.h>
 #include <litmus/bheap.h>
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+#include <litmus/affinity.h>
+#endif
 #include <linux/module.h>
 /* Overview of GSN-EDF operations.
@@ -255,21 +260,52 @@ static noinline void requeue(struct task_struct* task)
        }
 }
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+static cpu_entry_t* gsnedf_get_nearest_available_cpu(cpu_entry_t *start)
+{
+        cpu_entry_t *affinity;
+        get_nearest_available_cpu(affinity, start, gsnedf_cpu_entries,
+#ifdef CONFIG_RELEASE_MASTER
+                        gsnedf.release_master
+#else
+                        NO_CPU
+#endif
+                        );
+        return(affinity);
+}
+#endif
 /* check for any necessary preemptions */
 static void check_for_preemptions(void)
 {
        struct task_struct *task;
-        cpu_entry_t* last;
+        cpu_entry_t *last;
-        for(last = lowest_prio_cpu();
+        for (last = lowest_prio_cpu();
-            edf_preemption_needed(&gsnedf, last->linked);
+             edf_preemption_needed(&gsnedf, last->linked);
-            last = lowest_prio_cpu()) {
+             last = lowest_prio_cpu()) {
                /* preemption necessary */
                task = __take_ready(&gsnedf);
                TRACE("check_for_preemptions: attempting to link task %d to %d\n",
                      task->pid, last->cpu);
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+                {
+                        cpu_entry_t *affinity =
+                                        gsnedf_get_nearest_available_cpu(
+                                                &per_cpu(gsnedf_cpu_entries, task_cpu(task)));
+                        if (affinity)
+                                last = affinity;
+                        else if (last->linked)
+                                requeue(last->linked);
+                }
+#else
                if (last->linked)
                        requeue(last->linked);
+#endif
                link_task_to_cpu(task, last);
                preempt(last);
        }
@@ -376,8 +412,10 @@ static struct task_struct* gsnedf_schedule(struct task_struct * prev)
        /* Bail out early if we are the release master.
         * The release master never schedules any real-time tasks.
         */
-        if (gsnedf.release_master == entry->cpu)
+        if (unlikely(gsnedf.release_master == entry->cpu)) {
+                sched_state_task_picked();
                return NULL;
+        }
 #endif
        raw_spin_lock(&gsnedf_lock);
@@ -783,6 +821,8 @@ int gsnedf_fmlp_lock(struct litmus_lock* l)
                                set_priority_inheritance(sem->owner, sem->hp_waiter);
                }
+                TS_LOCK_SUSPEND;
                /* release lock before sleeping */
                spin_unlock_irqrestore(&sem->wait.lock, flags);
@@ -793,6 +833,8 @@ int gsnedf_fmlp_lock(struct litmus_lock* l)
                schedule();
+                TS_LOCK_RESUME;
                /* Since we hold the lock, no other task will change
                 * ->owner. We can thus check it without acquiring the spin
                 * lock. */
diff --git a/litmus/sched_litmus.c b/litmus/sched_litmus.c
index 1bca2e1a33cd..9a6fe487718e 100644
--- a/litmus/sched_litmus.c
+++ b/litmus/sched_litmus.c
@@ -254,12 +254,12 @@ static void task_tick_litmus(struct rq *rq, struct task_struct *p, int queued)
        return;
 }
-static void switched_to_litmus(struct rq *rq, struct task_struct *p, int running)
+static void switched_to_litmus(struct rq *rq, struct task_struct *p)
 {
 }
 static void prio_changed_litmus(struct rq *rq, struct task_struct *p,
-                                int oldprio, int running)
+                                int oldprio)
 {
 }
@@ -285,8 +285,8 @@ static void set_curr_task_litmus(struct rq *rq)
 * We don't care about the scheduling domain; can gets called from
 * exec, fork, wakeup.
 */
-static int select_task_rq_litmus(struct rq *rq, struct task_struct *p,
+static int
-                int sd_flag, int flags)
+select_task_rq_litmus(struct task_struct *p, int sd_flag, int flags)
 {
        /* preemption is already disabled.
         * We don't want to change cpu here
@@ -296,7 +296,12 @@ static int select_task_rq_litmus(struct rq *rq, struct task_struct *p,
 #endif
 static const struct sched_class litmus_sched_class = {
-        .next                   = &rt_sched_class,
+        /* From 34f971f6 the stop/migrate worker threads have a class on
+         * their own, which is the highest prio class. We don't support
+         * cpu-hotplug or cpu throttling. Allows Litmus to use up to 1.0
+         * CPU capacity.
+         */
+        .next                   = &stop_sched_class,
        .enqueue_task           = enqueue_task_litmus,
        .dequeue_task           = dequeue_task_litmus,
        .yield_task             = yield_task_litmus,
diff --git a/litmus/sched_pfair.c b/litmus/sched_pfair.c
index 0a64273daa47..16f1065bbdca 100644
--- a/litmus/sched_pfair.c
+++ b/litmus/sched_pfair.c
@@ -1,7 +1,8 @@
 /*
 * kernel/sched_pfair.c
 *
- * Implementation of the (global) Pfair scheduling algorithm.
+ * Implementation of the PD^2 pfair scheduling algorithm. This
+ * implementation realizes "early releasing," i.e., it is work-conserving.
 *
 */
@@ -76,36 +77,29 @@ struct pfair_state {
        struct task_struct* local;     /* the local copy of linked          */
        struct task_struct* scheduled; /* what is actually scheduled        */
-        unsigned long missed_quanta;
        lt_t offset;                    /* stagger offset */
+        unsigned int missed_updates;
+        unsigned int missed_quanta;
 };
-/* Currently, we limit the maximum period of any task to 2000 quanta.
- * The reason is that it makes the implementation easier since we do not
- * need to reallocate the release wheel on task arrivals.
- * In the future
- */
-#define PFAIR_MAX_PERIOD 2000
 struct pfair_cluster {
        struct scheduling_cluster topology;
        /* The "global" time in this cluster. */
        quanta_t pfair_time; /* the "official" PFAIR clock */
-        quanta_t merge_time; /* Updated after the release queue has been
-                              * merged. Used by drop_all_references().
-                              */
        /* The ready queue for this cluster. */
        rt_domain_t pfair;
-        /* This is the release queue wheel for this cluster. It is indexed by
+        /* The set of jobs that should have their release enacted at the next
-         * pfair_time % PFAIR_MAX_PERIOD.  Each heap is ordered by PFAIR
+         * quantum boundary.
-         * priority, so that it can be merged with the ready queue.
         */
-        struct bheap release_queue[PFAIR_MAX_PERIOD];
+        struct bheap release_queue;
+        raw_spinlock_t release_lock;
 };
+#define RT_F_REQUEUE 0x2
 static inline struct pfair_cluster* cpu_cluster(struct pfair_state* state)
 {
        return container_of(state->topology.cluster, struct pfair_cluster, topology);
@@ -121,6 +115,11 @@ static inline struct pfair_state* from_cluster_list(struct list_head* pos)
        return list_entry(pos, struct pfair_state, topology.cluster_list);
 }
+static inline struct pfair_cluster* from_domain(rt_domain_t* rt)
+{
+        return container_of(rt, struct pfair_cluster, pfair);
+}
 static inline raw_spinlock_t* cluster_lock(struct pfair_cluster* cluster)
 {
        /* The ready_lock is used to serialize all scheduling events. */
@@ -161,21 +160,11 @@ static quanta_t cur_deadline(struct task_struct* t)
        return cur_subtask(t)->deadline +  tsk_pfair(t)->release;
 }
-static quanta_t cur_sub_release(struct task_struct* t)
-{
-        return cur_subtask(t)->release +  tsk_pfair(t)->release;
-}
 static quanta_t cur_release(struct task_struct* t)
 {
-#ifdef EARLY_RELEASE
+        /* This is early releasing: only the release of the first subtask
-        /* only the release of the first subtask counts when we early
+         * counts. */
-         * release */
        return tsk_pfair(t)->release;
-#else
-        return cur_sub_release(t);
-#endif
 }
 static quanta_t cur_overlap(struct task_struct* t)
@@ -235,11 +224,16 @@ int pfair_ready_order(struct bheap_node* a, struct bheap_node* b)
        return pfair_higher_prio(bheap2task(a), bheap2task(b));
 }
-/* return the proper release queue for time t */
+static void pfair_release_jobs(rt_domain_t* rt, struct bheap* tasks)
-static struct bheap* relq(struct pfair_cluster* cluster, quanta_t t)
 {
-        struct bheap* rq = cluster->release_queue + (t % PFAIR_MAX_PERIOD);
+        struct pfair_cluster* cluster = from_domain(rt);
-        return rq;
+        unsigned long flags;
+        raw_spin_lock_irqsave(&cluster->release_lock, flags);
+        bheap_union(pfair_ready_order, &cluster->release_queue, tasks);
+        raw_spin_unlock_irqrestore(&cluster->release_lock, flags);
 }
 static void prepare_release(struct task_struct* t, quanta_t at)
@@ -248,25 +242,12 @@ static void prepare_release(struct task_struct* t, quanta_t at)
        tsk_pfair(t)->cur        = 0;
 }
-static void __pfair_add_release(struct task_struct* t, struct bheap* queue)
-{
-        bheap_insert(pfair_ready_order, queue,
-                    tsk_rt(t)->heap_node);
-}
-static void pfair_add_release(struct pfair_cluster* cluster,
-                              struct task_struct* t)
-{
-        BUG_ON(bheap_node_in_heap(tsk_rt(t)->heap_node));
-        __pfair_add_release(t, relq(cluster, cur_release(t)));
-}
 /* pull released tasks from the release queue */
-static void poll_releases(struct pfair_cluster* cluster,
+static void poll_releases(struct pfair_cluster* cluster)
-                          quanta_t time)
 {
-        __merge_ready(&cluster->pfair, relq(cluster, time));
+        raw_spin_lock(&cluster->release_lock);
-        cluster->merge_time = time;
+        __merge_ready(&cluster->pfair, &cluster->release_queue);
+        raw_spin_unlock(&cluster->release_lock);
 }
 static void check_preempt(struct task_struct* t)
@@ -292,16 +273,12 @@ static void drop_all_references(struct task_struct *t)
 {
        int cpu;
        struct pfair_state* s;
-        struct bheap* q;
        struct pfair_cluster* cluster;
        if (bheap_node_in_heap(tsk_rt(t)->heap_node)) {
-                /* figure out what queue the node is in */
+                /* It must be in the ready queue; drop references isn't called
+                 * when the job is in a release queue. */
                cluster = tsk_pfair(t)->cluster;
-                if (time_before_eq(cur_release(t), cluster->merge_time))
+                bheap_delete(pfair_ready_order, &cluster->pfair.ready_queue,
-                        q = &cluster->pfair.ready_queue;
-                else
-                        q = relq(cluster, cur_release(t));
-                bheap_delete(pfair_ready_order, q,
                            tsk_rt(t)->heap_node);
        }
        for (cpu = 0; cpu < num_online_cpus(); cpu++) {
@@ -313,6 +290,17 @@ static void drop_all_references(struct task_struct *t)
                if (s->scheduled  == t)
                        s->scheduled = NULL;
        }
+        /* make sure we don't have a stale linked_on field */
+        tsk_rt(t)->linked_on = NO_CPU;
+}
+static void pfair_prepare_next_period(struct task_struct* t)
+{
+        struct pfair_param* p = tsk_pfair(t);
+        prepare_for_next_period(t);
+        get_rt_flags(t) = RT_F_RUNNING;
+        p->release += p->period;
 }
 /* returns 1 if the task needs to go the release queue */
@@ -322,30 +310,26 @@ static int advance_subtask(quanta_t time, struct task_struct* t, int cpu)
        int to_relq;
        p->cur = (p->cur + 1) % p->quanta;
        if (!p->cur) {
-                sched_trace_task_completion(t, 1);
                if (tsk_rt(t)->present) {
-                        /* we start a new job */
+                        /* The job overran; we start a new budget allocation. */
-                        prepare_for_next_period(t);
+                        pfair_prepare_next_period(t);
-                        sched_trace_task_release(t);
-                        get_rt_flags(t) = RT_F_RUNNING;
-                        p->release += p->period;
                } else {
                        /* remove task from system until it wakes */
                        drop_all_references(t);
+                        tsk_rt(t)->flags = RT_F_REQUEUE;
                        TRACE_TASK(t, "on %d advanced to subtask %lu (not present)\n",
                                   cpu, p->cur);
                        return 0;
                }
        }
        to_relq = time_after(cur_release(t), time);
-        TRACE_TASK(t, "on %d advanced to subtask %lu -> to_relq=%d\n",
+        TRACE_TASK(t, "on %d advanced to subtask %lu -> to_relq=%d (cur_release:%lu time:%lu)\n",
-                   cpu, p->cur, to_relq);
+                   cpu, p->cur, to_relq, cur_release(t), time);
        return to_relq;
 }
 static void advance_subtasks(struct pfair_cluster *cluster, quanta_t time)
 {
-        int missed;
        struct task_struct* l;
        struct pfair_param* p;
        struct list_head* pos;
@@ -354,14 +338,17 @@ static void advance_subtasks(struct pfair_cluster *cluster, quanta_t time)
        list_for_each(pos, &cluster->topology.cpus) {
                cpu = from_cluster_list(pos);
                l = cpu->linked;
-                missed = cpu->linked != cpu->local;
+                cpu->missed_updates += cpu->linked != cpu->local;
                if (l) {
                        p = tsk_pfair(l);
                        p->last_quantum = time;
                        p->last_cpu     =  cpu_id(cpu);
                        if (advance_subtask(time, l, cpu_id(cpu))) {
-                                cpu->linked = NULL;
+                                //cpu->linked = NULL;
-                                pfair_add_release(cluster, l);
+                                PTRACE_TASK(l, "should go to release queue. "
+                                            "scheduled_on=%d present=%d\n",
+                                            tsk_rt(l)->scheduled_on,
+                                            tsk_rt(l)->present);
                        }
                }
        }
@@ -445,6 +432,11 @@ static void schedule_subtasks(struct pfair_cluster *cluster, quanta_t time)
        list_for_each(pos, &cluster->topology.cpus) {
                cpu_state = from_cluster_list(pos);
                retry = 1;
+#ifdef CONFIG_RELEASE_MASTER
+                /* skip release master */
+                if (cluster->pfair.release_master == cpu_id(cpu_state))
+                        continue;
+#endif
                while (retry) {
                        if (pfair_higher_prio(__peek_ready(&cluster->pfair),
                                              cpu_state->linked))
@@ -471,13 +463,13 @@ static void schedule_next_quantum(struct pfair_cluster *cluster, quanta_t time)
        sched_trace_quantum_boundary();
        advance_subtasks(cluster, time);
-        poll_releases(cluster, time);
+        poll_releases(cluster);
        schedule_subtasks(cluster, time);
        list_for_each(pos, &cluster->topology.cpus) {
                cpu = from_cluster_list(pos);
                if (cpu->linked)
-                        PTRACE_TASK(pstate[cpu]->linked,
+                        PTRACE_TASK(cpu->linked,
                                    " linked on %d.\n", cpu_id(cpu));
                else
                        PTRACE("(null) linked on %d.\n", cpu_id(cpu));
@@ -612,12 +604,42 @@ static int safe_to_schedule(struct task_struct* t, int cpu)
 static struct task_struct* pfair_schedule(struct task_struct * prev)
 {
        struct pfair_state* state = &__get_cpu_var(pfair_state);
-        int blocks;
+        struct pfair_cluster* cluster = cpu_cluster(state);
+        int blocks, completion, out_of_time;
        struct task_struct* next = NULL;
+#ifdef CONFIG_RELEASE_MASTER
+        /* Bail out early if we are the release master.
+         * The release master never schedules any real-time tasks.
+         */
+        if (unlikely(cluster->pfair.release_master == cpu_id(state))) {
+                sched_state_task_picked();
+                return NULL;
+        }
+#endif
        raw_spin_lock(cpu_lock(state));
-        blocks  = is_realtime(prev) && !is_running(prev);
+        blocks      = is_realtime(prev) && !is_running(prev);
+        completion  = is_realtime(prev) && get_rt_flags(prev) == RT_F_SLEEP;
+        out_of_time = is_realtime(prev) && time_after(cur_release(prev),
+                                                      state->local_tick);
+        if (is_realtime(prev))
+            PTRACE_TASK(prev, "blocks:%d completion:%d out_of_time:%d\n",
+                        blocks, completion, out_of_time);
+        if (completion) {
+                sched_trace_task_completion(prev, 0);
+                pfair_prepare_next_period(prev);
+                prepare_release(prev, cur_release(prev));
+        }
+        if (!blocks && (completion || out_of_time)) {
+                drop_all_references(prev);
+                sched_trace_task_release(prev);
+                add_release(&cluster->pfair, prev);
+        }
        if (state->local && safe_to_schedule(state->local, cpu_id(state)))
                next = state->local;
@@ -649,13 +671,19 @@ static void pfair_task_new(struct task_struct * t, int on_rq, int running)
        cluster = tsk_pfair(t)->cluster;
        raw_spin_lock_irqsave(cluster_lock(cluster), flags);
-        if (running)
-                t->rt_param.scheduled_on = task_cpu(t);
-        else
-                t->rt_param.scheduled_on = NO_CPU;
        prepare_release(t, cluster->pfair_time + 1);
-        pfair_add_release(cluster, t);
+        t->rt_param.scheduled_on = NO_CPU;
+        if (running) {
+#ifdef CONFIG_RELEASE_MASTER
+                if (task_cpu(t) != cluster->pfair.release_master)
+#endif
+                        t->rt_param.scheduled_on = task_cpu(t);
+                __add_ready(&cluster->pfair, t);
+        }
        check_preempt(t);
        raw_spin_unlock_irqrestore(cluster_lock(cluster), flags);
@@ -665,6 +693,7 @@ static void pfair_task_wake_up(struct task_struct *t)
 {
        unsigned long flags;
        lt_t now;
+        int requeue = 0;
        struct pfair_cluster* cluster;
        cluster = tsk_pfair(t)->cluster;
@@ -679,13 +708,20 @@ static void pfair_task_wake_up(struct task_struct *t)
         * (as if it never blocked at all). Otherwise, we have a
         * new sporadic job release.
         */
+        requeue = tsk_rt(t)->flags == RT_F_REQUEUE;
        now = litmus_clock();
        if (lt_before(get_deadline(t), now)) {
+                TRACE_TASK(t, "sporadic release!\n");
                release_at(t, now);
                prepare_release(t, time2quanta(now, CEIL));
                sched_trace_task_release(t);
-                /* FIXME: race with pfair_time advancing */
+        }
-                pfair_add_release(cluster, t);
+        /* only add to ready queue if the task isn't still linked somewhere */
+        if (requeue) {
+                TRACE_TASK(t, "requeueing required\n");
+                tsk_rt(t)->flags = RT_F_RUNNING;
+                __add_ready(&cluster->pfair, t);
        }
        check_preempt(t);
@@ -744,15 +780,11 @@ static void pfair_release_at(struct task_struct* task, lt_t start)
        release_at(task, start);
        release = time2quanta(start, CEIL);
-        /* FIXME: support arbitrary offsets. */
-        if (release - cluster->pfair_time >= PFAIR_MAX_PERIOD)
-                release = cluster->pfair_time + PFAIR_MAX_PERIOD;
        TRACE_TASK(task, "sys release at %lu\n", release);
        drop_all_references(task);
        prepare_release(task, release);
-        pfair_add_release(cluster, task);
+        add_release(&cluster->pfair, task);
        raw_spin_unlock_irqrestore(cluster_lock(cluster), flags);
 }
@@ -834,13 +866,6 @@ static long pfair_admit_task(struct task_struct* t)
                       "The period of %s/%d is not a multiple of %llu.\n",
                       t->comm, t->pid, (unsigned long long) quantum_length);
-        if (period >= PFAIR_MAX_PERIOD) {
-                printk(KERN_WARNING
-                       "PFAIR: Rejecting task %s/%d; its period is too long.\n",
-                       t->comm, t->pid);
-                return -EINVAL;
-        }
        if (quanta == period) {
                /* special case: task has weight 1.0 */
                printk(KERN_INFO
@@ -880,12 +905,9 @@ static long pfair_admit_task(struct task_struct* t)
 static void pfair_init_cluster(struct pfair_cluster* cluster)
 {
-        int i;
+        rt_domain_init(&cluster->pfair, pfair_ready_order, NULL, pfair_release_jobs);
+        bheap_init(&cluster->release_queue);
-        /* initialize release queue */
+        raw_spin_lock_init(&cluster->release_lock);
-        for (i = 0; i < PFAIR_MAX_PERIOD; i++)
-                bheap_init(&cluster->release_queue[i]);
-        rt_domain_init(&cluster->pfair, pfair_ready_order, NULL, NULL);
        INIT_LIST_HEAD(&cluster->topology.cpus);
 }
@@ -899,8 +921,11 @@ static void cleanup_clusters(void)
        num_pfair_clusters = 0;
        /* avoid stale pointers */
-        for (i = 0; i < NR_CPUS; i++)
+        for (i = 0; i < num_online_cpus(); i++) {
                pstate[i]->topology.cluster = NULL;
+                printk("P%d missed %u updates and %u quanta.\n", cpu_id(pstate[i]),
+                       pstate[i]->missed_updates, pstate[i]->missed_quanta);
+        }
 }
 static long pfair_activate_plugin(void)
@@ -936,6 +961,9 @@ static long pfair_activate_plugin(void)
                pfair_init_cluster(cluster);
                cluster->pfair_time = now;
                clust[i] = &cluster->topology;
+#ifdef CONFIG_RELEASE_MASTER
+                cluster->pfair.release_master = atomic_read(&release_master_cpu);
+#endif
        }
        for (i = 0; i < num_online_cpus(); i++)  {
@@ -943,6 +971,7 @@ static long pfair_activate_plugin(void)
                state->cur_tick   = now;
                state->local_tick = now;
                state->missed_quanta = 0;
+                state->missed_updates = 0;
                state->offset     = cpu_stagger_offset(i);
                printk(KERN_ERR "cpus[%d] set; %d\n", i, num_online_cpus());
                cpus[i] = &state->topology;
diff --git a/litmus/sched_plugin.c b/litmus/sched_plugin.c
index d54886df1f57..00a1900d6457 100644
--- a/litmus/sched_plugin.c
+++ b/litmus/sched_plugin.c
@@ -35,29 +35,18 @@ void preempt_if_preemptable(struct task_struct* t, int cpu)
                        /* local CPU case */
                        /* check if we need to poke userspace */
                        if (is_user_np(t))
-                                /* yes, poke it */
+                                /* Yes, poke it. This doesn't have to be atomic since
+                                 * the task is definitely not executing. */
                                request_exit_np(t);
                        else if (!is_kernel_np(t))
                                /* only if we are allowed to preempt the
                                 * currently-executing task */
                                reschedule = 1;
                } else {
-                        /* remote CPU case */
+                        /* Remote CPU case.  Only notify if it's not a kernel
-                        if (is_user_np(t)) {
+                         * NP section and if we didn't set the userspace
-                                /* need to notify user space of delayed
+                         * flag. */
-                                 * preemption */
+                        reschedule = !(is_kernel_np(t) || request_exit_np_atomic(t));
-                                /* to avoid a race, set the flag, then test
-                                 * again */
-                                request_exit_np(t);
-                                /* make sure it got written */
-                                mb();
-                        }
-                        /* Only send an ipi if remote task might have raced our
-                         * request, i.e., send an IPI to make sure in case it
-                         * exited its critical section.
-                         */
-                        reschedule = !is_np(t) && !is_kernel_np(t);
                }
        }
        if (likely(reschedule))
diff --git a/litmus/sched_psn_edf.c b/litmus/sched_psn_edf.c
index 71c02409efa2..8e4a22dd8d6a 100644
--- a/litmus/sched_psn_edf.c
+++ b/litmus/sched_psn_edf.c
@@ -20,6 +20,7 @@
 #include <litmus/sched_plugin.h>
 #include <litmus/edf_common.h>
 #include <litmus/sched_trace.h>
+#include <litmus/trace.h>
 typedef struct {
        rt_domain_t             domain;
@@ -383,12 +384,6 @@ static unsigned int psnedf_get_srp_prio(struct task_struct* t)
        return get_rt_period(t);
 }
-static long psnedf_activate_plugin(void)
-{
-        get_srp_prio = psnedf_get_srp_prio;
-        return 0;
-}
 /* ******************** FMLP support ********************** */
 /* struct for semaphore with priority inheritance */
@@ -428,6 +423,8 @@ int psnedf_fmlp_lock(struct litmus_lock* l)
                __add_wait_queue_tail_exclusive(&sem->wait, &wait);
+                TS_LOCK_SUSPEND;
                /* release lock before sleeping */
                spin_unlock_irqrestore(&sem->wait.lock, flags);
@@ -438,6 +435,8 @@ int psnedf_fmlp_lock(struct litmus_lock* l)
                schedule();
+                TS_LOCK_RESUME;
                /* Since we hold the lock, no other task will change
                 * ->owner. We can thus check it without acquiring the spin
                 * lock. */
@@ -577,9 +576,35 @@ static long psnedf_allocate_lock(struct litmus_lock **lock, int type,
 #endif
+static long psnedf_activate_plugin(void)
+{
+#ifdef CONFIG_RELEASE_MASTER
+        int cpu;
+        for_each_online_cpu(cpu) {
+                remote_edf(cpu)->release_master = atomic_read(&release_master_cpu);
+        }
+#endif
+#ifdef CONFIG_LITMUS_LOCKING
+        get_srp_prio = psnedf_get_srp_prio;
+#endif
+        return 0;
+}
 static long psnedf_admit_task(struct task_struct* tsk)
 {
-        return task_cpu(tsk) == tsk->rt_param.task_params.cpu ? 0 : -EINVAL;
+        if (task_cpu(tsk) == tsk->rt_param.task_params.cpu
+#ifdef CONFIG_RELEASE_MASTER
+            /* don't allow tasks on release master CPU */
+             && task_cpu(tsk) != remote_edf(task_cpu(tsk))->release_master
+#endif
+                )
+                return 0;
+        else
+                return -EINVAL;
 }
 /*      Plugin object   */
@@ -593,9 +618,9 @@ static struct sched_plugin psn_edf_plugin __cacheline_aligned_in_smp = {
        .task_wake_up           = psnedf_task_wake_up,
        .task_block             = psnedf_task_block,
        .admit_task             = psnedf_admit_task,
+        .activate_plugin        = psnedf_activate_plugin,
 #ifdef CONFIG_LITMUS_LOCKING
        .allocate_lock          = psnedf_allocate_lock,
-        .activate_plugin        = psnedf_activate_plugin,
 #endif
 };
diff --git a/litmus/trace.c b/litmus/trace.c
index e7ea1c2ab3e4..3c35c527e805 100644
--- a/litmus/trace.c
+++ b/litmus/trace.c
@@ -1,5 +1,6 @@
 #include <linux/sched.h>
 #include <linux/module.h>
+#include <linux/uaccess.h>
 #include <litmus/ftdev.h>
 #include <litmus/litmus.h>
@@ -15,6 +16,35 @@ static struct ftdev overhead_dev;
 static unsigned int ts_seq_no = 0;
+DEFINE_PER_CPU(atomic_t, irq_fired_count);
+static inline void clear_irq_fired(void)
+{
+        atomic_set(&__raw_get_cpu_var(irq_fired_count), 0);
+}
+static inline unsigned int get_and_clear_irq_fired(void)
+{
+        /* This is potentially not atomic  since we might migrate if
+         * preemptions are not disabled. As a tradeoff between
+         * accuracy and tracing overheads, this seems acceptable.
+         * If it proves to be a problem, then one could add a callback
+         * from the migration code to invalidate irq_fired_count.
+         */
+        return atomic_xchg(&__raw_get_cpu_var(irq_fired_count), 0);
+}
+static inline void __save_irq_flags(struct timestamp *ts)
+{
+        unsigned int irq_count;
+        irq_count     = get_and_clear_irq_fired();
+        /* Store how many interrupts occurred. */
+        ts->irq_count = irq_count;
+        /* Extra flag because ts->irq_count overflows quickly. */
+        ts->irq_flag  = irq_count > 0;
+}
 static inline void __save_timestamp_cpu(unsigned long event,
                                        uint8_t type, uint8_t cpu)
 {
@@ -23,10 +53,26 @@ static inline void __save_timestamp_cpu(unsigned long event,
        seq_no = fetch_and_inc((int *) &ts_seq_no);
        if (ft_buffer_start_write(trace_ts_buf, (void**)  &ts)) {
                ts->event     = event;
-                ts->timestamp = ft_timestamp();
                ts->seq_no    = seq_no;
                ts->cpu       = cpu;
                ts->task_type = type;
+                __save_irq_flags(ts);
+                barrier();
+                /* prevent re-ordering of ft_timestamp() */
+                ts->timestamp = ft_timestamp();
+                ft_buffer_finish_write(trace_ts_buf, ts);
+        }
+}
+static void __add_timestamp_user(struct timestamp *pre_recorded)
+{
+        unsigned int seq_no;
+        struct timestamp *ts;
+        seq_no = fetch_and_inc((int *) &ts_seq_no);
+        if (ft_buffer_start_write(trace_ts_buf, (void**)  &ts)) {
+                *ts = *pre_recorded;
+                ts->seq_no = seq_no;
+                __save_irq_flags(ts);
                ft_buffer_finish_write(trace_ts_buf, ts);
        }
 }
@@ -61,6 +107,27 @@ feather_callback void save_timestamp_cpu(unsigned long event,
        __save_timestamp_cpu(event, TSK_UNKNOWN, cpu);
 }
+feather_callback void save_task_latency(unsigned long event,
+                                        unsigned long when_ptr)
+{
+        lt_t now = litmus_clock();
+        lt_t *when = (lt_t*) when_ptr;
+        unsigned int seq_no;
+        int cpu = raw_smp_processor_id();
+        struct timestamp *ts;
+        seq_no = fetch_and_inc((int *) &ts_seq_no);
+        if (ft_buffer_start_write(trace_ts_buf, (void**)  &ts)) {
+                ts->event     = event;
+                ts->timestamp = now - *when;
+                ts->seq_no    = seq_no;
+                ts->cpu       = cpu;
+                ts->task_type = TSK_RT;
+                __save_irq_flags(ts);
+                ft_buffer_finish_write(trace_ts_buf, ts);
+        }
+}
 /******************************************************************************/
 /*                        DEVICE FILE DRIVER                                  */
 /******************************************************************************/
@@ -69,11 +136,15 @@ feather_callback void save_timestamp_cpu(unsigned long event,
 * should be 8M; it is the max we can ask to buddy system allocator (MAX_ORDER)
 * and we might not get as much
 */
-#define NO_TIMESTAMPS (2 << 11)
+#define NO_TIMESTAMPS (2 << 16)
 static int alloc_timestamp_buffer(struct ftdev* ftdev, unsigned int idx)
 {
        unsigned int count = NO_TIMESTAMPS;
+        /* An overhead-tracing timestamp should be exactly 16 bytes long. */
+        BUILD_BUG_ON(sizeof(struct timestamp) != 16);
        while (count && !trace_ts_buf) {
                printk("time stamp buffer: trying to allocate %u time stamps.\n", count);
                ftdev->minor[idx].buf = alloc_ft_buffer(count, sizeof(struct timestamp));
@@ -88,9 +159,35 @@ static void free_timestamp_buffer(struct ftdev* ftdev, unsigned int idx)
        ftdev->minor[idx].buf = NULL;
 }
+static ssize_t write_timestamp_from_user(struct ft_buffer* buf, size_t len,
+                                         const char __user *from)
+{
+        ssize_t consumed = 0;
+        struct timestamp ts;
+        /* don't give us partial timestamps */
+        if (len % sizeof(ts))
+                return -EINVAL;
+        while (len >= sizeof(ts)) {
+                if (copy_from_user(&ts, from, sizeof(ts))) {
+                        consumed = -EFAULT;
+                        goto out;
+                }
+                len  -= sizeof(ts);
+                from += sizeof(ts);
+                consumed += sizeof(ts);
+                __add_timestamp_user(&ts);
+        }
+out:
+        return consumed;
+}
 static int __init init_ft_overhead_trace(void)
 {
-        int err;
+        int err, cpu;
        printk("Initializing Feather-Trace overhead tracing device.\n");
        err = ftdev_init(&overhead_dev, THIS_MODULE, 1, "ft_trace");
@@ -99,11 +196,17 @@ static int __init init_ft_overhead_trace(void)
        overhead_dev.alloc = alloc_timestamp_buffer;
        overhead_dev.free  = free_timestamp_buffer;
+        overhead_dev.write = write_timestamp_from_user;
        err = register_ftdev(&overhead_dev);
        if (err)
                goto err_dealloc;
+        /* initialize IRQ flags */
+        for (cpu = 0; cpu < NR_CPUS; cpu++)  {
+                clear_irq_fired();
+        }
        return 0;
 err_dealloc: