Merge branch 'wip-color' into wip-mc

Conflicts: include/litmus/budget.h include/litmus/litmus.h include/litmus/rt_param.h include/litmus/sched_trace.h include/litmus/trace.h include/trace/events/litmus.h litmus/Makefile litmus/budget.c litmus/ftdev.c litmus/jobs.c litmus/litmus.c litmus/locking.c litmus/preempt.c litmus/rt_domain.c litmus/sched_gsn_edf.c litmus/trace.c
author: Jonathan Herman <hermanjl@cs.unc.edu> 2012-09-29 13:04:40 -0400
committer: Jonathan Herman <hermanjl@cs.unc.edu> 2012-09-29 13:04:40 -0400
commit: daf1e620bff2cb6d830ef66725369bba9c858f62 (patch)
tree: 1aed8f7cb55371c70d2139b6754d90ea89a26147 /litmus
parent: 451ed3b075c2a8e322e5a44f177e2470426a821d (diff)
parent: 1cb90226816c7af7808be4c0de866c54da17ecc9 (diff)
27 files changed, 2932 insertions, 198 deletions
diff --git a/litmus/Kconfig b/litmus/Kconfig
index bd6ec9f2d3e8..48d6f28c6e4a 100644
--- a/litmus/Kconfig
+++ b/litmus/Kconfig
@@ -12,6 +12,19 @@ config PLUGIN_CEDF
          On smaller platforms (e.g., ARM PB11MPCore), using C-EDF
          makes little sense since there aren't any shared caches.
+config PLUGIN_COLOR
+        bool "Scheduling with Colors"
+        default y
+        help
+          Include the scheduling with colors scheduler.
+config PLUGIN_COLOR_UNCACHABLE
+        bool "Colored memory is not cachable"
+        depends on PLUGIN_COLOR && X86_PAT
+        default n
+        help
+          Any memory allocated to the color plugin is not CPU cached.
 config PLUGIN_PFAIR
        bool "PFAIR"
        depends on HIGH_RES_TIMERS && !NO_HZ
@@ -102,7 +115,6 @@ config NP_SECTION
 config LITMUS_LOCKING
        bool "Support for real-time locking protocols"
-        depends on NP_SECTION
        default n
        help
          Enable LITMUS^RT's deterministic multiprocessor real-time
@@ -113,6 +125,25 @@ config LITMUS_LOCKING
 endmenu
+menu "Performance Enhancements"
+config SCHED_CPU_AFFINITY
+        bool "Local Migration Affinity"
+        depends on X86
+        default y
+        help
+          Rescheduled tasks prefer CPUs near to their previously used CPU.  This
+          may improve performance through possible preservation of cache affinity.
+          Warning: May make bugs harder to find since tasks may migrate less often.
+          NOTES:
+                * Feature is not utilized by PFair/PD^2.
+          Say Yes if unsure.
+endmenu
 menu "Tracing"
 config FEATHER_TRACE
@@ -249,6 +280,20 @@ config SCHED_DEBUG_TRACE_CALLER
         If unsure, say No.
+config PREEMPT_STATE_TRACE
+       bool "Trace preemption state machine transitions"
+       depends on SCHED_DEBUG_TRACE
+       default n
+       help
+         With this option enabled, each CPU will log when it transitions
+         states in the preemption state machine. This state machine is
+         used to determine how to react to IPIs (avoid races with in-flight IPIs).
+         Warning: this creates a lot of information in the debug trace. Only
+         recommended when you are debugging preemption-related races.
+         If unsure, say No.
 endmenu
 endmenu
diff --git a/litmus/Makefile b/litmus/Makefile
index 51e979506d84..b406cf2ad9e1 100644
--- a/litmus/Makefile
+++ b/litmus/Makefile
@@ -19,14 +19,26 @@ obj-y     = sched_plugin.o litmus.o \
            domain.o \
            sched_psn_edf.o \
            sched_gsn_edf.o
+            color.o \
+            color_proc.o \
+            color_dev.o \
+            rt_server.o \
+            dgl.o \
+            fifo_common.o \
+            rm_common.o \
+            sched_psn_edf.o \
+            sched_gsn_edf.o
 obj-$(CONFIG_PLUGIN_CEDF) += sched_cedf.o
 obj-$(CONFIG_PLUGIN_PFAIR) += sched_pfair.o
+obj-$(CONFIG_PLUGIN_COLOR) += sched_color.o
+obj-$(CONFIG_SCHED_CPU_AFFINITY) += affinity.o
 obj-$(CONFIG_PLUGIN_MC) += sched_mc.o sched_mc_ce.o ce_domain.o
 obj-$(CONFIG_MERGE_TIMERS) += event_group.o
 obj-$(CONFIG_FEATHER_TRACE) += ft_event.o ftdev.o
 obj-$(CONFIG_SCHED_TASK_TRACE) += sched_task_trace.o
 obj-$(CONFIG_SCHED_DEBUG_TRACE) += sched_trace.o
 obj-$(CONFIG_SCHED_OVERHEAD_TRACE) += trace.o
diff --git a/litmus/affinity.c b/litmus/affinity.c
new file mode 100644
index 000000000000..3fa6dd789400
--- /dev/null
+++ b/litmus/affinity.c
@@ -0,0 +1,42 @@
+#include <linux/cpu.h>
+#include <litmus/affinity.h>
+struct neighborhood neigh_info[NR_CPUS];
+/* called by _init_litmus() */
+void init_topology(void) {
+        int cpu;
+        int i;
+        int chk;
+        int depth = num_cache_leaves;
+        if (depth > NUM_CACHE_LEVELS)
+                depth = NUM_CACHE_LEVELS;
+        for_each_online_cpu(cpu) {
+                for (i = 0; i < depth; ++i) {
+                        chk = get_shared_cpu_map((struct cpumask *)&neigh_info[cpu].neighbors[i], cpu, i);
+                        if (chk) {
+                                /* failed */
+                                neigh_info[cpu].size[i] = 0;
+                        } else {
+                                /* size = num bits in mask */
+                                neigh_info[cpu].size[i] =
+                                        cpumask_weight((struct cpumask *)&neigh_info[cpu].neighbors[i]);
+                        }
+                        printk("CPU %d has %d neighbors at level %d. (mask = %lx)\n",
+                                                        cpu, neigh_info[cpu].size[i], i, 
+                                                        *cpumask_bits(neigh_info[cpu].neighbors[i]));
+                }
+                /* set data for non-existent levels */
+                for (; i < NUM_CACHE_LEVELS; ++i) {
+                        neigh_info[cpu].size[i] = 0;
+                        printk("CPU %d has %d neighbors at level %d. (mask = %lx)\n",
+                                                cpu, neigh_info[cpu].size[i], i, 0lu);
+                }
+        }
+}
diff --git a/litmus/budget.c b/litmus/budget.c
index 172c12b369da..d63e484ba160 100644
--- a/litmus/budget.c
+++ b/litmus/budget.c
@@ -7,16 +7,9 @@
 #include <litmus/budget.h>
 #include <litmus/sched_trace.h>
-struct enforcement_timer {
-        /* The enforcement timer is used to accurately police
-         * slice budgets. */
-        struct hrtimer          timer;
-        int                     armed;
-};
 DEFINE_PER_CPU(struct enforcement_timer, budget_timer);
-static enum hrtimer_restart on_enforcement_timeout(struct hrtimer *timer)
+enum hrtimer_restart on_enforcement_timeout(struct hrtimer *timer)
 {
        struct enforcement_timer* et = container_of(timer,
                                                    struct enforcement_timer,
@@ -34,7 +27,7 @@ static enum hrtimer_restart on_enforcement_timeout(struct hrtimer *timer)
 }
 /* assumes called with IRQs off */
-static void cancel_enforcement_timer(struct enforcement_timer* et)
+void cancel_enforcement_timer(struct enforcement_timer* et)
 {
        int ret;
@@ -56,11 +49,10 @@ static void cancel_enforcement_timer(struct enforcement_timer* et)
 }
 /* assumes called with IRQs off */
-static void arm_enforcement_timer(struct enforcement_timer* et,
+void arm_enforcement_timer(struct enforcement_timer* et,
-                                  struct task_struct* t)
+                           struct task_struct* t)
 {
        lt_t when_to_fire;
-        TRACE_TASK(t, "arming enforcement timer.\n");
        /* Calling this when there is no budget left for the task
         * makes no sense, unless the task is non-preemptive. */
@@ -69,8 +61,11 @@ static void arm_enforcement_timer(struct enforcement_timer* et,
        /* __hrtimer_start_range_ns() cancels the timer
         * anyway, so we don't have to check whether it is still armed */
-        if (likely(!is_np(t))) {
+        if (likely(!is_user_np(t))) {
                when_to_fire = litmus_clock() + budget_remaining(t);
+                TRACE_TASK(t, "arming enforcement timer for %llu.\n",
+                           when_to_fire);
                __hrtimer_start_range_ns(&et->timer,
                                         ns_to_ktime(when_to_fire),
                                         0 /* delta */,
@@ -96,6 +91,11 @@ void update_enforcement_timer(struct task_struct* t)
        }
 }
+void init_enforcement_timer(struct enforcement_timer *et)
+{
+        hrtimer_init(&et->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+        et->timer.function = on_enforcement_timeout;
+}
 static int __init init_budget_enforcement(void)
 {
@@ -104,14 +104,14 @@ static int __init init_budget_enforcement(void)
        for (cpu = 0; cpu < NR_CPUS; cpu++)  {
                et = &per_cpu(budget_timer, cpu);
-                hrtimer_init(&et->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+                init_enforcement_timer(et);
-                et->timer.function = on_enforcement_timeout;
        }
        return 0;
 }
 void task_release(struct task_struct *t)
 {
+        /* Also wrong */
        t->rt_param.job_params.real_release = t->rt_param.job_params.real_deadline;
        t->rt_param.job_params.real_deadline += get_rt_period(t);
        t->rt_param.job_params.job_no++;
@@ -120,6 +120,7 @@ void task_release(struct task_struct *t)
 void server_release(struct task_struct *t)
 {
+        /* TODO: so wrong with respect to time accounting */
        lt_t now = litmus_clock();
        t->rt_param.job_params.exec_time = 0;
        t->rt_param.job_params.release = t->rt_param.job_params.deadline;
diff --git a/litmus/clustered.c b/litmus/clustered.c
index 04450a8ad4fe..6fe1b512f628 100644
--- a/litmus/clustered.c
+++ b/litmus/clustered.c
@@ -102,7 +102,7 @@ int assign_cpus_to_clusters(enum cache_level level,
                        cpus[i]->cluster = cpus[low_cpu]->cluster;
                }
                /* enqueue in cpus list */
-                list_add(&cpus[i]->cluster_list, &cpus[i]->cluster->cpus);
+                list_add_tail(&cpus[i]->cluster_list, &cpus[i]->cluster->cpus);
                printk(KERN_INFO "Assigning CPU%u to cluster %u\n.", i, cpus[i]->cluster->id);
        }
 out:
diff --git a/litmus/color.c b/litmus/color.c
new file mode 100644
index 000000000000..ecc191137137
--- /dev/null
+++ b/litmus/color.c
@@ -0,0 +1,357 @@
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/sysctl.h>
+#include <linux/lockdep.h>
+#include <linux/sched.h> /* required by litmus.h */
+#include <asm/io.h> /* page_to_phys on SPARC */
+#ifdef CONFIG_PLUGIN_COLOR_UNCACHABLE
+#include <asm/cacheflush.h> /* set_memory_uc */
+#endif
+#include <litmus/color.h>
+#include <litmus/litmus.h> /* for in_list(...) */
+#define PAGES_PER_COLOR 3072
+/*
+ * This is used only to "trick" lockdep into permitting dynamically allocated
+ * locks of different classes that are initialized on the same line.
+ */
+#define LOCKDEP_MAX_NR_COLORS 512
+static struct lock_class_key color_lock_keys[LOCKDEP_MAX_NR_COLORS];
+struct color_group {
+        spinlock_t lock;
+        char _lock_name[LITMUS_LOCKDEP_NAME_MAX_LEN];
+        struct list_head list;
+        atomic_t nr_pages;
+};
+static unsigned long color_mask;
+static struct color_group *color_groups;
+/* non-static: extern'ed in various files */
+struct color_cache_info color_cache_info;
+int color_sysctl_add_pages_data;
+static inline unsigned long page_color(struct page *page)
+{
+        return ((page_to_phys(page) & color_mask) >> PAGE_SHIFT);
+}
+/*
+ * Page's count should be one, it sould not be on any LRU list.
+ */
+void add_page_to_color_list(struct page *page)
+{
+        const unsigned long color = page_color(page);
+        struct color_group *cgroup = &color_groups[color];
+        BUG_ON(in_list(&page->lru) || PageLRU(page));
+        BUG_ON(page_count(page) > 1);
+        spin_lock(&cgroup->lock);
+        list_add_tail(&page->lru, &cgroup->list);
+        atomic_inc(&cgroup->nr_pages);
+        SetPageLRU(page);
+        spin_unlock(&cgroup->lock);
+}
+/*
+ * Increase's page's count to two.
+ */
+struct page* get_colored_page(unsigned long color)
+{
+        struct color_group *cgroup;
+        struct page *page = NULL;
+        if (color >= color_cache_info.nr_colors)
+                goto out;
+        cgroup = &color_groups[color];
+        spin_lock(&cgroup->lock);
+        if (unlikely(!atomic_read(&cgroup->nr_pages))) {
+                TRACE_CUR("No free %lu colored pages.\n", color);
+                printk(KERN_WARNING "no free %lu colored pages.\n", color);
+                goto out_unlock;
+        }
+        page = list_first_entry(&cgroup->list, struct page, lru);
+        BUG_ON(page_count(page) > 1);
+        get_page(page);
+        list_del(&page->lru);
+        atomic_dec(&cgroup->nr_pages);
+        ClearPageLRU(page);
+out_unlock:
+        spin_unlock(&cgroup->lock);
+out:
+        return page;
+}
+static unsigned long smallest_nr_pages(void)
+{
+        unsigned long i, min_pages = -1;
+        struct color_group *cgroup;
+        for (i = 0; i < color_cache_info.nr_colors; ++i) {
+                cgroup = &color_groups[i];
+                if (atomic_read(&cgroup->nr_pages) < min_pages)
+                        min_pages = atomic_read(&cgroup->nr_pages);
+        }
+        return min_pages;
+}
+static int do_add_pages(void)
+{
+        struct page *page, *page_tmp;
+        LIST_HEAD(free_later);
+        unsigned long color;
+        int ret = 0;
+        while (smallest_nr_pages() < PAGES_PER_COLOR) {
+#ifdef CONFIG_PLUGIN_COLOR_UNCACHABLE
+                unsigned long vaddr;
+#endif
+#if defined(CONFIG_X86)
+                page = alloc_page(GFP_HIGHUSER | __GFP_ZERO |
+                                __GFP_MOVABLE);
+#elif defined(CONFIG_SPARC) /* X86 */
+                page = alloc_page(GFP_HIGHUSER | __GFP_MOVABLE);
+#else
+#error What architecture are you using?
+#endif
+                if (unlikely(!page)) {
+                        printk(KERN_WARNING "Could not allocate pages.\n");
+                        ret = -ENOMEM;
+                        goto out;
+                }
+                color = page_color(page);
+                if (atomic_read(&color_groups[color].nr_pages) < PAGES_PER_COLOR) {
+                        SetPageReserved(page);
+#ifdef CONFIG_PLUGIN_COLOR_UNCACHABLE
+                        vaddr = (unsigned long) pfn_to_kaddr(page_to_pfn(page));
+                        if (set_memory_uc(vaddr, 1)) {
+                                printk(KERN_ALERT "Could not set_memory_uc\n");
+                                BUG();
+                        }
+#endif
+                        add_page_to_color_list(page);
+                } else
+                        list_add_tail(&page->lru, &free_later);
+        }
+        list_for_each_entry_safe(page, page_tmp, &free_later, lru) {
+                list_del(&page->lru);
+                __free_page(page);
+        }
+out:
+        return ret;
+}
+static struct alloced_pages {
+        spinlock_t lock;
+        struct list_head list;
+} alloced_pages;
+struct alloced_page {
+        struct page *page;
+        struct vm_area_struct *vma;
+        struct list_head list;
+};
+static struct alloced_page * new_alloced_page(struct page *page,
+                struct vm_area_struct *vma)
+{
+        struct alloced_page *ap = kmalloc(sizeof(*ap), GFP_KERNEL);
+        INIT_LIST_HEAD(&ap->list);
+        ap->page = page;
+        ap->vma = vma;
+        return ap;
+}
+/*
+ * Page's count should be two or more. It should not be on aly LRU list.
+ */
+void add_page_to_alloced_list(struct page *page, struct vm_area_struct *vma)
+{
+        struct alloced_page *ap;
+        BUG_ON(page_count(page) < 2);
+        ap = new_alloced_page(page, vma);
+        spin_lock(&alloced_pages.lock);
+        list_add_tail(&ap->list, &alloced_pages.list);
+        spin_unlock(&alloced_pages.lock);
+}
+/*
+ * Reclaim pages.
+ */
+void reclaim_pages(struct vm_area_struct *vma)
+{
+        struct alloced_page *ap, *ap_tmp;
+        unsigned long nr_reclaimed = 0;
+        spin_lock(&alloced_pages.lock);
+        list_for_each_entry_safe(ap, ap_tmp, &alloced_pages.list, list) {
+                if (vma == ap->vma) {
+                        list_del(&ap->list);
+                        put_page(ap->page);
+                        add_page_to_color_list(ap->page);
+                        nr_reclaimed++;
+                        TRACE_CUR("reclaiming page (pa:0x%10llx, pfn:%8lu, "
+                                        "color:%3lu)\n", page_to_phys(ap->page),
+                                        page_to_pfn(ap->page), page_color(ap->page));
+                        kfree(ap);
+                }
+        }
+        spin_unlock(&alloced_pages.lock);
+        TRACE_CUR("Reclaimed %lu pages.\n", nr_reclaimed);
+}
+/***********************************************************
+ * Proc
+***********************************************************/
+int color_add_pages_handler(struct ctl_table *table, int write, void __user *buffer,
+                size_t *lenp, loff_t *ppos)
+{
+        int ret = 0;
+        ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+        if (ret)
+                goto out;
+        if (write && color_sysctl_add_pages_data)
+                ret = do_add_pages();
+out:
+        return ret;
+}
+int color_nr_pages_handler(struct ctl_table *table, int write, void __user *buffer,
+                size_t *lenp, loff_t *ppos)
+{
+        struct color_group *cgroup;
+        char *buf;
+        unsigned long i;
+        int used = 0, ret = 0;
+        if (write) {
+                ret = -EPERM;
+                goto out;
+        }
+        for (i = 0; i < color_cache_info.nr_colors; ++i) {
+                cgroup = &color_groups[i];
+                buf = ((char*)table->data) + used;
+                used += snprintf(buf, table->maxlen - used, ONE_COLOR_FMT,
+                                i, atomic_read(&cgroup->nr_pages));
+        }
+        ret = proc_dostring(table, write, buffer, lenp, ppos);
+out:
+        return ret;
+}
+/***********************************************************
+ * Initialization
+***********************************************************/
+#if defined(CONFIG_X86)
+/* slowest possible way to find a log, but we only do this once on boot */
+static unsigned int __init slow_log(unsigned int v)
+{
+        unsigned int r = 0;
+        while (v >>= 1)
+                r++;
+        return r;
+}
+static int __init init_mask(void)
+{
+        unsigned int line_size_log = slow_log(color_cache_info.line_size);
+        int err = 0;
+        BUG_ON(color_cache_info.size <= 1048576 ||
+                        color_cache_info.ways < 15 ||
+                        color_cache_info.line_size != 64);
+        printk("Cache size: %lu  line-size: %lu  ways: %lu  sets: %lu\n",
+                        color_cache_info.size, color_cache_info.line_size,
+                        color_cache_info.ways, color_cache_info.sets);
+        if (!color_cache_info.size) {
+                printk(KERN_WARNING "No cache information found.\n");
+                err = -EINVAL;
+                goto out;
+        }
+        BUG_ON(color_cache_info.size / color_cache_info.line_size /
+                        color_cache_info.ways != color_cache_info.sets);
+        BUG_ON(PAGE_SIZE >= (color_cache_info.sets  << line_size_log));
+        color_mask = ((color_cache_info.sets << line_size_log) - 1) ^
+                (PAGE_SIZE - 1);
+        color_cache_info.nr_colors = (color_mask >> PAGE_SHIFT) + 1;
+out:
+        return err;
+}
+#elif defined(CONFIG_SPARC) /* X86 */
+static int __init init_mask(void)
+{
+        /*
+         * Static assuming we are using Flare (our Niagara machine).
+         * This machine has weirdness with cache banks, and I don't want
+         * to waste time trying to auto-detect this.
+         */
+        color_mask = 0x3e000UL;                   /* bits 17:13 */
+        color_cache_info.size = 3 * 1024 * 1024;  /* 3 MB */
+        color_cache_info.line_size = 64;
+        color_cache_info.ways = 12;
+        color_cache_info.sets = 1024 * 4;
+        color_cache_info.nr_colors = (1 << hweight_long(color_mask));
+        return 0;
+}
+#endif /* SPARC/X86 */
+static int __init init_color_groups(void)
+{
+        struct color_group *cgroup;
+        unsigned long i;
+        int err = 0;
+        color_groups = kmalloc(color_cache_info.nr_colors *
+                        sizeof(struct color_group), GFP_KERNEL);
+        if (!color_groups) {
+                printk(KERN_WARNING "Could not allocate color groups.\n");
+                err = -ENOMEM;
+                goto out;
+        }
+        for (i = 0; i < color_cache_info.nr_colors; ++i) {
+                cgroup = &color_groups[i];
+                atomic_set(&cgroup->nr_pages, 0);
+                INIT_LIST_HEAD(&cgroup->list);
+                spin_lock_init(&cgroup->lock);
+                LOCKDEP_DYNAMIC_ALLOC(&cgroup->lock, &color_lock_keys[i],
+                                cgroup->_lock_name, "color%lu", i);
+        }
+out:
+        return err;
+}
+static int __init init_color(void)
+{
+        int err = 0;
+        printk("Initializing LITMUS^RT cache coloring.\n");
+        INIT_LIST_HEAD(&alloced_pages.list);
+        spin_lock_init(&alloced_pages.lock);
+        err = init_mask();
+        printk("PAGE_SIZE: %lu  Color mask: 0x%lx  Total colors: %lu\n",
+                        PAGE_SIZE, color_mask, color_cache_info.nr_colors);
+        BUG_ON(LOCKDEP_MAX_NR_COLORS < color_cache_info.nr_colors);
+        err = init_color_groups();
+        return err;
+}
+module_init(init_color);
diff --git a/litmus/color_dev.c b/litmus/color_dev.c
new file mode 100644
index 000000000000..51760328418e
--- /dev/null
+++ b/litmus/color_dev.c
@@ -0,0 +1,351 @@
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/highmem.h>
+#include <asm/io.h> /* page_to_phys on SPARC */
+#include <litmus/litmus.h>
+#include <litmus/color.h>
+#define ALLOC_NAME      "litmus/color_alloc"
+#define CTRL_NAME       "litmus/color_ctrl"
+extern unsigned long nr_colors;
+/***********************************************************
+ * Control device
+***********************************************************/
+static void litmus_color_ctrl_vm_close(struct vm_area_struct *vma)
+{
+        TRACE_CUR("%s flags=0x%lx prot=0x%lx\n", __FUNCTION__,
+                        vma->vm_flags, pgprot_val(vma->vm_page_prot));
+        TRACE_CUR(CTRL_NAME ": %p:%p vma:%p vma->vm_private_data:%p closed.\n",
+                        (void*) vma->vm_start, (void*) vma->vm_end, vma,
+                        vma->vm_private_data);
+}
+static int litmus_color_ctrl_vm_fault(struct vm_area_struct *vma,
+                struct vm_fault *vmf)
+{
+        /* This function should never be called, since
+         * all pages should have been mapped by mmap()
+         * already. */
+        TRACE_CUR("%s flags=0x%lx\n", __FUNCTION__, vma->vm_flags);
+        printk(KERN_WARNING "fault: %s flags=0x%lx\n", __FUNCTION__,
+                        vma->vm_flags);
+        /* nope, you only get one page */
+        return VM_FAULT_SIGBUS;
+}
+static struct vm_operations_struct litmus_color_ctrl_vm_ops = {
+        .close  = litmus_color_ctrl_vm_close,
+        .fault  = litmus_color_ctrl_vm_fault,
+};
+static int mmap_common_checks(struct vm_area_struct *vma)
+{
+        /* you can only map the "first" page */
+        if (vma->vm_pgoff != 0)
+                return -EINVAL;
+#if 0
+        /* you can't share it with anyone */
+        /* well, maybe you can... */
+        if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED))
+                return -EINVAL;
+#endif
+        return 0;
+}
+static int alloc_color_ctrl_page(void)
+{
+        struct task_struct *t;
+        int err = 0;
+        t = current;
+        /* only allocate if the task doesn't have one yet */
+        if (!tsk_rt(t)->color_ctrl_page) {
+                tsk_rt(t)->color_ctrl_page = (void*) get_zeroed_page(GFP_KERNEL);
+                if (!tsk_rt(t)->color_ctrl_page)
+                        err = -ENOMEM;
+                /* will get de-allocated in task teardown */
+                TRACE_TASK(t, "%s color_ctrl_page = %p\n", __FUNCTION__,
+                           tsk_rt(t)->color_ctrl_page);
+        }
+        return err;
+}
+static int map_color_ctrl_page(struct vm_area_struct *vma)
+{
+        int err;
+        unsigned long pfn;
+        struct task_struct *t = current;
+        struct page *color_ctrl = virt_to_page(tsk_rt(t)->color_ctrl_page);
+        t = current;
+        /* Increase ref count. Is decreased when vma is destroyed. */
+        get_page(color_ctrl);
+        pfn = page_to_pfn(color_ctrl);
+        TRACE_CUR(CTRL_NAME
+                  ": mapping %p (pfn:%lx, %lx) to 0x%lx (flags:%lx prot:%lx)\n",
+                  tsk_rt(t)->color_ctrl_page, pfn, page_to_pfn(color_ctrl),
+                  vma->vm_start, vma->vm_flags, pgprot_val(vma->vm_page_prot));
+        /* Map it into the vma. Make sure to use PAGE_SHARED, otherwise
+         * userspace actually gets a copy-on-write page. */
+        err = remap_pfn_range(vma, vma->vm_start, pfn, PAGE_SIZE, PAGE_SHARED);
+        if (err)
+                TRACE_CUR(CTRL_NAME ": remap_pfn_range() failed (%d)\n", err);
+        return err;
+}
+static int litmus_color_ctrl_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+        int err = 0;
+        /* you can only get one page */
+        if (vma->vm_end - vma->vm_start != PAGE_SIZE) {
+                TRACE_CUR(CTRL_NAME ": must allocate a multiple of PAGE_SIZE\n");
+                err = -EINVAL;
+                goto out;
+        }
+        err = mmap_common_checks(vma);
+        if (err) {
+                TRACE_CUR(CTRL_NAME ": failed common mmap checks.\n");
+                goto out;
+        }
+        vma->vm_ops = &litmus_color_ctrl_vm_ops;
+        /* this mapping should not be kept across forks,
+         * and cannot be expanded */
+        vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND;
+        err = alloc_color_ctrl_page();
+        if (!err)
+                err = map_color_ctrl_page(vma);
+        TRACE_CUR("%s flags=0x%lx prot=0x%lx\n", __FUNCTION__, vma->vm_flags,
+                        pgprot_val(vma->vm_page_prot));
+out:
+        return err;
+}
+/***********************************************************
+ * Allocation device
+***********************************************************/
+#define vma_nr_pages(vma) \
+        ({unsigned long v = ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT); v;})
+static int do_map_colored_pages(struct vm_area_struct *vma)
+{
+        const unsigned long nr_pages = vma_nr_pages(vma);
+        struct color_ctrl_page *color_ctrl = tsk_rt(current)->color_ctrl_page;
+        unsigned long nr_mapped;
+        int i, err = 0;
+        TRACE_CUR(ALLOC_NAME ": allocating %lu pages (flags:%lx prot:%lx)\n",
+                        nr_pages, vma->vm_flags, pgprot_val(vma->vm_page_prot));
+#ifdef CONFIG_PLUGIN_COLOR_UNCACHABLE
+        vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+#endif
+        for (i = 0, nr_mapped = 0; nr_mapped < nr_pages; ++i) {
+                const unsigned long color_no = color_ctrl->colors[i];
+                unsigned int page_no = 0;
+                for (; page_no < color_ctrl->pages[i]; ++page_no, ++nr_mapped) {
+                        const unsigned long addr = vma->vm_start +
+                                (nr_mapped << PAGE_SHIFT);
+                        struct page *page = get_colored_page(color_no);
+#ifdef CONFIG_PLUGIN_COLOR_UNCACHABLE
+                        const pgprot_t ins_prot = pgprot_noncached(PAGE_SHARED);
+#else
+                        const pgprot_t ins_prot = PAGE_SHARED;
+#endif
+                        if (!page) {
+                                TRACE_CUR(ALLOC_NAME ": Could not get page with"
+                                          " color %lu.\n", color_no);
+                                /* TODO unmap mapped pages */
+                                err = -ENOMEM;
+                                goto out;
+                        }
+#ifdef CONFIG_SPARC
+                        clear_user_highpage(page, addr);
+#endif
+                        TRACE_CUR(ALLOC_NAME ": insert page (pa:0x%10llx, "
+                                  "pfn:%8lu, color:%3lu, prot:%lx) at 0x%lx "
+                                  "vma:(flags:%16lx prot:%16lx)\n",
+                                  page_to_phys(page),
+                                  page_to_pfn(page), color_no,
+                                  pgprot_val(ins_prot), addr,
+                                  vma->vm_flags,
+                                  pgprot_val(vma->vm_page_prot));
+                        err = remap_pfn_range(vma, addr, page_to_pfn(page),
+                                        PAGE_SIZE, ins_prot);
+                        if (err) {
+                                TRACE_CUR(ALLOC_NAME ": remap_pfn_range() fail "
+                                          "(%d)\n", err);
+                                /* TODO unmap mapped pages */
+                                err = -EINVAL;
+                                goto out;
+                        }
+                        add_page_to_alloced_list(page, vma);
+                }
+                if (!page_no) {
+                        TRACE_CUR(ALLOC_NAME ": 0 pages given for color %lu\n",
+                                  color_no);
+                        err = -EINVAL;
+                        goto out;
+                }
+        }
+ out:
+        return err;
+}
+static int map_colored_pages(struct vm_area_struct *vma)
+{
+        int err = 0;
+        if (!tsk_rt(current)->color_ctrl_page) {
+                TRACE_CUR("Process has no color control page.\n");
+                err = -EINVAL;
+                goto out;
+        }
+        if (COLORS_PER_CONTROL_PAGE < vma_nr_pages(vma)) {
+                TRACE_CUR("Max page request %lu but want %lu.\n",
+                                COLORS_PER_CONTROL_PAGE, vma_nr_pages(vma));
+                err = -EINVAL;
+                goto out;
+        }
+        err = do_map_colored_pages(vma);
+out:
+        return err;
+}
+static void litmus_color_alloc_vm_close(struct vm_area_struct *vma)
+{
+        TRACE_CUR("%s flags=0x%lx prot=0x%lx\n", __FUNCTION__,
+                        vma->vm_flags, pgprot_val(vma->vm_page_prot));
+        TRACE_CUR(ALLOC_NAME ": %p:%p vma:%p vma->vm_private_data:%p closed.\n",
+                        (void*) vma->vm_start, (void*) vma->vm_end, vma,
+                        vma->vm_private_data);
+        reclaim_pages(vma);
+}
+static int litmus_color_alloc_vm_fault(struct vm_area_struct *vma,
+                struct vm_fault *vmf)
+{
+        /* This function should never be called, since
+         * all pages should have been mapped by mmap()
+         * already. */
+        TRACE_CUR("%s flags=0x%lx\n", __FUNCTION__, vma->vm_flags);
+        printk(KERN_WARNING "fault: %s flags=0x%lx\n", __FUNCTION__,
+                        vma->vm_flags);
+        /* nope, you only get one page */
+        return VM_FAULT_SIGBUS;
+}
+static struct vm_operations_struct litmus_color_alloc_vm_ops = {
+        .close  = litmus_color_alloc_vm_close,
+        .fault  = litmus_color_alloc_vm_fault,
+};
+static int litmus_color_alloc_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+        int err = 0;
+        /* you may only request integer multiple of PAGE_SIZE */
+        if (offset_in_page(vma->vm_end - vma->vm_start)) {
+                err = -EINVAL;
+                goto out;
+        }
+        err = mmap_common_checks(vma);
+        if (err)
+                goto out;
+        vma->vm_ops = &litmus_color_alloc_vm_ops;
+        /* this mapping should not be kept across forks,
+         * and cannot be expanded */
+        vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND;
+        err = map_colored_pages(vma);
+        TRACE_CUR("%s flags=0x%lx prot=0x%lx\n", __FUNCTION__, vma->vm_flags,
+                        pgprot_val(vma->vm_page_prot));
+out:
+        return err;
+}
+/***********************************************************
+ * Initilization
+***********************************************************/
+static struct file_operations litmus_color_ctrl_fops = {
+        .owner  = THIS_MODULE,
+        .mmap   = litmus_color_ctrl_mmap,
+};
+static struct miscdevice litmus_color_ctrl_dev = {
+        .name   = CTRL_NAME,
+        .minor  = MISC_DYNAMIC_MINOR,
+        .fops   = &litmus_color_ctrl_fops,
+};
+static struct file_operations litmus_color_alloc_fops = {
+        .owner  = THIS_MODULE,
+        .mmap   = litmus_color_alloc_mmap,
+};
+static struct miscdevice litmus_color_alloc_dev = {
+        .name   = ALLOC_NAME,
+        .minor  = MISC_DYNAMIC_MINOR,
+        .fops   = &litmus_color_alloc_fops,
+};
+static int __init init_dev(const char* name, struct miscdevice *dev)
+{
+        int err;
+        err = misc_register(dev);
+        if (err)
+                printk(KERN_WARNING "Could not allocate %s device (%d).\n",
+                                name, err);
+        return err;
+}
+static int __init init_color_devices(void)
+{
+        int err;
+        printk("Allocating LITMUS^RT color devices.\n");
+        err = init_dev(ALLOC_NAME, &litmus_color_alloc_dev);
+        if (err)
+                goto out;
+        err = init_dev(CTRL_NAME, &litmus_color_ctrl_dev);
+out:
+        return err;
+}
+module_init(init_color_devices);
diff --git a/litmus/color_proc.c b/litmus/color_proc.c
new file mode 100644
index 000000000000..d770123c5f02
--- /dev/null
+++ b/litmus/color_proc.c
@@ -0,0 +1,220 @@
+#include <linux/module.h>
+#include <linux/sysctl.h>
+#include <linux/slab.h>
+#include <litmus/sched_trace.h>
+#include <litmus/color.h>
+extern int color_sysctl_add_pages_data; /* litmus/color.c */
+static int zero = 0;
+static int one = 1;
+/* used as names for server proc entries */
+static char *period_str = "period";
+static char *wcet_str = "wcet";
+/* servers have a WCET and period */
+#define NR_SERVER_PARAMS 2
+#define CPU_NAME_LEN     3
+struct color_cpu_server {
+        char name[CPU_NAME_LEN];
+        unsigned long wcet;
+        unsigned long period;
+        /* the + 1 is for the sentinel element */
+        struct ctl_table table[NR_SERVER_PARAMS + 1];
+};
+static struct color_cpu_server color_cpu_servers[NR_CPUS];
+/* the + 1 is for the sentinel element */
+static struct ctl_table color_cpu_tables[NR_CPUS + 1];
+unsigned long color_chunk;
+#define INFO_BUFFER_SIZE 100
+static char info_buffer[100];
+#define NR_PAGES_INDEX 0 /* location of nr_pages in the table below */
+static struct ctl_table color_table[] =
+{
+        {
+                /* you MUST update NR_PAGES_INDEX if you move this entry */
+                .procname       = "nr_pages",
+                .mode           = 0444,
+                .proc_handler   = color_nr_pages_handler,
+                .data           = NULL,         /* dynamically set later */
+                .maxlen         = 0,            /* also set later */
+        },
+        {
+                .procname       = "servers",
+                .mode           = 0555,
+                .child          = color_cpu_tables,
+        },
+        {
+                .procname       = "add_pages",
+                .data           = &color_sysctl_add_pages_data,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = color_add_pages_handler,
+                .extra1         = &zero,
+                .extra2         = &one,
+        },
+        {
+                .procname       = "cache_info",
+                .mode           = 0444,
+                .proc_handler   = proc_dostring,
+                .data           = info_buffer,
+                .maxlen         = INFO_BUFFER_SIZE,
+        },
+        {
+                .procname       = "chunk_size",
+                .mode           = 0666,
+                .proc_handler   = proc_doulongvec_minmax,
+                .data           = &color_chunk,
+                .maxlen         = sizeof(color_chunk),
+        },
+        { }
+};
+static struct ctl_table litmus_table[] =
+{
+        {
+                .procname       = "color",
+                .mode           = 0555,
+                .child          = color_table,
+        },
+        { }
+};
+static struct ctl_table litmus_dir_table[] = {
+        {
+                .procname       = "litmus",
+                .mode           = 0555,
+                .child          = litmus_table,
+        },
+        { }
+};
+int color_server_params(int cpu, unsigned long *wcet, unsigned long *period)
+{
+        struct color_cpu_server *svr;
+        if (cpu >= num_online_cpus()) {
+                printk(KERN_WARNING "Cannot access illegal CPU: %d\n", cpu);
+                return -EFAULT;
+        }
+        svr = &color_cpu_servers[cpu];
+        if (svr->wcet == 0 || svr->period == 0) {
+                printk(KERN_WARNING "Server %d is uninitialized!\n", cpu);
+                return -EPERM;
+        }
+        *wcet = svr->wcet;
+        *period = svr->period;
+        TRACE("For %d: %lu, %lu\n", cpu, svr->wcet, svr->period);
+        return 0;
+}
+/* must be called AFTER nr_colors is set */
+static int __init init_sysctl_nr_colors(void)
+{
+        int ret = 0, maxlen = ONE_COLOR_LEN * color_cache_info.nr_colors;
+        color_table[NR_PAGES_INDEX].data = kmalloc(maxlen, GFP_KERNEL);
+        if (!color_table[NR_PAGES_INDEX].data) {
+                printk(KERN_WARNING "Could not allocate nr_pages buffer.\n");
+                ret = -ENOMEM;
+                goto out;
+        }
+        color_table[NR_PAGES_INDEX].maxlen = maxlen;
+out:
+        return ret;
+}
+static void __init init_server_entry(struct ctl_table *entry,
+                                     unsigned long *parameter,
+                                     char *name)
+{
+        entry->procname = name;
+        entry->mode = 0666;
+        entry->proc_handler = proc_doulongvec_minmax;
+        entry->data = parameter;
+        entry->maxlen = sizeof(*parameter);
+}
+static int __init init_cpu_entry(struct ctl_table *cpu_table,
+                struct color_cpu_server *svr, int cpu)
+{
+        struct ctl_table *entry = svr->table;
+        printk(KERN_INFO "Creating cpu %d\n", cpu);
+        init_server_entry(entry, &svr->wcet, wcet_str);
+        entry++;
+        init_server_entry(entry, &svr->period, period_str);
+        /* minus one for the null byte */
+        snprintf(svr->name, CPU_NAME_LEN - 1, "%d", cpu);
+        cpu_table->procname = svr->name;
+        cpu_table->mode = 0555;
+        cpu_table->child = svr->table;
+        return 0;
+}
+static int __init init_server_entries(void)
+{
+        int cpu, err = 0;
+        struct ctl_table *cpu_table;
+        struct color_cpu_server *svr;
+        for_each_online_cpu(cpu) {
+                cpu_table = &color_cpu_tables[cpu];
+                svr = &color_cpu_servers[cpu];
+                err = init_cpu_entry(cpu_table, svr, cpu);
+                if (err)
+                        goto out;
+        }
+out:
+        return err;
+}
+static struct ctl_table_header *litmus_sysctls;
+static int __init litmus_sysctl_init(void)
+{
+        int ret = 0;
+        printk(KERN_INFO "Registering LITMUS^RT proc sysctl.\n");
+        litmus_sysctls = register_sysctl_table(litmus_dir_table);
+        if (!litmus_sysctls) {
+                printk(KERN_WARNING "Could not register LITMUS^RT sysctl.\n");
+                ret = -EFAULT;
+                goto out;
+        }
+        ret = init_sysctl_nr_colors();
+        if (ret)
+                goto out;
+        ret = init_server_entries();
+        if (ret)
+                goto out;
+        snprintf(info_buffer, INFO_BUFFER_SIZE,
+                 "Cache size\t: %lu B\n"
+                 "Line size\t: %lu B\n"
+                 "Page size\t: %lu B\n"
+                 "Ways\t\t: %lu\n"
+                 "Sets\t\t: %lu\n"
+                 "Colors\t\t: %lu",
+                 color_cache_info.size, color_cache_info.line_size, PAGE_SIZE,
+                 color_cache_info.ways, color_cache_info.sets,
+                 color_cache_info.nr_colors);
+out:
+        return ret;
+}
+module_init(litmus_sysctl_init);
diff --git a/litmus/dgl.c b/litmus/dgl.c
new file mode 100644
index 000000000000..dd2a42cc9ca6
--- /dev/null
+++ b/litmus/dgl.c
@@ -0,0 +1,300 @@
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <litmus/litmus.h>
+#include <litmus/dgl.h>
+#include <litmus/sched_trace.h>
+#define MASK_SIZE     (sizeof(unsigned long) * 8)
+/* Return number of MASK_SIZE fields needed to store a mask in d */
+#define WP(num, word) (num / word + (num % word != 0))
+#define MASK_WORDS(d) WP(d->num_resources, MASK_SIZE)
+/* Word, bit -> resource id */
+#define ri(w, b) (w * MASK_SIZE + b)
+ /* For loop, where @i iterates over each set bit in @bit_arr */
+#define for_each_resource(bit_arr, d, w, b, i)                          \
+        for(w = 0; w < MASK_WORDS(d); ++w)                              \
+                 for(b = find_first_bit(&bit_arr[w],MASK_SIZE), i = ri(w, b);  \
+                     b < MASK_SIZE;                                            \
+                     b = find_next_bit(&bit_arr[w],MASK_SIZE,b+1), i = ri(w, b))
+/* Return resource id in dgl @d for resource @r */
+#define resource_id(d, r) ((((void*)r) - (void*)((d)->resources))/ sizeof(*r))
+/* Return request group of req @r for resource @i */
+#define req_group(r, i) (container_of(((void*)r) - sizeof(*r)*(i),      \
+                                      struct dgl_group_req, requests))
+/* Resource id -> word, bit */
+static inline void mask_idx(int resource, int *word, int *bit)
+{
+        *word = resource / MASK_SIZE;
+        *bit  = resource % MASK_SIZE;
+}
+static void print_waiting(struct dgl *dgl, struct dgl_resource *resource)
+{
+        struct dgl_req *pos;
+        struct dgl_group_req *greq;
+        unsigned long long last = 0;
+        TRACE("List for rid %d\n", resource_id(dgl, resource));
+        list_for_each_entry(pos, &resource->waiting, list) {
+                greq = pos->greq;
+                TRACE("  0x%p with timestamp %llu\n", greq, greq->ts);
+                BUG_ON(greq->ts < last);
+                last = greq->ts;
+        }
+}
+void dgl_init(struct dgl *dgl, unsigned long num_resources,
+              unsigned long num_replicas)
+{
+        int i;
+        struct dgl_resource *resource;
+        dgl->num_replicas  = num_replicas;
+        dgl->num_resources = num_resources;
+        dgl->resources = kmalloc(sizeof(*dgl->resources) * num_resources,
+                                 GFP_ATOMIC);
+        dgl->acquired  = kmalloc(sizeof(*dgl->acquired) * num_online_cpus(),
+                                 GFP_ATOMIC);
+        for (i = 0; i < num_online_cpus(); ++i)
+                dgl->acquired[i] = NULL;
+        for (i = 0; i < num_resources; i++) {
+                resource = &dgl->resources[i];
+                INIT_LIST_HEAD(&resource->waiting);
+                resource->free_replicas = dgl->num_replicas;
+        }
+        dgl->requests = 0;
+        dgl->running  = 0;
+        dgl->ts = 0;
+}
+void dgl_free(struct dgl *dgl)
+{
+        kfree(dgl->resources);
+        kfree(dgl->acquired);
+}
+void dgl_group_req_init(struct dgl *dgl, struct dgl_group_req *greq)
+{
+        int i;
+        greq->requested = kmalloc(sizeof(*greq->requested) * MASK_WORDS(dgl),
+                                  GFP_ATOMIC);
+        greq->waiting   = kmalloc(sizeof(*greq->waiting) * MASK_WORDS(dgl),
+                                  GFP_ATOMIC);
+        greq->requests  = kmalloc(sizeof(*greq->requests) * dgl->num_resources,
+                                  GFP_ATOMIC);
+        BUG_ON(!greq->requested);
+        BUG_ON(!greq->waiting);
+        BUG_ON(!greq->requests);
+        greq->cpu = NO_CPU;
+        for (i = 0; i < MASK_WORDS(dgl); ++i) {
+                greq->requested[i] = 0;
+                greq->waiting[i]   = 0;
+        }
+}
+void dgl_group_req_free(struct dgl_group_req *greq)
+{
+        kfree(greq->requested);
+        kfree(greq->waiting);
+        kfree(greq->requests);
+}
+/**
+ * set_req - create request for @replicas of @resource.
+ */
+void set_req(struct dgl *dgl, struct dgl_group_req *greq,
+             int resource, int replicas)
+{
+        int word, bit;
+        struct dgl_req *req;
+        if (replicas > dgl->num_replicas)
+                replicas = dgl->num_replicas;
+        mask_idx(resource, &word, &bit);
+        __set_bit(bit, &greq->requested[word]);
+        TRACE("0x%p requesting %d of %d\n", greq, replicas, resource);
+        req = &greq->requests[resource];
+        req->greq = greq;
+        INIT_LIST_HEAD(&req->list);
+        req->replicas = replicas;
+}
+/*
+ * Attempt to fulfill request @req for @resource.
+ * Return 1 if successful. If the matching group request has acquired all of
+ * its needed resources, this will then set that req as dgl->acquired[cpu].
+ */
+static unsigned long try_acquire(struct dgl *dgl, struct dgl_resource *resource,
+                                 struct dgl_req *req)
+{
+        int word, bit, rid, head, empty, room;
+        unsigned long waiting;
+        struct dgl_group_req *greq;
+        rid  = resource_id(dgl, resource);
+        greq = req->greq;
+        TRACE("0x%p greq\n", greq);
+        head  = resource->waiting.next == &req->list;
+        empty = list_empty(&resource->waiting);
+        room  = resource->free_replicas >= req->replicas;
+        if (! (room && (head || empty)) ) {
+                TRACE("0x%p cannot acquire %d replicas, %d free\n",
+                      greq, req->replicas, resource->free_replicas,
+                      room, head, empty);
+                return 0;
+        }
+        resource->free_replicas -= req->replicas;
+        TRACE("0x%p acquired %d replicas of rid %d\n",
+              greq, req->replicas, rid);
+        mask_idx(rid, &word, &bit);
+        TRACE("0x%p, %lu, 0x%p\n", greq->waiting, greq->waiting[word],
+              &greq->waiting[word]);
+        clear_bit(bit, &greq->waiting[word]);
+        waiting = 0;
+        for (word = 0; word < MASK_WORDS(dgl); word++) {
+                waiting |= greq->waiting[word];
+                if (waiting)
+                        break;
+        }
+        if (!waiting) {
+                TRACE("0x%p acquired all resources\n", greq);
+                BUG_ON(dgl->acquired[greq->cpu]);
+                dgl->acquired[greq->cpu] = greq;
+                litmus_reschedule(greq->cpu);
+                dgl->running++;
+        }
+        return 1;
+}
+/**
+ * add_group_req - initiate group request.
+ */
+void add_group_req(struct dgl *dgl, struct dgl_group_req *greq, int cpu)
+{
+        int b, w, i, succ, all_succ = 1;
+        struct dgl_req *req;
+        struct dgl_resource *resource;
+        greq->cpu = cpu;
+        greq->ts = dgl->ts++;
+        TRACE("0x%p group request added for CPU %d\n", greq, cpu);
+        BUG_ON(dgl->acquired[cpu] == greq);
+        ++dgl->requests;
+        for_each_resource(greq->requested, dgl, w, b, i) {
+                __set_bit(b, &greq->waiting[w]);
+        }
+        for_each_resource(greq->requested, dgl, w, b, i) {
+                req = &greq->requests[i];
+                resource = &dgl->resources[i];
+                succ = try_acquire(dgl, resource, req);
+                all_succ &= succ;
+                if (!succ) {
+                        TRACE("0x%p waiting on rid %d\n", greq, i);
+                        list_add_tail(&req->list, &resource->waiting);
+                }
+        }
+        /* Grant empty requests */
+        if (all_succ && !dgl->acquired[cpu]) {
+                TRACE("0x%p empty group request acquired cpu %d\n", greq, cpu);
+                dgl->acquired[cpu] = greq;
+                ++dgl->running;
+        }
+        BUG_ON(dgl->requests && !dgl->running);
+}
+/**
+ * remove_group_req - abandon group request.
+ *
+ * This will also progress the waiting queues of resources acquired by @greq.
+ */
+void remove_group_req(struct dgl *dgl, struct dgl_group_req *greq)
+{
+        int b, w, i;
+        struct dgl_req *req, *next;
+        struct dgl_resource *resource;
+        TRACE("0x%p removing group request for CPU %d\n", greq, greq->cpu);
+        --dgl->requests;
+        if (dgl->acquired[greq->cpu] == greq) {
+                TRACE("0x%p no longer acquired on CPU %d\n", greq, greq->cpu);
+                dgl->acquired[greq->cpu] = NULL;
+                --dgl->running;
+        }
+        for_each_resource(greq->requested, dgl, w, b, i) {
+                req = &greq->requests[i];
+                resource = &dgl->resources[i];
+                if (!list_empty(&req->list)) {
+                        /* Waiting on resource */
+                        clear_bit(b, &greq->waiting[w]);
+                        list_del_init(&req->list);
+                        TRACE("Quitting 0x%p from rid %d\n",
+                              req, i);
+                } else {
+                        /* Have resource */
+                        resource->free_replicas += req->replicas;
+                        BUG_ON(resource->free_replicas > dgl->num_replicas);
+                        TRACE("0x%p releasing %d of %d replicas, rid %d\n",
+                              greq, req->replicas, resource->free_replicas, i);
+                        if (!list_empty(&resource->waiting)) {
+                                /* Give it to the next guy */
+                                next = list_first_entry(&resource->waiting,
+                                                        struct dgl_req,
+                                                        list);
+                                BUG_ON(next->greq->ts < greq->ts);
+                                if (try_acquire(dgl, resource, next)) {
+                                        list_del_init(&next->list);
+                                        print_waiting(dgl, resource);
+                                }
+                        }
+                }
+        }
+        BUG_ON(dgl->requests && !dgl->running);
+}
diff --git a/litmus/fifo_common.c b/litmus/fifo_common.c
new file mode 100644
index 000000000000..84ae98e42ae4
--- /dev/null
+++ b/litmus/fifo_common.c
@@ -0,0 +1,58 @@
+/*
+ * kernel/edf_common.c
+ *
+ * Common functions for EDF based scheduler.
+ */
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/sched_trace.h>
+#include <litmus/fifo_common.h>
+int fifo_higher_prio(struct task_struct* first,
+                    struct task_struct* second)
+{
+        /* There is no point in comparing a task to itself. */
+        if (first && first == second) {
+                TRACE_TASK(first,
+                           "WARNING: pointless fifo priority comparison.\n");
+                BUG_ON(1);
+                return 0;
+        }
+        if (!first || !second)
+                return first && !second;
+        /* Tiebreak by PID */
+        return  (get_release(first) == get_release(second) &&
+                 first->pid > second->pid) ||
+                (get_release(first) < get_release(second));
+}
+int fifo_ready_order(struct bheap_node* a, struct bheap_node* b)
+{
+        return fifo_higher_prio(bheap2task(a), bheap2task(b));
+}
+void fifo_domain_init(rt_domain_t* rt, check_resched_needed_t resched,
+                      release_jobs_t release)
+{
+        rt_domain_init(rt,  fifo_ready_order, resched, release);
+}
+int fifo_preemption_needed(rt_domain_t* rt, struct task_struct *t)
+{
+        if (!__jobs_pending(rt))
+                return 0;
+        if (!t)
+                return 1;
+        return !is_realtime(t) || fifo_higher_prio(__next_ready(rt), t);
+}
diff --git a/litmus/ftdev.c b/litmus/ftdev.c
index e282f8a9c067..999290fc8302 100644
--- a/litmus/ftdev.c
+++ b/litmus/ftdev.c
@@ -114,6 +114,7 @@ static int ftdev_open(struct inode *in, struct file *filp)
                goto out;
        ftdm = ftdev->minor + buf_idx;
+        ftdm->ftdev = ftdev;
        filp->private_data = ftdm;
        if (mutex_lock_interruptible(&ftdm->lock)) {
@@ -291,6 +292,19 @@ out:
        return err;
 }
+static ssize_t ftdev_write(struct file *filp, const char __user *from,
+                           size_t len, loff_t *f_pos)
+{
+        struct ftdev_minor* ftdm = filp->private_data;
+        ssize_t err = -EINVAL;
+        struct ftdev* ftdev = ftdm->ftdev;
+        /* dispatch write to buffer-specific code, if available */
+        if (ftdev->write)
+                err = ftdev->write(ftdm->buf, len, from);
+        return err;
+}
 struct file_operations ftdev_fops = {
        .owner   = THIS_MODULE,
@@ -315,6 +329,7 @@ int ftdev_init(	struct ftdev* ftdev, struct module* owner,
        ftdev->alloc    = NULL;
        ftdev->free     = NULL;
        ftdev->can_open = NULL;
+        ftdev->write    = NULL;
        ftdev->minor = kcalloc(ftdev->minor_cnt, sizeof(*ftdev->minor),
                        GFP_KERNEL);
diff --git a/litmus/jobs.c b/litmus/jobs.c
index 10a42db1165e..7263cabf8c6c 100644
--- a/litmus/jobs.c
+++ b/litmus/jobs.c
@@ -9,15 +9,21 @@
 void prepare_for_next_period(struct task_struct *t)
 {
        BUG_ON(!t);
+#ifdef CONFIG_PLUGIN_COLOR
+        tsk_rt(t)->tot_exec_time += tsk_rt(t)->job_params.exec_time;
+#endif
        /* prepare next release */
        t->rt_param.job_params.release   = t->rt_param.job_params.deadline;
        t->rt_param.job_params.real_release =  t->rt_param.job_params.release;
        t->rt_param.job_params.deadline += get_rt_period(t);
        t->rt_param.job_params.real_deadline = t->rt_param.job_params.deadline;
        t->rt_param.job_params.exec_time = 0;
+        tsk_rt(t)->job_params.release   = tsk_rt(t)->job_params.deadline;
+        tsk_rt(t)->job_params.deadline += get_rt_period(t);
+        tsk_rt(t)->job_params.exec_time = 0;
        /* update job sequence number */
-        t->rt_param.job_params.job_no++;
+        tsk_rt(t)->job_params.job_no++;
        /* don't confuse Linux */
        t->rt.time_slice = 1;
@@ -25,7 +31,7 @@ void prepare_for_next_period(struct task_struct *t)
 void release_at(struct task_struct *t, lt_t start)
 {
-        t->rt_param.job_params.deadline = start;
+        tsk_rt(t)->job_params.deadline = start;
        prepare_for_next_period(t);
        set_rt_flags(t, RT_F_RUNNING);
 }
diff --git a/litmus/litmus.c b/litmus/litmus.c
index f4d676c17d5f..b76e1496d7f4 100644
--- a/litmus/litmus.c
+++ b/litmus/litmus.c
@@ -24,6 +24,8 @@
 #include <litmus/sched_mc.h>
 #else
 struct mc_task;
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+#include <litmus/affinity.h>
 #endif
 /* Number of RT tasks that exist in the system */
@@ -127,6 +129,14 @@ asmlinkage long sys_set_rt_task_param(pid_t pid, struct rt_task __user * param)
                       "because wcet > period\n", pid);
                goto out_unlock;
        }
+        if (    tp.cls != RT_CLASS_HARD &&
+                tp.cls != RT_CLASS_SOFT &&
+                tp.cls != RT_CLASS_BEST_EFFORT)
+        {
+                printk(KERN_INFO "litmus: real-time task %d rejected "
+                                 "because its class is invalid\n", pid);
+                goto out_unlock;
+        }
        if (tp.budget_policy != NO_ENFORCEMENT &&
            tp.budget_policy != QUANTUM_ENFORCEMENT &&
            tp.budget_policy != PRECISE_ENFORCEMENT)
@@ -369,12 +379,14 @@ static void reinit_litmus_state(struct task_struct* p, int restore)
 {
        struct rt_task  user_config = {};
        void*  ctrl_page     = NULL;
+        void*  color_ctrl_page = NULL;
        if (restore) {
                /* Safe user-space provided configuration data.
                 * and allocated page. */
-                user_config = p->rt_param.task_params;
+                user_config     = p->rt_param.task_params;
-                ctrl_page   = p->rt_param.ctrl_page;
+                ctrl_page       = p->rt_param.ctrl_page;
+                color_ctrl_page = p->rt_param.color_ctrl_page;
        }
        /* We probably should not be inheriting any task's priority
@@ -387,8 +399,9 @@ static void reinit_litmus_state(struct task_struct* p, int restore)
        /* Restore preserved fields. */
        if (restore) {
-                p->rt_param.task_params = user_config;
+                p->rt_param.task_params         = user_config;
-                p->rt_param.ctrl_page   = ctrl_page;
+                p->rt_param.ctrl_page           = ctrl_page;
+                p->rt_param.color_ctrl_page     = color_ctrl_page;
        }
 }
@@ -529,9 +542,11 @@ void litmus_fork(struct task_struct* p)
                reinit_litmus_state(p, 0);
                /* Don't let the child be a real-time task.  */
                p->sched_reset_on_fork = 1;
-        } else
+        } else {
                /* non-rt tasks might have ctrl_page set */
                tsk_rt(p)->ctrl_page = NULL;
+                tsk_rt(p)->color_ctrl_page = NULL;
+        }
        /* od tables are never inherited across a fork */
        p->od_table = NULL;
@@ -551,6 +566,10 @@ void litmus_exec(void)
                        free_page((unsigned long) tsk_rt(p)->ctrl_page);
                        tsk_rt(p)->ctrl_page = NULL;
                }
+                if (tsk_rt(p)->color_ctrl_page) {
+                        free_page((unsigned long) tsk_rt(p)->color_ctrl_page);
+                        tsk_rt(p)->color_ctrl_page = NULL;
+                }
        }
 }
@@ -568,6 +587,12 @@ void exit_litmus(struct task_struct *dead_tsk)
                           tsk_rt(dead_tsk)->ctrl_page);
                free_page((unsigned long) tsk_rt(dead_tsk)->ctrl_page);
        }
+        if (tsk_rt(dead_tsk)->color_ctrl_page) {
+                TRACE_TASK(dead_tsk,
+                                "freeing color_ctrl_page %p\n",
+                                tsk_rt(dead_tsk)->color_ctrl_page);
+                free_page((unsigned long) tsk_rt(dead_tsk)->color_ctrl_page);
+        }
 #ifdef CONFIG_PLUGIN_MC
        /* The MC-setup syscall might succeed and allocate mc_data, but the
@@ -616,6 +641,8 @@ static int __init _init_litmus(void)
         */
        printk("Starting LITMUS^RT kernel\n");
+        BUILD_BUG_ON(sizeof(union np_flag) != sizeof(uint32_t));
        register_sched_plugin(&linux_sched_plugin);
        bheap_node_cache   = KMEM_CACHE(bheap_node, SLAB_PANIC);
@@ -637,6 +664,10 @@ static int __init _init_litmus(void)
        init_litmus_proc();
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+        init_topology();
+#endif
        return 0;
 }
diff --git a/litmus/locking.c b/litmus/locking.c
index 91aa0f9724b0..e051a288aba0 100644
--- a/litmus/locking.c
+++ b/litmus/locking.c
@@ -92,8 +92,6 @@ asmlinkage long sys_litmus_lock(int lock_od)
         * this into account when computing overheads. */
        TS_LOCK_END;
-        TS_SYSCALL_OUT_START;
        return err;
 }
@@ -125,16 +123,18 @@ asmlinkage long sys_litmus_unlock(int lock_od)
        return err;
 }
-struct task_struct* waitqueue_first(wait_queue_head_t *wq)
+struct task_struct* __waitqueue_remove_first(wait_queue_head_t *wq)
 {
-        wait_queue_t *q;
+        wait_queue_t* q;
+        struct task_struct* t = NULL;
        if (waitqueue_active(wq)) {
                q = list_entry(wq->task_list.next,
                               wait_queue_t, task_list);
-                return (struct task_struct*) q->private;
+                t = (struct task_struct*) q->private;
-        } else
+                __remove_wait_queue(wq, q);
-                return NULL;
+        }
+        return(t);
 }
diff --git a/litmus/preempt.c b/litmus/preempt.c
index 528d7131fe12..3606cd7ffae7 100644
--- a/litmus/preempt.c
+++ b/litmus/preempt.c
@@ -32,8 +32,11 @@ void sched_state_will_schedule(struct task_struct* tsk)
          /*    /\* Litmus tasks should never be subject to a remote */
          /*     * set_tsk_need_resched(). *\/ */
          /*    BUG_ON(is_realtime(tsk)); */
+#ifdef CONFIG_PREEMPT_STATE_TRACE
        TRACE_TASK(tsk, "set_tsk_need_resched() ret:%p\n",
                   __builtin_return_address(0));
+#endif
 }
 /* Called by the IPI handler after another CPU called smp_send_resched(). */
diff --git a/litmus/rm_common.c b/litmus/rm_common.c
new file mode 100644
index 000000000000..f608a084d3b8
--- /dev/null
+++ b/litmus/rm_common.c
@@ -0,0 +1,91 @@
+/*
+ * kernel/rm_common.c
+ *
+ * Common functions for RM based scheduler.
+ */
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/sched_trace.h>
+#include <litmus/rm_common.h>
+/* rm_higher_prio -  returns true if first has a higher RM priority
+ *                    than second. Deadline ties are broken by PID.
+ *
+ * both first and second may be NULL
+ */
+int rm_higher_prio(struct task_struct* first,
+                    struct task_struct* second)
+{
+        struct task_struct *first_task = first;
+        struct task_struct *second_task = second;
+        /* There is no point in comparing a task to itself. */
+        if (first && first == second) {
+                TRACE_TASK(first,
+                           "WARNING: pointless rm priority comparison.\n");
+                return 0;
+        }
+        /* check for NULL tasks */
+        if (!first || !second)
+                return first && !second;
+        return !is_realtime(second_task)  ||
+                /* is the deadline of the first task earlier?
+                 * Then it has higher priority.
+                 */
+                lt_before(get_rt_period(first_task), get_rt_period(second_task)) ||
+                /* Do we have a deadline tie?
+                 * Then break by PID.
+                 */
+                (get_rt_period(first_task) == get_rt_period(second_task) &&
+                (first_task->pid < second_task->pid ||
+                /* If the PIDs are the same then the task with the inherited
+                 * priority wins.
+                 */
+                (first_task->pid == second_task->pid &&
+                 !second->rt_param.inh_task)));
+}
+int rm_ready_order(struct bheap_node* a, struct bheap_node* b)
+{
+        return rm_higher_prio(bheap2task(a), bheap2task(b));
+}
+void rm_domain_init(rt_domain_t* rt, check_resched_needed_t resched,
+                      release_jobs_t release)
+{
+        rt_domain_init(rt,  rm_ready_order, resched, release);
+}
+/* need_to_preempt - check whether the task t needs to be preempted
+ *                   call only with irqs disabled and with  ready_lock acquired
+ *                   THIS DOES NOT TAKE NON-PREEMPTIVE SECTIONS INTO ACCOUNT!
+ */
+int rm_preemption_needed(rt_domain_t* rt, struct task_struct *t)
+{
+        /* we need the read lock for rm_ready_queue */
+        /* no need to preempt if there is nothing pending */
+        if (!__jobs_pending(rt))
+                return 0;
+        /* we need to reschedule if t doesn't exist */
+        if (!t)
+                return 1;
+        /* NOTE: We cannot check for non-preemptibility since we
+         *       don't know what address space we're currently in.
+         */
+        /* make sure to get non-rt stuff out of the way */
+        return !is_realtime(t) || rm_higher_prio(__next_ready(rt), t);
+}
diff --git a/litmus/rt_domain.c b/litmus/rt_domain.c
index 3b3b49ed48ea..79243f92edbe 100644
--- a/litmus/rt_domain.c
+++ b/litmus/rt_domain.c
@@ -57,6 +57,14 @@ static void do_release(struct release_heap *rh)
                TS_LVLB_RELEASE_START;
        else
                TS_LVLC_RELEASE_START;
+        struct release_heap* rh;
+        rh = container_of(timer, struct release_heap, timer);
+        TS_RELEASE_LATENCY(rh->release_time);
+        VTRACE("on_release_timer(0x%p) starts.\n", timer);
+        TS_RELEASE_START;
        raw_spin_lock_irqsave(&rh->dom->release_lock, flags);
        VTRACE("CB has the release_lock 0x%p\n", &rh->dom->release_lock);
diff --git a/litmus/rt_server.c b/litmus/rt_server.c
new file mode 100644
index 000000000000..74d7c7b0f81a
--- /dev/null
+++ b/litmus/rt_server.c
@@ -0,0 +1,23 @@
+#include <litmus/rt_server.h>
+static void default_server_update(struct rt_server *srv)
+{
+}
+void init_rt_server(struct rt_server *server,
+                    int sid, int cpu, rt_domain_t *domain,
+                    need_preempt_t need_preempt,
+                    server_update_t update)
+{
+        if (!need_preempt)
+                BUG_ON(1);
+        server->need_preempt = need_preempt;
+        server->update = (update) ? update : default_server_update;
+        server->sid = sid;
+        server->cpu = cpu;
+        server->linked = NULL;
+        server->domain = domain;
+        server->running = 0;
+}
diff --git a/litmus/sched_cedf.c b/litmus/sched_cedf.c
index 73fe1c442a0d..480c62bc895b 100644
--- a/litmus/sched_cedf.c
+++ b/litmus/sched_cedf.c
@@ -43,6 +43,10 @@
 #include <litmus/bheap.h>
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+#include <litmus/affinity.h>
+#endif
 /* to configure the cluster size */
 #include <litmus/litmus_proc.h>
 #include <linux/uaccess.h>
@@ -95,7 +99,7 @@ typedef struct clusterdomain {
        struct bheap_node *heap_node;
        struct bheap      cpu_heap;
        /* lock for this cluster */
-#define lock domain.ready_lock
+#define cluster_lock domain.ready_lock
 } cedf_domain_t;
 /* a cedf_domain per cluster; allocation is done at init/activation time */
@@ -257,11 +261,34 @@ static noinline void requeue(struct task_struct* task)
        }
 }
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+static cpu_entry_t* cedf_get_nearest_available_cpu(
+                                cedf_domain_t *cluster, cpu_entry_t *start)
+{
+        cpu_entry_t *affinity;
+        get_nearest_available_cpu(affinity, start, cedf_cpu_entries,
+#ifdef CONFIG_RELEASE_MASTER
+                cluster->domain.release_master
+#else
+                NO_CPU
+#endif
+                );
+        /* make sure CPU is in our cluster */
+        if (affinity && cpu_isset(affinity->cpu, *cluster->cpu_map))
+                return(affinity);
+        else
+                return(NULL);
+}
+#endif
 /* check for any necessary preemptions */
 static void check_for_preemptions(cedf_domain_t *cluster)
 {
        struct task_struct *task;
-        cpu_entry_t* last;
+        cpu_entry_t *last;
        for(last = lowest_prio_cpu(cluster);
            edf_preemption_needed(&cluster->domain, last->linked);
@@ -270,8 +297,20 @@ static void check_for_preemptions(cedf_domain_t *cluster)
                task = __take_ready(&cluster->domain);
                TRACE("check_for_preemptions: attempting to link task %d to %d\n",
                      task->pid, last->cpu);
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+                {
+                        cpu_entry_t *affinity =
+                                        cedf_get_nearest_available_cpu(cluster,
+                                                &per_cpu(cedf_cpu_entries, task_cpu(task)));
+                        if(affinity)
+                                last = affinity;
+                        else if(last->linked)
+                                requeue(last->linked);
+                }
+#else
                if (last->linked)
                        requeue(last->linked);
+#endif
                link_task_to_cpu(task, last);
                preempt(last);
        }
@@ -292,12 +331,12 @@ static void cedf_release_jobs(rt_domain_t* rt, struct bheap* tasks)
        cedf_domain_t* cluster = container_of(rt, cedf_domain_t, domain);
        unsigned long flags;
-        raw_spin_lock_irqsave(&cluster->lock, flags);
+        raw_spin_lock_irqsave(&cluster->cluster_lock, flags);
        __merge_ready(&cluster->domain, tasks);
        check_for_preemptions(cluster);
-        raw_spin_unlock_irqrestore(&cluster->lock, flags);
+        raw_spin_unlock_irqrestore(&cluster->cluster_lock, flags);
 }
 /* caller holds cedf_lock */
@@ -378,7 +417,17 @@ static struct task_struct* cedf_schedule(struct task_struct * prev)
        int out_of_time, sleep, preempt, np, exists, blocks;
        struct task_struct* next = NULL;
-        raw_spin_lock(&cluster->lock);
+#ifdef CONFIG_RELEASE_MASTER
+        /* Bail out early if we are the release master.
+         * The release master never schedules any real-time tasks.
+         */
+        if (unlikely(cluster->domain.release_master == entry->cpu)) {
+                sched_state_task_picked();
+                return NULL;
+        }
+#endif
+        raw_spin_lock(&cluster->cluster_lock);
        clear_will_schedule();
        /* sanity checking */
@@ -462,7 +511,7 @@ static struct task_struct* cedf_schedule(struct task_struct * prev)
                        next = prev;
        sched_state_task_picked();
-        raw_spin_unlock(&cluster->lock);
+        raw_spin_unlock(&cluster->cluster_lock);
 #ifdef WANT_ALL_SCHED_EVENTS
        TRACE("cedf_lock released, next=0x%p\n", next);
@@ -504,7 +553,7 @@ static void cedf_task_new(struct task_struct * t, int on_rq, int running)
        /* the cluster doesn't change even if t is running */
        cluster = task_cpu_cluster(t);
-        raw_spin_lock_irqsave(&cluster->domain.ready_lock, flags);
+        raw_spin_lock_irqsave(&cluster->cluster_lock, flags);
        /* setup job params */
        release_at(t, litmus_clock());
@@ -513,15 +562,25 @@ static void cedf_task_new(struct task_struct * t, int on_rq, int running)
                entry = &per_cpu(cedf_cpu_entries, task_cpu(t));
                BUG_ON(entry->scheduled);
-                entry->scheduled = t;
+#ifdef CONFIG_RELEASE_MASTER
-                tsk_rt(t)->scheduled_on = task_cpu(t);
+                if (entry->cpu != cluster->domain.release_master) {
+#endif
+                        entry->scheduled = t;
+                        tsk_rt(t)->scheduled_on = task_cpu(t);
+#ifdef CONFIG_RELEASE_MASTER
+                } else {
+                        /* do not schedule on release master */
+                        preempt(entry); /* force resched */
+                        tsk_rt(t)->scheduled_on = NO_CPU;
+                }
+#endif
        } else {
                t->rt_param.scheduled_on = NO_CPU;
        }
        t->rt_param.linked_on          = NO_CPU;
        cedf_job_arrival(t);
-        raw_spin_unlock_irqrestore(&(cluster->domain.ready_lock), flags);
+        raw_spin_unlock_irqrestore(&(cluster->cluster_lock), flags);
 }
 static void cedf_task_wake_up(struct task_struct *task)
@@ -534,7 +593,7 @@ static void cedf_task_wake_up(struct task_struct *task)
        cluster = task_cpu_cluster(task);
-        raw_spin_lock_irqsave(&cluster->lock, flags);
+        raw_spin_lock_irqsave(&cluster->cluster_lock, flags);
        /* We need to take suspensions because of semaphores into
         * account! If a job resumes after being suspended due to acquiring
         * a semaphore, it should never be treated as a new job release.
@@ -557,7 +616,7 @@ static void cedf_task_wake_up(struct task_struct *task)
                }
        }
        cedf_job_arrival(task);
-        raw_spin_unlock_irqrestore(&cluster->lock, flags);
+        raw_spin_unlock_irqrestore(&cluster->cluster_lock, flags);
 }
 static void cedf_task_block(struct task_struct *t)
@@ -570,9 +629,9 @@ static void cedf_task_block(struct task_struct *t)
        cluster = task_cpu_cluster(t);
        /* unlink if necessary */
-        raw_spin_lock_irqsave(&cluster->lock, flags);
+        raw_spin_lock_irqsave(&cluster->cluster_lock, flags);
        unlink(t);
-        raw_spin_unlock_irqrestore(&cluster->lock, flags);
+        raw_spin_unlock_irqrestore(&cluster->cluster_lock, flags);
        BUG_ON(!is_realtime(t));
 }
@@ -584,7 +643,7 @@ static void cedf_task_exit(struct task_struct * t)
        cedf_domain_t *cluster = task_cpu_cluster(t);
        /* unlink if necessary */
-        raw_spin_lock_irqsave(&cluster->lock, flags);
+        raw_spin_lock_irqsave(&cluster->cluster_lock, flags);
        unlink(t);
        if (tsk_rt(t)->scheduled_on != NO_CPU) {
                cpu_entry_t *cpu;
@@ -592,7 +651,7 @@ static void cedf_task_exit(struct task_struct * t)
                cpu->scheduled = NULL;
                tsk_rt(t)->scheduled_on = NO_CPU;
        }
-        raw_spin_unlock_irqrestore(&cluster->lock, flags);
+        raw_spin_unlock_irqrestore(&cluster->cluster_lock, flags);
        BUG_ON(!is_realtime(t));
        TRACE_TASK(t, "RIP\n");
@@ -698,6 +757,9 @@ static long cedf_activate_plugin(void)
                if(!zalloc_cpumask_var(&cedf[i].cpu_map, GFP_ATOMIC))
                        return -ENOMEM;
+#ifdef CONFIG_RELEASE_MASTER
+                cedf[i].domain.release_master = atomic_read(&release_master_cpu);
+#endif
        }
        /* cycle through cluster and add cpus to them */
@@ -740,7 +802,11 @@ static long cedf_activate_plugin(void)
                                entry->linked = NULL;
                                entry->scheduled = NULL;
-                                update_cpu_position(entry);
+#ifdef CONFIG_RELEASE_MASTER
+                                /* only add CPUs that should schedule jobs */
+                                if (entry->cpu != entry->cluster->domain.release_master)
+#endif
+                                        update_cpu_position(entry);
                        }
                        /* done with this cluster */
                        break;
diff --git a/litmus/sched_color.c b/litmus/sched_color.c
new file mode 100644
index 000000000000..44327d60aaa5
--- /dev/null
+++ b/litmus/sched_color.c
@@ -0,0 +1,888 @@
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <litmus/litmus.h>
+#include <litmus/jobs.h>
+#include <litmus/preempt.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/rm_common.h>
+#include <litmus/sched_trace.h>
+#include <litmus/color.h>
+#include <litmus/fifo_common.h>
+#include <litmus/budget.h>
+#include <litmus/rt_server.h>
+#include <litmus/dgl.h>
+/**
+ * @rt_server    Common server functionality.
+ * @task         Task used to schedule server.
+ * @timer        Budget enforcement for @task
+ * @start_time   If set, time at which server began running.
+ */
+struct fifo_server {
+        struct rt_server                server;
+        struct task_struct*             task;
+        struct enforcement_timer        timer;
+        lt_t                            start_time;
+};
+/**
+ * @server        Common server functionality.
+ * @rm_domain     PRM domain.
+ * @scheduled     Task physically running on CPU.
+ * @fifo_server   Server partitioned to this CPU.
+ */
+struct cpu_entry {
+        struct rt_server        server;
+        rt_domain_t             rm_domain;
+        struct task_struct*     scheduled;
+        struct fifo_server      fifo_server;
+        struct hrtimer          chunk_timer;
+};
+DEFINE_PER_CPU(struct cpu_entry, color_cpus);
+static rt_domain_t fifo_domain;
+static raw_spinlock_t fifo_lock;
+static struct dgl group_lock;
+static raw_spinlock_t dgl_lock;
+#define local_entry            (&__get_cpu_var(color_cpus))
+#define remote_entry(cpu)      (&per_cpu(color_cpus, cpu))
+#define task_entry(task)       remote_entry(get_partition(task))
+#define task_fserver(task)     (&task_entry(task)->fifo_server.server)
+#define entry_lock(entry)      (&(entry)->rm_domain.ready_lock)
+#define task_dom(entry, task)  (is_be(task) ? &fifo_domain : &entry->rm_domain)
+#define task_lock(entry, task) (is_be(task) ? &fifo_lock   : entry_lock(entry))
+#define is_fifo_server(s)      ((s)->sid > num_online_cpus())
+#define lock_if(lock, cond)    do { if (cond) raw_spin_lock(lock);} while(0)
+#define unlock_if(lock, cond)  do { if (cond) raw_spin_unlock(lock);} while(0)
+#ifdef CONFIG_NP_SECTION
+#define has_resources(t, c)   (tsk_rt(t)->req == group_lock.acquired[c])
+#else
+#define has_resources(t, c)   (1)
+#endif
+/*
+ * Requeue onto domain's release or ready queue based on task state.
+ */
+static void requeue(rt_domain_t *dom, struct task_struct* t)
+{
+        if (is_server(t) && !tsk_rt(t)->present)
+                /* Remove stopped server from the system */
+                return;
+        TRACE_TASK(t, "Requeueing\n");
+        if (is_queued(t)) {
+                TRACE_TASK(t, "Already queued!\n");
+                return;
+        }
+        set_rt_flags(t, RT_F_RUNNING);
+        if (is_released(t, litmus_clock()))
+                __add_ready(dom, t);
+        else
+                add_release(dom, t);
+}
+enum hrtimer_restart chunk_fire(struct hrtimer *timer)
+{
+        unsigned long flags;
+        local_irq_save(flags);
+        TRACE("Chunk timer fired.\n");
+        litmus_reschedule_local();
+        local_irq_restore(flags);
+        return HRTIMER_NORESTART;
+}
+void chunk_arm(struct cpu_entry *entry)
+{
+        unsigned long fire;
+        if (color_chunk) {
+                fire = litmus_clock() + color_chunk;
+                TRACE("Arming chunk timer for %llu\n", fire);
+                __hrtimer_start_range_ns(&entry->chunk_timer,
+                                         ns_to_ktime(fire), 0,
+                                         HRTIMER_MODE_ABS_PINNED, 0);
+        }
+}
+void chunk_cancel(struct cpu_entry *entry)
+{
+        TRACE("Cancelling chunk timer\n");
+        hrtimer_try_to_cancel(&entry->chunk_timer);
+}
+/*
+ * Relinquish resources held by @t (or its children).
+ */
+static void release_resources(struct task_struct *t)
+{
+        struct task_struct *sched;
+#ifdef CONFIG_NP_SECTION
+        TRACE_TASK(t, "Releasing resources\n");
+        if (is_server(t)) {
+                sched = task_fserver(t)->linked;
+                if (sched)
+                        release_resources(sched);
+        } else if (is_kernel_np(t))
+                remove_group_req(&group_lock, tsk_rt(t)->req);
+        take_np(t);
+#endif
+}
+/*
+ * Put in requests for resources needed by @t. If @t is a server, this will
+ * set @t's np flag to reflect resources held by @t's children.
+ */
+static void acquire_resources(struct task_struct *t)
+{
+        int cpu;
+        struct rt_server *server;
+        struct task_struct *sched;
+#ifdef CONFIG_NP_SECTION
+        /* Can't acquire resources if t is not running */
+        BUG_ON(!get_task_server(t));
+        if (is_kernel_np(t)) {
+                TRACE_TASK(t, "Already contending for resources\n");
+                return;
+        }
+        cpu = get_task_server(t)->cpu;
+        if (is_server(t)) {
+                server = task_fserver(t);
+                sched  = server->linked;
+                /* Happens when server is booted off on completion or
+                 * has just completed executing a task.
+                 */
+                if (sched && !is_kernel_np(sched))
+                        acquire_resources(sched);
+                /* Become np if there is a running task */
+                if (sched && has_resources(sched, cpu)) {
+                        TRACE_TASK(t, "Running task with resource\n");
+                        make_np(t);
+                } else {
+                        TRACE_TASK(t, "Running no resources\n");
+                        take_np(t);
+                }
+        } else {
+                TRACE_TASK(t, "Acquiring resources\n");
+                if (!has_resources(t, cpu))
+                        add_group_req(&group_lock, tsk_rt(t)->req, cpu);
+                make_np(t);
+        }
+#endif
+}
+/*
+ * Stop logically running the currently linked task.
+ */
+static void unlink(struct rt_server *server)
+{
+        BUG_ON(!server->linked);
+        if (is_server(server->linked))
+                task_fserver(server->linked)->running = 0;
+        sched_trace_server_switch_away(server->sid, 0,
+                                       server->linked->pid,
+                                       get_rt_job(server->linked));
+        TRACE_TASK(server->linked, "No longer run by server %d\n", server->sid);
+        raw_spin_lock(&dgl_lock);
+        release_resources(server->linked);
+        raw_spin_unlock(&dgl_lock);
+        get_task_server(server->linked) = NULL;
+        server->linked = NULL;
+}
+static struct task_struct* schedule_server(struct rt_server *server);
+/*
+ * Logically run @task.
+ */
+static void link(struct rt_server *server, struct task_struct *task)
+{
+        struct rt_server *tserv;
+        BUG_ON(server->linked);
+        BUG_ON(!server->running);
+        BUG_ON(is_kernel_np(task));
+        TRACE_TASK(task, "Run by server %d\n", server->sid);
+        if (is_server(task)) {
+                tserv = task_fserver(task);
+                tserv->running = 1;
+                schedule_server(tserv);
+        }
+        server->linked = task;
+        get_task_server(task) = server;
+        sched_trace_server_switch_to(server->sid, 0,
+                                     task->pid, get_rt_job(task));
+}
+/*
+ * Triggers preemption on first FIFO server which is running NULL.
+ */
+static void check_for_fifo_preempt(void)
+{
+        int ret = 0, cpu;
+        struct cpu_entry *entry;
+        struct rt_server *cpu_server, *fifo_server;
+        TRACE("Checking for FIFO preempt\n");
+        for_each_online_cpu(cpu) {
+                entry = remote_entry(cpu);
+                cpu_server = &entry->server;
+                fifo_server = &entry->fifo_server.server;
+                raw_spin_lock(entry_lock(entry));
+                raw_spin_lock(&fifo_lock);
+                if (cpu_server->linked && is_server(cpu_server->linked) &&
+                    !fifo_server->linked) {
+                        litmus_reschedule(cpu);
+                        ret = 1;
+                }
+                raw_spin_unlock(&fifo_lock);
+                raw_spin_unlock(entry_lock(entry));
+                if (ret)
+                        break;
+        }
+}
+/*
+ * Rejoin a task into the system.
+ */
+static void job_arrival(struct task_struct *t)
+{
+        int i;
+        rt_domain_t *dom = task_dom(task_entry(t), t);
+        struct dgl_group_req *gr = tsk_rt(t)->req;
+        struct control_page  *cp = tsk_rt(t)->ctrl_page;
+        struct color_ctrl_page *ccp = tsk_rt(t)->color_ctrl_page;
+        /* Fill request */
+        if (cp && ccp && cp->colors_updated) {
+                cp->colors_updated = 0;
+                dgl_group_req_init(&group_lock, gr);
+                for (i = 0; ccp->pages[i]; ++i)
+                        set_req(&group_lock, gr, ccp->colors[i], ccp->pages[i]);
+        } else {
+                TRACE("Oh noz: %p %p %d\n", cp, ccp, ((cp) ? cp->colors_updated : -1));
+        }
+        lock_if(&fifo_lock, is_be(t));
+        requeue(dom, t);
+        unlock_if(&fifo_lock, is_be(t));
+}
+/*
+ * Complete job for task linked to @server.
+ */
+static void job_completion(struct rt_server *server)
+{
+        struct task_struct *t = server->linked;
+        lt_t et, now = litmus_clock();
+        TRACE_TASK(t, "Job completed\n");
+        if (is_server(t))
+                sched_trace_server_completion(t->pid, get_rt_job(t));
+        else
+                sched_trace_task_completion(t, 0);
+        if (1 < get_rt_job(t)) {
+                /* our releases happen at the second job */
+                et = get_exec_time(t);
+                if (et > tsk_rt(t)->max_exec_time)
+                        tsk_rt(t)->max_exec_time = et;
+        }
+        if (is_tardy(t, now)) {
+                lt_t miss = now - get_deadline(t);
+                ++tsk_rt(t)->missed;
+                tsk_rt(t)->total_tardy += miss;
+                if (lt_before(tsk_rt(t)->max_tardy, miss)) {
+                        tsk_rt(t)->max_tardy = miss;
+                }
+        }
+        unlink(server);
+        set_rt_flags(t, RT_F_SLEEP);
+        prepare_for_next_period(t);
+        if (is_server(t))
+                sched_trace_server_release(t->pid, get_rt_job(t),
+                                           get_release(t), get_deadline(t));
+        else
+                sched_trace_task_release(t);
+        if (is_running(t))
+                job_arrival(t);
+}
+/*
+ * Update @server state to reflect task's state.
+ */
+static void update_task(struct rt_server *server)
+{
+        int oot, sleep, block, np, chunked;
+        struct task_struct *t = server->linked;
+        lt_t last = tsk_rt(t)->last_exec_time;
+        block = !is_running(t);
+        oot   = budget_enforced(t) && budget_exhausted(t);
+        np    = is_kernel_np(t);
+        sleep = get_rt_flags(t) == RT_F_SLEEP;
+        chunked = color_chunk && last && (lt_after(litmus_clock() - last, color_chunk));
+        TRACE_TASK(t, "Updating task, block: %d, oot: %d, np: %d, sleep: %d, chunk: %d\n",
+                   block, oot, np, sleep, chunked);
+        if (block)
+                unlink(server);
+        else if (oot || sleep)
+                job_completion(server);
+        else if (chunked) {
+                unlink(server);
+                job_arrival(t);
+        }
+}
+/*
+ * Link next task for @server.
+ */
+static struct task_struct* schedule_server(struct rt_server *server)
+{
+        struct task_struct *next;
+        struct rt_server *lserver;
+        TRACE("Scheduling server %d\n", server->sid);
+        if (server->linked) {
+                if (is_server(server->linked)) {
+                        lserver = task_fserver(server->linked);
+                        lserver->update(lserver);
+                }
+                update_task(server);
+        }
+        next = server->linked;
+        lock_if(&fifo_lock, is_fifo_server(server));
+        if ((!next || !is_np(next)) &&
+             server->need_preempt(server->domain, next)) {
+                if (next) {
+                        TRACE_TASK(next, "Preempted\n");
+                        unlink(server);
+                        requeue(server->domain, next);
+                }
+                next = __take_ready(server->domain);
+                link(server, next);
+        }
+        unlock_if(&fifo_lock, is_fifo_server(server));
+        return next;
+}
+/*
+ * Update server state, including picking next running task and incrementing
+ * server execution time.
+ */
+static void fifo_update(struct rt_server *server)
+{
+        lt_t delta;
+        struct fifo_server *fserver;
+        fserver = container_of(server, struct fifo_server, server);
+        TRACE_TASK(fserver->task, "Updating FIFO server\n");
+        if (!server->linked || has_resources(server->linked, server->cpu)) {
+                /* Running here means linked to a parent server */
+                /* BUG_ON(!server->running); */
+                /* Stop executing */
+                if (fserver->start_time) {
+                        delta = litmus_clock() - fserver->start_time;
+                        tsk_rt(fserver->task)->job_params.exec_time += delta;
+                        fserver->start_time = 0;
+                        cancel_enforcement_timer(&fserver->timer);
+                } else {
+                        /* Server is linked, but not executing */
+                        /* BUG_ON(fserver->timer.armed); */
+                }
+                /* Calculate next task */
+                schedule_server(&fserver->server);
+                /* Reserve needed resources */
+                raw_spin_lock(&dgl_lock);
+                acquire_resources(fserver->task);
+                raw_spin_unlock(&dgl_lock);
+        }
+}
+/*
+ * Triggers preemption on rm-scheduled "linked" field only.
+ */
+static void color_rm_release(rt_domain_t *rm, struct bheap *tasks)
+{
+        unsigned long flags;
+        struct cpu_entry *entry;
+        TRACE_TASK(bheap2task(bheap_peek(rm->order, tasks)),
+                   "Released set of RM tasks\n");
+        entry = container_of(rm, struct cpu_entry, rm_domain);
+        raw_spin_lock_irqsave(entry_lock(entry), flags);
+        __merge_ready(rm, tasks);
+        if (rm_preemption_needed(rm, entry->server.linked) &&
+            (!entry->server.linked || !is_kernel_np(entry->server.linked))) {
+                litmus_reschedule(entry->server.cpu);
+        }
+        raw_spin_unlock_irqrestore(entry_lock(entry), flags);
+}
+static void color_fifo_release(rt_domain_t *dom, struct bheap *tasks)
+{
+        unsigned long flags;
+        TRACE_TASK(bheap2task(bheap_peek(dom->order, tasks)),
+                   "Released set of FIFO tasks\n");
+        local_irq_save(flags);
+        raw_spin_lock(&fifo_lock);
+        __merge_ready(dom, tasks);
+        raw_spin_unlock(&fifo_lock);
+        check_for_fifo_preempt();
+        local_irq_restore(flags);
+}
+#define cpu_empty(entry, run) \
+        (!(run) || (is_server(run) && !(entry)->fifo_server.server.linked))
+static struct task_struct* color_schedule(struct task_struct *prev)
+{
+        unsigned long flags;
+        int server_running;
+        struct cpu_entry *entry = local_entry;
+        struct task_struct *next, *plink = entry->server.linked;
+        TRACE("Reschedule on %d at %llu\n", entry->server.cpu, litmus_clock());
+        BUG_ON(entry->scheduled && entry->scheduled != prev);
+        BUG_ON(entry->scheduled && !is_realtime(prev));
+        raw_spin_lock_irqsave(entry_lock(entry), flags);
+        if (entry->scheduled && cpu_empty(entry, plink) && is_running(prev)) {
+                TRACE_TASK(prev, "Snuck in on new!\n");
+                job_arrival(entry->scheduled);
+        }
+        /* Pick next top-level task */
+        next = schedule_server(&entry->server);
+        /* Schedule hierarchically */
+        server_running = next && is_server(next);
+        if (server_running)
+                next = task_fserver(next)->linked;
+        /* Selected tasks must contend for group lock */
+        if (next) {
+                raw_spin_lock(&dgl_lock);
+                acquire_resources(next);
+                if (has_resources(next, entry->server.cpu)) {
+                        TRACE_TASK(next, "Has group lock\n");
+                        sched_trace_task_resume(next, 1);
+                } else {
+                        TRACE_TASK(next, "Does not have lock, 0x%p does\n",
+                                   group_lock.acquired[entry->server.cpu]);
+                        if (next != prev)
+                                sched_trace_task_block(next, 1);
+                        next = NULL;
+                        server_running = 0;
+                }
+                raw_spin_unlock(&dgl_lock);
+        }
+        /* Server is blocked if its running task is blocked. Note that if the
+         * server has no running task, the server will now execute NULL.
+         */
+        if (server_running) {
+                TRACE_TASK(entry->server.linked, "Server running\n");
+                arm_enforcement_timer(&entry->fifo_server.timer,
+                                      entry->fifo_server.task);
+                entry->fifo_server.start_time = litmus_clock();
+        }
+        if (prev) {
+                tsk_rt(prev)->scheduled_on = NO_CPU;
+                tsk_rt(prev)->last_exec_time = 0;
+                chunk_cancel(entry);
+        }
+        if (next) {
+                tsk_rt(next)->scheduled_on = entry->server.cpu;
+                tsk_rt(next)->last_exec_time = litmus_clock();
+                chunk_arm(entry);
+        }
+        entry->scheduled = next;
+        sched_state_task_picked();
+        raw_spin_unlock_irqrestore(entry_lock(entry), flags);
+        return entry->scheduled;
+}
+static void color_task_new(struct task_struct *t, int on_rq, int running)
+{
+        unsigned long flags;
+        struct cpu_entry *entry;
+        struct dgl_group_req *req;
+        TRACE_TASK(t, "New colored task\n");
+        entry = (is_be(t)) ? local_entry : task_entry(t);
+        raw_spin_lock_irqsave(entry_lock(entry), flags);
+        req = kmalloc(sizeof(*req), GFP_ATOMIC);
+        tsk_rt(t)->req = req;
+        tsk_rt(t)->tot_exec_time = 0;
+        tsk_rt(t)->max_exec_time = 0;
+        tsk_rt(t)->max_tardy = 0;
+        tsk_rt(t)->missed = 0;
+        tsk_rt(t)->total_tardy = 0;
+        tsk_rt(t)->ctrl_page->colors_updated = 1;
+        tsk_rt(t)->last_exec_time = 0;
+        release_at(t, litmus_clock());
+        if (running) {
+                /* No need to lock with irqs disabled */
+                TRACE_TASK(t, "Already scheduled on %d\n", entry->server.cpu);
+                BUG_ON(entry->scheduled);
+                entry->scheduled = t;
+                tsk_rt(t)->scheduled_on = entry->server.cpu;
+        } else {
+                job_arrival(t);
+        }
+        raw_spin_unlock(entry_lock(entry));
+        if (is_be(t))
+                check_for_fifo_preempt();
+        else
+                litmus_reschedule_local();
+        local_irq_restore(flags);
+}
+static void color_task_wake_up(struct task_struct *task)
+{
+        unsigned long flags;
+        struct cpu_entry* entry = local_entry;
+        int sched;
+        lt_t now = litmus_clock();
+        TRACE_TASK(task, "Wake up at %llu\n", now);
+        raw_spin_lock_irqsave(entry_lock(entry), flags);
+        /* Abuse sporadic model */
+        if (is_tardy(task, now)) {
+                release_at(task, now);
+                sched_trace_task_release(task);
+        }
+        sched = (entry->scheduled == task);
+        if (!sched)
+                job_arrival(task);
+        else
+                TRACE_TASK(task, "Is already scheduled on %d!\n",
+                           entry->scheduled);
+        raw_spin_unlock(entry_lock(entry));
+        if (is_be(task))
+                check_for_fifo_preempt();
+        else
+                litmus_reschedule_local();
+        local_irq_restore(flags);
+}
+static void color_task_block(struct task_struct *t)
+{
+        TRACE_TASK(t, "Block at %llu, state=%d\n", litmus_clock(), t->state);
+        BUG_ON(!is_realtime(t));
+        BUG_ON(is_queued(t));
+}
+static void color_task_exit(struct task_struct *t)
+{
+        unsigned long flags;
+        struct cpu_entry *entry = task_entry(t);
+        raw_spinlock_t *lock = task_lock(entry, t);
+        TRACE_TASK(t, "RIP, now reschedule\n");
+        local_irq_save(flags);
+        sched_trace_task_exit(t);
+        sched_trace_task_tardy(t);
+        /* Remove from scheduler consideration */
+        if (is_queued(t)) {
+                raw_spin_lock(lock);
+                remove(task_dom(entry, t), t);
+                raw_spin_unlock(lock);
+        }
+        /* Stop parent server */
+        if (get_task_server(t))
+                unlink(get_task_server(t));
+        /* Unschedule running task */
+        if (tsk_rt(t)->scheduled_on != NO_CPU) {
+                entry = remote_entry(tsk_rt(t)->scheduled_on);
+                raw_spin_lock(entry_lock(entry));
+                tsk_rt(t)->scheduled_on = NO_CPU;
+                entry->scheduled = NULL;
+                litmus_reschedule(entry->server.cpu);
+                raw_spin_unlock(entry_lock(entry));
+        }
+        /* Remove dgl request from system */
+        raw_spin_lock(&dgl_lock);
+        release_resources(t);
+        raw_spin_unlock(&dgl_lock);
+        dgl_group_req_free(tsk_rt(t)->req);
+        kfree(tsk_rt(t)->req);
+        local_irq_restore(flags);
+}
+/*
+ * Non-be tasks must have migrated to the right CPU.
+ */
+static long color_admit_task(struct task_struct* t)
+{
+        int ret = is_be(t) || task_cpu(t) == get_partition(t) ? 0 : -EINVAL;
+        if (!ret) {
+                printk(KERN_WARNING "Task failed to migrate to CPU %d\n",
+                       get_partition(t));
+        }
+        return ret;
+}
+/*
+ * Load server parameters.
+ */
+static long color_activate_plugin(void)
+{
+        int cpu, ret = 0;
+        struct rt_task tp;
+        struct task_struct *server_task;
+        struct cpu_entry *entry;
+        color_chunk = 0;
+        for_each_online_cpu(cpu) {
+                entry = remote_entry(cpu);
+                server_task = entry->fifo_server.task;
+                raw_spin_lock(entry_lock(entry));
+                ret = color_server_params(cpu, ((unsigned long*)&tp.exec_cost),
+                                               ((unsigned long*)&tp.period));
+                if (ret) {
+                        printk(KERN_WARNING "Uninitialized server for CPU %d\n",
+                               entry->server.cpu);
+                        goto loop_end;
+                }
+                /* Fill rt parameters */
+                tp.phase = 0;
+                tp.cpu = cpu;
+                tp.cls = RT_CLASS_SOFT;
+                tp.budget_policy = PRECISE_ENFORCEMENT;
+                tsk_rt(server_task)->task_params = tp;
+                tsk_rt(server_task)->present = 1;
+                entry->scheduled = NULL;
+                TRACE_TASK(server_task, "Created server with wcet: %llu, "
+                           "period: %llu\n", tp.exec_cost, tp.period);
+        loop_end:
+                raw_spin_unlock(entry_lock(entry));
+        }
+        return ret;
+}
+/*
+ * Mark servers as unused, making future calls to requeue fail.
+ */
+static long color_deactivate_plugin(void)
+{
+        int cpu;
+        struct cpu_entry *entry;
+        for_each_online_cpu(cpu) {
+                entry = remote_entry(cpu);
+                if (entry->fifo_server.task) {
+                        tsk_rt(entry->fifo_server.task)->present = 0;
+                }
+        }
+        return 0;
+}
+/*
+ * Dump container and server parameters for tracing.
+ */
+static void color_release_ts(lt_t time)
+{
+        int cpu, fifo_cid;
+        char fifo_name[TASK_COMM_LEN], cpu_name[TASK_COMM_LEN];
+        struct cpu_entry *entry;
+        struct task_struct *stask;
+        strcpy(cpu_name, "CPU");
+        strcpy(fifo_name, "BE");
+        fifo_cid = num_online_cpus();
+        trace_litmus_container_param(fifo_cid, fifo_name);
+        for_each_online_cpu(cpu) {
+                entry = remote_entry(cpu);
+                trace_litmus_container_param(cpu, cpu_name);
+                trace_litmus_server_param(entry->server.sid, cpu, 0, 0);
+                stask = entry->fifo_server.task;
+                trace_litmus_server_param(stask->pid, fifo_cid,
+                                          get_exec_cost(stask),
+                                          get_rt_period(stask));
+                /* Make runnable */
+                release_at(stask, time);
+                entry->fifo_server.start_time = 0;
+                cancel_enforcement_timer(&entry->fifo_server.timer);
+                if (!is_queued(stask))
+                        requeue(&entry->rm_domain, stask);
+        }
+}
+static struct sched_plugin color_plugin __cacheline_aligned_in_smp = {
+        .plugin_name            = "COLOR",
+        .task_new               = color_task_new,
+        .complete_job           = complete_job,
+        .task_exit              = color_task_exit,
+        .schedule               = color_schedule,
+        .task_wake_up           = color_task_wake_up,
+        .task_block             = color_task_block,
+        .admit_task             = color_admit_task,
+        .release_ts             = color_release_ts,
+        .activate_plugin        = color_activate_plugin,
+        .deactivate_plugin      = color_deactivate_plugin,
+};
+static int __init init_color(void)
+{
+        int cpu;
+        struct cpu_entry *entry;
+        struct task_struct *server_task;
+        struct fifo_server *fifo_server;
+        struct rt_server *cpu_server;
+        for_each_online_cpu(cpu) {
+                entry = remote_entry(cpu);
+                rm_domain_init(&entry->rm_domain, NULL, color_rm_release);
+                entry->scheduled = NULL;
+                /* Create FIFO server */
+                fifo_server = &entry->fifo_server;
+                init_rt_server(&fifo_server->server,
+                               cpu + num_online_cpus() + 1,
+                               cpu,
+                               &fifo_domain,
+                               fifo_preemption_needed, fifo_update);
+                /* Create task struct for FIFO server */
+                server_task = kmalloc(sizeof(struct task_struct), GFP_ATOMIC);
+                memset(server_task, 0, sizeof(*server_task));
+                server_task->policy = SCHED_LITMUS;
+                strcpy(server_task->comm, "server");
+                server_task->pid = fifo_server->server.sid;
+                fifo_server->task = server_task;
+                /* Create rt_params for FIFO server */
+                tsk_rt(server_task)->heap_node = bheap_node_alloc(GFP_ATOMIC);
+                tsk_rt(server_task)->rel_heap = release_heap_alloc(GFP_ATOMIC);
+                bheap_node_init(&tsk_rt(server_task)->heap_node, server_task);
+                tsk_rt(server_task)->is_server = 1;
+                /* Create CPU server */
+                cpu_server = &entry->server;
+                init_rt_server(cpu_server, cpu + 1, cpu,
+                               &entry->rm_domain, rm_preemption_needed, NULL);
+                cpu_server->running = 1;
+                init_enforcement_timer(&fifo_server->timer);
+                hrtimer_init(&entry->chunk_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+                entry->chunk_timer.function = chunk_fire;
+        }
+        fifo_domain_init(&fifo_domain, NULL, color_fifo_release);
+        raw_spin_lock_init(&fifo_lock);
+        dgl_init(&group_lock, color_cache_info.nr_colors,
+                 color_cache_info.ways);
+        raw_spin_lock_init(&dgl_lock);
+        return register_sched_plugin(&color_plugin);
+}
+static void exit_color(void)
+{
+        dgl_free(&group_lock);
+}
+module_init(init_color);
+module_exit(exit_color);
diff --git a/litmus/sched_gsn_edf.c b/litmus/sched_gsn_edf.c
index 0afd49155999..0aa44dbddbd6 100644
--- a/litmus/sched_gsn_edf.c
+++ b/litmus/sched_gsn_edf.c
@@ -18,11 +18,16 @@
 #include <litmus/sched_plugin.h>
 #include <litmus/edf_common.h>
 #include <litmus/sched_trace.h>
+#include <litmus/trace.h>
 #include <litmus/preempt.h>
 #include <litmus/bheap.h>
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+#include <litmus/affinity.h>
+#endif
 #include <linux/module.h>
 /* Overview of GSN-EDF operations.
@@ -253,21 +258,52 @@ static noinline void requeue(struct task_struct* task)
        }
 }
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+static cpu_entry_t* gsnedf_get_nearest_available_cpu(cpu_entry_t *start)
+{
+        cpu_entry_t *affinity;
+        get_nearest_available_cpu(affinity, start, gsnedf_cpu_entries,
+#ifdef CONFIG_RELEASE_MASTER
+                        gsnedf.release_master
+#else
+                        NO_CPU
+#endif
+                        );
+        return(affinity);
+}
+#endif
 /* check for any necessary preemptions */
 static void check_for_preemptions(void)
 {
        struct task_struct *task;
-        cpu_entry_t* last;
+        cpu_entry_t *last;
-        for(last = lowest_prio_cpu();
+        for (last = lowest_prio_cpu();
-            edf_preemption_needed(&gsnedf, last->linked);
+             edf_preemption_needed(&gsnedf, last->linked);
-            last = lowest_prio_cpu()) {
+             last = lowest_prio_cpu()) {
                /* preemption necessary */
                task = __take_ready(&gsnedf);
                TRACE("check_for_preemptions: attempting to link task %d to %d\n",
                      task->pid, last->cpu);
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+                {
+                        cpu_entry_t *affinity =
+                                        gsnedf_get_nearest_available_cpu(
+                                                &per_cpu(gsnedf_cpu_entries, task_cpu(task)));
+                        if (affinity)
+                                last = affinity;
+                        else if (last->linked)
+                                requeue(last->linked);
+                }
+#else
                if (last->linked)
                        requeue(last->linked);
+#endif
                link_task_to_cpu(task, last);
                preempt(last);
        }
@@ -374,8 +410,10 @@ static struct task_struct* gsnedf_schedule(struct task_struct * prev)
        /* Bail out early if we are the release master.
         * The release master never schedules any real-time tasks.
         */
-        if (gsnedf.release_master == entry->cpu)
+        if (unlikely(gsnedf.release_master == entry->cpu)) {
+                sched_state_task_picked();
                return NULL;
+        }
 #endif
        raw_spin_lock(&gsnedf_lock);
@@ -765,6 +803,8 @@ int gsnedf_fmlp_lock(struct litmus_lock* l)
                        }
                }
+                TS_LOCK_SUSPEND;
                /* release lock before sleeping */
                spin_unlock_irqrestore(&sem->wait.lock, flags);
@@ -777,14 +817,12 @@ int gsnedf_fmlp_lock(struct litmus_lock* l)
                schedule();
-                sched_trace_task_resume(t, l->id);
+                TS_LOCK_RESUME;
                /* Since we hold the lock, no other task will change
                 * ->owner. We can thus check it without acquiring the spin
                 * lock. */
                BUG_ON(sem->owner != t);
-                remove_wait_queue(&sem->wait, &wait);
        } else {
                /* it's ours now */
                sem->owner = t;
@@ -812,7 +850,7 @@ int gsnedf_fmlp_unlock(struct litmus_lock* l)
        }
        /* check if there are jobs waiting for this resource */
-        next = waitqueue_first(&sem->wait);
+        next = __waitqueue_remove_first(&sem->wait);
        if (next) {
                /* next becomes the resouce holder */
                sem->owner = next;
diff --git a/litmus/sched_litmus.c b/litmus/sched_litmus.c
index c2f1e49692c3..39673ab6c7cd 100644
--- a/litmus/sched_litmus.c
+++ b/litmus/sched_litmus.c
@@ -102,9 +102,9 @@ litmus_schedule(struct rq *rq, struct task_struct *prev)
                        }
                }
 #ifdef  __ARCH_WANT_UNLOCKED_CTXSW
-                if (next->oncpu)
+                if (next->on_cpu)
                        TRACE_TASK(next, "waiting for !oncpu");
-                while (next->oncpu) {
+                while (next->on_cpu) {
                        cpu_relax();
                        mb();
                }
@@ -257,12 +257,12 @@ static void task_tick_litmus(struct rq *rq, struct task_struct *p, int queued)
        return;
 }
-static void switched_to_litmus(struct rq *rq, struct task_struct *p, int running)
+static void switched_to_litmus(struct rq *rq, struct task_struct *p)
 {
 }
 static void prio_changed_litmus(struct rq *rq, struct task_struct *p,
-                                int oldprio, int running)
+                                int oldprio)
 {
 }
@@ -288,8 +288,8 @@ static void set_curr_task_litmus(struct rq *rq)
 * We don't care about the scheduling domain; can gets called from
 * exec, fork, wakeup.
 */
-static int select_task_rq_litmus(struct rq *rq, struct task_struct *p,
+static int
-                int sd_flag, int flags)
+select_task_rq_litmus(struct task_struct *p, int sd_flag, int flags)
 {
        /* preemption is already disabled.
         * We don't want to change cpu here
@@ -299,7 +299,12 @@ static int select_task_rq_litmus(struct rq *rq, struct task_struct *p,
 #endif
 static const struct sched_class litmus_sched_class = {
-        .next                   = &rt_sched_class,
+        /* From 34f971f6 the stop/migrate worker threads have a class on
+         * their own, which is the highest prio class. We don't support
+         * cpu-hotplug or cpu throttling. Allows Litmus to use up to 1.0
+         * CPU capacity.
+         */
+        .next                   = &stop_sched_class,
        .enqueue_task           = enqueue_task_litmus,
        .dequeue_task           = dequeue_task_litmus,
        .yield_task             = yield_task_litmus,
diff --git a/litmus/sched_pfair.c b/litmus/sched_pfair.c
index 0a64273daa47..16f1065bbdca 100644
--- a/litmus/sched_pfair.c
+++ b/litmus/sched_pfair.c
@@ -1,7 +1,8 @@
 /*
 * kernel/sched_pfair.c
 *
- * Implementation of the (global) Pfair scheduling algorithm.
+ * Implementation of the PD^2 pfair scheduling algorithm. This
+ * implementation realizes "early releasing," i.e., it is work-conserving.
 *
 */
@@ -76,36 +77,29 @@ struct pfair_state {
        struct task_struct* local;     /* the local copy of linked          */
        struct task_struct* scheduled; /* what is actually scheduled        */
-        unsigned long missed_quanta;
        lt_t offset;                    /* stagger offset */
+        unsigned int missed_updates;
+        unsigned int missed_quanta;
 };
-/* Currently, we limit the maximum period of any task to 2000 quanta.
- * The reason is that it makes the implementation easier since we do not
- * need to reallocate the release wheel on task arrivals.
- * In the future
- */
-#define PFAIR_MAX_PERIOD 2000
 struct pfair_cluster {
        struct scheduling_cluster topology;
        /* The "global" time in this cluster. */
        quanta_t pfair_time; /* the "official" PFAIR clock */
-        quanta_t merge_time; /* Updated after the release queue has been
-                              * merged. Used by drop_all_references().
-                              */
        /* The ready queue for this cluster. */
        rt_domain_t pfair;
-        /* This is the release queue wheel for this cluster. It is indexed by
+        /* The set of jobs that should have their release enacted at the next
-         * pfair_time % PFAIR_MAX_PERIOD.  Each heap is ordered by PFAIR
+         * quantum boundary.
-         * priority, so that it can be merged with the ready queue.
         */
-        struct bheap release_queue[PFAIR_MAX_PERIOD];
+        struct bheap release_queue;
+        raw_spinlock_t release_lock;
 };
+#define RT_F_REQUEUE 0x2
 static inline struct pfair_cluster* cpu_cluster(struct pfair_state* state)
 {
        return container_of(state->topology.cluster, struct pfair_cluster, topology);
@@ -121,6 +115,11 @@ static inline struct pfair_state* from_cluster_list(struct list_head* pos)
        return list_entry(pos, struct pfair_state, topology.cluster_list);
 }
+static inline struct pfair_cluster* from_domain(rt_domain_t* rt)
+{
+        return container_of(rt, struct pfair_cluster, pfair);
+}
 static inline raw_spinlock_t* cluster_lock(struct pfair_cluster* cluster)
 {
        /* The ready_lock is used to serialize all scheduling events. */
@@ -161,21 +160,11 @@ static quanta_t cur_deadline(struct task_struct* t)
        return cur_subtask(t)->deadline +  tsk_pfair(t)->release;
 }
-static quanta_t cur_sub_release(struct task_struct* t)
-{
-        return cur_subtask(t)->release +  tsk_pfair(t)->release;
-}
 static quanta_t cur_release(struct task_struct* t)
 {
-#ifdef EARLY_RELEASE
+        /* This is early releasing: only the release of the first subtask
-        /* only the release of the first subtask counts when we early
+         * counts. */
-         * release */
        return tsk_pfair(t)->release;
-#else
-        return cur_sub_release(t);
-#endif
 }
 static quanta_t cur_overlap(struct task_struct* t)
@@ -235,11 +224,16 @@ int pfair_ready_order(struct bheap_node* a, struct bheap_node* b)
        return pfair_higher_prio(bheap2task(a), bheap2task(b));
 }
-/* return the proper release queue for time t */
+static void pfair_release_jobs(rt_domain_t* rt, struct bheap* tasks)
-static struct bheap* relq(struct pfair_cluster* cluster, quanta_t t)
 {
-        struct bheap* rq = cluster->release_queue + (t % PFAIR_MAX_PERIOD);
+        struct pfair_cluster* cluster = from_domain(rt);
-        return rq;
+        unsigned long flags;
+        raw_spin_lock_irqsave(&cluster->release_lock, flags);
+        bheap_union(pfair_ready_order, &cluster->release_queue, tasks);
+        raw_spin_unlock_irqrestore(&cluster->release_lock, flags);
 }
 static void prepare_release(struct task_struct* t, quanta_t at)
@@ -248,25 +242,12 @@ static void prepare_release(struct task_struct* t, quanta_t at)
        tsk_pfair(t)->cur        = 0;
 }
-static void __pfair_add_release(struct task_struct* t, struct bheap* queue)
-{
-        bheap_insert(pfair_ready_order, queue,
-                    tsk_rt(t)->heap_node);
-}
-static void pfair_add_release(struct pfair_cluster* cluster,
-                              struct task_struct* t)
-{
-        BUG_ON(bheap_node_in_heap(tsk_rt(t)->heap_node));
-        __pfair_add_release(t, relq(cluster, cur_release(t)));
-}
 /* pull released tasks from the release queue */
-static void poll_releases(struct pfair_cluster* cluster,
+static void poll_releases(struct pfair_cluster* cluster)
-                          quanta_t time)
 {
-        __merge_ready(&cluster->pfair, relq(cluster, time));
+        raw_spin_lock(&cluster->release_lock);
-        cluster->merge_time = time;
+        __merge_ready(&cluster->pfair, &cluster->release_queue);
+        raw_spin_unlock(&cluster->release_lock);
 }
 static void check_preempt(struct task_struct* t)
@@ -292,16 +273,12 @@ static void drop_all_references(struct task_struct *t)
 {
        int cpu;
        struct pfair_state* s;
-        struct bheap* q;
        struct pfair_cluster* cluster;
        if (bheap_node_in_heap(tsk_rt(t)->heap_node)) {
-                /* figure out what queue the node is in */
+                /* It must be in the ready queue; drop references isn't called
+                 * when the job is in a release queue. */
                cluster = tsk_pfair(t)->cluster;
-                if (time_before_eq(cur_release(t), cluster->merge_time))
+                bheap_delete(pfair_ready_order, &cluster->pfair.ready_queue,
-                        q = &cluster->pfair.ready_queue;
-                else
-                        q = relq(cluster, cur_release(t));
-                bheap_delete(pfair_ready_order, q,
                            tsk_rt(t)->heap_node);
        }
        for (cpu = 0; cpu < num_online_cpus(); cpu++) {
@@ -313,6 +290,17 @@ static void drop_all_references(struct task_struct *t)
                if (s->scheduled  == t)
                        s->scheduled = NULL;
        }
+        /* make sure we don't have a stale linked_on field */
+        tsk_rt(t)->linked_on = NO_CPU;
+}
+static void pfair_prepare_next_period(struct task_struct* t)
+{
+        struct pfair_param* p = tsk_pfair(t);
+        prepare_for_next_period(t);
+        get_rt_flags(t) = RT_F_RUNNING;
+        p->release += p->period;
 }
 /* returns 1 if the task needs to go the release queue */
@@ -322,30 +310,26 @@ static int advance_subtask(quanta_t time, struct task_struct* t, int cpu)
        int to_relq;
        p->cur = (p->cur + 1) % p->quanta;
        if (!p->cur) {
-                sched_trace_task_completion(t, 1);
                if (tsk_rt(t)->present) {
-                        /* we start a new job */
+                        /* The job overran; we start a new budget allocation. */
-                        prepare_for_next_period(t);
+                        pfair_prepare_next_period(t);
-                        sched_trace_task_release(t);
-                        get_rt_flags(t) = RT_F_RUNNING;
-                        p->release += p->period;
                } else {
                        /* remove task from system until it wakes */
                        drop_all_references(t);
+                        tsk_rt(t)->flags = RT_F_REQUEUE;
                        TRACE_TASK(t, "on %d advanced to subtask %lu (not present)\n",
                                   cpu, p->cur);
                        return 0;
                }
        }
        to_relq = time_after(cur_release(t), time);
-        TRACE_TASK(t, "on %d advanced to subtask %lu -> to_relq=%d\n",
+        TRACE_TASK(t, "on %d advanced to subtask %lu -> to_relq=%d (cur_release:%lu time:%lu)\n",
-                   cpu, p->cur, to_relq);
+                   cpu, p->cur, to_relq, cur_release(t), time);
        return to_relq;
 }
 static void advance_subtasks(struct pfair_cluster *cluster, quanta_t time)
 {
-        int missed;
        struct task_struct* l;
        struct pfair_param* p;
        struct list_head* pos;
@@ -354,14 +338,17 @@ static void advance_subtasks(struct pfair_cluster *cluster, quanta_t time)
        list_for_each(pos, &cluster->topology.cpus) {
                cpu = from_cluster_list(pos);
                l = cpu->linked;
-                missed = cpu->linked != cpu->local;
+                cpu->missed_updates += cpu->linked != cpu->local;
                if (l) {
                        p = tsk_pfair(l);
                        p->last_quantum = time;
                        p->last_cpu     =  cpu_id(cpu);
                        if (advance_subtask(time, l, cpu_id(cpu))) {
-                                cpu->linked = NULL;
+                                //cpu->linked = NULL;
-                                pfair_add_release(cluster, l);
+                                PTRACE_TASK(l, "should go to release queue. "
+                                            "scheduled_on=%d present=%d\n",
+                                            tsk_rt(l)->scheduled_on,
+                                            tsk_rt(l)->present);
                        }
                }
        }
@@ -445,6 +432,11 @@ static void schedule_subtasks(struct pfair_cluster *cluster, quanta_t time)
        list_for_each(pos, &cluster->topology.cpus) {
                cpu_state = from_cluster_list(pos);
                retry = 1;
+#ifdef CONFIG_RELEASE_MASTER
+                /* skip release master */
+                if (cluster->pfair.release_master == cpu_id(cpu_state))
+                        continue;
+#endif
                while (retry) {
                        if (pfair_higher_prio(__peek_ready(&cluster->pfair),
                                              cpu_state->linked))
@@ -471,13 +463,13 @@ static void schedule_next_quantum(struct pfair_cluster *cluster, quanta_t time)
        sched_trace_quantum_boundary();
        advance_subtasks(cluster, time);
-        poll_releases(cluster, time);
+        poll_releases(cluster);
        schedule_subtasks(cluster, time);
        list_for_each(pos, &cluster->topology.cpus) {
                cpu = from_cluster_list(pos);
                if (cpu->linked)
-                        PTRACE_TASK(pstate[cpu]->linked,
+                        PTRACE_TASK(cpu->linked,
                                    " linked on %d.\n", cpu_id(cpu));
                else
                        PTRACE("(null) linked on %d.\n", cpu_id(cpu));
@@ -612,12 +604,42 @@ static int safe_to_schedule(struct task_struct* t, int cpu)
 static struct task_struct* pfair_schedule(struct task_struct * prev)
 {
        struct pfair_state* state = &__get_cpu_var(pfair_state);
-        int blocks;
+        struct pfair_cluster* cluster = cpu_cluster(state);
+        int blocks, completion, out_of_time;
        struct task_struct* next = NULL;
+#ifdef CONFIG_RELEASE_MASTER
+        /* Bail out early if we are the release master.
+         * The release master never schedules any real-time tasks.
+         */
+        if (unlikely(cluster->pfair.release_master == cpu_id(state))) {
+                sched_state_task_picked();
+                return NULL;
+        }
+#endif
        raw_spin_lock(cpu_lock(state));
-        blocks  = is_realtime(prev) && !is_running(prev);
+        blocks      = is_realtime(prev) && !is_running(prev);
+        completion  = is_realtime(prev) && get_rt_flags(prev) == RT_F_SLEEP;
+        out_of_time = is_realtime(prev) && time_after(cur_release(prev),
+                                                      state->local_tick);
+        if (is_realtime(prev))
+            PTRACE_TASK(prev, "blocks:%d completion:%d out_of_time:%d\n",
+                        blocks, completion, out_of_time);
+        if (completion) {
+                sched_trace_task_completion(prev, 0);
+                pfair_prepare_next_period(prev);
+                prepare_release(prev, cur_release(prev));
+        }
+        if (!blocks && (completion || out_of_time)) {
+                drop_all_references(prev);
+                sched_trace_task_release(prev);
+                add_release(&cluster->pfair, prev);
+        }
        if (state->local && safe_to_schedule(state->local, cpu_id(state)))
                next = state->local;
@@ -649,13 +671,19 @@ static void pfair_task_new(struct task_struct * t, int on_rq, int running)
        cluster = tsk_pfair(t)->cluster;
        raw_spin_lock_irqsave(cluster_lock(cluster), flags);
-        if (running)
-                t->rt_param.scheduled_on = task_cpu(t);
-        else
-                t->rt_param.scheduled_on = NO_CPU;
        prepare_release(t, cluster->pfair_time + 1);
-        pfair_add_release(cluster, t);
+        t->rt_param.scheduled_on = NO_CPU;
+        if (running) {
+#ifdef CONFIG_RELEASE_MASTER
+                if (task_cpu(t) != cluster->pfair.release_master)
+#endif
+                        t->rt_param.scheduled_on = task_cpu(t);
+                __add_ready(&cluster->pfair, t);
+        }
        check_preempt(t);
        raw_spin_unlock_irqrestore(cluster_lock(cluster), flags);
@@ -665,6 +693,7 @@ static void pfair_task_wake_up(struct task_struct *t)
 {
        unsigned long flags;
        lt_t now;
+        int requeue = 0;
        struct pfair_cluster* cluster;
        cluster = tsk_pfair(t)->cluster;
@@ -679,13 +708,20 @@ static void pfair_task_wake_up(struct task_struct *t)
         * (as if it never blocked at all). Otherwise, we have a
         * new sporadic job release.
         */
+        requeue = tsk_rt(t)->flags == RT_F_REQUEUE;
        now = litmus_clock();
        if (lt_before(get_deadline(t), now)) {
+                TRACE_TASK(t, "sporadic release!\n");
                release_at(t, now);
                prepare_release(t, time2quanta(now, CEIL));
                sched_trace_task_release(t);
-                /* FIXME: race with pfair_time advancing */
+        }
-                pfair_add_release(cluster, t);
+        /* only add to ready queue if the task isn't still linked somewhere */
+        if (requeue) {
+                TRACE_TASK(t, "requeueing required\n");
+                tsk_rt(t)->flags = RT_F_RUNNING;
+                __add_ready(&cluster->pfair, t);
        }
        check_preempt(t);
@@ -744,15 +780,11 @@ static void pfair_release_at(struct task_struct* task, lt_t start)
        release_at(task, start);
        release = time2quanta(start, CEIL);
-        /* FIXME: support arbitrary offsets. */
-        if (release - cluster->pfair_time >= PFAIR_MAX_PERIOD)
-                release = cluster->pfair_time + PFAIR_MAX_PERIOD;
        TRACE_TASK(task, "sys release at %lu\n", release);
        drop_all_references(task);
        prepare_release(task, release);
-        pfair_add_release(cluster, task);
+        add_release(&cluster->pfair, task);
        raw_spin_unlock_irqrestore(cluster_lock(cluster), flags);
 }
@@ -834,13 +866,6 @@ static long pfair_admit_task(struct task_struct* t)
                       "The period of %s/%d is not a multiple of %llu.\n",
                       t->comm, t->pid, (unsigned long long) quantum_length);
-        if (period >= PFAIR_MAX_PERIOD) {
-                printk(KERN_WARNING
-                       "PFAIR: Rejecting task %s/%d; its period is too long.\n",
-                       t->comm, t->pid);
-                return -EINVAL;
-        }
        if (quanta == period) {
                /* special case: task has weight 1.0 */
                printk(KERN_INFO
@@ -880,12 +905,9 @@ static long pfair_admit_task(struct task_struct* t)
 static void pfair_init_cluster(struct pfair_cluster* cluster)
 {
-        int i;
+        rt_domain_init(&cluster->pfair, pfair_ready_order, NULL, pfair_release_jobs);
+        bheap_init(&cluster->release_queue);
-        /* initialize release queue */
+        raw_spin_lock_init(&cluster->release_lock);
-        for (i = 0; i < PFAIR_MAX_PERIOD; i++)
-                bheap_init(&cluster->release_queue[i]);
-        rt_domain_init(&cluster->pfair, pfair_ready_order, NULL, NULL);
        INIT_LIST_HEAD(&cluster->topology.cpus);
 }
@@ -899,8 +921,11 @@ static void cleanup_clusters(void)
        num_pfair_clusters = 0;
        /* avoid stale pointers */
-        for (i = 0; i < NR_CPUS; i++)
+        for (i = 0; i < num_online_cpus(); i++) {
                pstate[i]->topology.cluster = NULL;
+                printk("P%d missed %u updates and %u quanta.\n", cpu_id(pstate[i]),
+                       pstate[i]->missed_updates, pstate[i]->missed_quanta);
+        }
 }
 static long pfair_activate_plugin(void)
@@ -936,6 +961,9 @@ static long pfair_activate_plugin(void)
                pfair_init_cluster(cluster);
                cluster->pfair_time = now;
                clust[i] = &cluster->topology;
+#ifdef CONFIG_RELEASE_MASTER
+                cluster->pfair.release_master = atomic_read(&release_master_cpu);
+#endif
        }
        for (i = 0; i < num_online_cpus(); i++)  {
@@ -943,6 +971,7 @@ static long pfair_activate_plugin(void)
                state->cur_tick   = now;
                state->local_tick = now;
                state->missed_quanta = 0;
+                state->missed_updates = 0;
                state->offset     = cpu_stagger_offset(i);
                printk(KERN_ERR "cpus[%d] set; %d\n", i, num_online_cpus());
                cpus[i] = &state->topology;
diff --git a/litmus/sched_plugin.c b/litmus/sched_plugin.c
index 4eaf30659af3..123c7516fb76 100644
--- a/litmus/sched_plugin.c
+++ b/litmus/sched_plugin.c
@@ -35,29 +35,18 @@ void preempt_if_preemptable(struct task_struct* t, int cpu)
                        /* local CPU case */
                        /* check if we need to poke userspace */
                        if (is_user_np(t))
-                                /* yes, poke it */
+                                /* Yes, poke it. This doesn't have to be atomic since
+                                 * the task is definitely not executing. */
                                request_exit_np(t);
                        else if (!is_kernel_np(t))
                                /* only if we are allowed to preempt the
                                 * currently-executing task */
                                reschedule = 1;
                } else {
-                        /* remote CPU case */
+                        /* Remote CPU case.  Only notify if it's not a kernel
-                        if (is_user_np(t)) {
+                         * NP section and if we didn't set the userspace
-                                /* need to notify user space of delayed
+                         * flag. */
-                                 * preemption */
+                        reschedule = !(is_kernel_np(t) || request_exit_np_atomic(t));
-                                /* to avoid a race, set the flag, then test
-                                 * again */
-                                request_exit_np(t);
-                                /* make sure it got written */
-                                mb();
-                        }
-                        /* Only send an ipi if remote task might have raced our
-                         * request, i.e., send an IPI to make sure in case it
-                         * exited its critical section.
-                         */
-                        reschedule = !is_np(t) && !is_kernel_np(t);
                }
        }
        if (likely(reschedule))
diff --git a/litmus/sched_psn_edf.c b/litmus/sched_psn_edf.c
index e66397b7e8a2..eaaec38f43da 100644
--- a/litmus/sched_psn_edf.c
+++ b/litmus/sched_psn_edf.c
@@ -20,6 +20,7 @@
 #include <litmus/sched_plugin.h>
 #include <litmus/edf_common.h>
 #include <litmus/sched_trace.h>
+#include <litmus/trace.h>
 typedef struct {
        rt_domain_t             domain;
@@ -386,12 +387,6 @@ static unsigned int psnedf_get_srp_prio(struct task_struct* t)
        return get_rt_period(t);
 }
-static long psnedf_activate_plugin(void)
-{
-        get_srp_prio = psnedf_get_srp_prio;
-        return 0;
-}
 /* ******************** FMLP support ********************** */
 /* struct for semaphore with priority inheritance */
@@ -431,6 +426,8 @@ int psnedf_fmlp_lock(struct litmus_lock* l)
                __add_wait_queue_tail_exclusive(&sem->wait, &wait);
+                TS_LOCK_SUSPEND;
                /* release lock before sleeping */
                spin_unlock_irqrestore(&sem->wait.lock, flags);
@@ -441,14 +438,12 @@ int psnedf_fmlp_lock(struct litmus_lock* l)
                schedule();
+                TS_LOCK_RESUME;
                /* Since we hold the lock, no other task will change
                 * ->owner. We can thus check it without acquiring the spin
                 * lock. */
                BUG_ON(sem->owner != t);
-                /* FIXME: could we punt the dequeuing to the previous job,
-                 * which is holding the spinlock anyway? */
-                remove_wait_queue(&sem->wait, &wait);
        } else {
                /* it's ours now */
                sem->owner = t;
@@ -481,7 +476,7 @@ int psnedf_fmlp_unlock(struct litmus_lock* l)
        unboost_priority(t);
        /* check if there are jobs waiting for this resource */
-        next = waitqueue_first(&sem->wait);
+        next = __waitqueue_remove_first(&sem->wait);
        if (next) {
                /* boost next job */
                boost_priority(next);
@@ -584,9 +579,35 @@ static long psnedf_allocate_lock(struct litmus_lock **lock, int type,
 #endif
+static long psnedf_activate_plugin(void)
+{
+#ifdef CONFIG_RELEASE_MASTER
+        int cpu;
+        for_each_online_cpu(cpu) {
+                remote_edf(cpu)->release_master = atomic_read(&release_master_cpu);
+        }
+#endif
+#ifdef CONFIG_LITMUS_LOCKING
+        get_srp_prio = psnedf_get_srp_prio;
+#endif
+        return 0;
+}
 static long psnedf_admit_task(struct task_struct* tsk)
 {
-        return task_cpu(tsk) == tsk->rt_param.task_params.cpu ? 0 : -EINVAL;
+        if (task_cpu(tsk) == tsk->rt_param.task_params.cpu
+#ifdef CONFIG_RELEASE_MASTER
+            /* don't allow tasks on release master CPU */
+             && task_cpu(tsk) != remote_edf(task_cpu(tsk))->release_master
+#endif
+                )
+                return 0;
+        else
+                return -EINVAL;
 }
 /*      Plugin object   */
@@ -600,9 +621,9 @@ static struct sched_plugin psn_edf_plugin __cacheline_aligned_in_smp = {
        .task_wake_up           = psnedf_task_wake_up,
        .task_block             = psnedf_task_block,
        .admit_task             = psnedf_admit_task,
+        .activate_plugin        = psnedf_activate_plugin,
 #ifdef CONFIG_LITMUS_LOCKING
        .allocate_lock          = psnedf_allocate_lock,
-        .activate_plugin        = psnedf_activate_plugin,
 #endif
 };
diff --git a/litmus/sched_task_trace.c b/litmus/sched_task_trace.c
index f923280b3146..8c1ca188bce1 100644
--- a/litmus/sched_task_trace.c
+++ b/litmus/sched_task_trace.c
@@ -191,7 +191,7 @@ feather_callback void do_sched_trace_task_completion(unsigned long id,
        struct task_struct *t = (struct task_struct*) _task;
        struct st_event_record* rec = get_record(ST_COMPLETION, t);
        if (rec) {
-                rec->data.completion.when   = now();
+                rec->data.completion.when   = get_exec_time(t);
                rec->data.completion.forced = forced;
                put_record(rec);
        }
@@ -231,6 +231,38 @@ feather_callback void do_sched_trace_sys_release(unsigned long id,
        }
 }
+feather_callback void do_sched_trace_task_exit(unsigned long id,
+                                               unsigned long _task)
+{
+        struct task_struct *t = (struct task_struct*) _task;
+#ifdef CONFIG_PLUGIN_COLOR
+        const lt_t max_exec_time = tsk_rt(t)->max_exec_time;
+        const lt_t avg_exec_time = tsk_rt(t)->tot_exec_time / (get_rt_job(t) - 1);
+#else
+        const lt_t max_exec_time = 0;
+        const lt_t avg_exec_time = 0;
+#endif
+        struct st_event_record *rec = get_record(ST_TASK_EXIT, t);
+        if (rec) {
+                rec->data.task_exit.avg_exec_time = avg_exec_time;
+                rec->data.task_exit.max_exec_time = max_exec_time;
+                put_record(rec);
+        }
+}
+feather_callback void do_sched_trace_task_tardy(unsigned long id,
+                                               unsigned long _task)
+{
+        struct task_struct *t = (struct task_struct*) _task;
+        struct st_event_record *rec = get_record(ST_TASK_TARDY, t);
+        if (rec) {
+                rec->data.task_tardy.max_tardy = tsk_rt(t)->max_tardy;
+                rec->data.task_tardy.total_tardy = tsk_rt(t)->total_tardy;
+                rec->data.task_tardy.missed = tsk_rt(t)->missed;
+                put_record(rec);
+        }
+}
 feather_callback void do_sched_trace_action(unsigned long id,
                                            unsigned long _task,
                                            unsigned long action)
diff --git a/litmus/trace.c b/litmus/trace.c
index 209524fd345e..4722ffa443c6 100644
--- a/litmus/trace.c
+++ b/litmus/trace.c
@@ -1,5 +1,6 @@
 #include <linux/sched.h>
 #include <linux/module.h>
+#include <linux/uaccess.h>
 #include <litmus/ftdev.h>
 #include <litmus/litmus.h>
@@ -19,6 +20,35 @@ static struct ftdev overhead_dev;
 static unsigned int ts_seq_no = 0;
+DEFINE_PER_CPU(atomic_t, irq_fired_count);
+static inline void clear_irq_fired(void)
+{
+        atomic_set(&__raw_get_cpu_var(irq_fired_count), 0);
+}
+static inline unsigned int get_and_clear_irq_fired(void)
+{
+        /* This is potentially not atomic  since we might migrate if
+         * preemptions are not disabled. As a tradeoff between
+         * accuracy and tracing overheads, this seems acceptable.
+         * If it proves to be a problem, then one could add a callback
+         * from the migration code to invalidate irq_fired_count.
+         */
+        return atomic_xchg(&__raw_get_cpu_var(irq_fired_count), 0);
+}
+static inline void __save_irq_flags(struct timestamp *ts)
+{
+        unsigned int irq_count;
+        irq_count     = get_and_clear_irq_fired();
+        /* Store how many interrupts occurred. */
+        ts->irq_count = irq_count;
+        /* Extra flag because ts->irq_count overflows quickly. */
+        ts->irq_flag  = irq_count > 0;
+}
 static inline void __save_timestamp_cpu(unsigned long event,
                                        uint8_t type, uint8_t cpu)
 {
@@ -27,10 +57,26 @@ static inline void __save_timestamp_cpu(unsigned long event,
        seq_no = fetch_and_inc((int *) &ts_seq_no);
        if (ft_buffer_start_write(trace_ts_buf, (void**)  &ts)) {
                ts->event     = event;
-                ts->timestamp = ft_timestamp();
                ts->seq_no    = seq_no;
                ts->cpu       = cpu;
                ts->task_type = type;
+                __save_irq_flags(ts);
+                barrier();
+                /* prevent re-ordering of ft_timestamp() */
+                ts->timestamp = ft_timestamp();
+                ft_buffer_finish_write(trace_ts_buf, ts);
+        }
+}
+static void __add_timestamp_user(struct timestamp *pre_recorded)
+{
+        unsigned int seq_no;
+        struct timestamp *ts;
+        seq_no = fetch_and_inc((int *) &ts_seq_no);
+        if (ft_buffer_start_write(trace_ts_buf, (void**)  &ts)) {
+                *ts = *pre_recorded;
+                ts->seq_no = seq_no;
+                __save_irq_flags(ts);
                ft_buffer_finish_write(trace_ts_buf, ts);
        }
 }
@@ -108,6 +154,27 @@ feather_callback void save_timestamp_cpu(unsigned long event,
        __save_timestamp_cpu(event, TSK_UNKNOWN, cpu);
 }
+feather_callback void save_task_latency(unsigned long event,
+                                        unsigned long when_ptr)
+{
+        lt_t now = litmus_clock();
+        lt_t *when = (lt_t*) when_ptr;
+        unsigned int seq_no;
+        int cpu = raw_smp_processor_id();
+        struct timestamp *ts;
+        seq_no = fetch_and_inc((int *) &ts_seq_no);
+        if (ft_buffer_start_write(trace_ts_buf, (void**)  &ts)) {
+                ts->event     = event;
+                ts->timestamp = now - *when;
+                ts->seq_no    = seq_no;
+                ts->cpu       = cpu;
+                ts->task_type = TSK_RT;
+                __save_irq_flags(ts);
+                ft_buffer_finish_write(trace_ts_buf, ts);
+        }
+}
 /******************************************************************************/
 /*                        DEVICE FILE DRIVER                                  */
 /******************************************************************************/
@@ -116,11 +183,15 @@ feather_callback void save_timestamp_cpu(unsigned long event,
 * should be 8M; it is the max we can ask to buddy system allocator (MAX_ORDER)
 * and we might not get as much
 */
-#define NO_TIMESTAMPS (2 << 13)
+#define NO_TIMESTAMPS (2 << 16)
 static int alloc_timestamp_buffer(struct ftdev* ftdev, unsigned int idx)
 {
        unsigned int count = NO_TIMESTAMPS;
+        /* An overhead-tracing timestamp should be exactly 16 bytes long. */
+        BUILD_BUG_ON(sizeof(struct timestamp) != 16);
        while (count && !trace_ts_buf) {
                printk("time stamp buffer: trying to allocate %u time stamps.\n", count);
                ftdev->minor[idx].buf = alloc_ft_buffer(count, sizeof(struct timestamp));
@@ -135,9 +206,35 @@ static void free_timestamp_buffer(struct ftdev* ftdev, unsigned int idx)
        ftdev->minor[idx].buf = NULL;
 }
+static ssize_t write_timestamp_from_user(struct ft_buffer* buf, size_t len,
+                                         const char __user *from)
+{
+        ssize_t consumed = 0;
+        struct timestamp ts;
+        /* don't give us partial timestamps */
+        if (len % sizeof(ts))
+                return -EINVAL;
+        while (len >= sizeof(ts)) {
+                if (copy_from_user(&ts, from, sizeof(ts))) {
+                        consumed = -EFAULT;
+                        goto out;
+                }
+                len  -= sizeof(ts);
+                from += sizeof(ts);
+                consumed += sizeof(ts);
+                __add_timestamp_user(&ts);
+        }
+out:
+        return consumed;
+}
 static int __init init_ft_overhead_trace(void)
 {
-        int err;
+        int err, cpu;
        printk("Initializing Feather-Trace overhead tracing device.\n");
        err = ftdev_init(&overhead_dev, THIS_MODULE, 1, "ft_trace");
@@ -146,11 +243,17 @@ static int __init init_ft_overhead_trace(void)
        overhead_dev.alloc = alloc_timestamp_buffer;
        overhead_dev.free  = free_timestamp_buffer;
+        overhead_dev.write = write_timestamp_from_user;
        err = register_ftdev(&overhead_dev);
        if (err)
                goto err_dealloc;
+        /* initialize IRQ flags */
+        for (cpu = 0; cpu < NR_CPUS; cpu++)  {
+                clear_irq_fired();
+        }
        return 0;
 err_dealloc:
author	Jonathan Herman <hermanjl@cs.unc.edu>	2012-09-29 13:04:40 -0400
committer	Jonathan Herman <hermanjl@cs.unc.edu>	2012-09-29 13:04:40 -0400
commit	daf1e620bff2cb6d830ef66725369bba9c858f62 (patch)
tree	1aed8f7cb55371c70d2139b6754d90ea89a26147 /litmus
parent	451ed3b075c2a8e322e5a44f177e2470426a821d (diff)
parent	1cb90226816c7af7808be4c0de866c54da17ecc9 (diff)