Merge remote-tracking branch 'github/master' into wip-mmap-uncache

author: Glenn Elliott <gelliott@cs.unc.edu> 2013-03-06 14:20:55 -0500
committer: Glenn Elliott <gelliott@cs.unc.edu> 2013-03-06 14:20:55 -0500
commit: 22da1b2b4f02413e58bf01caa5b14e42e7913598 (patch)
tree: 6e4022a5140e682d287c4206550848300bb7986b
parent: da954aa12e99b502356ca62bff822cb6a95cba7a (diff)
parent: c7cd5432b98df518b05bc8978d34382797fd9a05 (diff)
43 files changed, 4072 insertions, 427 deletions
diff --git a/arch/x86/include/asm/feather_trace_32.h b/arch/x86/include/asm/feather_trace_32.h
index 70202f90f169..75e81a9f9382 100644
--- a/arch/x86/include/asm/feather_trace_32.h
+++ b/arch/x86/include/asm/feather_trace_32.h
@@ -1,12 +1,45 @@
+/* Copyright (c) 2007-2012 Björn Brandenburg, <bbb@mpi-sws.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
 /* Do not directly include this file. Include feather_trace.h instead */
-#define feather_callback __attribute__((regparm(0)))
+#define feather_callback __attribute__((regparm(3)))  __attribute__((used))
 /*
- * make the compiler reload any register that is not saved in
+ * Make the compiler reload any register that is not saved in a cdecl function
- * a cdecl function call
+ * call (minus the registers that we explicitly clobber as output registers).
 */
-#define CLOBBER_LIST "memory", "cc", "eax", "ecx", "edx"
+#define __FT_CLOBBER_LIST0 "memory", "cc", "eax", "edx", "ecx"
+#define __FT_CLOBBER_LIST1 "memory", "cc", "eax", "ecx"
+#define __FT_CLOBBER_LIST2 "memory", "cc", "eax"
+#define __FT_CLOBBER_LIST3 "memory", "cc", "eax"
+#define __FT_TMP1(x) "=d" (x)
+#define __FT_ARG1(x) "0" ((long) (x))
+#define __FT_TMP2(x) "=c" (x)
+#define __FT_ARG2(x) "1" ((long) (x))
+#define __FT_ARG3(x) "r" ((long) (x))
 #define ft_event(id, callback)                                  \
        __asm__ __volatile__(                                   \
@@ -16,64 +49,67 @@
            ".long " #id  ", 0, 1b, 2f                    \n\t" \
            ".previous                                    \n\t" \
            "2:                                           \n\t" \
-        : : : CLOBBER_LIST)
+            : : : __FT_CLOBBER_LIST0)
 #define ft_event0(id, callback)                                 \
        __asm__ __volatile__(                                   \
            "1: jmp 2f                                    \n\t" \
-            " subl $4, %%esp                              \n\t" \
+            " movl $" #id  ", %%eax                       \n\t" \
-            " movl $" #id  ", (%%esp)                     \n\t" \
            " call " #callback "                          \n\t" \
-            " addl $4, %%esp                              \n\t" \
            ".section __event_table, \"aw\"               \n\t" \
            ".long " #id  ", 0, 1b, 2f                    \n\t" \
            ".previous                                    \n\t" \
            "2:                                           \n\t" \
-        : :  : CLOBBER_LIST)
+            : : : __FT_CLOBBER_LIST0)
-#define ft_event1(id, callback, param)                          \
+#define ft_event1(id, callback, param)                          \
+        do {                                                    \
+                long __ft_tmp1;                                 \
        __asm__ __volatile__(                                   \
            "1: jmp 2f                                    \n\t" \
-            " subl $8, %%esp                              \n\t" \
+            " movl $" #id  ", %%eax                       \n\t" \
-            " movl %0, 4(%%esp)                           \n\t" \
-            " movl $" #id  ", (%%esp)                     \n\t" \
            " call " #callback "                          \n\t" \
-            " addl $8, %%esp                              \n\t" \
            ".section __event_table, \"aw\"               \n\t" \
            ".long " #id  ", 0, 1b, 2f                    \n\t" \
            ".previous                                    \n\t" \
            "2:                                           \n\t" \
-        : : "r" (param)  : CLOBBER_LIST)
+            : __FT_TMP1(__ft_tmp1)                              \
+            : __FT_ARG1(param)                                  \
+            : __FT_CLOBBER_LIST1);                              \
+        } while (0);
 #define ft_event2(id, callback, param, param2)                  \
+        do {                                                    \
+                long __ft_tmp1, __ft_tmp2;                      \
        __asm__ __volatile__(                                   \
            "1: jmp 2f                                    \n\t" \
-            " subl $12, %%esp                             \n\t" \
+            " movl $" #id  ", %%eax                       \n\t" \
-            " movl %1, 8(%%esp)                           \n\t" \
-            " movl %0, 4(%%esp)                           \n\t" \
-            " movl $" #id  ", (%%esp)                     \n\t" \
            " call " #callback "                          \n\t" \
-            " addl $12, %%esp                             \n\t" \
            ".section __event_table, \"aw\"               \n\t" \
            ".long " #id  ", 0, 1b, 2f                    \n\t" \
            ".previous                                    \n\t" \
            "2:                                           \n\t" \
-        : : "r" (param), "r" (param2)  : CLOBBER_LIST)
+            : __FT_TMP1(__ft_tmp1), __FT_TMP2(__ft_tmp2)        \
+            : __FT_ARG1(param), __FT_ARG2(param2)               \
+            : __FT_CLOBBER_LIST2);                              \
+        } while (0);
-#define ft_event3(id, callback, p, p2, p3)                      \
+#define ft_event3(id, callback, param, param2, param3)          \
+        do {                                                    \
+                long __ft_tmp1, __ft_tmp2;                      \
        __asm__ __volatile__(                                   \
            "1: jmp 2f                                    \n\t" \
-            " subl $16, %%esp                             \n\t" \
+            " subl $4, %%esp                              \n\t" \
-            " movl %2, 12(%%esp)                          \n\t" \
+            " movl $" #id  ", %%eax                       \n\t" \
-            " movl %1, 8(%%esp)                           \n\t" \
+            " movl %2, (%%esp)                            \n\t" \
-            " movl %0, 4(%%esp)                           \n\t" \
-            " movl $" #id  ", (%%esp)                     \n\t" \
            " call " #callback "                          \n\t" \
-            " addl $16, %%esp                             \n\t" \
+            " addl $4, %%esp                              \n\t" \
            ".section __event_table, \"aw\"               \n\t" \
            ".long " #id  ", 0, 1b, 2f                    \n\t" \
            ".previous                                    \n\t" \
            "2:                                           \n\t" \
-        : : "r" (p), "r" (p2), "r" (p3)  : CLOBBER_LIST)
+            : __FT_TMP1(__ft_tmp1), __FT_TMP2(__ft_tmp2)        \
+            : __FT_ARG1(param), __FT_ARG2(param2), __FT_ARG3(param3)    \
+            : __FT_CLOBBER_LIST3);                              \
+        } while (0);
diff --git a/arch/x86/include/asm/feather_trace_64.h b/arch/x86/include/asm/feather_trace_64.h
index 54ac2aeb3a28..5ce49e2eebba 100644
--- a/arch/x86/include/asm/feather_trace_64.h
+++ b/arch/x86/include/asm/feather_trace_64.h
@@ -1,67 +1,124 @@
+/* Copyright (c) 2010 Andrea Bastoni, <bastoni@cs.unc.edu>
+ * Copyright (c) 2012 Björn Brandenburg, <bbb@mpi-sws.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
 /* Do not directly include this file. Include feather_trace.h instead */
 /* regparm is the default on x86_64 */
-#define feather_callback
+#define feather_callback  __attribute__((used))
-# define _EVENT_TABLE(id,from,to) \
+#define __FT_EVENT_TABLE(id,from,to) \
            ".section __event_table, \"aw\"\n\t" \
            ".balign 8\n\t" \
            ".quad " #id  ", 0, " #from ", " #to " \n\t" \
            ".previous \n\t"
 /*
- * x86_64 callee only owns rbp, rbx, r12 -> r15
+ * x86_64 caller only owns rbp, rbx, r12-r15;
- * the called can freely modify the others
+ * the callee can freely modify the others.
 */
-#define CLOBBER_LIST    "memory", "cc", "rdi", "rsi", "rdx", "rcx", \
+#define __FT_CLOBBER_LIST0      "memory", "cc", "rdi", "rsi", "rdx", "rcx", \
+                        "r8", "r9", "r10", "r11", "rax"
+#define __FT_CLOBBER_LIST1      "memory", "cc", "rdi", "rdx", "rcx", \
+                        "r8", "r9", "r10", "r11", "rax"
+#define __FT_CLOBBER_LIST2      "memory", "cc", "rdi", "rcx", \
                        "r8", "r9", "r10", "r11", "rax"
+#define __FT_CLOBBER_LIST3      "memory", "cc", "rdi", \
+                        "r8", "r9", "r10", "r11", "rax"
+/* The registers RDI, RSI, RDX, RCX, R8 and R9 are used for integer and pointer
+ * arguments. */
+/* RSI */
+#define __FT_TMP1(x) "=S" (x)
+#define __FT_ARG1(x) "0" ((long) (x))
+/* RDX */
+#define __FT_TMP2(x) "=d" (x)
+#define __FT_ARG2(x) "1" ((long) (x))
+/* RCX */
+#define __FT_TMP3(x) "=c" (x)
+#define __FT_ARG3(x) "2" ((long) (x))
 #define ft_event(id, callback)                                  \
        __asm__ __volatile__(                                   \
            "1: jmp 2f                                    \n\t" \
            " call " #callback "                          \n\t" \
-            _EVENT_TABLE(id,1b,2f) \
+            __FT_EVENT_TABLE(id,1b,2f)                          \
            "2:                                           \n\t" \
-        : : : CLOBBER_LIST)
+        : : : __FT_CLOBBER_LIST0)
 #define ft_event0(id, callback)                                 \
        __asm__ __volatile__(                                   \
            "1: jmp 2f                                    \n\t" \
            " movq $" #id ", %%rdi                        \n\t" \
            " call " #callback "                          \n\t" \
-            _EVENT_TABLE(id,1b,2f) \
+            __FT_EVENT_TABLE(id,1b,2f)                          \
            "2:                                           \n\t" \
-        : :  : CLOBBER_LIST)
+        : :  : __FT_CLOBBER_LIST0)
 #define ft_event1(id, callback, param)                          \
+        do {                                                    \
+                long __ft_tmp1;                                 \
        __asm__ __volatile__(                                   \
            "1: jmp 2f                                    \n\t" \
-            " movq %0, %%rsi                              \n\t" \
            " movq $" #id ", %%rdi                        \n\t" \
            " call " #callback "                          \n\t" \
-            _EVENT_TABLE(id,1b,2f) \
+            __FT_EVENT_TABLE(id,1b,2f)                          \
            "2:                                           \n\t" \
-        : : "r" (param)  : CLOBBER_LIST)
+            : __FT_TMP1(__ft_tmp1)                              \
+            : __FT_ARG1(param)                                  \
+            : __FT_CLOBBER_LIST1);                              \
+        } while (0);
 #define ft_event2(id, callback, param, param2)                  \
+        do {                                                    \
+                long __ft_tmp1, __ft_tmp2;                      \
        __asm__ __volatile__(                                   \
            "1: jmp 2f                                    \n\t" \
-            " movq %1, %%rdx                              \n\t" \
-            " movq %0, %%rsi                              \n\t" \
            " movq $" #id ", %%rdi                        \n\t" \
            " call " #callback "                          \n\t" \
-            _EVENT_TABLE(id,1b,2f) \
+            __FT_EVENT_TABLE(id,1b,2f)                          \
            "2:                                           \n\t" \
-        : : "r" (param), "r" (param2)  : CLOBBER_LIST)
+            : __FT_TMP1(__ft_tmp1), __FT_TMP2(__ft_tmp2)        \
+            : __FT_ARG1(param), __FT_ARG2(param2)               \
+            : __FT_CLOBBER_LIST2);                              \
+        } while (0);
-#define ft_event3(id, callback, p, p2, p3)                      \
+#define ft_event3(id, callback, param, param2, param3)          \
+        do {                                                    \
+                long __ft_tmp1, __ft_tmp2, __ft_tmp3;           \
        __asm__ __volatile__(                                   \
            "1: jmp 2f                                    \n\t" \
-            " movq %2, %%rcx                              \n\t" \
-            " movq %1, %%rdx                              \n\t" \
-            " movq %0, %%rsi                              \n\t" \
            " movq $" #id ", %%rdi                        \n\t" \
            " call " #callback "                          \n\t" \
-            _EVENT_TABLE(id,1b,2f) \
+            __FT_EVENT_TABLE(id,1b,2f)                          \
            "2:                                           \n\t" \
-        : : "r" (p), "r" (p2), "r" (p3)  : CLOBBER_LIST)
+            : __FT_TMP1(__ft_tmp1), __FT_TMP2(__ft_tmp2), __FT_TMP3(__ft_tmp3) \
+            : __FT_ARG1(param), __FT_ARG2(param2), __FT_ARG3(param3)    \
+            : __FT_CLOBBER_LIST3);                              \
+        } while (0);
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index ed4c4f54e2ae..7539d84628f7 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -25,7 +25,6 @@
 #include <litmus/preempt.h>
 #include <litmus/debug_trace.h>
-#include <litmus/trace.h>
 #include <asm/mtrr.h>
 #include <asm/tlbflush.h>
@@ -122,7 +121,6 @@ static void native_smp_send_reschedule(int cpu)
                WARN_ON(1);
                return;
        }
-        TS_SEND_RESCHED_START(cpu);
        apic->send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR);
 }
@@ -214,18 +212,16 @@ static void native_stop_other_cpus(int wait)
 void smp_reschedule_interrupt(struct pt_regs *regs)
 {
        ack_APIC_irq();
-        /* LITMUS^RT: this IPI might need to trigger the sched state machine. */
-        sched_state_ipi();
        inc_irq_stat(irq_resched_count);
-        /*
-         * LITMUS^RT: starting from 3.0 schedule_ipi() actually does something.
-         * This may increase IPI latencies compared with previous versions.
-         */
        scheduler_ipi();
-        TS_SEND_RESCHED_END;
        /*
         * KVM uses this interrupt to force a cpu out of guest mode
         */
+        /* LITMUS^RT: this IPI might need to trigger the sched state machine.
+         * Starting from 3.0 schedule_ipi() actually does something.  This may
+         * increase IPI latencies compared with previous versions. */
+        sched_state_ipi();
 }
 void smp_call_function_interrupt(struct pt_regs *regs)
@@ -251,8 +247,10 @@ extern void hrtimer_pull(void);
 void smp_pull_timers_interrupt(struct pt_regs *regs)
 {
        ack_APIC_irq();
+        irq_enter();
        TRACE("pull timer interrupt\n");
        hrtimer_pull();
+        irq_exit();
 }
 struct smp_ops smp_ops = {
diff --git a/include/linux/completion.h b/include/linux/completion.h
index 9d727271c9fe..51494e6b5548 100644
--- a/include/linux/completion.h
+++ b/include/linux/completion.h
@@ -90,7 +90,6 @@ extern bool completion_done(struct completion *x);
 extern void complete(struct completion *);
 extern void complete_all(struct completion *);
-extern void complete_n(struct completion *, int n);
 /**
 * INIT_COMPLETION - reinitialize a completion structure
diff --git a/include/litmus/binheap.h b/include/litmus/binheap.h
new file mode 100644
index 000000000000..901a30a3e296
--- /dev/null
+++ b/include/litmus/binheap.h
@@ -0,0 +1,206 @@
+#ifndef LITMUS_BINARY_HEAP_H
+#define LITMUS_BINARY_HEAP_H
+#include <linux/kernel.h>
+/**
+ * Simple binary heap with add, arbitrary delete, delete_root, and top
+ * operations.
+ *
+ * Style meant to conform with list.h.
+ *
+ * Motivation: Linux's prio_heap.h is of fixed size. Litmus's binomial
+ * heap may be overkill (and perhaps not general enough) for some applications.
+ *
+ * Note: In order to make node swaps fast, a node inserted with a data pointer
+ * may not always hold said data pointer. This is similar to the binomial heap
+ * implementation. This does make node deletion tricky since we have to
+ * (1) locate the node that holds the data pointer to delete, and (2) the
+ * node that was originally inserted with said data pointer. These have to be
+ * coalesced into a single node before removal (see usage of
+ * __binheap_safe_swap()). We have to track node references to accomplish this.
+ */
+struct binheap_node {
+        void    *data;
+        struct binheap_node *parent;
+        struct binheap_node *left;
+        struct binheap_node *right;
+        /* pointer to binheap_node that holds *data for which this binheap_node
+         * was originally inserted.  (*data "owns" this node)
+         */
+        struct binheap_node *ref;
+        struct binheap_node **ref_ptr;
+};
+/**
+ * Signature of compator function.  Assumed 'less-than' (min-heap).
+ * Pass in 'greater-than' for max-heap.
+ *
+ * TODO: Consider macro-based implementation that allows comparator to be
+ * inlined (similar to Linux red/black tree) for greater efficiency.
+ */
+typedef int (*binheap_order_t)(struct binheap_node *a,
+                                struct binheap_node *b);
+struct binheap {
+        struct binheap_node *root;
+        /* pointer to node to take next inserted child */
+        struct binheap_node *next;
+        /* pointer to last node in complete binary tree */
+        struct binheap_node *last;
+        /* comparator function pointer */
+        binheap_order_t compare;
+};
+/* Initialized heap nodes not in a heap have parent
+ * set to BINHEAP_POISON.
+ */
+#define BINHEAP_POISON  ((void*)(0xdeadbeef))
+/**
+ * binheap_entry - get the struct for this heap node.
+ *  Only valid when called upon heap nodes other than the root handle.
+ * @ptr:        the heap node.
+ * @type:       the type of struct pointed to by binheap_node::data.
+ * @member:     unused.
+ */
+#define binheap_entry(ptr, type, member) \
+((type *)((ptr)->data))
+/**
+ * binheap_node_container - get the struct that contains this node.
+ *  Only valid when called upon heap nodes other than the root handle.
+ * @ptr:        the heap node.
+ * @type:       the type of struct the node is embedded in.
+ * @member:     the name of the binheap_struct within the (type) struct.
+ */
+#define binheap_node_container(ptr, type, member) \
+container_of((ptr), type, member)
+/**
+ * binheap_top_entry - get the struct for the node at the top of the heap.
+ *  Only valid when called upon the heap handle node.
+ * @ptr:    the special heap-handle node.
+ * @type:   the type of the struct the head is embedded in.
+ * @member:     the name of the binheap_struct within the (type) struct.
+ */
+#define binheap_top_entry(ptr, type, member) \
+binheap_entry((ptr)->root, type, member)
+/**
+ * binheap_delete_root - remove the root element from the heap.
+ * @handle:      handle to the heap.
+ * @type:    the type of the struct the head is embedded in.
+ * @member:      the name of the binheap_struct within the (type) struct.
+ */
+#define binheap_delete_root(handle, type, member) \
+__binheap_delete_root((handle), &((type *)((handle)->root->data))->member)
+/**
+ * binheap_delete - remove an arbitrary element from the heap.
+ * @to_delete:  pointer to node to be removed.
+ * @handle:      handle to the heap.
+ */
+#define binheap_delete(to_delete, handle) \
+__binheap_delete((to_delete), (handle))
+/**
+ * binheap_add - insert an element to the heap
+ * new_node: node to add.
+ * @handle:      handle to the heap.
+ * @type:    the type of the struct the head is embedded in.
+ * @member:      the name of the binheap_struct within the (type) struct.
+ */
+#define binheap_add(new_node, handle, type, member) \
+__binheap_add((new_node), (handle), container_of((new_node), type, member))
+/**
+ * binheap_decrease - re-eval the position of a node (based upon its
+ * original data pointer).
+ * @handle: handle to the heap.
+ * @orig_node: node that was associated with the data pointer
+ *             (whose value has changed) when said pointer was
+ *             added to the heap.
+ */
+#define binheap_decrease(orig_node, handle) \
+__binheap_decrease((orig_node), (handle))
+#define BINHEAP_NODE_INIT() { NULL, BINHEAP_POISON, NULL, NULL , NULL, NULL}
+#define BINHEAP_NODE(name) \
+        struct binheap_node name = BINHEAP_NODE_INIT()
+static inline void INIT_BINHEAP_NODE(struct binheap_node *n)
+{
+        n->data = NULL;
+        n->parent = BINHEAP_POISON;
+        n->left = NULL;
+        n->right = NULL;
+        n->ref = NULL;
+        n->ref_ptr = NULL;
+}
+static inline void INIT_BINHEAP_HANDLE(struct binheap *handle,
+                                binheap_order_t compare)
+{
+        handle->root = NULL;
+        handle->next = NULL;
+        handle->last = NULL;
+        handle->compare = compare;
+}
+/* Returns true if binheap is empty. */
+static inline int binheap_empty(struct binheap *handle)
+{
+        return(handle->root == NULL);
+}
+/* Returns true if binheap node is in a heap. */
+static inline int binheap_is_in_heap(struct binheap_node *node)
+{
+        return (node->parent != BINHEAP_POISON);
+}
+/* Returns true if binheap node is in given heap. */
+int binheap_is_in_this_heap(struct binheap_node *node, struct binheap* heap);
+/* Add a node to a heap */
+void __binheap_add(struct binheap_node *new_node,
+                                struct binheap *handle,
+                                void *data);
+/**
+ * Removes the root node from the heap. The node is removed after coalescing
+ * the binheap_node with its original data pointer at the root of the tree.
+ *
+ * The 'last' node in the tree is then swapped up to the root and bubbled
+ * down.
+ */
+void __binheap_delete_root(struct binheap *handle,
+                                struct binheap_node *container);
+/**
+ * Delete an arbitrary node.  Bubble node to delete up to the root,
+ * and then delete to root.
+ */
+void __binheap_delete(struct binheap_node *node_to_delete,
+                                struct binheap *handle);
+/**
+ * Bubble up a node whose pointer has decreased in value.
+ */
+void __binheap_decrease(struct binheap_node *orig_node,
+                                                struct binheap *handle);
+#endif
diff --git a/include/litmus/budget.h b/include/litmus/budget.h
index 732530e63491..33344ee8d5f9 100644
--- a/include/litmus/budget.h
+++ b/include/litmus/budget.h
@@ -5,4 +5,31 @@
 * the next task. */
 void update_enforcement_timer(struct task_struct* t);
+inline static int budget_exhausted(struct task_struct* t)
+{
+        return get_exec_time(t) >= get_exec_cost(t);
+}
+inline static lt_t budget_remaining(struct task_struct* t)
+{
+        if (!budget_exhausted(t))
+                return get_exec_cost(t) - get_exec_time(t);
+        else
+                /* avoid overflow */
+                return 0;
+}
+#define budget_enforced(t) (tsk_rt(t)->task_params.budget_policy != NO_ENFORCEMENT)
+#define budget_precisely_enforced(t) (tsk_rt(t)->task_params.budget_policy \
+                                      == PRECISE_ENFORCEMENT)
+static inline int requeue_preempted_job(struct task_struct* t)
+{
+        /* Add task to ready queue only if not subject to budget enforcement or
+         * if the job has budget remaining. t may be NULL.
+         */
+        return t && (!budget_exhausted(t) || !budget_enforced(t));
+}
 #endif
diff --git a/include/litmus/debug_trace.h b/include/litmus/debug_trace.h
index 48d086d5a44c..1266ac6a760c 100644
--- a/include/litmus/debug_trace.h
+++ b/include/litmus/debug_trace.h
@@ -28,8 +28,11 @@ extern atomic_t __log_seq_no;
                                TRACE_ARGS,  ## args)
 #define TRACE_TASK(t, fmt, args...)                     \
-        TRACE("(%s/%d:%d) " fmt, (t)->comm, (t)->pid,   \
+        TRACE("(%s/%d:%d) " fmt,                         \
-              (t)->rt_param.job_params.job_no,  ##args)
+              t ? (t)->comm : "null",                    \
+              t ? (t)->pid : 0,                          \
+              t ? (t)->rt_param.job_params.job_no : 0,   \
+              ##args)
 #define TRACE_CUR(fmt, args...) \
        TRACE_TASK(current, fmt, ## args)
diff --git a/include/litmus/fdso.h b/include/litmus/fdso.h
index caf2a1e6918c..f2115b83f1e4 100644
--- a/include/litmus/fdso.h
+++ b/include/litmus/fdso.h
@@ -12,7 +12,7 @@
 #include <linux/fs.h>
 #include <linux/slab.h>
-#define MAX_OBJECT_DESCRIPTORS 32
+#define MAX_OBJECT_DESCRIPTORS 85
 typedef enum  {
        MIN_OBJ_TYPE    = 0,
@@ -20,7 +20,13 @@ typedef enum  {
        FMLP_SEM        = 0,
        SRP_SEM         = 1,
-        MAX_OBJ_TYPE    = 1
+        MPCP_SEM        = 2,
+        MPCP_VS_SEM     = 3,
+        DPCP_SEM        = 4,
+        PCP_SEM         = 5,
+        MAX_OBJ_TYPE    = 5
 } obj_type_t;
 struct inode_obj_id {
diff --git a/include/litmus/fp_common.h b/include/litmus/fp_common.h
new file mode 100644
index 000000000000..19356c0fa6c1
--- /dev/null
+++ b/include/litmus/fp_common.h
@@ -0,0 +1,105 @@
+/* Fixed-priority scheduler support.
+ */
+#ifndef __FP_COMMON_H__
+#define __FP_COMMON_H__
+#include <litmus/rt_domain.h>
+#include <asm/bitops.h>
+void fp_domain_init(rt_domain_t* rt, check_resched_needed_t resched,
+                    release_jobs_t release);
+int fp_higher_prio(struct task_struct* first,
+                   struct task_struct* second);
+int fp_ready_order(struct bheap_node* a, struct bheap_node* b);
+#define FP_PRIO_BIT_WORDS (LITMUS_MAX_PRIORITY / BITS_PER_LONG)
+#if (LITMUS_MAX_PRIORITY % BITS_PER_LONG)
+#error LITMUS_MAX_PRIORITY must be a multiple of BITS_PER_LONG
+#endif
+/* bitmask-inexed priority queue */
+struct fp_prio_queue {
+        unsigned long   bitmask[FP_PRIO_BIT_WORDS];
+        struct bheap    queue[LITMUS_MAX_PRIORITY];
+};
+void fp_prio_queue_init(struct fp_prio_queue* q);
+static inline void fpq_set(struct fp_prio_queue* q, unsigned int index)
+{
+        unsigned long *word = q->bitmask + (index / BITS_PER_LONG);
+        __set_bit(index % BITS_PER_LONG, word);
+}
+static inline void fpq_clear(struct fp_prio_queue* q, unsigned int index)
+{
+        unsigned long *word = q->bitmask + (index / BITS_PER_LONG);
+        __clear_bit(index % BITS_PER_LONG, word);
+}
+static inline unsigned int fpq_find(struct fp_prio_queue* q)
+{
+        int i;
+        /* loop optimizer should unroll this */
+        for (i = 0; i < FP_PRIO_BIT_WORDS; i++)
+                if (q->bitmask[i])
+                        return __ffs(q->bitmask[i]) + i * BITS_PER_LONG;
+        return LITMUS_MAX_PRIORITY; /* nothing found */
+}
+static inline void fp_prio_add(struct fp_prio_queue* q, struct task_struct* t, unsigned int index)
+{
+        BUG_ON(index >= LITMUS_MAX_PRIORITY);
+        BUG_ON(bheap_node_in_heap(tsk_rt(t)->heap_node));
+        fpq_set(q, index);
+        bheap_insert(fp_ready_order, &q->queue[index], tsk_rt(t)->heap_node);
+}
+static inline void fp_prio_remove(struct fp_prio_queue* q, struct task_struct* t, unsigned int index)
+{
+        BUG_ON(!is_queued(t));
+        bheap_delete(fp_ready_order, &q->queue[index], tsk_rt(t)->heap_node);
+        if (likely(bheap_empty(&q->queue[index])))
+                fpq_clear(q, index);
+}
+static inline struct task_struct* fp_prio_peek(struct fp_prio_queue* q)
+{
+        unsigned int idx = fpq_find(q);
+        struct bheap_node* hn;
+        if (idx < LITMUS_MAX_PRIORITY) {
+                hn = bheap_peek(fp_ready_order, &q->queue[idx]);
+                return bheap2task(hn);
+        } else
+                return NULL;
+}
+static inline struct task_struct* fp_prio_take(struct fp_prio_queue* q)
+{
+        unsigned int idx = fpq_find(q);
+        struct bheap_node* hn;
+        if (idx < LITMUS_MAX_PRIORITY) {
+                hn = bheap_take(fp_ready_order, &q->queue[idx]);
+                if (likely(bheap_empty(&q->queue[idx])))
+                        fpq_clear(q, idx);
+                return bheap2task(hn);
+        } else
+                return NULL;
+}
+int fp_preemption_needed(struct fp_prio_queue*  q, struct task_struct *t);
+#endif
diff --git a/include/litmus/fpmath.h b/include/litmus/fpmath.h
new file mode 100644
index 000000000000..642de98542c8
--- /dev/null
+++ b/include/litmus/fpmath.h
@@ -0,0 +1,147 @@
+#ifndef __FP_MATH_H__
+#define __FP_MATH_H__
+#include <linux/math64.h>
+#ifndef __KERNEL__
+#include <stdint.h>
+#define abs(x) (((x) < 0) ? -(x) : x)
+#endif
+// Use 64-bit because we want to track things at the nanosecond scale.
+// This can lead to very large numbers.
+typedef int64_t fpbuf_t;
+typedef struct
+{
+        fpbuf_t val;
+} fp_t;
+#define FP_SHIFT 10
+#define ROUND_BIT (FP_SHIFT - 1)
+#define _fp(x) ((fp_t) {x})
+#ifdef __KERNEL__
+static const fp_t LITMUS_FP_ZERO = {.val = 0};
+static const fp_t LITMUS_FP_ONE = {.val = (1 << FP_SHIFT)};
+#endif
+static inline fp_t FP(fpbuf_t x)
+{
+        return _fp(((fpbuf_t) x) << FP_SHIFT);
+}
+/* divide two integers to obtain a fixed point value  */
+static inline fp_t _frac(fpbuf_t a, fpbuf_t b)
+{
+        return _fp(div64_s64(FP(a).val, (b)));
+}
+static inline fpbuf_t _point(fp_t x)
+{
+        return (x.val % (1 << FP_SHIFT));
+}
+#define fp2str(x) x.val
+/*(x.val >> FP_SHIFT), (x.val % (1 << FP_SHIFT)) */
+#define _FP_  "%ld/1024"
+static inline fpbuf_t _floor(fp_t x)
+{
+        return x.val >> FP_SHIFT;
+}
+/* FIXME: negative rounding */
+static inline fpbuf_t _round(fp_t x)
+{
+        return _floor(x) + ((x.val >> ROUND_BIT) & 1);
+}
+/* multiply two fixed point values */
+static inline fp_t _mul(fp_t a, fp_t b)
+{
+        return _fp((a.val * b.val) >> FP_SHIFT);
+}
+static inline fp_t _div(fp_t a, fp_t b)
+{
+#if !defined(__KERNEL__) && !defined(unlikely)
+#define unlikely(x) (x)
+#define DO_UNDEF_UNLIKELY
+#endif
+        /* try not to overflow */
+        if (unlikely(  a.val > (2l << ((sizeof(fpbuf_t)*8) - FP_SHIFT)) ))
+                return _fp((a.val / b.val) << FP_SHIFT);
+        else
+                return _fp((a.val << FP_SHIFT) / b.val);
+#ifdef DO_UNDEF_UNLIKELY
+#undef unlikely
+#undef DO_UNDEF_UNLIKELY
+#endif
+}
+static inline fp_t _add(fp_t a, fp_t b)
+{
+        return _fp(a.val + b.val);
+}
+static inline fp_t _sub(fp_t a, fp_t b)
+{
+        return _fp(a.val - b.val);
+}
+static inline fp_t _neg(fp_t x)
+{
+        return _fp(-x.val);
+}
+static inline fp_t _abs(fp_t x)
+{
+        return _fp(abs(x.val));
+}
+/* works the same as casting float/double to integer */
+static inline fpbuf_t _fp_to_integer(fp_t x)
+{
+        return _floor(_abs(x)) * ((x.val > 0) ? 1 : -1);
+}
+static inline fp_t _integer_to_fp(fpbuf_t x)
+{
+        return _frac(x,1);
+}
+static inline int _leq(fp_t a, fp_t b)
+{
+        return a.val <= b.val;
+}
+static inline int _geq(fp_t a, fp_t b)
+{
+        return a.val >= b.val;
+}
+static inline int _lt(fp_t a, fp_t b)
+{
+        return a.val < b.val;
+}
+static inline int _gt(fp_t a, fp_t b)
+{
+        return a.val > b.val;
+}
+static inline int _eq(fp_t a, fp_t b)
+{
+        return a.val == b.val;
+}
+static inline fp_t _max(fp_t a, fp_t b)
+{
+        if (a.val < b.val)
+                return b;
+        else
+                return a;
+}
+#endif
diff --git a/include/litmus/litmus.h b/include/litmus/litmus.h
index 0b071fd359f9..875783e6a67b 100644
--- a/include/litmus/litmus.h
+++ b/include/litmus/litmus.h
@@ -45,38 +45,23 @@ void litmus_exit_task(struct task_struct *tsk);
 #define tsk_rt(t)               (&(t)->rt_param)
 /*      Realtime utility macros */
-#define get_rt_flags(t)         (tsk_rt(t)->flags)
+#define is_priority_boosted(t)  (tsk_rt(t)->priority_boosted)
-#define set_rt_flags(t,f)       (tsk_rt(t)->flags=(f))
+#define get_boost_start(t)  (tsk_rt(t)->boost_start_time)
+/* task_params macros */
 #define get_exec_cost(t)        (tsk_rt(t)->task_params.exec_cost)
-#define get_exec_time(t)        (tsk_rt(t)->job_params.exec_time)
 #define get_rt_period(t)        (tsk_rt(t)->task_params.period)
+#define get_rt_relative_deadline(t)     (tsk_rt(t)->task_params.relative_deadline)
 #define get_rt_phase(t)         (tsk_rt(t)->task_params.phase)
 #define get_partition(t)        (tsk_rt(t)->task_params.cpu)
+#define get_priority(t)         (tsk_rt(t)->task_params.priority)
+#define get_class(t)        (tsk_rt(t)->task_params.cls)
+/* job_param macros */
+#define get_exec_time(t)    (tsk_rt(t)->job_params.exec_time)
 #define get_deadline(t)         (tsk_rt(t)->job_params.deadline)
 #define get_release(t)          (tsk_rt(t)->job_params.release)
-#define get_class(t)            (tsk_rt(t)->task_params.cls)
+#define get_lateness(t)         (tsk_rt(t)->job_params.lateness)
-#define is_priority_boosted(t)  (tsk_rt(t)->priority_boosted)
-#define get_boost_start(t)      (tsk_rt(t)->boost_start_time)
-inline static int budget_exhausted(struct task_struct* t)
-{
-        return get_exec_time(t) >= get_exec_cost(t);
-}
-inline static lt_t budget_remaining(struct task_struct* t)
-{
-        if (!budget_exhausted(t))
-                return get_exec_cost(t) - get_exec_time(t);
-        else
-                /* avoid overflow */
-                return 0;
-}
-#define budget_enforced(t) (tsk_rt(t)->task_params.budget_policy != NO_ENFORCEMENT)
-#define budget_precisely_enforced(t) (tsk_rt(t)->task_params.budget_policy \
-                                      == PRECISE_ENFORCEMENT)
 #define is_hrt(t)               \
        (tsk_rt(t)->task_params.cls == RT_CLASS_HARD)
@@ -245,6 +230,11 @@ static inline int is_present(struct task_struct* t)
        return t && tsk_rt(t)->present;
 }
+static inline int is_completed(struct task_struct* t)
+{
+        return t && tsk_rt(t)->completed;
+}
 /* make the unit explicit */
 typedef unsigned long quanta_t;
@@ -272,4 +262,39 @@ static inline quanta_t time2quanta(lt_t time, enum round round)
 /* By how much is cpu staggered behind CPU 0? */
 u64 cpu_stagger_offset(int cpu);
+static inline struct control_page* get_control_page(struct task_struct *t)
+{
+        return tsk_rt(t)->ctrl_page;
+}
+static inline int has_control_page(struct task_struct* t)
+{
+        return tsk_rt(t)->ctrl_page != NULL;
+}
+#ifdef CONFIG_SCHED_OVERHEAD_TRACE
+#define TS_SYSCALL_IN_START                                             \
+        if (has_control_page(current)) {                                \
+                __TS_SYSCALL_IN_START(&get_control_page(current)->ts_syscall_start); \
+        }
+#define TS_SYSCALL_IN_END                                               \
+        if (has_control_page(current)) {                                \
+                uint64_t irqs;                                          \
+                local_irq_disable();                                    \
+                irqs = get_control_page(current)->irq_count -           \
+                        get_control_page(current)->irq_syscall_start;   \
+                __TS_SYSCALL_IN_END(&irqs);                             \
+                local_irq_enable();                                     \
+        }
+#else
+#define TS_SYSCALL_IN_START
+#define TS_SYSCALL_IN_END
+#endif
 #endif
diff --git a/include/litmus/rt_param.h b/include/litmus/rt_param.h
index d6d799174160..4cd06dd32906 100644
--- a/include/litmus/rt_param.h
+++ b/include/litmus/rt_param.h
@@ -33,22 +33,44 @@ typedef enum {
        PRECISE_ENFORCEMENT  /* budgets are enforced with hrtimers */
 } budget_policy_t;
+/* We use the common priority interpretation "lower index == higher priority",
+ * which is commonly used in fixed-priority schedulability analysis papers.
+ * So, a numerically lower priority value implies higher scheduling priority,
+ * with priority 1 being the highest priority. Priority 0 is reserved for
+ * priority boosting. LITMUS_MAX_PRIORITY denotes the maximum priority value
+ * range.
+ */
+#define LITMUS_MAX_PRIORITY     512
+#define LITMUS_HIGHEST_PRIORITY   1
+#define LITMUS_LOWEST_PRIORITY    (LITMUS_MAX_PRIORITY - 1)
+/* Provide generic comparison macros for userspace,
+ * in case that we change this later. */
+#define litmus_higher_fixed_prio(a, b)  (a < b)
+#define litmus_lower_fixed_prio(a, b)   (a > b)
+#define litmus_is_valid_fixed_prio(p)           \
+        ((p) >= LITMUS_HIGHEST_PRIORITY &&      \
+         (p) <= LITMUS_LOWEST_PRIORITY)
 struct rt_task {
        lt_t            exec_cost;
        lt_t            period;
+        lt_t            relative_deadline;
        lt_t            phase;
        unsigned int    cpu;
+        unsigned int    priority;
        task_class_t    cls;
        budget_policy_t budget_policy; /* ignored by pfair */
 };
 union np_flag {
-        uint32_t raw;
+        uint64_t raw;
        struct {
                /* Is the task currently in a non-preemptive section? */
-                uint32_t flag:31;
+                uint64_t flag:31;
                /* Should the task call into the scheduler? */
-                uint32_t preempt:1;
+                uint64_t preempt:1;
        } np;
 };
@@ -67,11 +89,29 @@ union np_flag {
 * determining preemption/migration overheads).
 */
 struct control_page {
+        /* This flag is used by userspace to communicate non-preempive
+         * sections. */
        volatile union np_flag sched;
+        volatile uint64_t irq_count; /* Incremented by the kernel each time an IRQ is
+                                      * handled. */
+        /* Locking overhead tracing: userspace records here the time stamp
+         * and IRQ counter prior to starting the system call. */
+        uint64_t ts_syscall_start;  /* Feather-Trace cycles */
+        uint64_t irq_syscall_start; /* Snapshot of irq_count when the syscall
+                                     * started. */
        /* to be extended */
 };
+/* Expected offsets within the control page. */
+#define LITMUS_CP_OFFSET_SCHED          0
+#define LITMUS_CP_OFFSET_IRQ_COUNT      8
+#define LITMUS_CP_OFFSET_TS_SC_START    16
+#define LITMUS_CP_OFFSET_IRQ_SC_START   24
 /* don't export internal data structures to user space (liblitmus) */
 #ifdef __KERNEL__
@@ -88,6 +128,12 @@ struct rt_job {
        /* How much service has this job received so far? */
        lt_t    exec_time;
+        /* By how much did the prior job miss its deadline by?
+         * Value differs from tardiness in that lateness may
+         * be negative (when job finishes before its deadline).
+         */
+        long long       lateness;
        /* Which job is this. This is used to let user space
         * specify which job to wait for, which is important if jobs
         * overrun. If we just call sys_sleep_next_period() then we
@@ -114,6 +160,9 @@ struct rt_param {
        /* is the task present? (true if it can be scheduled) */
        unsigned int            present:1;
+        /* has the task completed? */
+        unsigned int            completed:1;
 #ifdef CONFIG_LITMUS_LOCKING
        /* Is the task being priority-boosted by a locking protocol? */
        unsigned int            priority_boosted:1;
@@ -199,11 +248,6 @@ struct rt_param {
        struct control_page * ctrl_page;
 };
-/*      Possible RT flags       */
-#define RT_F_RUNNING            0x00000000
-#define RT_F_SLEEP              0x00000001
-#define RT_F_EXIT_SEM           0x00000008
 #endif
 #endif
diff --git a/include/litmus/sched_plugin.h b/include/litmus/sched_plugin.h
index 6e7cabdddae8..1546ab7f1d66 100644
--- a/include/litmus/sched_plugin.h
+++ b/include/litmus/sched_plugin.h
@@ -53,10 +53,12 @@ typedef void (*task_block_t)  (struct task_struct *task);
 */
 typedef void (*task_exit_t)    (struct task_struct *);
+#ifdef CONFIG_LITMUS_LOCKING
 /* Called when the current task attempts to create a new lock of a given
 * protocol type. */
 typedef long (*allocate_lock_t) (struct litmus_lock **lock, int type,
                                 void* __user config);
+#endif
 /********************* sys call backends  ********************/
diff --git a/include/litmus/sched_trace.h b/include/litmus/sched_trace.h
index 7ca34cb13881..82bde8241298 100644
--- a/include/litmus/sched_trace.h
+++ b/include/litmus/sched_trace.h
@@ -164,34 +164,93 @@ feather_callback void do_sched_trace_sys_release(unsigned long id,
 #endif
+#ifdef CONFIG_SCHED_LITMUS_TRACEPOINT
+#include <trace/events/litmus.h>
+#else
+/* Override trace macros to actually do nothing */
+#define trace_litmus_task_param(t)
+#define trace_litmus_task_release(t)
+#define trace_litmus_switch_to(t)
+#define trace_litmus_switch_away(prev)
+#define trace_litmus_task_completion(t, forced)
+#define trace_litmus_task_block(t)
+#define trace_litmus_task_resume(t)
+#define trace_litmus_sys_release(start)
+#endif
 #define SCHED_TRACE_BASE_ID 500
-#define sched_trace_task_name(t) \
+#define sched_trace_task_name(t)                                        \
-        SCHED_TRACE(SCHED_TRACE_BASE_ID + 1, do_sched_trace_task_name, t)
+        SCHED_TRACE(SCHED_TRACE_BASE_ID + 1,                            \
-#define sched_trace_task_param(t) \
+                        do_sched_trace_task_name, t)
-        SCHED_TRACE(SCHED_TRACE_BASE_ID + 2, do_sched_trace_task_param, t)
-#define sched_trace_task_release(t) \
+#define sched_trace_task_param(t)                                       \
-        SCHED_TRACE(SCHED_TRACE_BASE_ID + 3, do_sched_trace_task_release, t)
+        do {                                                            \
-#define sched_trace_task_switch_to(t) \
+                SCHED_TRACE(SCHED_TRACE_BASE_ID + 2,                    \
-        SCHED_TRACE(SCHED_TRACE_BASE_ID + 4, do_sched_trace_task_switch_to, t)
+                                do_sched_trace_task_param, t);          \
-#define sched_trace_task_switch_away(t) \
+                trace_litmus_task_param(t);                             \
-        SCHED_TRACE(SCHED_TRACE_BASE_ID + 5, do_sched_trace_task_switch_away, t)
+        } while (0)
-#define sched_trace_task_completion(t, forced) \
-        SCHED_TRACE2(SCHED_TRACE_BASE_ID + 6, do_sched_trace_task_completion, t, \
+#define sched_trace_task_release(t)                                     \
-                     (unsigned long) forced)
+        do {                                                            \
-#define sched_trace_task_block(t) \
+                SCHED_TRACE(SCHED_TRACE_BASE_ID + 3,                    \
-        SCHED_TRACE(SCHED_TRACE_BASE_ID + 7, do_sched_trace_task_block, t)
+                                do_sched_trace_task_release, t);        \
-#define sched_trace_task_resume(t) \
+                trace_litmus_task_release(t);                           \
-        SCHED_TRACE(SCHED_TRACE_BASE_ID + 8, do_sched_trace_task_resume, t)
+        } while (0)
-#define sched_trace_action(t, action) \
-        SCHED_TRACE2(SCHED_TRACE_BASE_ID + 9, do_sched_trace_action, t, \
+#define sched_trace_task_switch_to(t)                                   \
-                     (unsigned long) action);
+        do {                                                            \
-/* when is a pointer, it does not need an explicit cast to unsigned long */
+                SCHED_TRACE(SCHED_TRACE_BASE_ID + 4,                    \
-#define sched_trace_sys_release(when) \
+                        do_sched_trace_task_switch_to, t);              \
-        SCHED_TRACE(SCHED_TRACE_BASE_ID + 10, do_sched_trace_sys_release, when)
+                trace_litmus_switch_to(t);                              \
+        } while (0)
+#define sched_trace_task_switch_away(t)                                 \
+        do {                                                            \
+                SCHED_TRACE(SCHED_TRACE_BASE_ID + 5,                    \
+                        do_sched_trace_task_switch_away, t);            \
+                trace_litmus_switch_away(t);                            \
+        } while (0)
+#define sched_trace_task_completion(t, forced)                          \
+        do {                                                            \
+                SCHED_TRACE2(SCHED_TRACE_BASE_ID + 6,                   \
+                                do_sched_trace_task_completion, t,      \
+                                (unsigned long) forced);                \
+                trace_litmus_task_completion(t, forced);                \
+        } while (0)
+#define sched_trace_task_block(t)                                       \
+        do {                                                            \
+                SCHED_TRACE(SCHED_TRACE_BASE_ID + 7,                    \
+                        do_sched_trace_task_block, t);                  \
+                trace_litmus_task_block(t);                             \
+        } while (0)
+#define sched_trace_task_resume(t)                                      \
+        do {                                                            \
+                SCHED_TRACE(SCHED_TRACE_BASE_ID + 8,                    \
+                                do_sched_trace_task_resume, t);         \
+                trace_litmus_task_resume(t);                            \
+        } while (0)
+#define sched_trace_action(t, action)                                   \
+        SCHED_TRACE2(SCHED_TRACE_BASE_ID + 9,                           \
+                do_sched_trace_action, t, (unsigned long) action);
+/* when is a pointer, it does not need an explicit cast to unsigned long */
+#define sched_trace_sys_release(when)                                   \
+        do {                                                            \
+                SCHED_TRACE(SCHED_TRACE_BASE_ID + 10,                   \
+                        do_sched_trace_sys_release, when);              \
+                trace_litmus_sys_release(when);                         \
+        } while (0)
 #define sched_trace_quantum_boundary() /* NOT IMPLEMENTED */
diff --git a/include/litmus/trace.h b/include/litmus/trace.h
index e809376d6487..8ad4966c602e 100644
--- a/include/litmus/trace.h
+++ b/include/litmus/trace.h
@@ -3,6 +3,7 @@
 #ifdef CONFIG_SCHED_OVERHEAD_TRACE
 #include <litmus/feather_trace.h>
 #include <litmus/feather_buffer.h>
@@ -16,7 +17,8 @@ enum task_type_marker {
 };
 struct timestamp {
-        uint64_t                timestamp;
+        uint64_t                timestamp:48;
+        uint64_t                pid:16;
        uint32_t                seq_no;
        uint8_t                 cpu;
        uint8_t                 event;
@@ -31,11 +33,16 @@ feather_callback void save_timestamp_def(unsigned long event, unsigned long type
 feather_callback void save_timestamp_task(unsigned long event, unsigned long t_ptr);
 feather_callback void save_timestamp_cpu(unsigned long event, unsigned long cpu);
 feather_callback void save_task_latency(unsigned long event, unsigned long when_ptr);
+feather_callback void save_timestamp_time(unsigned long event, unsigned long time_ptr);
+feather_callback void save_timestamp_irq(unsigned long event, unsigned long irq_count_ptr);
+feather_callback void save_timestamp_hide_irq(unsigned long event);
 #define TIMESTAMP(id) ft_event0(id, save_timestamp)
 #define DTIMESTAMP(id, def)  ft_event1(id, save_timestamp_def, (unsigned long) def)
+#define TIMESTAMP_CUR(id) DTIMESTAMP(id, is_realtime(current) ? TSK_RT : TSK_BE)
 #define TTIMESTAMP(id, task) \
        ft_event1(id, save_timestamp_task, (unsigned long) task)
@@ -45,18 +52,35 @@ feather_callback void save_task_latency(unsigned long event, unsigned long when_
 #define LTIMESTAMP(id, task) \
        ft_event1(id, save_task_latency, (unsigned long) task)
+#define TIMESTAMP_TIME(id, time_ptr) \
+        ft_event1(id, save_timestamp_time, (unsigned long) time_ptr)
+#define TIMESTAMP_IRQ(id, irq_count_ptr) \
+        ft_event1(id, save_timestamp_irq, (unsigned long) irq_count_ptr)
+#define TIMESTAMP_IN_IRQ(id) \
+        ft_event0(id, save_timestamp_hide_irq)
 #else /* !CONFIG_SCHED_OVERHEAD_TRACE */
 #define TIMESTAMP(id)        /* no tracing */
 #define DTIMESTAMP(id, def)  /* no tracing */
+#define TIMESTAMP_CUR(id)    /* no tracing */
 #define TTIMESTAMP(id, task) /* no tracing */
 #define CTIMESTAMP(id, cpu)  /* no tracing */
 #define LTIMESTAMP(id, when_ptr) /* no tracing */
+#define TIMESTAMP_TIME(id, time_ptr) /* no tracing */
+#define TIMESTAMP_IRQ(id, irq_count_ptr) /* no tracing */
+#define TIMESTAMP_IN_IRQ(id) /* no tracing */
 #endif
@@ -68,7 +92,20 @@ feather_callback void save_task_latency(unsigned long event, unsigned long when_
 * always the next number after the start time event id.
 */
+#define __TS_SYSCALL_IN_START(p)        TIMESTAMP_TIME(10, p)
+#define __TS_SYSCALL_IN_END(p)          TIMESTAMP_IRQ(11, p)
+#define TS_SYSCALL_OUT_START            TIMESTAMP_CUR(20)
+#define TS_SYSCALL_OUT_END              TIMESTAMP_CUR(21)
+#define TS_LOCK_START                   TIMESTAMP_CUR(30)
+#define TS_LOCK_END                     TIMESTAMP_CUR(31)
+#define TS_LOCK_SUSPEND                 TIMESTAMP_CUR(38)
+#define TS_LOCK_RESUME                  TIMESTAMP_CUR(39)
+#define TS_UNLOCK_START                 TIMESTAMP_CUR(40)
+#define TS_UNLOCK_END                   TIMESTAMP_CUR(41)
 #define TS_SCHED_START                  DTIMESTAMP(100, TSK_UNKNOWN) /* we only
                                                                      * care
@@ -100,16 +137,8 @@ feather_callback void save_task_latency(unsigned long event, unsigned long when_
 #define TS_EXIT_NP_START                TIMESTAMP(150)
 #define TS_EXIT_NP_END                  TIMESTAMP(151)
-#define TS_LOCK_START                   TIMESTAMP(170)
-#define TS_LOCK_SUSPEND                 TIMESTAMP(171)
-#define TS_LOCK_RESUME                  TIMESTAMP(172)
-#define TS_LOCK_END                     TIMESTAMP(173)
-#define TS_UNLOCK_START                 TIMESTAMP(180)
-#define TS_UNLOCK_END                   TIMESTAMP(181)
 #define TS_SEND_RESCHED_START(c)        CTIMESTAMP(190, c)
-#define TS_SEND_RESCHED_END             DTIMESTAMP(191, TSK_UNKNOWN)
+#define TS_SEND_RESCHED_END             TIMESTAMP_IN_IRQ(191)
 #define TS_RELEASE_LATENCY(when)        LTIMESTAMP(208, &(when))
diff --git a/include/litmus/trace_irq.h b/include/litmus/trace_irq.h
index f18b127a089d..0d0c042ba9c3 100644
--- a/include/litmus/trace_irq.h
+++ b/include/litmus/trace_irq.h
@@ -3,14 +3,7 @@
 #ifdef CONFIG_SCHED_OVERHEAD_TRACE
-extern DEFINE_PER_CPU(atomic_t, irq_fired_count);
+void ft_irq_fired(void);
-static inline void ft_irq_fired(void)
-{
-        /* Only called with preemptions disabled.  */
-        atomic_inc(&__get_cpu_var(irq_fired_count));
-}
 #else
diff --git a/include/litmus/wait.h b/include/litmus/wait.h
new file mode 100644
index 000000000000..ce1347c355f8
--- /dev/null
+++ b/include/litmus/wait.h
@@ -0,0 +1,57 @@
+#ifndef _LITMUS_WAIT_H_
+#define _LITMUS_WAIT_H_
+struct task_struct* __waitqueue_remove_first(wait_queue_head_t *wq);
+/* wrap regular wait_queue_t head */
+struct __prio_wait_queue {
+        wait_queue_t wq;
+        /* some priority point */
+        lt_t priority;
+        /* break ties in priority by lower tie_breaker */
+        unsigned int tie_breaker;
+};
+typedef struct __prio_wait_queue prio_wait_queue_t;
+static inline void init_prio_waitqueue_entry(prio_wait_queue_t *pwq,
+                                             struct task_struct* t,
+                                             lt_t priority)
+{
+        init_waitqueue_entry(&pwq->wq, t);
+        pwq->priority    = priority;
+        pwq->tie_breaker = 0;
+}
+static inline void init_prio_waitqueue_entry_tie(prio_wait_queue_t *pwq,
+                                                 struct task_struct* t,
+                                                 lt_t priority,
+                                                 unsigned int tie_breaker)
+{
+        init_waitqueue_entry(&pwq->wq, t);
+        pwq->priority    = priority;
+        pwq->tie_breaker = tie_breaker;
+}
+unsigned int __add_wait_queue_prio_exclusive(
+        wait_queue_head_t* head,
+        prio_wait_queue_t *new);
+static inline unsigned int add_wait_queue_prio_exclusive(
+        wait_queue_head_t* head,
+        prio_wait_queue_t *new)
+{
+        unsigned long flags;
+        unsigned int passed;
+        spin_lock_irqsave(&head->lock, flags);
+        passed = __add_wait_queue_prio_exclusive(head, new);
+        spin_unlock_irqrestore(&head->lock, flags);
+        return passed;
+}
+#endif
diff --git a/include/trace/events/litmus.h b/include/trace/events/litmus.h
new file mode 100644
index 000000000000..0fffcee02be0
--- /dev/null
+++ b/include/trace/events/litmus.h
@@ -0,0 +1,231 @@
+/*
+ * LITMUS^RT kernel style scheduling tracepoints
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM litmus
+#if !defined(_SCHED_TASK_TRACEPOINT_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _SCHED_TASK_TRACEPOINT_H
+#include <linux/tracepoint.h>
+#include <litmus/litmus.h>
+#include <litmus/rt_param.h>
+/*
+ * Tracing task admission
+ */
+TRACE_EVENT(litmus_task_param,
+        TP_PROTO(struct task_struct *t),
+        TP_ARGS(t),
+        TP_STRUCT__entry(
+                __field( pid_t,         pid     )
+                __field( unsigned int,  job     )
+                __field( lt_t,          wcet    )
+                __field( lt_t,          period  )
+                __field( lt_t,          phase   )
+                __field( int,           partition )
+        ),
+        TP_fast_assign(
+                __entry->pid    = t ? t->pid : 0;
+                __entry->job    = t ? t->rt_param.job_params.job_no : 0;
+                __entry->wcet   = get_exec_cost(t);
+                __entry->period = get_rt_period(t);
+                __entry->phase  = get_rt_phase(t);
+                __entry->partition = get_partition(t);
+        ),
+        TP_printk("period(%d, %Lu).\nwcet(%d, %Lu).\n",
+                __entry->pid, __entry->period,
+                __entry->pid, __entry->wcet)
+);
+/*
+ * Tracing jobs release
+ */
+TRACE_EVENT(litmus_task_release,
+        TP_PROTO(struct task_struct *t),
+        TP_ARGS(t),
+        TP_STRUCT__entry(
+                __field( pid_t,         pid     )
+                __field( unsigned int,  job     )
+                __field( lt_t,          release )
+                __field( lt_t,          deadline        )
+        ),
+        TP_fast_assign(
+                __entry->pid    = t ? t->pid : 0;
+                __entry->job    = t ? t->rt_param.job_params.job_no : 0;
+                __entry->release        = get_release(t);
+                __entry->deadline       = get_deadline(t);
+        ),
+        TP_printk("release(job(%u, %u)): %Lu\ndeadline(job(%u, %u)): %Lu\n",
+                        __entry->pid, __entry->job, __entry->release,
+                        __entry->pid, __entry->job, __entry->deadline)
+);
+/*
+ * Tracepoint for switching to new task
+ */
+TRACE_EVENT(litmus_switch_to,
+        TP_PROTO(struct task_struct *t),
+        TP_ARGS(t),
+        TP_STRUCT__entry(
+                __field( pid_t,         pid     )
+                __field( unsigned int,  job     )
+                __field( lt_t,          when    )
+                __field( lt_t,          exec_time       )
+        ),
+        TP_fast_assign(
+                __entry->pid    = is_realtime(t) ? t->pid : 0;
+                __entry->job    = is_realtime(t) ? t->rt_param.job_params.job_no : 0;
+                __entry->when           = litmus_clock();
+                __entry->exec_time      = get_exec_time(t);
+        ),
+        TP_printk("switch_to(job(%u, %u)): %Lu (exec: %Lu)\n",
+                        __entry->pid, __entry->job,
+                        __entry->when, __entry->exec_time)
+);
+/*
+ * Tracepoint for switching away previous task
+ */
+TRACE_EVENT(litmus_switch_away,
+        TP_PROTO(struct task_struct *t),
+        TP_ARGS(t),
+        TP_STRUCT__entry(
+                __field( pid_t,         pid     )
+                __field( unsigned int,  job     )
+                __field( lt_t,          when    )
+                __field( lt_t,          exec_time       )
+        ),
+        TP_fast_assign(
+                __entry->pid    = is_realtime(t) ? t->pid : 0;
+                __entry->job    = is_realtime(t) ? t->rt_param.job_params.job_no : 0;
+                __entry->when           = litmus_clock();
+                __entry->exec_time      = get_exec_time(t);
+        ),
+        TP_printk("switch_away(job(%u, %u)): %Lu (exec: %Lu)\n",
+                        __entry->pid, __entry->job,
+                        __entry->when, __entry->exec_time)
+);
+/*
+ * Tracing jobs completion
+ */
+TRACE_EVENT(litmus_task_completion,
+        TP_PROTO(struct task_struct *t, unsigned long forced),
+        TP_ARGS(t, forced),
+        TP_STRUCT__entry(
+                __field( pid_t,         pid     )
+                __field( unsigned int,  job     )
+                __field( lt_t,          when    )
+                __field( unsigned long, forced  )
+        ),
+        TP_fast_assign(
+                __entry->pid    = t ? t->pid : 0;
+                __entry->job    = t ? t->rt_param.job_params.job_no : 0;
+                __entry->when   = litmus_clock();
+                __entry->forced = forced;
+        ),
+        TP_printk("completed(job(%u, %u)): %Lu (forced: %lu)\n",
+                        __entry->pid, __entry->job,
+                        __entry->when, __entry->forced)
+);
+/*
+ * Trace blocking tasks.
+ */
+TRACE_EVENT(litmus_task_block,
+        TP_PROTO(struct task_struct *t),
+        TP_ARGS(t),
+        TP_STRUCT__entry(
+                __field( pid_t,         pid     )
+                __field( lt_t,          when    )
+        ),
+        TP_fast_assign(
+                __entry->pid    = t ? t->pid : 0;
+                __entry->when   = litmus_clock();
+        ),
+        TP_printk("(%u) blocks: %Lu\n", __entry->pid, __entry->when)
+);
+/*
+ * Tracing jobs resume
+ */
+TRACE_EVENT(litmus_task_resume,
+        TP_PROTO(struct task_struct *t),
+        TP_ARGS(t),
+        TP_STRUCT__entry(
+                __field( pid_t,         pid     )
+                __field( unsigned int,  job     )
+                __field( lt_t,          when    )
+        ),
+        TP_fast_assign(
+                __entry->pid    = t ? t->pid : 0;
+                __entry->job    = t ? t->rt_param.job_params.job_no : 0;
+                __entry->when   = litmus_clock();
+        ),
+        TP_printk("resume(job(%u, %u)): %Lu\n",
+                        __entry->pid, __entry->job, __entry->when)
+);
+/*
+ * Trace synchronous release
+ */
+TRACE_EVENT(litmus_sys_release,
+        TP_PROTO(lt_t *start),
+        TP_ARGS(start),
+        TP_STRUCT__entry(
+                __field( lt_t,          rel     )
+                __field( lt_t,          when    )
+        ),
+        TP_fast_assign(
+                __entry->rel    = *start;
+                __entry->when   = litmus_clock();
+        ),
+        TP_printk("SynRelease(%Lu) at %Lu\n", __entry->rel, __entry->when)
+);
+#endif /* _SCHED_TASK_TRACEPOINT_H */
+/* Must stay outside the protection */
+#include <trace/define_trace.h>
diff --git a/kernel/sched.c b/kernel/sched.c
index baaca61bc3a3..c4b6bd5151ff 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -80,14 +80,14 @@
 #include "workqueue_sched.h"
 #include "sched_autogroup.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/sched.h>
 #include <litmus/sched_trace.h>
 #include <litmus/trace.h>
 static void litmus_tick(struct rq*, struct task_struct*);
-#define CREATE_TRACE_POINTS
-#include <trace/events/sched.h>
 /*
 * Convert user-nice values [ -20 ... 0 ... 19 ]
 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@ -2597,8 +2597,12 @@ void scheduler_ipi(void)
        struct rq *rq = this_rq();
        struct task_struct *list = xchg(&rq->wake_list, NULL);
-        if (!list)
+        if (!list) {
+                /* If we don't call irq_enter(), we need to trigger the IRQ
+                 * tracing manually. */
+                ft_irq_fired();
                return;
+        }
        /*
         * Not all reschedule IPI handlers call irq_enter/irq_exit, since
@@ -3163,16 +3167,26 @@ static inline void post_schedule(struct rq *rq)
 asmlinkage void schedule_tail(struct task_struct *prev)
        __releases(rq->lock)
 {
-        struct rq *rq = this_rq();
+        struct rq *rq;
+        
+        preempt_disable();
+        
+        rq = this_rq();
        finish_task_switch(rq, prev);
+        sched_trace_task_switch_to(current);
        /*
         * FIXME: do we need to worry about rq being invalidated by the
         * task_switch?
         */
        post_schedule(rq);
+        if (sched_state_validate_switch())
+                litmus_reschedule_local();
+        preempt_enable();
 #ifdef __ARCH_WANT_UNLOCKED_CTXSW
        /* In this case, finish_task_switch does not reenable preemption */
        preempt_enable();
@@ -4403,14 +4417,20 @@ litmus_need_resched_nonpreemptible:
                raw_spin_unlock_irq(&rq->lock);
        }
+        TS_SCHED2_START(prev);
        sched_trace_task_switch_to(current);
        post_schedule(rq);
-        if (sched_state_validate_switch())
+        if (sched_state_validate_switch()) {
+                TS_SCHED2_END(prev);
                goto litmus_need_resched_nonpreemptible;
+        }
        preempt_enable_no_resched();
+        TS_SCHED2_END(prev);
        if (need_resched())
                goto need_resched;
@@ -4684,17 +4704,6 @@ void complete_all(struct completion *x)
 }
 EXPORT_SYMBOL(complete_all);
-void complete_n(struct completion *x, int n)
-{
-        unsigned long flags;
-        spin_lock_irqsave(&x->wait.lock, flags);
-        x->done += n;
-        __wake_up_common(&x->wait, TASK_NORMAL, n, 0, NULL);
-        spin_unlock_irqrestore(&x->wait.lock, flags);
-}
-EXPORT_SYMBOL(complete_n);
 static inline long __sched
 do_wait_for_common(struct completion *x, long timeout, int state)
 {
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 58cf5d18dfdc..db04161fe37c 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -3,6 +3,8 @@
 * policies)
 */
+#include <litmus/litmus.h>
 #ifdef CONFIG_RT_GROUP_SCHED
 #define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
@@ -228,8 +230,11 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
        if (rt_rq->rt_nr_running) {
                if (rt_se && !on_rt_rq(rt_se))
                        enqueue_rt_entity(rt_se, false);
-                if (rt_rq->highest_prio.curr < curr->prio)
+                if (rt_rq->highest_prio.curr < curr->prio &&
+                    /* Don't subject LITMUS tasks to remote reschedules */
+                    !is_realtime(curr)) {
                        resched_task(curr);
+                }
        }
 }
@@ -322,8 +327,10 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
 static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 {
-        if (rt_rq->rt_nr_running)
+        struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
-                resched_task(rq_of_rt_rq(rt_rq)->curr);
+        if (rt_rq->rt_nr_running && !is_realtime(curr))
+                resched_task(curr);
 }
 static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
diff --git a/kernel/softirq.c b/kernel/softirq.c
index fca82c32042b..2f2df08df395 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -211,6 +211,9 @@ asmlinkage void __do_softirq(void)
        int max_restart = MAX_SOFTIRQ_RESTART;
        int cpu;
+        /* Mark Feather-Trace samples as "disturbed". */
+        ft_irq_fired();
        pending = local_softirq_pending();
        account_system_vtime(current);
diff --git a/litmus/Kconfig b/litmus/Kconfig
index 94b48e199577..bd6635c8de08 100644
--- a/litmus/Kconfig
+++ b/litmus/Kconfig
@@ -79,6 +79,52 @@ config SCHED_CPU_AFFINITY
          Say Yes if unsure.
+choice
+        prompt "EDF Tie-Break Behavior"
+        default EDF_TIE_BREAK_LATENESS_NORM
+        help
+          Allows the configuration of tie-breaking behavior when the deadlines
+          of two EDF-scheduled tasks are equal.
+        
+        config EDF_TIE_BREAK_LATENESS
+        bool "Lateness-based Tie Break"
+        help
+          Break ties between two jobs, A and B, based upon the lateness of their
+          prior jobs. The job with the greatest lateness has priority. Note that
+          lateness has a negative value if the prior job finished before its
+          deadline.
+        
+        config EDF_TIE_BREAK_LATENESS_NORM
+        bool "Normalized Lateness-based Tie Break"
+        help
+          Break ties between two jobs, A and B, based upon the lateness, normalized
+          by relative deadline, of their prior jobs. The job with the greatest
+          normalized lateness has priority. Note that lateness has a negative value
+          if the prior job finished before its deadline.
+          
+          Normalized lateness tie-breaks are likely desireable over non-normalized
+          tie-breaks if the execution times and/or relative deadlines of tasks in a
+          task set vary greatly.
+        
+        config EDF_TIE_BREAK_HASH
+        bool "Hash-based Tie Breaks"
+        help
+          Break ties between two jobs, A and B, with equal deadlines by using a
+          uniform hash; i.e.: hash(A.pid, A.job_num) < hash(B.pid, B.job_num). Job
+          A has ~50% of winning a given tie-break.
+        
+        config EDF_PID_TIE_BREAK
+        bool "PID-based Tie Breaks"
+        help
+          Break ties based upon OS-assigned thread IDs. Use this option if
+          required by algorithm's real-time analysis or per-task response-time
+          jitter must be minimized.
+        
+          NOTES:
+            * This tie-breaking method was default in Litmus 2012.2 and before.
+                
+endchoice
 endmenu
 menu "Tracing"
@@ -138,6 +184,24 @@ config SCHED_TASK_TRACE_SHIFT
                   10 =>   1k events
                    8 =>  512 events
+config SCHED_LITMUS_TRACEPOINT
+        bool "Enable Event/Tracepoint Tracing for real-time task tracing"
+        depends on TRACEPOINTS
+        default n
+        help
+          Enable kernel-style events (tracepoint) for Litmus. Litmus events
+          trace the same functions as the above sched_trace_XXX(), but can
+          be enabled independently.
+          Litmus tracepoints can be recorded and analyzed together (single
+          time reference) with all other kernel tracing events (e.g.,
+          sched:sched_switch, etc.).
+          This also enables a quick way to visualize schedule traces using
+          trace-cmd utility and kernelshark visualizer.
+          Say Yes for debugging and visualization purposes.
+          Say No for overhead tracing.
 config SCHED_OVERHEAD_TRACE
        bool "Record timestamps for overhead measurements"
        depends on FEATHER_TRACE
@@ -201,7 +265,7 @@ config SCHED_DEBUG_TRACE_CALLER
 config PREEMPT_STATE_TRACE
       bool "Trace preemption state machine transitions"
-       depends on SCHED_DEBUG_TRACE
+       depends on SCHED_DEBUG_TRACE && DEBUG_KERNEL
       default n
       help
         With this option enabled, each CPU will log when it transitions
diff --git a/litmus/Makefile b/litmus/Makefile
index 7338180f196f..d26ca7076b62 100644
--- a/litmus/Makefile
+++ b/litmus/Makefile
@@ -11,13 +11,16 @@ obj-y     = sched_plugin.o litmus.o \
            sync.o \
            rt_domain.o \
            edf_common.o \
+            fp_common.o \
            fdso.o \
            locking.o \
            srp.o \
            bheap.o \
+            binheap.o \
            ctrldev.o \
            sched_gsn_edf.o \
-            sched_psn_edf.o
+            sched_psn_edf.o \
+            sched_pfp.o
 obj-$(CONFIG_PLUGIN_CEDF) += sched_cedf.o
 obj-$(CONFIG_PLUGIN_PFAIR) += sched_pfair.o
diff --git a/litmus/binheap.c b/litmus/binheap.c
new file mode 100644
index 000000000000..40a913f4b5a7
--- /dev/null
+++ b/litmus/binheap.c
@@ -0,0 +1,388 @@
+#include <litmus/binheap.h>
+/* Returns true of the root ancestor of node is the root of the given heap. */
+int binheap_is_in_this_heap(struct binheap_node *node,
+        struct binheap* heap)
+{
+        if(!binheap_is_in_heap(node)) {
+                return 0;
+        }
+        while(node->parent != NULL) {
+                node = node->parent;
+        }
+        return (node == heap->root);
+}
+/* Update the node reference pointers.  Same logic as Litmus binomial heap. */
+static void __update_ref(struct binheap_node *parent,
+                                struct binheap_node *child)
+{
+        *(parent->ref_ptr) = child;
+        *(child->ref_ptr) = parent;
+        swap(parent->ref_ptr, child->ref_ptr);
+}
+/* Swaps data between two nodes. */
+static void __binheap_swap(struct binheap_node *parent,
+                                struct binheap_node *child)
+{
+        swap(parent->data, child->data);
+        __update_ref(parent, child);
+}
+/* Swaps memory and data between two nodes. Actual nodes swap instead of
+ * just data.  Needed when we delete nodes from the heap.
+ */
+static void __binheap_swap_safe(struct binheap *handle,
+                                struct binheap_node *a,
+                                struct binheap_node *b)
+{
+        swap(a->data, b->data);
+        __update_ref(a, b);
+        if((a->parent != NULL) && (a->parent == b->parent)) {
+                /* special case: shared parent */
+                swap(a->parent->left, a->parent->right);
+        }
+        else {
+                /* Update pointers to swap parents. */
+                if(a->parent) {
+                        if(a == a->parent->left) {
+                                a->parent->left = b;
+                        }
+                        else {
+                                a->parent->right = b;
+                        }
+                }
+                if(b->parent) {
+                        if(b == b->parent->left) {
+                                b->parent->left = a;
+                        }
+                        else {
+                                b->parent->right = a;
+                        }
+                }
+                swap(a->parent, b->parent);
+        }
+        /* swap children */
+        if(a->left) {
+                a->left->parent = b;
+                if(a->right) {
+                        a->right->parent = b;
+                }
+        }
+        if(b->left) {
+                b->left->parent = a;
+                if(b->right) {
+                        b->right->parent = a;
+                }
+        }
+        swap(a->left, b->left);
+        swap(a->right, b->right);
+        /* update next/last/root pointers */
+        if(a == handle->next) {
+                handle->next = b;
+        }
+        else if(b == handle->next) {
+                handle->next = a;
+        }
+        if(a == handle->last) {
+                handle->last = b;
+        }
+        else if(b == handle->last) {
+                handle->last = a;
+        }
+        if(a == handle->root) {
+                handle->root = b;
+        }
+        else if(b == handle->root) {
+                handle->root = a;
+        }
+}
+/**
+ * Update the pointer to the last node in the complete binary tree.
+ * Called internally after the root node has been deleted.
+ */
+static void __binheap_update_last(struct binheap *handle)
+{
+        struct binheap_node *temp = handle->last;
+        /* find a "bend" in the tree. */
+        while(temp->parent && (temp == temp->parent->left)) {
+                temp = temp->parent;
+        }
+        /* step over to sibling if we're not at root */
+        if(temp->parent != NULL) {
+                temp = temp->parent->left;
+        }
+        /* now travel right as far as possible. */
+        while(temp->right != NULL) {
+                temp = temp->right;
+        }
+        /* take one step to the left if we're not at the bottom-most level. */
+        if(temp->left != NULL) {
+                temp = temp->left;
+        }
+        handle->last = temp;
+}
+/**
+ * Update the pointer to the node that will take the next inserted node.
+ * Called internally after a node has been inserted.
+ */
+static void __binheap_update_next(struct binheap *handle)
+{
+        struct binheap_node *temp = handle->next;
+        /* find a "bend" in the tree. */
+        while(temp->parent && (temp == temp->parent->right)) {
+                temp = temp->parent;
+        }
+        /* step over to sibling if we're not at root */
+        if(temp->parent != NULL) {
+                temp = temp->parent->right;
+        }
+        /* now travel left as far as possible. */
+        while(temp->left != NULL) {
+                temp = temp->left;
+        }
+        handle->next = temp;
+}
+/* bubble node up towards root */
+static void __binheap_bubble_up(struct binheap *handle,
+                                struct binheap_node *node)
+{
+        /* let BINHEAP_POISON data bubble to the top */
+        while((node->parent != NULL) &&
+                  ((node->data == BINHEAP_POISON) ||
+                   handle->compare(node, node->parent))) {
+                          __binheap_swap(node->parent, node);
+                          node = node->parent;
+        }
+}
+/* bubble node down, swapping with min-child */
+static void __binheap_bubble_down(struct binheap *handle)
+{
+        struct binheap_node *node = handle->root;
+        while(node->left != NULL) {
+                if(node->right && handle->compare(node->right, node->left)) {
+                        if(handle->compare(node->right, node)) {
+                                __binheap_swap(node, node->right);
+                                node = node->right;
+                        }
+                        else {
+                                break;
+                        }
+                }
+                else {
+                        if(handle->compare(node->left, node)) {
+                                __binheap_swap(node, node->left);
+                                node = node->left;
+                        }
+                        else {
+                                break;
+                        }
+                }
+        }
+}
+void __binheap_add(struct binheap_node *new_node,
+                                struct binheap *handle,
+                                void *data)
+{
+        new_node->data = data;
+        new_node->ref = new_node;
+        new_node->ref_ptr = &(new_node->ref);
+        if(!binheap_empty(handle)) {
+                /* insert left side first */
+                if(handle->next->left == NULL) {
+                        handle->next->left = new_node;
+                        new_node->parent = handle->next;
+                        new_node->left = NULL;
+                        new_node->right = NULL;
+                        handle->last = new_node;
+                        __binheap_bubble_up(handle, new_node);
+                }
+                else {
+                        /* left occupied. insert right. */
+                        handle->next->right = new_node;
+                        new_node->parent = handle->next;
+                        new_node->left = NULL;
+                        new_node->right = NULL;
+                        handle->last = new_node;
+                        __binheap_update_next(handle);
+                        __binheap_bubble_up(handle, new_node);
+                }
+        }
+        else {
+                /* first node in heap */
+                new_node->parent = NULL;
+                new_node->left = NULL;
+                new_node->right = NULL;
+                handle->root = new_node;
+                handle->next = new_node;
+                handle->last = new_node;
+        }
+}
+/**
+ * Removes the root node from the heap. The node is removed after coalescing
+ * the binheap_node with its original data pointer at the root of the tree.
+ *
+ * The 'last' node in the tree is then swapped up to the root and bubbled
+ * down.
+ */
+void __binheap_delete_root(struct binheap *handle,
+                                struct binheap_node *container)
+{
+        struct binheap_node *root = handle->root;
+        if(root != container) {
+                /* coalesce */
+                __binheap_swap_safe(handle, root, container);
+                root = container;
+        }
+        if(handle->last != root) {
+                /* swap 'last' node up to root and bubble it down. */
+                struct binheap_node *to_move = handle->last;
+                if(to_move->parent != root) {
+                        handle->next = to_move->parent;
+                        if(handle->next->right == to_move) {
+                                /* disconnect from parent */
+                                to_move->parent->right = NULL;
+                                handle->last = handle->next->left;
+                        }
+                        else {
+                                /* find new 'last' before we disconnect */
+                                __binheap_update_last(handle);
+                                /* disconnect from parent */
+                                to_move->parent->left = NULL;
+                        }
+                }
+                else {
+                        /* 'last' is direct child of root */
+                        handle->next = to_move;
+                        if(to_move == to_move->parent->right) {
+                                to_move->parent->right = NULL;
+                                handle->last = to_move->parent->left;
+                        }
+                        else {
+                                to_move->parent->left = NULL;
+                                handle->last = to_move;
+                        }
+                }
+                to_move->parent = NULL;
+                /* reconnect as root.  We can't just swap data ptrs since root node
+                 * may be freed after this function returns.
+                 */
+                to_move->left = root->left;
+                to_move->right = root->right;
+                if(to_move->left != NULL) {
+                        to_move->left->parent = to_move;
+                }
+                if(to_move->right != NULL) {
+                        to_move->right->parent = to_move;
+                }
+                handle->root = to_move;
+                /* bubble down */
+                __binheap_bubble_down(handle);
+        }
+        else {
+                /* removing last node in tree */
+                handle->root = NULL;
+                handle->next = NULL;
+                handle->last = NULL;
+        }
+        /* mark as removed */
+        container->parent = BINHEAP_POISON;
+}
+/**
+ * Delete an arbitrary node.  Bubble node to delete up to the root,
+ * and then delete to root.
+ */
+void __binheap_delete(struct binheap_node *node_to_delete,
+                                struct binheap *handle)
+{
+        struct binheap_node *target = node_to_delete->ref;
+        void *temp_data = target->data;
+        /* temporarily set data to null to allow node to bubble up to the top. */
+        target->data = BINHEAP_POISON;
+        __binheap_bubble_up(handle, target);
+        __binheap_delete_root(handle, node_to_delete);
+        node_to_delete->data = temp_data;  /* restore node data pointer */
+}
+/**
+ * Bubble up a node whose pointer has decreased in value.
+ */
+void __binheap_decrease(struct binheap_node *orig_node,
+                                struct binheap *handle)
+{
+        struct binheap_node *target = orig_node->ref;
+        __binheap_bubble_up(handle, target);
+}
diff --git a/litmus/budget.c b/litmus/budget.c
index 310e9a3d4172..f7712be29adb 100644
--- a/litmus/budget.c
+++ b/litmus/budget.c
@@ -5,6 +5,8 @@
 #include <litmus/litmus.h>
 #include <litmus/preempt.h>
+#include <litmus/budget.h>
 struct enforcement_timer {
        /* The enforcement timer is used to accurately police
         * slice budgets. */
diff --git a/litmus/ctrldev.c b/litmus/ctrldev.c
index 6677a67cc945..41919b2714cb 100644
--- a/litmus/ctrldev.c
+++ b/litmus/ctrldev.c
@@ -30,27 +30,19 @@ static int alloc_ctrl_page(struct task_struct *t)
 static int map_ctrl_page(struct task_struct *t, struct vm_area_struct* vma)
 {
        int err;
-        unsigned long pfn;
        struct page* ctrl = virt_to_page(tsk_rt(t)->ctrl_page);
-        /* Increase ref count. Is decreased when vma is destroyed. */
-        get_page(ctrl);
-        /* compute page frame number */
-        pfn = page_to_pfn(ctrl);
        TRACE_CUR(CTRL_NAME
-                  ": mapping %p (pfn:%lx, %lx) to 0x%lx (prot:%lx)\n",
+                  ": mapping %p (pfn:%lx) to 0x%lx (prot:%lx)\n",
-                  tsk_rt(t)->ctrl_page, pfn, page_to_pfn(ctrl), vma->vm_start,
+                  tsk_rt(t)->ctrl_page,page_to_pfn(ctrl), vma->vm_start,
                  vma->vm_page_prot);
-        /* Map it into the vma. Make sure to use PAGE_SHARED, otherwise
+        /* Map it into the vma. */
-         * userspace actually gets a copy-on-write page. */
+        err = vm_insert_page(vma, vma->vm_start, ctrl);
-        err = remap_pfn_range(vma, vma->vm_start, pfn, PAGE_SIZE, PAGE_SHARED);
        if (err)
-                TRACE_CUR(CTRL_NAME ": remap_pfn_range() failed (%d)\n", err);
+                TRACE_CUR(CTRL_NAME ": vm_insert_page() failed (%d)\n", err);
        return err;
 }
@@ -63,19 +55,19 @@ static void litmus_ctrl_vm_close(struct vm_area_struct* vma)
        TRACE_CUR(CTRL_NAME
                  ": %p:%p vma:%p vma->vm_private_data:%p closed.\n",
                  (void*) vma->vm_start, (void*) vma->vm_end, vma,
-                  vma->vm_private_data, current->comm,
+                  vma->vm_private_data);
-                  current->pid);
 }
 static int litmus_ctrl_vm_fault(struct vm_area_struct* vma,
                                      struct vm_fault* vmf)
 {
-        /* This function should never be called, since
+        TRACE_CUR("%s flags=0x%x (off:%ld)\n", __FUNCTION__,
-         * all pages should have been mapped by mmap()
+                  vma->vm_flags, vmf->pgoff);
-         * already. */
-        TRACE_CUR("%s flags=0x%x\n", __FUNCTION__, vma->vm_flags);
+        /* This function should never be called, since all pages should have
+         * been mapped by mmap() already. */
+        WARN_ONCE(1, "Page faults should be impossible in the control page\n");
-        /* nope, you only get one page */
        return VM_FAULT_SIGBUS;
 }
@@ -103,9 +95,16 @@ static int litmus_ctrl_mmap(struct file* filp, struct vm_area_struct* vma)
                return -EINVAL;
        vma->vm_ops = &litmus_ctrl_vm_ops;
-        /* this mapping should not be kept across forks,
+        /* This mapping should not be kept across forks,
-         * and cannot be expanded */
+         * cannot be expanded, and is not a "normal" page. */
-        vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND;
+        vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_IO;
+        /* We don't want the first write access to trigger a "minor" page fault
+         * to mark the page as dirty.  This is transient, private memory, we
+         * don't care if it was touched or not. __S011 means RW access, but not
+         * execute, and avoids copy-on-write behavior.
+         * See protection_map in mmap.c.  */
+        vma->vm_page_prot = __S011;
        err = alloc_ctrl_page(current);
        if (!err)
@@ -134,6 +133,17 @@ static int __init init_litmus_ctrl_dev(void)
        BUILD_BUG_ON(sizeof(struct control_page) > PAGE_SIZE);
+        BUILD_BUG_ON(sizeof(union np_flag) != sizeof(uint64_t));
+        BUILD_BUG_ON(offsetof(struct control_page, sched.raw)
+                     != LITMUS_CP_OFFSET_SCHED);
+        BUILD_BUG_ON(offsetof(struct control_page, irq_count)
+                     != LITMUS_CP_OFFSET_IRQ_COUNT);
+        BUILD_BUG_ON(offsetof(struct control_page, ts_syscall_start)
+                     != LITMUS_CP_OFFSET_TS_SC_START);
+        BUILD_BUG_ON(offsetof(struct control_page, irq_syscall_start)
+                     != LITMUS_CP_OFFSET_IRQ_SC_START);
        printk("Initializing LITMUS^RT control device.\n");
        err = misc_register(&litmus_ctrl_dev);
        if (err)
diff --git a/litmus/edf_common.c b/litmus/edf_common.c
index 9b44dc2d8d1e..5aca2934a7b5 100644
--- a/litmus/edf_common.c
+++ b/litmus/edf_common.c
@@ -14,6 +14,32 @@
 #include <litmus/edf_common.h>
+#ifdef CONFIG_EDF_TIE_BREAK_LATENESS_NORM
+#include <litmus/fpmath.h>
+#endif
+#ifdef CONFIG_EDF_TIE_BREAK_HASH
+#include <linux/hash.h>
+static inline long edf_hash(struct task_struct *t)
+{
+        /* pid is 32 bits, so normally we would shove that into the
+         * upper 32-bits and and put the job number in the bottom
+         * and hash the 64-bit number with hash_64(). Sadly,
+         * in testing, hash_64() doesn't distribute keys were the
+         * upper bits are close together (as would be the case with
+         * pids) and job numbers are equal (as would be the case with
+         * synchronous task sets with all relative deadlines equal).
+         *
+         * A 2006 Linux patch proposed the following solution
+         * (but for some reason it wasn't accepted...).
+         *
+         * At least this workaround works for 32-bit systems as well.
+         */
+        return hash_32(hash_32((u32)tsk_rt(t)->job_params.job_no, 32) ^ t->pid, 32);
+}
+#endif
 /* edf_higher_prio -  returns true if first has a higher EDF priority
 *                    than second. Deadline ties are broken by PID.
 *
@@ -63,25 +89,81 @@ int edf_higher_prio(struct task_struct* first,
 #endif
+        if (earlier_deadline(first_task, second_task)) {
+                return 1;
+        }
+        else if (get_deadline(first_task) == get_deadline(second_task)) {
+                /* Need to tie break. All methods must set pid_break to 0/1 if
+                 * first_task does not have priority over second_task.
+                 */
+                int pid_break;
-        return !is_realtime(second_task)  ||
-                /* is the deadline of the first task earlier?
+#if defined(CONFIG_EDF_TIE_BREAK_LATENESS)
-                 * Then it has higher priority.
+                /* Tie break by lateness. Jobs with greater lateness get
+                 * priority. This should spread tardiness across all tasks,
+                 * especially in task sets where all tasks have the same
+                 * period and relative deadlines.
                 */
-                earlier_deadline(first_task, second_task) ||
+                if (get_lateness(first_task) > get_lateness(second_task)) {
+                        return 1;
-                /* Do we have a deadline tie?
+                }
-                 * Then break by PID.
+                pid_break = (get_lateness(first_task) == get_lateness(second_task));
+#elif defined(CONFIG_EDF_TIE_BREAK_LATENESS_NORM)
+                /* Tie break by lateness, normalized by relative deadline. Jobs with
+                 * greater normalized lateness get priority.
+                 *
+                 * Note: Considered using the algebraically equivalent
+                 *      lateness(first)*relative_deadline(second) >
+                                        lateness(second)*relative_deadline(first)
+                 * to avoid fixed-point math, but values are prone to overflow if inputs
+                 * are on the order of several seconds, even in 64-bit.
                 */
-                (get_deadline(first_task) == get_deadline(second_task) &&
+                fp_t fnorm = _frac(get_lateness(first_task),
-                (first_task->pid < second_task->pid ||
+                                                   get_rt_relative_deadline(first_task));
+                fp_t snorm = _frac(get_lateness(second_task),
+                                                   get_rt_relative_deadline(second_task));
+                if (_gt(fnorm, snorm)) {
+                        return 1;
+                }
+                pid_break = _eq(fnorm, snorm);
-                /* If the PIDs are the same then the task with the inherited
-                 * priority wins.
+#elif defined(CONFIG_EDF_TIE_BREAK_HASH)
+                /* Tie break by comparing hashs of (pid, job#) tuple.  There should be
+                 * a 50% chance that first_task has a higher priority than second_task.
                 */
-                (first_task->pid == second_task->pid &&
+                long fhash = edf_hash(first_task);
-                 !second->rt_param.inh_task)));
+                long shash = edf_hash(second_task);
+                if (fhash < shash) {
+                        return 1;
+                }
+                pid_break = (fhash == shash);
+#else
+                /* CONFIG_EDF_PID_TIE_BREAK */
+                pid_break = 1; // fall through to tie-break by pid;
+#endif
+                /* Tie break by pid */
+                if(pid_break) {
+                        if (first_task->pid < second_task->pid) {
+                                return 1;
+                        }
+                        else if (first_task->pid == second_task->pid) {
+                                /* If the PIDs are the same then the task with the
+                                 * inherited priority wins.
+                                 */
+                                if (!second->rt_param.inh_task) {
+                                        return 1;
+                                }
+                        }
+                }
+        }
+        return 0; /* fall-through. prio(second_task) > prio(first_task) */
 }
 int edf_ready_order(struct bheap_node* a, struct bheap_node* b)
diff --git a/litmus/fdso.c b/litmus/fdso.c
index aa7b384264e3..c4b450be4509 100644
--- a/litmus/fdso.c
+++ b/litmus/fdso.c
@@ -23,10 +23,16 @@ extern struct fdso_ops generic_lock_ops;
 static const struct fdso_ops* fdso_ops[] = {
        &generic_lock_ops, /* FMLP_SEM */
        &generic_lock_ops, /* SRP_SEM */
+        &generic_lock_ops, /* MPCP_SEM */
+        &generic_lock_ops, /* MPCP_VS_SEM */
+        &generic_lock_ops, /* DPCP_SEM */
+        &generic_lock_ops, /* PCP_SEM */
 };
 static int fdso_create(void** obj_ref, obj_type_t type, void* __user config)
 {
+        BUILD_BUG_ON(ARRAY_SIZE(fdso_ops) != MAX_OBJ_TYPE + 1);
        if (fdso_ops[type]->create)
                return fdso_ops[type]->create(obj_ref, type, config);
        else
@@ -162,6 +168,18 @@ static int put_od_entry(struct od_table_entry* od)
        return 0;
 }
+static long close_od_entry(struct od_table_entry *od)
+{
+        long ret;
+        /* Give the class a chance to reject the close. */
+        ret = fdso_close(od);
+        if (ret == 0)
+                ret = put_od_entry(od);
+        return ret;
+}
 void exit_od_table(struct task_struct* t)
 {
        int i;
@@ -169,7 +187,7 @@ void exit_od_table(struct task_struct* t)
        if (t->od_table) {
                for (i = 0; i < MAX_OBJECT_DESCRIPTORS; i++)
                        if (t->od_table[i].used)
-                                put_od_entry(t->od_table + i);
+                                close_od_entry(t->od_table + i);
                kfree(t->od_table);
                t->od_table = NULL;
        }
@@ -283,11 +301,7 @@ asmlinkage long sys_od_close(int od)
                return ret;
-        /* give the class a chance to reject the close
+        ret = close_od_entry(t->od_table + od);
-         */
-        ret = fdso_close(t->od_table + od);
-        if (ret == 0)
-                ret = put_od_entry(t->od_table + od);
        return ret;
 }
diff --git a/litmus/fp_common.c b/litmus/fp_common.c
new file mode 100644
index 000000000000..964a4729deff
--- /dev/null
+++ b/litmus/fp_common.c
@@ -0,0 +1,119 @@
+/*
+ * litmus/fp_common.c
+ *
+ * Common functions for fixed-priority scheduler.
+ */
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/sched_trace.h>
+#include <litmus/fp_common.h>
+/* fp_higher_prio -  returns true if first has a higher static priority
+ *                   than second. Ties are broken by PID.
+ *
+ * both first and second may be NULL
+ */
+int fp_higher_prio(struct task_struct* first,
+                   struct task_struct* second)
+{
+        struct task_struct *first_task = first;
+        struct task_struct *second_task = second;
+        /* There is no point in comparing a task to itself. */
+        if (unlikely(first && first == second)) {
+                TRACE_TASK(first,
+                           "WARNING: pointless FP priority comparison.\n");
+                return 0;
+        }
+        /* check for NULL tasks */
+        if (!first || !second)
+                return first && !second;
+        if (!is_realtime(second_task))
+                return 1;
+#ifdef CONFIG_LITMUS_LOCKING
+        /* Check for inherited priorities. Change task
+         * used for comparison in such a case.
+         */
+        if (unlikely(first->rt_param.inh_task))
+                first_task = first->rt_param.inh_task;
+        if (unlikely(second->rt_param.inh_task))
+                second_task = second->rt_param.inh_task;
+        /* Check for priority boosting. Tie-break by start of boosting.
+         */
+        if (unlikely(is_priority_boosted(first_task))) {
+                /* first_task is boosted, how about second_task? */
+                if (is_priority_boosted(second_task))
+                        /* break by priority point */
+                        return lt_before(get_boost_start(first_task),
+                                         get_boost_start(second_task));
+                else
+                        /* priority boosting wins. */
+                        return 1;
+        } else if (unlikely(is_priority_boosted(second_task)))
+                /* second_task is boosted, first is not*/
+                return 0;
+#endif
+        /* Comparisons to itself are not expected; priority inheritance
+         * should also not cause this to happen. */
+        BUG_ON(first_task == second_task);
+        if (get_priority(first_task) < get_priority(second_task))
+                return 1;
+        else if (get_priority(first_task) == get_priority(second_task))
+                /* Break by PID. */
+                return first_task->pid < second_task->pid;
+        else
+                return 0;
+}
+int fp_ready_order(struct bheap_node* a, struct bheap_node* b)
+{
+        return fp_higher_prio(bheap2task(a), bheap2task(b));
+}
+void fp_domain_init(rt_domain_t* rt, check_resched_needed_t resched,
+                    release_jobs_t release)
+{
+        rt_domain_init(rt,  fp_ready_order, resched, release);
+}
+/* need_to_preempt - check whether the task t needs to be preempted
+ */
+int fp_preemption_needed(struct fp_prio_queue *q, struct task_struct *t)
+{
+        struct task_struct *pending;
+        pending = fp_prio_peek(q);
+        if (!pending)
+                return 0;
+        if (!t)
+                return 1;
+        /* make sure to get non-rt stuff out of the way */
+        return !is_realtime(t) || fp_higher_prio(pending, t);
+}
+void fp_prio_queue_init(struct fp_prio_queue* q)
+{
+        int i;
+        for (i = 0; i < FP_PRIO_BIT_WORDS; i++)
+                q->bitmask[i] = 0;
+        for (i = 0; i < LITMUS_MAX_PRIORITY; i++)
+                bheap_init(&q->queue[i]);
+}
diff --git a/litmus/ftdev.c b/litmus/ftdev.c
index 06fcf4cf77dc..99bc39ffbcef 100644
--- a/litmus/ftdev.c
+++ b/litmus/ftdev.c
@@ -230,13 +230,20 @@ static ssize_t ftdev_read(struct file *filp,
                         * here with copied data because that data would get
                         * lost if the task is interrupted (e.g., killed).
                         */
+                        mutex_unlock(&ftdm->lock);
                        set_current_state(TASK_INTERRUPTIBLE);
                        schedule_timeout(50);
                        if (signal_pending(current)) {
                                if (err == 0)
                                        /* nothing read yet, signal problem */
                                        err = -ERESTARTSYS;
-                                break;
+                                goto out;
+                        }
+                        if (mutex_lock_interruptible(&ftdm->lock)) {
+                                err = -ERESTARTSYS;
+                                goto out;
                        }
                } else if (copied < 0) {
                        /* page fault */
diff --git a/litmus/jobs.c b/litmus/jobs.c
index 36e314625d86..13a4ed4c9e93 100644
--- a/litmus/jobs.c
+++ b/litmus/jobs.c
@@ -6,13 +6,13 @@
 #include <litmus/litmus.h>
 #include <litmus/jobs.h>
-void prepare_for_next_period(struct task_struct *t)
+static inline void setup_release(struct task_struct *t, lt_t release)
 {
-        BUG_ON(!t);
        /* prepare next release */
-        t->rt_param.job_params.release   = t->rt_param.job_params.deadline;
+        t->rt_param.job_params.release = release;
-        t->rt_param.job_params.deadline += get_rt_period(t);
+        t->rt_param.job_params.deadline = release + get_rt_relative_deadline(t);
        t->rt_param.job_params.exec_time = 0;
        /* update job sequence number */
        t->rt_param.job_params.job_no++;
@@ -20,11 +20,25 @@ void prepare_for_next_period(struct task_struct *t)
        t->rt.time_slice = 1;
 }
+void prepare_for_next_period(struct task_struct *t)
+{
+        BUG_ON(!t);
+        /* Record lateness before we set up the next job's
+         * release and deadline. Lateness may be negative.
+         */
+        t->rt_param.job_params.lateness =
+                (long long)litmus_clock() - 
+                (long long)t->rt_param.job_params.deadline;
+        setup_release(t, get_release(t) + get_rt_period(t));
+}
 void release_at(struct task_struct *t, lt_t start)
 {
-        t->rt_param.job_params.deadline = start;
+        BUG_ON(!t);
-        prepare_for_next_period(t);
+        setup_release(t, start);
-        set_rt_flags(t, RT_F_RUNNING);
+        tsk_rt(t)->completed = 0;
 }
@@ -34,7 +48,7 @@ void release_at(struct task_struct *t, lt_t start)
 long complete_job(void)
 {
        /* Mark that we do not excute anymore */
-        set_rt_flags(current, RT_F_SLEEP);
+        tsk_rt(current)->completed = 1;
        /* call schedule, this will return when a new job arrives
         * it also takes care of preparing for the next release
         */
diff --git a/litmus/litmus.c b/litmus/litmus.c
index 301390148d02..dc94be71bfb6 100644
--- a/litmus/litmus.c
+++ b/litmus/litmus.c
@@ -9,6 +9,8 @@
 #include <linux/sched.h>
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <linux/reboot.h>
+#include <linux/stop_machine.h>
 #include <litmus/litmus.h>
 #include <litmus/bheap.h>
@@ -23,9 +25,6 @@
 /* Number of RT tasks that exist in the system */
 atomic_t rt_task_count          = ATOMIC_INIT(0);
-static DEFINE_RAW_SPINLOCK(task_transition_lock);
-/* synchronize plugin switching */
-atomic_t cannot_use_plugin      = ATOMIC_INIT(0);
 /* Give log messages sequential IDs. */
 atomic_t __log_seq_no = ATOMIC_INIT(0);
@@ -102,21 +101,25 @@ asmlinkage long sys_set_rt_task_param(pid_t pid, struct rt_task __user * param)
                goto out_unlock;
        }
+        /* set relative deadline to be implicit if left unspecified */
+        if (tp.relative_deadline == 0)
+                tp.relative_deadline = tp.period;
        if (tp.exec_cost <= 0)
                goto out_unlock;
        if (tp.period <= 0)
                goto out_unlock;
        if (!cpu_online(tp.cpu))
                goto out_unlock;
-        if (tp.period < tp.exec_cost)
+        if (min(tp.relative_deadline, tp.period) < tp.exec_cost) /*density check*/
        {
                printk(KERN_INFO "litmus: real-time task %d rejected "
-                       "because wcet > period\n", pid);
+                       "because task density > 1.0\n", pid);
                goto out_unlock;
        }
-        if (    tp.cls != RT_CLASS_HARD &&
+        if (tp.cls != RT_CLASS_HARD &&
-                tp.cls != RT_CLASS_SOFT &&
+            tp.cls != RT_CLASS_SOFT &&
-                tp.cls != RT_CLASS_BEST_EFFORT)
+            tp.cls != RT_CLASS_BEST_EFFORT)
        {
                printk(KERN_INFO "litmus: real-time task %d rejected "
                                 "because its class is invalid\n", pid);
@@ -317,15 +320,20 @@ static void reinit_litmus_state(struct task_struct* p, int restore)
 long litmus_admit_task(struct task_struct* tsk)
 {
        long retval = 0;
-        unsigned long flags;
        BUG_ON(is_realtime(tsk));
-        if (get_rt_period(tsk) == 0 ||
+        tsk_rt(tsk)->heap_node = NULL;
-            get_exec_cost(tsk) > get_rt_period(tsk)) {
+        tsk_rt(tsk)->rel_heap = NULL;
-                TRACE_TASK(tsk, "litmus admit: invalid task parameters "
-                           "(%lu, %lu)\n",
+        if (get_rt_relative_deadline(tsk) == 0 ||
-                           get_exec_cost(tsk), get_rt_period(tsk));
+            get_exec_cost(tsk) >
+                        min(get_rt_relative_deadline(tsk), get_rt_period(tsk)) ) {
+                TRACE_TASK(tsk,
+                        "litmus admit: invalid task parameters "
+                        "(e = %lu, p = %lu, d = %lu)\n",
+                        get_exec_cost(tsk), get_rt_period(tsk),
+                        get_rt_relative_deadline(tsk));
                retval = -EINVAL;
                goto out;
        }
@@ -339,9 +347,6 @@ long litmus_admit_task(struct task_struct* tsk)
        INIT_LIST_HEAD(&tsk_rt(tsk)->list);
-        /* avoid scheduler plugin changing underneath us */
-        raw_spin_lock_irqsave(&task_transition_lock, flags);
        /* allocate heap node for this task */
        tsk_rt(tsk)->heap_node = bheap_node_alloc(GFP_ATOMIC);
        tsk_rt(tsk)->rel_heap = release_heap_alloc(GFP_ATOMIC);
@@ -349,15 +354,14 @@ long litmus_admit_task(struct task_struct* tsk)
        if (!tsk_rt(tsk)->heap_node || !tsk_rt(tsk)->rel_heap) {
                printk(KERN_WARNING "litmus: no more heap node memory!?\n");
-                bheap_node_free(tsk_rt(tsk)->heap_node);
-                release_heap_free(tsk_rt(tsk)->rel_heap);
                retval = -ENOMEM;
-                goto out_unlock;
+                goto out;
        } else {
                bheap_node_init(&tsk_rt(tsk)->heap_node, tsk);
        }
+        preempt_disable();
        retval = litmus->admit_task(tsk);
        if (!retval) {
@@ -366,9 +370,13 @@ long litmus_admit_task(struct task_struct* tsk)
                atomic_inc(&rt_task_count);
        }
-out_unlock:
+        preempt_enable();
-        raw_spin_unlock_irqrestore(&task_transition_lock, flags);
 out:
+        if (retval) {
+                bheap_node_free(tsk_rt(tsk)->heap_node);
+                release_heap_free(tsk_rt(tsk)->rel_heap);
+        }
        return retval;
 }
@@ -388,37 +396,10 @@ void litmus_exit_task(struct task_struct* tsk)
        }
 }
-/* IPI callback to synchronize plugin switching */
+static int do_plugin_switch(void *_plugin)
-static void synch_on_plugin_switch(void* info)
 {
-        atomic_inc(&cannot_use_plugin);
+        int ret;
-        while (atomic_read(&cannot_use_plugin) > 0)
+        struct sched_plugin* plugin = _plugin;
-                cpu_relax();
-}
-/* Switching a plugin in use is tricky.
- * We must watch out that no real-time tasks exists
- * (and that none is created in parallel) and that the plugin is not
- * currently in use on any processor (in theory).
- */
-int switch_sched_plugin(struct sched_plugin* plugin)
-{
-        unsigned long flags;
-        int ret = 0;
-        BUG_ON(!plugin);
-        /* forbid other cpus to use the plugin */
-        atomic_set(&cannot_use_plugin, 1);
-        /* send IPI to force other CPUs to synch with us */
-        smp_call_function(synch_on_plugin_switch, NULL, 0);
-        /* wait until all other CPUs have started synch */
-        while (atomic_read(&cannot_use_plugin) < num_online_cpus())
-                cpu_relax();
-        /* stop task transitions */
-        raw_spin_lock_irqsave(&task_transition_lock, flags);
        /* don't switch if there are active real-time tasks */
        if (atomic_read(&rt_task_count) == 0) {
@@ -436,11 +417,24 @@ int switch_sched_plugin(struct sched_plugin* plugin)
        } else
                ret = -EBUSY;
 out:
-        raw_spin_unlock_irqrestore(&task_transition_lock, flags);
-        atomic_set(&cannot_use_plugin, 0);
        return ret;
 }
+/* Switching a plugin in use is tricky.
+ * We must watch out that no real-time tasks exists
+ * (and that none is created in parallel) and that the plugin is not
+ * currently in use on any processor (in theory).
+ */
+int switch_sched_plugin(struct sched_plugin* plugin)
+{
+        BUG_ON(!plugin);
+        if (atomic_read(&rt_task_count) == 0)
+                return stop_machine(do_plugin_switch, plugin, NULL);
+        else
+                return -EBUSY;
+}
 /* Called upon fork.
 * p is the newly forked task.
 */
@@ -521,6 +515,25 @@ static struct sysrq_key_op sysrq_kill_rt_tasks_op = {
 extern struct sched_plugin linux_sched_plugin;
+static int litmus_shutdown_nb(struct notifier_block *unused1,
+                                unsigned long unused2, void *unused3)
+{
+        /* Attempt to switch back to regular Linux scheduling.
+         * Forces the active plugin to clean up.
+         */
+        if (litmus != &linux_sched_plugin) {
+                int ret = switch_sched_plugin(&linux_sched_plugin);
+                if (ret) {
+                        printk("Auto-shutdown of active Litmus plugin failed.\n");
+                }
+        }
+        return NOTIFY_DONE;
+}
+static struct notifier_block shutdown_notifier = {
+        .notifier_call = litmus_shutdown_nb,
+};
 static int __init _init_litmus(void)
 {
        /*      Common initializers,
@@ -529,8 +542,6 @@ static int __init _init_litmus(void)
         */
        printk("Starting LITMUS^RT kernel\n");
-        BUILD_BUG_ON(sizeof(union np_flag) != sizeof(uint32_t));
        register_sched_plugin(&linux_sched_plugin);
        bheap_node_cache    = KMEM_CACHE(bheap_node, SLAB_PANIC);
@@ -550,11 +561,15 @@ static int __init _init_litmus(void)
        init_topology();
 #endif
+        register_reboot_notifier(&shutdown_notifier);
        return 0;
 }
 static void _exit_litmus(void)
 {
+        unregister_reboot_notifier(&shutdown_notifier);
        exit_litmus_proc();
        kmem_cache_destroy(bheap_node_cache);
        kmem_cache_destroy(release_heap_cache);
diff --git a/litmus/locking.c b/litmus/locking.c
index 0c1aa6aa40b7..43d9aece2e74 100644
--- a/litmus/locking.c
+++ b/litmus/locking.c
@@ -1,9 +1,14 @@
+#include <linux/sched.h>
+#include <litmus/litmus.h>
 #include <litmus/fdso.h>
 #ifdef CONFIG_LITMUS_LOCKING
+#include <linux/sched.h>
+#include <litmus/litmus.h>
 #include <litmus/sched_plugin.h>
 #include <litmus/trace.h>
+#include <litmus/wait.h>
 static int create_generic_lock(void** obj_ref, obj_type_t type, void* __user arg);
 static int open_generic_lock(struct od_table_entry* entry, void* __user arg);
@@ -69,6 +74,10 @@ asmlinkage long sys_litmus_lock(int lock_od)
        struct od_table_entry* entry;
        struct litmus_lock* l;
+        TS_SYSCALL_IN_START;
+        TS_SYSCALL_IN_END;
        TS_LOCK_START;
        entry = get_entry_for_od(lock_od);
@@ -82,6 +91,8 @@ asmlinkage long sys_litmus_lock(int lock_od)
         * this into account when computing overheads. */
        TS_LOCK_END;
+        TS_SYSCALL_OUT_START;
        return err;
 }
@@ -91,6 +102,10 @@ asmlinkage long sys_litmus_unlock(int lock_od)
        struct od_table_entry* entry;
        struct litmus_lock* l;
+        TS_SYSCALL_IN_START;
+        TS_SYSCALL_IN_END;
        TS_UNLOCK_START;
        entry = get_entry_for_od(lock_od);
@@ -104,6 +119,8 @@ asmlinkage long sys_litmus_unlock(int lock_od)
         * account when computing overheads. */
        TS_UNLOCK_END;
+        TS_SYSCALL_OUT_START;
        return err;
 }
@@ -121,6 +138,38 @@ struct task_struct* __waitqueue_remove_first(wait_queue_head_t *wq)
        return(t);
 }
+unsigned int __add_wait_queue_prio_exclusive(
+        wait_queue_head_t* head,
+        prio_wait_queue_t *new)
+{
+        struct list_head *pos;
+        unsigned int passed = 0;
+        new->wq.flags |= WQ_FLAG_EXCLUSIVE;
+        /* find a spot where the new entry is less than the next */
+        list_for_each(pos, &head->task_list) {
+                prio_wait_queue_t* queued = list_entry(pos, prio_wait_queue_t,
+                                                       wq.task_list);
+                if (unlikely(lt_before(new->priority, queued->priority) ||
+                             (new->priority == queued->priority &&
+                              new->tie_breaker < queued->tie_breaker))) {
+                        /* pos is not less than new, thus insert here */
+                        __list_add(&new->wq.task_list, pos->prev, pos);
+                        goto out;
+                }
+                passed++;
+        }
+        /* if we get to this point either the list is empty or every entry
+         * queued element is less than new.
+         * Let's add new to the end. */
+        list_add_tail(&new->wq.task_list, &head->task_list);
+out:
+        return passed;
+}
 #else
diff --git a/litmus/preempt.c b/litmus/preempt.c
index 5704d0bf4c0b..6be2f26728b8 100644
--- a/litmus/preempt.c
+++ b/litmus/preempt.c
@@ -2,6 +2,7 @@
 #include <litmus/litmus.h>
 #include <litmus/preempt.h>
+#include <litmus/trace.h>
 /* The rescheduling state of each processor.
 */
@@ -47,6 +48,7 @@ void sched_state_ipi(void)
                set_tsk_need_resched(current);
                TRACE_STATE("IPI -> set_tsk_need_resched(%s/%d)\n",
                            current->comm, current->pid);
+                TS_SEND_RESCHED_END;
        } else {
                /* ignore */
                TRACE_STATE("ignoring IPI in state %x (%s)\n",
@@ -85,8 +87,10 @@ void litmus_reschedule(int cpu)
        if (scheduled_transition_ok) {
                if (smp_processor_id() == cpu)
                        set_tsk_need_resched(current);
-                else
+                else {
+                        TS_SEND_RESCHED_START(cpu);
                        smp_send_reschedule(cpu);
+                }
        }
        TRACE_STATE("%s picked-ok:%d sched-ok:%d\n",
diff --git a/litmus/rt_domain.c b/litmus/rt_domain.c
index d405854cd39c..1683d3847560 100644
--- a/litmus/rt_domain.c
+++ b/litmus/rt_domain.c
@@ -300,9 +300,11 @@ void rt_domain_init(rt_domain_t *rt,
 */
 void __add_ready(rt_domain_t* rt, struct task_struct *new)
 {
-        TRACE("rt: adding %s/%d (%llu, %llu) rel=%llu to ready queue at %llu\n",
+        TRACE("rt: adding %s/%d (%llu, %llu, %llu) rel=%llu "
-              new->comm, new->pid, get_exec_cost(new), get_rt_period(new),
+                "to ready queue at %llu\n",
-              get_release(new), litmus_clock());
+                new->comm, new->pid,
+                get_exec_cost(new), get_rt_period(new), get_rt_relative_deadline(new),
+                get_release(new), litmus_clock());
        BUG_ON(bheap_node_in_heap(tsk_rt(new)->heap_node));
@@ -329,12 +331,7 @@ void __add_release_on(rt_domain_t* rt, struct task_struct *task,
        list_add(&tsk_rt(task)->list, &rt->tobe_released);
        task->rt_param.domain = rt;
-        /* start release timer */
-        TS_SCHED2_START(task);
        arm_release_timer_on(rt, target_cpu);
-        TS_SCHED2_END(task);
 }
 #endif
@@ -347,11 +344,6 @@ void __add_release(rt_domain_t* rt, struct task_struct *task)
        list_add(&tsk_rt(task)->list, &rt->tobe_released);
        task->rt_param.domain = rt;
-        /* start release timer */
-        TS_SCHED2_START(task);
        arm_release_timer(rt);
-        TS_SCHED2_END(task);
 }
diff --git a/litmus/sched_cedf.c b/litmus/sched_cedf.c
index 480c62bc895b..b45b46fc4fca 100644
--- a/litmus/sched_cedf.c
+++ b/litmus/sched_cedf.c
@@ -35,6 +35,7 @@
 #include <litmus/litmus.h>
 #include <litmus/jobs.h>
 #include <litmus/preempt.h>
+#include <litmus/budget.h>
 #include <litmus/sched_plugin.h>
 #include <litmus/edf_common.h>
 #include <litmus/sched_trace.h>
@@ -170,7 +171,7 @@ static noinline void link_task_to_cpu(struct task_struct* linked,
        /* Link new task to CPU. */
        if (linked) {
-                set_rt_flags(linked, RT_F_RUNNING);
+                tsk_rt(linked)->completed = 0;
                /* handle task is already scheduled somewhere! */
                on_cpu = linked->rt_param.scheduled_on;
                if (on_cpu != NO_CPU) {
@@ -304,11 +305,11 @@ static void check_for_preemptions(cedf_domain_t *cluster)
                                                &per_cpu(cedf_cpu_entries, task_cpu(task)));
                        if(affinity)
                                last = affinity;
-                        else if(last->linked)
+                        else if(requeue_preempted_job(last->linked))
                                requeue(last->linked);
                }
 #else
-                if (last->linked)
+                if (requeue_preempted_job(last->linked))
                        requeue(last->linked);
 #endif
                link_task_to_cpu(task, last);
@@ -349,7 +350,7 @@ static noinline void job_completion(struct task_struct *t, int forced)
        TRACE_TASK(t, "job_completion().\n");
        /* set flags */
-        set_rt_flags(t, RT_F_SLEEP);
+        tsk_rt(t)->completed = 1;
        /* prepare for next period */
        prepare_for_next_period(t);
        if (is_released(t, litmus_clock()))
@@ -403,7 +404,7 @@ static void cedf_tick(struct task_struct* t)
 *
 *      - !is_running(scheduled)        // the job blocks
 *      - scheduled->timeslice == 0     // the job completed (forcefully)
- *      - get_rt_flag() == RT_F_SLEEP   // the job completed (by syscall)
+ *      - is_completed()                // the job completed (by syscall)
 *      - linked != scheduled           // we need to reschedule (for any reason)
 *      - is_np(scheduled)              // rescheduling must be delayed,
 *                                         sys_exit_np must be requested
@@ -442,7 +443,7 @@ static struct task_struct* cedf_schedule(struct task_struct * prev)
                                  budget_enforced(entry->scheduled) &&
                                  budget_exhausted(entry->scheduled);
        np          = exists && is_np(entry->scheduled);
-        sleep       = exists && get_rt_flags(entry->scheduled) == RT_F_SLEEP;
+        sleep       = exists && is_completed(entry->scheduled);
        preempt     = entry->scheduled != entry->linked;
 #ifdef WANT_ALL_SCHED_EVENTS
@@ -478,9 +479,9 @@ static struct task_struct* cedf_schedule(struct task_struct * prev)
        /* Any task that is preemptable and either exhausts its execution
         * budget or wants to sleep completes. We may have to reschedule after
         * this. Don't do a job completion if we block (can't have timers running
-         * for blocked jobs). Preemption go first for the same reason.
+         * for blocked jobs).
         */
-        if (!np && (out_of_time || sleep) && !blocks && !preempt)
+        if (!np && (out_of_time || sleep) && !blocks)
                job_completion(entry->scheduled, !sleep);
        /* Link pending task if we became unlinked.
@@ -594,25 +595,17 @@ static void cedf_task_wake_up(struct task_struct *task)
        cluster = task_cpu_cluster(task);
        raw_spin_lock_irqsave(&cluster->cluster_lock, flags);
-        /* We need to take suspensions because of semaphores into
+        now = litmus_clock();
-         * account! If a job resumes after being suspended due to acquiring
+        if (is_tardy(task, now)) {
-         * a semaphore, it should never be treated as a new job release.
+                /* new sporadic release */
-         */
+                release_at(task, now);
-        if (get_rt_flags(task) == RT_F_EXIT_SEM) {
+                sched_trace_task_release(task);
-                set_rt_flags(task, RT_F_RUNNING);
+        }
-        } else {
+        else {
-                now = litmus_clock();
+                if (task->rt.time_slice) {
-                if (is_tardy(task, now)) {
+                        /* came back in time before deadline
-                        /* new sporadic release */
+                        */
-                        release_at(task, now);
+                        tsk_rt(task)->completed = 0;
-                        sched_trace_task_release(task);
-                }
-                else {
-                        if (task->rt.time_slice) {
-                                /* came back in time before deadline
-                                */
-                                set_rt_flags(task, RT_F_RUNNING);
-                        }
                }
        }
        cedf_job_arrival(task);
diff --git a/litmus/sched_gsn_edf.c b/litmus/sched_gsn_edf.c
index 6ed504f4750e..b8548b885b35 100644
--- a/litmus/sched_gsn_edf.c
+++ b/litmus/sched_gsn_edf.c
@@ -21,6 +21,7 @@
 #include <litmus/trace.h>
 #include <litmus/preempt.h>
+#include <litmus/budget.h>
 #include <litmus/bheap.h>
@@ -43,7 +44,7 @@
 *                                (thereby removing its association with this
 *                                CPU). However, it will not requeue the
 *                                previously linked task (if any). It will set
- *                                T's state to RT_F_RUNNING and check whether
+ *                                T's state to not completed and check whether
 *                                it is already running somewhere else. If T
 *                                is scheduled somewhere else it will link
 *                                it to that CPU instead (and pull the linked
@@ -172,7 +173,7 @@ static noinline void link_task_to_cpu(struct task_struct* linked,
        /* Link new task to CPU. */
        if (linked) {
-                set_rt_flags(linked, RT_F_RUNNING);
+                tsk_rt(linked)->completed = 0;
                /* handle task is already scheduled somewhere! */
                on_cpu = linked->rt_param.scheduled_on;
                if (on_cpu != NO_CPU) {
@@ -296,11 +297,11 @@ static void check_for_preemptions(void)
                                                &per_cpu(gsnedf_cpu_entries, task_cpu(task)));
                        if (affinity)
                                last = affinity;
-                        else if (last->linked)
+                        else if (requeue_preempted_job(last->linked))
                                requeue(last->linked);
                }
 #else
-                if (last->linked)
+                if (requeue_preempted_job(last->linked))
                        requeue(last->linked);
 #endif
@@ -340,7 +341,7 @@ static noinline void job_completion(struct task_struct *t, int forced)
        TRACE_TASK(t, "job_completion().\n");
        /* set flags */
-        set_rt_flags(t, RT_F_SLEEP);
+        tsk_rt(t)->completed = 1;
        /* prepare for next period */
        prepare_for_next_period(t);
        if (is_released(t, litmus_clock()))
@@ -393,7 +394,7 @@ static void gsnedf_tick(struct task_struct* t)
 *
 *      - !is_running(scheduled)        // the job blocks
 *      - scheduled->timeslice == 0     // the job completed (forcefully)
- *      - get_rt_flag() == RT_F_SLEEP   // the job completed (by syscall)
+ *      - is_completed()                // the job completed (by syscall)
 *      - linked != scheduled           // we need to reschedule (for any reason)
 *      - is_np(scheduled)              // rescheduling must be delayed,
 *                                         sys_exit_np must be requested
@@ -426,11 +427,10 @@ static struct task_struct* gsnedf_schedule(struct task_struct * prev)
        /* (0) Determine state */
        exists      = entry->scheduled != NULL;
        blocks      = exists && !is_running(entry->scheduled);
-        out_of_time = exists &&
+        out_of_time = exists && budget_enforced(entry->scheduled)
-                                  budget_enforced(entry->scheduled) &&
+                && budget_exhausted(entry->scheduled);
-                                  budget_exhausted(entry->scheduled);
        np          = exists && is_np(entry->scheduled);
-        sleep       = exists && get_rt_flags(entry->scheduled) == RT_F_SLEEP;
+        sleep       = exists && is_completed(entry->scheduled);
        preempt     = entry->scheduled != entry->linked;
 #ifdef WANT_ALL_SCHED_EVENTS
@@ -466,9 +466,9 @@ static struct task_struct* gsnedf_schedule(struct task_struct * prev)
        /* Any task that is preemptable and either exhausts its execution
         * budget or wants to sleep completes. We may have to reschedule after
         * this. Don't do a job completion if we block (can't have timers running
-         * for blocked jobs). Preemption go first for the same reason.
+         * for blocked jobs).
         */
-        if (!np && (out_of_time || sleep) && !blocks && !preempt)
+        if (!np && (out_of_time || sleep) && !blocks)
                job_completion(entry->scheduled, !sleep);
        /* Link pending task if we became unlinked.
@@ -577,25 +577,17 @@ static void gsnedf_task_wake_up(struct task_struct *task)
        TRACE_TASK(task, "wake_up at %llu\n", litmus_clock());
        raw_spin_lock_irqsave(&gsnedf_lock, flags);
-        /* We need to take suspensions because of semaphores into
+        now = litmus_clock();
-         * account! If a job resumes after being suspended due to acquiring
+        if (is_tardy(task, now)) {
-         * a semaphore, it should never be treated as a new job release.
+                /* new sporadic release */
-         */
+                release_at(task, now);
-        if (get_rt_flags(task) == RT_F_EXIT_SEM) {
+                sched_trace_task_release(task);
-                set_rt_flags(task, RT_F_RUNNING);
+        }
-        } else {
+        else {
-                now = litmus_clock();
+                if (task->rt.time_slice) {
-                if (is_tardy(task, now)) {
+                        /* came back in time before deadline
-                        /* new sporadic release */
+                        */
-                        release_at(task, now);
+                        tsk_rt(task)->completed = 0;
-                        sched_trace_task_release(task);
-                }
-                else {
-                        if (task->rt.time_slice) {
-                                /* came back in time before deadline
-                                */
-                                set_rt_flags(task, RT_F_RUNNING);
-                        }
                }
        }
        gsnedf_job_arrival(task);
diff --git a/litmus/sched_litmus.c b/litmus/sched_litmus.c
index 5a15ce938984..6b32cf09abbd 100644
--- a/litmus/sched_litmus.c
+++ b/litmus/sched_litmus.c
@@ -102,9 +102,9 @@ litmus_schedule(struct rq *rq, struct task_struct *prev)
                        }
                }
 #ifdef  __ARCH_WANT_UNLOCKED_CTXSW
-                if (next->oncpu)
+                if (next->on_cpu)
                        TRACE_TASK(next, "waiting for !oncpu");
-                while (next->oncpu) {
+                while (next->on_cpu) {
                        cpu_relax();
                        mb();
                }
@@ -194,6 +194,9 @@ static void dequeue_task_litmus(struct rq *rq, struct task_struct *p,
 static void yield_task_litmus(struct rq *rq)
 {
+        TS_SYSCALL_IN_START;
+        TS_SYSCALL_IN_END;
        BUG_ON(rq->curr != current);
        /* sched_yield() is called to trigger delayed preemptions.
         * Thus, mark the current task as needing to be rescheduled.
@@ -202,6 +205,8 @@ static void yield_task_litmus(struct rq *rq)
         */
        clear_exit_np(current);
        litmus_reschedule_local();
+        TS_SYSCALL_OUT_START;
 }
 /* Plugins are responsible for this.
diff --git a/litmus/sched_pfair.c b/litmus/sched_pfair.c
index 16f1065bbdca..6a89b003306c 100644
--- a/litmus/sched_pfair.c
+++ b/litmus/sched_pfair.c
@@ -254,7 +254,7 @@ static void check_preempt(struct task_struct* t)
 {
        int cpu = NO_CPU;
        if (tsk_rt(t)->linked_on != tsk_rt(t)->scheduled_on &&
-            tsk_rt(t)->present) {
+            is_present(t)) {
                /* the task can be scheduled and
                 * is not scheduled where it ought to be scheduled
                 */
@@ -299,7 +299,7 @@ static void pfair_prepare_next_period(struct task_struct* t)
        struct pfair_param* p = tsk_pfair(t);
        prepare_for_next_period(t);
-        get_rt_flags(t) = RT_F_RUNNING;
+        tsk_rt(t)->completed = 0;
        p->release += p->period;
 }
@@ -310,7 +310,7 @@ static int advance_subtask(quanta_t time, struct task_struct* t, int cpu)
        int to_relq;
        p->cur = (p->cur + 1) % p->quanta;
        if (!p->cur) {
-                if (tsk_rt(t)->present) {
+                if (is_present(t)) {
                        /* The job overran; we start a new budget allocation. */
                        pfair_prepare_next_period(t);
                } else {
@@ -598,7 +598,7 @@ static int safe_to_schedule(struct task_struct* t, int cpu)
                           "scheduled already on %d.\n", cpu, where);
                return 0;
        } else
-                return tsk_rt(t)->present && get_rt_flags(t) == RT_F_RUNNING;
+                return is_present(t) && !is_completed(t);
 }
 static struct task_struct* pfair_schedule(struct task_struct * prev)
@@ -621,7 +621,7 @@ static struct task_struct* pfair_schedule(struct task_struct * prev)
        raw_spin_lock(cpu_lock(state));
        blocks      = is_realtime(prev) && !is_running(prev);
-        completion  = is_realtime(prev) && get_rt_flags(prev) == RT_F_SLEEP;
+        completion  = is_realtime(prev) && is_completed(prev);
        out_of_time = is_realtime(prev) && time_after(cur_release(prev),
                                                      state->local_tick);
@@ -720,7 +720,7 @@ static void pfair_task_wake_up(struct task_struct *t)
        /* only add to ready queue if the task isn't still linked somewhere */
        if (requeue) {
                TRACE_TASK(t, "requeueing required\n");
-                tsk_rt(t)->flags = RT_F_RUNNING;
+                tsk_rt(t)->completed = 0;
                __add_ready(&cluster->pfair, t);
        }
@@ -850,6 +850,13 @@ static long pfair_admit_task(struct task_struct* t)
            cpu_cluster(pstate[task_cpu(t)]))
                return -EINVAL;
+        if (get_rt_period(t) != get_rt_relative_deadline(t)) {
+                printk(KERN_INFO "%s: Admission rejected. "
+                        "Only implicit deadlines are currently supported.\n",
+                        litmus->plugin_name);
+                return -EINVAL;
+        }
        /* Pfair is a tick-based method, so the time
         * of interest is jiffies. Calculate tick-based
         * times for everything.
diff --git a/litmus/sched_pfp.c b/litmus/sched_pfp.c
new file mode 100644
index 000000000000..0e875a3b5cba
--- /dev/null
+++ b/litmus/sched_pfp.c
@@ -0,0 +1,1711 @@
+/*
+ * litmus/sched_pfp.c
+ *
+ * Implementation of partitioned fixed-priority scheduling.
+ * Based on PSN-EDF.
+ */
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <litmus/litmus.h>
+#include <litmus/wait.h>
+#include <litmus/jobs.h>
+#include <litmus/preempt.h>
+#include <litmus/fp_common.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/sched_trace.h>
+#include <litmus/trace.h>
+#include <litmus/budget.h>
+#include <linux/uaccess.h>
+typedef struct {
+        rt_domain_t             domain;
+        struct fp_prio_queue    ready_queue;
+        int                     cpu;
+        struct task_struct*     scheduled; /* only RT tasks */
+/*
+ * scheduling lock slock
+ * protects the domain and serializes scheduling decisions
+ */
+#define slock domain.ready_lock
+} pfp_domain_t;
+DEFINE_PER_CPU(pfp_domain_t, pfp_domains);
+pfp_domain_t* pfp_doms[NR_CPUS];
+#define local_pfp               (&__get_cpu_var(pfp_domains))
+#define remote_dom(cpu)         (&per_cpu(pfp_domains, cpu).domain)
+#define remote_pfp(cpu) (&per_cpu(pfp_domains, cpu))
+#define task_dom(task)          remote_dom(get_partition(task))
+#define task_pfp(task)          remote_pfp(get_partition(task))
+/* we assume the lock is being held */
+static void preempt(pfp_domain_t *pfp)
+{
+        preempt_if_preemptable(pfp->scheduled, pfp->cpu);
+}
+static unsigned int priority_index(struct task_struct* t)
+{
+#ifdef CONFIG_LITMUS_LOCKING
+        if (unlikely(t->rt_param.inh_task))
+                /* use effective priority */
+                t = t->rt_param.inh_task;
+        if (is_priority_boosted(t)) {
+                /* zero is reserved for priority-boosted tasks */
+                return 0;
+        } else
+#endif
+                return get_priority(t);
+}
+static void pfp_release_jobs(rt_domain_t* rt, struct bheap* tasks)
+{
+        pfp_domain_t *pfp = container_of(rt, pfp_domain_t, domain);
+        unsigned long flags;
+        struct task_struct* t;
+        struct bheap_node* hn;
+        raw_spin_lock_irqsave(&pfp->slock, flags);
+        while (!bheap_empty(tasks)) {
+                hn = bheap_take(fp_ready_order, tasks);
+                t = bheap2task(hn);
+                TRACE_TASK(t, "released (part:%d prio:%d)\n",
+                           get_partition(t), get_priority(t));
+                fp_prio_add(&pfp->ready_queue, t, priority_index(t));
+        }
+        /* do we need to preempt? */
+        if (fp_higher_prio(fp_prio_peek(&pfp->ready_queue), pfp->scheduled)) {
+                TRACE_CUR("preempted by new release\n");
+                preempt(pfp);
+        }
+        raw_spin_unlock_irqrestore(&pfp->slock, flags);
+}
+static void pfp_preempt_check(pfp_domain_t *pfp)
+{
+        if (fp_higher_prio(fp_prio_peek(&pfp->ready_queue), pfp->scheduled))
+                preempt(pfp);
+}
+static void pfp_domain_init(pfp_domain_t* pfp,
+                               int cpu)
+{
+        fp_domain_init(&pfp->domain, NULL, pfp_release_jobs);
+        pfp->cpu                = cpu;
+        pfp->scheduled          = NULL;
+        fp_prio_queue_init(&pfp->ready_queue);
+}
+static void requeue(struct task_struct* t, pfp_domain_t *pfp)
+{
+        BUG_ON(!is_running(t));
+        tsk_rt(t)->completed = 0;
+        if (is_released(t, litmus_clock()))
+                fp_prio_add(&pfp->ready_queue, t, priority_index(t));
+        else
+                add_release(&pfp->domain, t); /* it has got to wait */
+}
+static void job_completion(struct task_struct* t, int forced)
+{
+        sched_trace_task_completion(t,forced);
+        TRACE_TASK(t, "job_completion().\n");
+        tsk_rt(t)->completed = 1;
+        prepare_for_next_period(t);
+        if (is_released(t, litmus_clock()))
+                sched_trace_task_release(t);
+}
+static void pfp_tick(struct task_struct *t)
+{
+        pfp_domain_t *pfp = local_pfp;
+        /* Check for inconsistency. We don't need the lock for this since
+         * ->scheduled is only changed in schedule, which obviously is not
+         *  executing in parallel on this CPU
+         */
+        BUG_ON(is_realtime(t) && t != pfp->scheduled);
+        if (is_realtime(t) && budget_enforced(t) && budget_exhausted(t)) {
+                if (!is_np(t)) {
+                        litmus_reschedule_local();
+                        TRACE("pfp_scheduler_tick: "
+                              "%d is preemptable "
+                              " => FORCE_RESCHED\n", t->pid);
+                } else if (is_user_np(t)) {
+                        TRACE("pfp_scheduler_tick: "
+                              "%d is non-preemptable, "
+                              "preemption delayed.\n", t->pid);
+                        request_exit_np(t);
+                }
+        }
+}
+static struct task_struct* pfp_schedule(struct task_struct * prev)
+{
+        pfp_domain_t*   pfp = local_pfp;
+        struct task_struct*     next;
+        int out_of_time, sleep, preempt, np, exists, blocks, resched, migrate;
+        raw_spin_lock(&pfp->slock);
+        /* sanity checking
+         * differently from gedf, when a task exits (dead)
+         * pfp->schedule may be null and prev _is_ realtime
+         */
+        BUG_ON(pfp->scheduled && pfp->scheduled != prev);
+        BUG_ON(pfp->scheduled && !is_realtime(prev));
+        /* (0) Determine state */
+        exists      = pfp->scheduled != NULL;
+        blocks      = exists && !is_running(pfp->scheduled);
+        out_of_time = exists &&
+                                  budget_enforced(pfp->scheduled) &&
+                                  budget_exhausted(pfp->scheduled);
+        np          = exists && is_np(pfp->scheduled);
+        sleep       = exists && is_completed(pfp->scheduled);
+        migrate     = exists && get_partition(pfp->scheduled) != pfp->cpu;
+        preempt     = !blocks && (migrate || fp_preemption_needed(&pfp->ready_queue, prev));
+        /* If we need to preempt do so.
+         * The following checks set resched to 1 in case of special
+         * circumstances.
+         */
+        resched = preempt;
+        /* If a task blocks we have no choice but to reschedule.
+         */
+        if (blocks)
+                resched = 1;
+        /* Request a sys_exit_np() call if we would like to preempt but cannot.
+         * Multiple calls to request_exit_np() don't hurt.
+         */
+        if (np && (out_of_time || preempt || sleep))
+                request_exit_np(pfp->scheduled);
+        /* Any task that is preemptable and either exhausts its execution
+         * budget or wants to sleep completes. We may have to reschedule after
+         * this.
+         */
+        if (!np && (out_of_time || sleep) && !blocks && !migrate) {
+                job_completion(pfp->scheduled, !sleep);
+                resched = 1;
+        }
+        /* The final scheduling decision. Do we need to switch for some reason?
+         * Switch if we are in RT mode and have no task or if we need to
+         * resched.
+         */
+        next = NULL;
+        if ((!np || blocks) && (resched || !exists)) {
+                /* When preempting a task that does not block, then
+                 * re-insert it into either the ready queue or the
+                 * release queue (if it completed). requeue() picks
+                 * the appropriate queue.
+                 */
+                if (pfp->scheduled && !blocks  && !migrate)
+                        requeue(pfp->scheduled, pfp);
+                next = fp_prio_take(&pfp->ready_queue);
+                if (next == prev) {
+                        struct task_struct *t = fp_prio_peek(&pfp->ready_queue);
+                        TRACE_TASK(next, "next==prev sleep=%d oot=%d np=%d preempt=%d migrate=%d "
+                                   "boost=%d empty=%d prio-idx=%u prio=%u\n",
+                                   sleep, out_of_time, np, preempt, migrate,
+                                   is_priority_boosted(next),
+                                   t == NULL,
+                                   priority_index(next),
+                                   get_priority(next));
+                        if (t)
+                                TRACE_TASK(t, "waiter boost=%d prio-idx=%u prio=%u\n",
+                                           is_priority_boosted(t),
+                                           priority_index(t),
+                                           get_priority(t));
+                }
+                /* If preempt is set, we should not see the same task again. */
+                BUG_ON(preempt && next == prev);
+                /* Similarly, if preempt is set, then next may not be NULL,
+                 * unless it's a migration. */
+                BUG_ON(preempt && !migrate && next == NULL);
+        } else
+                /* Only override Linux scheduler if we have a real-time task
+                 * scheduled that needs to continue.
+                 */
+                if (exists)
+                        next = prev;
+        if (next) {
+                TRACE_TASK(next, "scheduled at %llu\n", litmus_clock());
+                tsk_rt(next)->completed = 0;
+        } else {
+                TRACE("becoming idle at %llu\n", litmus_clock());
+        }
+        pfp->scheduled = next;
+        sched_state_task_picked();
+        raw_spin_unlock(&pfp->slock);
+        return next;
+}
+#ifdef CONFIG_LITMUS_LOCKING
+/* prev is no longer scheduled --- see if it needs to migrate */
+static void pfp_finish_switch(struct task_struct *prev)
+{
+        pfp_domain_t *to;
+        if (is_realtime(prev) &&
+            is_running(prev) &&
+            get_partition(prev) != smp_processor_id()) {
+                TRACE_TASK(prev, "needs to migrate from P%d to P%d\n",
+                           smp_processor_id(), get_partition(prev));
+                to = task_pfp(prev);
+                raw_spin_lock(&to->slock);
+                TRACE_TASK(prev, "adding to queue on P%d\n", to->cpu);
+                requeue(prev, to);
+                if (fp_preemption_needed(&to->ready_queue, to->scheduled))
+                        preempt(to);
+                raw_spin_unlock(&to->slock);
+        }
+}
+#endif
+/*      Prepare a task for running in RT mode
+ */
+static void pfp_task_new(struct task_struct * t, int on_rq, int running)
+{
+        pfp_domain_t*   pfp = task_pfp(t);
+        unsigned long           flags;
+        TRACE_TASK(t, "P-FP: task new, cpu = %d\n",
+                   t->rt_param.task_params.cpu);
+        /* setup job parameters */
+        release_at(t, litmus_clock());
+        /* The task should be running in the queue, otherwise signal
+         * code will try to wake it up with fatal consequences.
+         */
+        raw_spin_lock_irqsave(&pfp->slock, flags);
+        if (running) {
+                /* there shouldn't be anything else running at the time */
+                BUG_ON(pfp->scheduled);
+                pfp->scheduled = t;
+        } else {
+                requeue(t, pfp);
+                /* maybe we have to reschedule */
+                pfp_preempt_check(pfp);
+        }
+        raw_spin_unlock_irqrestore(&pfp->slock, flags);
+}
+static void pfp_task_wake_up(struct task_struct *task)
+{
+        unsigned long           flags;
+        pfp_domain_t*           pfp = task_pfp(task);
+        lt_t                    now;
+        TRACE_TASK(task, "wake_up at %llu\n", litmus_clock());
+        raw_spin_lock_irqsave(&pfp->slock, flags);
+#ifdef CONFIG_LITMUS_LOCKING
+        /* Should only be queued when processing a fake-wake up due to a
+         * migration-related state change. */
+        if (unlikely(is_queued(task))) {
+                TRACE_TASK(task, "WARNING: waking task still queued. Is this right?\n");
+                goto out_unlock;
+        }
+#else
+        BUG_ON(is_queued(task));
+#endif
+        now = litmus_clock();
+        if (is_tardy(task, now)
+#ifdef CONFIG_LITMUS_LOCKING
+        /* We need to take suspensions because of semaphores into
+         * account! If a job resumes after being suspended due to acquiring
+         * a semaphore, it should never be treated as a new job release.
+         */
+            && !is_priority_boosted(task)
+#endif
+                ) {
+                /* new sporadic release */
+                release_at(task, now);
+                sched_trace_task_release(task);
+        }
+        /* Only add to ready queue if it is not the currently-scheduled
+         * task. This could be the case if a task was woken up concurrently
+         * on a remote CPU before the executing CPU got around to actually
+         * de-scheduling the task, i.e., wake_up() raced with schedule()
+         * and won. Also, don't requeue if it is still queued, which can
+         * happen under the DPCP due wake-ups racing with migrations.
+         */
+        if (pfp->scheduled != task) {
+                requeue(task, pfp);
+                pfp_preempt_check(pfp);
+        }
+#ifdef CONFIG_LITMUS_LOCKING
+out_unlock:
+#endif
+        raw_spin_unlock_irqrestore(&pfp->slock, flags);
+        TRACE_TASK(task, "wake up done\n");
+}
+static void pfp_task_block(struct task_struct *t)
+{
+        /* only running tasks can block, thus t is in no queue */
+        TRACE_TASK(t, "block at %llu, state=%d\n", litmus_clock(), t->state);
+        BUG_ON(!is_realtime(t));
+        /* If this task blocked normally, it shouldn't be queued. The exception is
+         * if this is a simulated block()/wakeup() pair from the pull-migration code path.
+         * This should only happen if the DPCP is being used.
+         */
+#ifdef CONFIG_LITMUS_LOCKING
+        if (unlikely(is_queued(t)))
+                TRACE_TASK(t, "WARNING: blocking task still queued. Is this right?\n");
+#else
+        BUG_ON(is_queued(t));
+#endif
+}
+static void pfp_task_exit(struct task_struct * t)
+{
+        unsigned long flags;
+        pfp_domain_t*   pfp = task_pfp(t);
+        rt_domain_t*            dom;
+        raw_spin_lock_irqsave(&pfp->slock, flags);
+        if (is_queued(t)) {
+                BUG(); /* This currently doesn't work. */
+                /* dequeue */
+                dom  = task_dom(t);
+                remove(dom, t);
+        }
+        if (pfp->scheduled == t) {
+                pfp->scheduled = NULL;
+                preempt(pfp);
+        }
+        TRACE_TASK(t, "RIP, now reschedule\n");
+        raw_spin_unlock_irqrestore(&pfp->slock, flags);
+}
+#ifdef CONFIG_LITMUS_LOCKING
+#include <litmus/fdso.h>
+#include <litmus/srp.h>
+static void fp_dequeue(pfp_domain_t* pfp, struct task_struct* t)
+{
+        BUG_ON(pfp->scheduled == t && is_queued(t));
+        if (is_queued(t))
+                fp_prio_remove(&pfp->ready_queue, t, priority_index(t));
+}
+static void fp_set_prio_inh(pfp_domain_t* pfp, struct task_struct* t,
+                            struct task_struct* prio_inh)
+{
+        int requeue;
+        if (!t || t->rt_param.inh_task == prio_inh) {
+                /* no update  required */
+                if (t)
+                        TRACE_TASK(t, "no prio-inh update required\n");
+                return;
+        }
+        requeue = is_queued(t);
+        TRACE_TASK(t, "prio-inh: is_queued:%d\n", requeue);
+        if (requeue)
+                /* first remove */
+                fp_dequeue(pfp, t);
+        t->rt_param.inh_task = prio_inh;
+        if (requeue)
+                /* add again to the right queue */
+                fp_prio_add(&pfp->ready_queue, t, priority_index(t));
+}
+static int effective_agent_priority(int prio)
+{
+        /* make sure agents have higher priority */
+        return prio - LITMUS_MAX_PRIORITY;
+}
+static lt_t prio_point(int eprio)
+{
+        /* make sure we have non-negative prio points */
+        return eprio + LITMUS_MAX_PRIORITY;
+}
+static int prio_from_point(lt_t prio_point)
+{
+        return ((int) prio_point) - LITMUS_MAX_PRIORITY;
+}
+static void boost_priority(struct task_struct* t, lt_t priority_point)
+{
+        unsigned long           flags;
+        pfp_domain_t*   pfp = task_pfp(t);
+        raw_spin_lock_irqsave(&pfp->slock, flags);
+        TRACE_TASK(t, "priority boosted at %llu\n", litmus_clock());
+        tsk_rt(t)->priority_boosted = 1;
+        /* tie-break by protocol-specific priority point */
+        tsk_rt(t)->boost_start_time = priority_point;
+        /* Priority boosting currently only takes effect for already-scheduled
+         * tasks. This is sufficient since priority boosting only kicks in as
+         * part of lock acquisitions. */
+        BUG_ON(pfp->scheduled != t);
+        raw_spin_unlock_irqrestore(&pfp->slock, flags);
+}
+static void unboost_priority(struct task_struct* t)
+{
+        unsigned long           flags;
+        pfp_domain_t*   pfp = task_pfp(t);
+        lt_t                    now;
+        raw_spin_lock_irqsave(&pfp->slock, flags);
+        now = litmus_clock();
+        /* assumption: this only happens when the job is scheduled */
+        BUG_ON(pfp->scheduled != t);
+        TRACE_TASK(t, "priority restored at %llu\n", now);
+        /* priority boosted jobs must be scheduled */
+        BUG_ON(pfp->scheduled != t);
+        tsk_rt(t)->priority_boosted = 0;
+        tsk_rt(t)->boost_start_time = 0;
+        /* check if this changes anything */
+        if (fp_preemption_needed(&pfp->ready_queue, pfp->scheduled))
+                preempt(pfp);
+        raw_spin_unlock_irqrestore(&pfp->slock, flags);
+}
+/* ******************** SRP support ************************ */
+static unsigned int pfp_get_srp_prio(struct task_struct* t)
+{
+        return get_priority(t);
+}
+/* ******************** FMLP support ********************** */
+struct fmlp_semaphore {
+        struct litmus_lock litmus_lock;
+        /* current resource holder */
+        struct task_struct *owner;
+        /* FIFO queue of waiting tasks */
+        wait_queue_head_t wait;
+};
+static inline struct fmlp_semaphore* fmlp_from_lock(struct litmus_lock* lock)
+{
+        return container_of(lock, struct fmlp_semaphore, litmus_lock);
+}
+int pfp_fmlp_lock(struct litmus_lock* l)
+{
+        struct task_struct* t = current;
+        struct fmlp_semaphore *sem = fmlp_from_lock(l);
+        wait_queue_t wait;
+        unsigned long flags;
+        lt_t time_of_request;
+        if (!is_realtime(t))
+                return -EPERM;
+        spin_lock_irqsave(&sem->wait.lock, flags);
+        /* tie-break by this point in time */
+        time_of_request = litmus_clock();
+        /* Priority-boost ourself *before* we suspend so that
+         * our priority is boosted when we resume. */
+        boost_priority(t, time_of_request);
+        if (sem->owner) {
+                /* resource is not free => must suspend and wait */
+                init_waitqueue_entry(&wait, t);
+                /* FIXME: interruptible would be nice some day */
+                set_task_state(t, TASK_UNINTERRUPTIBLE);
+                __add_wait_queue_tail_exclusive(&sem->wait, &wait);
+                TS_LOCK_SUSPEND;
+                /* release lock before sleeping */
+                spin_unlock_irqrestore(&sem->wait.lock, flags);
+                /* We depend on the FIFO order.  Thus, we don't need to recheck
+                 * when we wake up; we are guaranteed to have the lock since
+                 * there is only one wake up per release.
+                 */
+                schedule();
+                TS_LOCK_RESUME;
+                /* Since we hold the lock, no other task will change
+                 * ->owner. We can thus check it without acquiring the spin
+                 * lock. */
+                BUG_ON(sem->owner != t);
+        } else {
+                /* it's ours now */
+                sem->owner = t;
+                spin_unlock_irqrestore(&sem->wait.lock, flags);
+        }
+        return 0;
+}
+int pfp_fmlp_unlock(struct litmus_lock* l)
+{
+        struct task_struct *t = current, *next;
+        struct fmlp_semaphore *sem = fmlp_from_lock(l);
+        unsigned long flags;
+        int err = 0;
+        spin_lock_irqsave(&sem->wait.lock, flags);
+        if (sem->owner != t) {
+                err = -EINVAL;
+                goto out;
+        }
+        /* we lose the benefit of priority boosting */
+        unboost_priority(t);
+        /* check if there are jobs waiting for this resource */
+        next = __waitqueue_remove_first(&sem->wait);
+        if (next) {
+                /* next becomes the resouce holder */
+                sem->owner = next;
+                /* Wake up next. The waiting job is already priority-boosted. */
+                wake_up_process(next);
+        } else
+                /* resource becomes available */
+                sem->owner = NULL;
+out:
+        spin_unlock_irqrestore(&sem->wait.lock, flags);
+        return err;
+}
+int pfp_fmlp_close(struct litmus_lock* l)
+{
+        struct task_struct *t = current;
+        struct fmlp_semaphore *sem = fmlp_from_lock(l);
+        unsigned long flags;
+        int owner;
+        spin_lock_irqsave(&sem->wait.lock, flags);
+        owner = sem->owner == t;
+        spin_unlock_irqrestore(&sem->wait.lock, flags);
+        if (owner)
+                pfp_fmlp_unlock(l);
+        return 0;
+}
+void pfp_fmlp_free(struct litmus_lock* lock)
+{
+        kfree(fmlp_from_lock(lock));
+}
+static struct litmus_lock_ops pfp_fmlp_lock_ops = {
+        .close  = pfp_fmlp_close,
+        .lock   = pfp_fmlp_lock,
+        .unlock = pfp_fmlp_unlock,
+        .deallocate = pfp_fmlp_free,
+};
+static struct litmus_lock* pfp_new_fmlp(void)
+{
+        struct fmlp_semaphore* sem;
+        sem = kmalloc(sizeof(*sem), GFP_KERNEL);
+        if (!sem)
+                return NULL;
+        sem->owner   = NULL;
+        init_waitqueue_head(&sem->wait);
+        sem->litmus_lock.ops = &pfp_fmlp_lock_ops;
+        return &sem->litmus_lock;
+}
+/* ******************** MPCP support ********************** */
+struct mpcp_semaphore {
+        struct litmus_lock litmus_lock;
+        /* current resource holder */
+        struct task_struct *owner;
+        /* priority queue of waiting tasks */
+        wait_queue_head_t wait;
+        /* priority ceiling per cpu */
+        unsigned int prio_ceiling[NR_CPUS];
+        /* should jobs spin "virtually" for this resource? */
+        int vspin;
+};
+#define OMEGA_CEILING UINT_MAX
+/* Since jobs spin "virtually" while waiting to acquire a lock,
+ * they first must aquire a local per-cpu resource.
+ */
+static DEFINE_PER_CPU(wait_queue_head_t, mpcpvs_vspin_wait);
+static DEFINE_PER_CPU(struct task_struct*, mpcpvs_vspin);
+/* called with preemptions off <=> no local modifications */
+static void mpcp_vspin_enter(void)
+{
+        struct task_struct* t = current;
+        while (1) {
+                if (__get_cpu_var(mpcpvs_vspin) == NULL) {
+                        /* good, we get to issue our request */
+                        __get_cpu_var(mpcpvs_vspin) = t;
+                        break;
+                } else {
+                        /* some job is spinning => enqueue in request queue */
+                        prio_wait_queue_t wait;
+                        wait_queue_head_t* vspin = &__get_cpu_var(mpcpvs_vspin_wait);
+                        unsigned long flags;
+                        /* ordered by regular priority */
+                        init_prio_waitqueue_entry(&wait, t, prio_point(get_priority(t)));
+                        spin_lock_irqsave(&vspin->lock, flags);
+                        set_task_state(t, TASK_UNINTERRUPTIBLE);
+                        __add_wait_queue_prio_exclusive(vspin, &wait);
+                        spin_unlock_irqrestore(&vspin->lock, flags);
+                        TS_LOCK_SUSPEND;
+                        preempt_enable_no_resched();
+                        schedule();
+                        preempt_disable();
+                        TS_LOCK_RESUME;
+                        /* Recheck if we got it --- some higher-priority process might
+                         * have swooped in. */
+                }
+        }
+        /* ok, now it is ours */
+}
+/* called with preemptions off */
+static void mpcp_vspin_exit(void)
+{
+        struct task_struct* t = current, *next;
+        unsigned long flags;
+        wait_queue_head_t* vspin = &__get_cpu_var(mpcpvs_vspin_wait);
+        BUG_ON(__get_cpu_var(mpcpvs_vspin) != t);
+        /* no spinning job */
+        __get_cpu_var(mpcpvs_vspin) = NULL;
+        /* see if anyone is waiting for us to stop "spinning" */
+        spin_lock_irqsave(&vspin->lock, flags);
+        next = __waitqueue_remove_first(vspin);
+        if (next)
+                wake_up_process(next);
+        spin_unlock_irqrestore(&vspin->lock, flags);
+}
+static inline struct mpcp_semaphore* mpcp_from_lock(struct litmus_lock* lock)
+{
+        return container_of(lock, struct mpcp_semaphore, litmus_lock);
+}
+int pfp_mpcp_lock(struct litmus_lock* l)
+{
+        struct task_struct* t = current;
+        struct mpcp_semaphore *sem = mpcp_from_lock(l);
+        prio_wait_queue_t wait;
+        unsigned long flags;
+        if (!is_realtime(t))
+                return -EPERM;
+        preempt_disable();
+        if (sem->vspin)
+                mpcp_vspin_enter();
+        /* Priority-boost ourself *before* we suspend so that
+         * our priority is boosted when we resume. Use the priority
+         * ceiling for the local partition. */
+        boost_priority(t, sem->prio_ceiling[get_partition(t)]);
+        spin_lock_irqsave(&sem->wait.lock, flags);
+        preempt_enable_no_resched();
+        if (sem->owner) {
+                /* resource is not free => must suspend and wait */
+                /* ordered by regular priority */
+                init_prio_waitqueue_entry(&wait, t, prio_point(get_priority(t)));
+                /* FIXME: interruptible would be nice some day */
+                set_task_state(t, TASK_UNINTERRUPTIBLE);
+                __add_wait_queue_prio_exclusive(&sem->wait, &wait);
+                TS_LOCK_SUSPEND;
+                /* release lock before sleeping */
+                spin_unlock_irqrestore(&sem->wait.lock, flags);
+                /* We depend on the FIFO order.  Thus, we don't need to recheck
+                 * when we wake up; we are guaranteed to have the lock since
+                 * there is only one wake up per release.
+                 */
+                schedule();
+                TS_LOCK_RESUME;
+                /* Since we hold the lock, no other task will change
+                 * ->owner. We can thus check it without acquiring the spin
+                 * lock. */
+                BUG_ON(sem->owner != t);
+        } else {
+                /* it's ours now */
+                sem->owner = t;
+                spin_unlock_irqrestore(&sem->wait.lock, flags);
+        }
+        return 0;
+}
+int pfp_mpcp_unlock(struct litmus_lock* l)
+{
+        struct task_struct *t = current, *next;
+        struct mpcp_semaphore *sem = mpcp_from_lock(l);
+        unsigned long flags;
+        int err = 0;
+        spin_lock_irqsave(&sem->wait.lock, flags);
+        if (sem->owner != t) {
+                err = -EINVAL;
+                goto out;
+        }
+        /* we lose the benefit of priority boosting */
+        unboost_priority(t);
+        /* check if there are jobs waiting for this resource */
+        next = __waitqueue_remove_first(&sem->wait);
+        if (next) {
+                /* next becomes the resouce holder */
+                sem->owner = next;
+                /* Wake up next. The waiting job is already priority-boosted. */
+                wake_up_process(next);
+        } else
+                /* resource becomes available */
+                sem->owner = NULL;
+out:
+        spin_unlock_irqrestore(&sem->wait.lock, flags);
+        if (sem->vspin && err == 0) {
+                preempt_disable();
+                mpcp_vspin_exit();
+                preempt_enable();
+        }
+        return err;
+}
+int pfp_mpcp_open(struct litmus_lock* l, void* config)
+{
+        struct task_struct *t = current;
+        struct mpcp_semaphore *sem = mpcp_from_lock(l);
+        int cpu, local_cpu;
+        unsigned long flags;
+        if (!is_realtime(t))
+                /* we need to know the real-time priority */
+                return -EPERM;
+        local_cpu = get_partition(t);
+        spin_lock_irqsave(&sem->wait.lock, flags);
+        for (cpu = 0; cpu < NR_CPUS; cpu++)
+                if (cpu != local_cpu)
+                {
+                        sem->prio_ceiling[cpu] = min(sem->prio_ceiling[cpu],
+                                                     get_priority(t));
+                        TRACE_CUR("priority ceiling for sem %p is now %d on cpu %d\n",
+                                  sem, sem->prio_ceiling[cpu], cpu);
+                }
+        spin_unlock_irqrestore(&sem->wait.lock, flags);
+        return 0;
+}
+int pfp_mpcp_close(struct litmus_lock* l)
+{
+        struct task_struct *t = current;
+        struct mpcp_semaphore *sem = mpcp_from_lock(l);
+        unsigned long flags;
+        int owner;
+        spin_lock_irqsave(&sem->wait.lock, flags);
+        owner = sem->owner == t;
+        spin_unlock_irqrestore(&sem->wait.lock, flags);
+        if (owner)
+                pfp_mpcp_unlock(l);
+        return 0;
+}
+void pfp_mpcp_free(struct litmus_lock* lock)
+{
+        kfree(mpcp_from_lock(lock));
+}
+static struct litmus_lock_ops pfp_mpcp_lock_ops = {
+        .close  = pfp_mpcp_close,
+        .lock   = pfp_mpcp_lock,
+        .open   = pfp_mpcp_open,
+        .unlock = pfp_mpcp_unlock,
+        .deallocate = pfp_mpcp_free,
+};
+static struct litmus_lock* pfp_new_mpcp(int vspin)
+{
+        struct mpcp_semaphore* sem;
+        int cpu;
+        sem = kmalloc(sizeof(*sem), GFP_KERNEL);
+        if (!sem)
+                return NULL;
+        sem->owner   = NULL;
+        init_waitqueue_head(&sem->wait);
+        sem->litmus_lock.ops = &pfp_mpcp_lock_ops;
+        for (cpu = 0; cpu < NR_CPUS; cpu++)
+                sem->prio_ceiling[cpu] = OMEGA_CEILING;
+        /* mark as virtual spinning */
+        sem->vspin = vspin;
+        return &sem->litmus_lock;
+}
+/* ******************** PCP support ********************** */
+struct pcp_semaphore {
+        struct litmus_lock litmus_lock;
+        struct list_head ceiling;
+        /* current resource holder */
+        struct task_struct *owner;
+        /* priority ceiling --- can be negative due to DPCP support */
+        int prio_ceiling;
+        /* on which processor is this PCP semaphore allocated? */
+        int on_cpu;
+};
+static inline struct pcp_semaphore* pcp_from_lock(struct litmus_lock* lock)
+{
+        return container_of(lock, struct pcp_semaphore, litmus_lock);
+}
+struct pcp_state {
+        struct list_head system_ceiling;
+        /* highest-priority waiting task */
+        struct task_struct* hp_waiter;
+        /* list of jobs waiting to get past the system ceiling */
+        wait_queue_head_t ceiling_blocked;
+};
+static void pcp_init_state(struct pcp_state* s)
+{
+        INIT_LIST_HEAD(&s->system_ceiling);
+        s->hp_waiter = NULL;
+        init_waitqueue_head(&s->ceiling_blocked);
+}
+static DEFINE_PER_CPU(struct pcp_state, pcp_state);
+/* assumes preemptions are off */
+static struct pcp_semaphore* pcp_get_ceiling(void)
+{
+        struct list_head* top = __get_cpu_var(pcp_state).system_ceiling.next;
+        if (top)
+                return list_entry(top, struct pcp_semaphore, ceiling);
+        else
+                return NULL;
+}
+/* assumes preempt off */
+static void pcp_add_ceiling(struct pcp_semaphore* sem)
+{
+        struct list_head *pos;
+        struct list_head *in_use = &__get_cpu_var(pcp_state).system_ceiling;
+        struct pcp_semaphore* held;
+        BUG_ON(sem->on_cpu != smp_processor_id());
+        BUG_ON(in_list(&sem->ceiling));
+        list_for_each(pos, in_use) {
+                held = list_entry(pos, struct pcp_semaphore, ceiling);
+                if (held->prio_ceiling >= sem->prio_ceiling) {
+                        __list_add(&sem->ceiling, pos->prev, pos);
+                        return;
+                }
+        }
+        /* we hit the end of the list */
+        list_add_tail(&sem->ceiling, in_use);
+}
+/* assumes preempt off */
+static int pcp_exceeds_ceiling(struct pcp_semaphore* ceiling,
+                              struct task_struct* task,
+                              int effective_prio)
+{
+        return ceiling == NULL ||
+                ceiling->prio_ceiling > effective_prio ||
+                ceiling->owner == task;
+}
+/* assumes preempt off */
+static void pcp_priority_inheritance(void)
+{
+        unsigned long   flags;
+        pfp_domain_t*   pfp = local_pfp;
+        struct pcp_semaphore* ceiling = pcp_get_ceiling();
+        struct task_struct *blocker, *blocked;
+        blocker = ceiling ?  ceiling->owner : NULL;
+        blocked = __get_cpu_var(pcp_state).hp_waiter;
+        raw_spin_lock_irqsave(&pfp->slock, flags);
+        /* Current is no longer inheriting anything by default.  This should be
+         * the currently scheduled job, and hence not currently queued. */
+        BUG_ON(current != pfp->scheduled);
+        fp_set_prio_inh(pfp, current, NULL);
+        fp_set_prio_inh(pfp, blocked, NULL);
+        fp_set_prio_inh(pfp, blocker, NULL);
+        /* Let blocking job inherit priority of blocked job, if required. */
+        if (blocker && blocked &&
+            fp_higher_prio(blocked, blocker)) {
+                TRACE_TASK(blocker, "PCP inherits from %s/%d (prio %u -> %u) \n",
+                           blocked->comm, blocked->pid,
+                           get_priority(blocker), get_priority(blocked));
+                fp_set_prio_inh(pfp, blocker, blocked);
+        }
+        /* Check if anything changed. If the blocked job is current, then it is
+         * just blocking and hence is going to call the scheduler anyway. */
+        if (blocked != current &&
+            fp_higher_prio(fp_prio_peek(&pfp->ready_queue), pfp->scheduled))
+                preempt(pfp);
+        raw_spin_unlock_irqrestore(&pfp->slock, flags);
+}
+/* called with preemptions off */
+static void pcp_raise_ceiling(struct pcp_semaphore* sem,
+                              int effective_prio)
+{
+        struct task_struct* t = current;
+        struct pcp_semaphore* ceiling;
+        prio_wait_queue_t wait;
+        unsigned int waiting_higher_prio;
+        do {
+                ceiling = pcp_get_ceiling();
+                if (pcp_exceeds_ceiling(ceiling, t, effective_prio))
+                        break;
+                TRACE_CUR("PCP ceiling-blocked, wanted sem %p, but %s/%d has the ceiling \n",
+                          sem, ceiling->owner->comm, ceiling->owner->pid);
+                /* we need to wait until the ceiling is lowered */
+                /* enqueue in priority order */
+                init_prio_waitqueue_entry(&wait, t, prio_point(effective_prio));
+                set_task_state(t, TASK_UNINTERRUPTIBLE);
+                waiting_higher_prio = add_wait_queue_prio_exclusive(
+                        &__get_cpu_var(pcp_state).ceiling_blocked, &wait);
+                if (waiting_higher_prio == 0) {
+                        TRACE_CUR("PCP new highest-prio waiter => prio inheritance\n");
+                        /* we are the new highest-priority waiting job
+                         * => update inheritance */
+                        __get_cpu_var(pcp_state).hp_waiter = t;
+                        pcp_priority_inheritance();
+                }
+                TS_LOCK_SUSPEND;
+                preempt_enable_no_resched();
+                schedule();
+                preempt_disable();
+                /* pcp_resume_unblocked() removed us from wait queue */
+                TS_LOCK_RESUME;
+        } while(1);
+        TRACE_CUR("PCP got the ceiling and sem %p\n", sem);
+        /* We are good to go. The semaphore should be available. */
+        BUG_ON(sem->owner != NULL);
+        sem->owner = t;
+        pcp_add_ceiling(sem);
+}
+static void pcp_resume_unblocked(void)
+{
+        wait_queue_head_t *blocked =  &__get_cpu_var(pcp_state).ceiling_blocked;
+        unsigned long flags;
+        prio_wait_queue_t* q;
+        struct task_struct* t = NULL;
+        struct pcp_semaphore* ceiling = pcp_get_ceiling();
+        spin_lock_irqsave(&blocked->lock, flags);
+        while (waitqueue_active(blocked)) {
+                /* check first == highest-priority waiting job */
+                q = list_entry(blocked->task_list.next,
+                               prio_wait_queue_t, wq.task_list);
+                t = (struct task_struct*) q->wq.private;
+                /* can it proceed now? => let it go */
+                if (pcp_exceeds_ceiling(ceiling, t,
+                                        prio_from_point(q->priority))) {
+                    __remove_wait_queue(blocked, &q->wq);
+                    wake_up_process(t);
+                } else {
+                        /* We are done. Update highest-priority waiter. */
+                        __get_cpu_var(pcp_state).hp_waiter = t;
+                        goto out;
+                }
+        }
+        /* If we get here, then there are no more waiting
+         * jobs. */
+        __get_cpu_var(pcp_state).hp_waiter = NULL;
+out:
+        spin_unlock_irqrestore(&blocked->lock, flags);
+}
+/* assumes preempt off */
+static void pcp_lower_ceiling(struct pcp_semaphore* sem)
+{
+        BUG_ON(!in_list(&sem->ceiling));
+        BUG_ON(sem->owner != current);
+        BUG_ON(sem->on_cpu != smp_processor_id());
+        /* remove from ceiling list */
+        list_del(&sem->ceiling);
+        /* release */
+        sem->owner = NULL;
+        TRACE_CUR("PCP released sem %p\n", sem);
+        pcp_priority_inheritance();
+        /* Wake up all ceiling-blocked jobs that now pass the ceiling. */
+        pcp_resume_unblocked();
+}
+static void pcp_update_prio_ceiling(struct pcp_semaphore* sem,
+                                    int effective_prio)
+{
+        /* This needs to be synchronized on something.
+         * Might as well use waitqueue lock for the processor.
+         * We assume this happens only before the task set starts execution,
+         * (i.e., during initialization), but it may happen on multiple processors
+         * at the same time.
+         */
+        unsigned long flags;
+        struct pcp_state* s = &per_cpu(pcp_state, sem->on_cpu);
+        spin_lock_irqsave(&s->ceiling_blocked.lock, flags);
+        sem->prio_ceiling = min(sem->prio_ceiling, effective_prio);
+        spin_unlock_irqrestore(&s->ceiling_blocked.lock, flags);
+}
+static void pcp_init_semaphore(struct pcp_semaphore* sem, int cpu)
+{
+        sem->owner   = NULL;
+        INIT_LIST_HEAD(&sem->ceiling);
+        sem->prio_ceiling = INT_MAX;
+        sem->on_cpu = cpu;
+}
+int pfp_pcp_lock(struct litmus_lock* l)
+{
+        struct task_struct* t = current;
+        struct pcp_semaphore *sem = pcp_from_lock(l);
+        int eprio = effective_agent_priority(get_priority(t));
+        int from  = get_partition(t);
+        int to    = sem->on_cpu;
+        if (!is_realtime(t) || from != to)
+                return -EPERM;
+        preempt_disable();
+        pcp_raise_ceiling(sem, eprio);
+        preempt_enable();
+        return 0;
+}
+int pfp_pcp_unlock(struct litmus_lock* l)
+{
+        struct task_struct *t = current;
+        struct pcp_semaphore *sem = pcp_from_lock(l);
+        int err = 0;
+        preempt_disable();
+        if (sem->on_cpu != smp_processor_id() || sem->owner != t) {
+                err = -EINVAL;
+                goto out;
+        }
+        /* give it back */
+        pcp_lower_ceiling(sem);
+out:
+        preempt_enable();
+        return err;
+}
+int pfp_pcp_open(struct litmus_lock* l, void* __user config)
+{
+        struct task_struct *t = current;
+        struct pcp_semaphore *sem = pcp_from_lock(l);
+        int cpu, eprio;
+        if (!is_realtime(t))
+                /* we need to know the real-time priority */
+                return -EPERM;
+        if (get_user(cpu, (int*) config))
+                return -EFAULT;
+        /* make sure the resource location matches */
+        if (cpu != sem->on_cpu)
+                return -EINVAL;
+        eprio = effective_agent_priority(get_priority(t));
+        pcp_update_prio_ceiling(sem, eprio);
+        return 0;
+}
+int pfp_pcp_close(struct litmus_lock* l)
+{
+        struct task_struct *t = current;
+        struct pcp_semaphore *sem = pcp_from_lock(l);
+        int owner = 0;
+        preempt_disable();
+        if (sem->on_cpu == smp_processor_id())
+                owner = sem->owner == t;
+        preempt_enable();
+        if (owner)
+                pfp_pcp_unlock(l);
+        return 0;
+}
+void pfp_pcp_free(struct litmus_lock* lock)
+{
+        kfree(pcp_from_lock(lock));
+}
+static struct litmus_lock_ops pfp_pcp_lock_ops = {
+        .close  = pfp_pcp_close,
+        .lock   = pfp_pcp_lock,
+        .open   = pfp_pcp_open,
+        .unlock = pfp_pcp_unlock,
+        .deallocate = pfp_pcp_free,
+};
+static struct litmus_lock* pfp_new_pcp(int on_cpu)
+{
+        struct pcp_semaphore* sem;
+        sem = kmalloc(sizeof(*sem), GFP_KERNEL);
+        if (!sem)
+                return NULL;
+        sem->litmus_lock.ops = &pfp_pcp_lock_ops;
+        pcp_init_semaphore(sem, on_cpu);
+        return &sem->litmus_lock;
+}
+/* ******************** DPCP support ********************** */
+struct dpcp_semaphore {
+        struct litmus_lock litmus_lock;
+        struct pcp_semaphore  pcp;
+        int owner_cpu;
+};
+static inline struct dpcp_semaphore* dpcp_from_lock(struct litmus_lock* lock)
+{
+        return container_of(lock, struct dpcp_semaphore, litmus_lock);
+}
+/* called with preemptions disabled */
+static void pfp_migrate_to(int target_cpu)
+{
+        struct task_struct* t = current;
+        pfp_domain_t *from;
+        if (get_partition(t) == target_cpu)
+                return;
+        /* make sure target_cpu makes sense */
+        BUG_ON(!cpu_online(target_cpu));
+        local_irq_disable();
+        /* scheduled task should not be in any ready or release queue */
+        BUG_ON(is_queued(t));
+        /* lock both pfp domains in order of address */
+        from = task_pfp(t);
+        raw_spin_lock(&from->slock);
+        /* switch partitions */
+        tsk_rt(t)->task_params.cpu = target_cpu;
+        raw_spin_unlock(&from->slock);
+        /* Don't trace scheduler costs as part of
+         * locking overhead. Scheduling costs are accounted for
+         * explicitly. */
+        TS_LOCK_SUSPEND;
+        local_irq_enable();
+        preempt_enable_no_resched();
+        /* deschedule to be migrated */
+        schedule();
+        /* we are now on the target processor */
+        preempt_disable();
+        /* start recording costs again */
+        TS_LOCK_RESUME;
+        BUG_ON(smp_processor_id() != target_cpu);
+}
+int pfp_dpcp_lock(struct litmus_lock* l)
+{
+        struct task_struct* t = current;
+        struct dpcp_semaphore *sem = dpcp_from_lock(l);
+        int eprio = effective_agent_priority(get_priority(t));
+        int from  = get_partition(t);
+        int to    = sem->pcp.on_cpu;
+        if (!is_realtime(t))
+                return -EPERM;
+        preempt_disable();
+        /* Priority-boost ourself *before* we suspend so that
+         * our priority is boosted when we resume. */
+        boost_priority(t, get_priority(t));
+        pfp_migrate_to(to);
+        pcp_raise_ceiling(&sem->pcp, eprio);
+        /* yep, we got it => execute request */
+        sem->owner_cpu = from;
+        preempt_enable();
+        return 0;
+}
+int pfp_dpcp_unlock(struct litmus_lock* l)
+{
+        struct task_struct *t = current;
+        struct dpcp_semaphore *sem = dpcp_from_lock(l);
+        int err = 0;
+        int home;
+        preempt_disable();
+        if (sem->pcp.on_cpu != smp_processor_id() || sem->pcp.owner != t) {
+                err = -EINVAL;
+                goto out;
+        }
+        home = sem->owner_cpu;
+        /* give it back */
+        pcp_lower_ceiling(&sem->pcp);
+        /* we lose the benefit of priority boosting */
+        unboost_priority(t);
+        pfp_migrate_to(home);
+out:
+        preempt_enable();
+        return err;
+}
+int pfp_dpcp_open(struct litmus_lock* l, void* __user config)
+{
+        struct task_struct *t = current;
+        struct dpcp_semaphore *sem = dpcp_from_lock(l);
+        int cpu, eprio;
+        if (!is_realtime(t))
+                /* we need to know the real-time priority */
+                return -EPERM;
+        if (get_user(cpu, (int*) config))
+                return -EFAULT;
+        /* make sure the resource location matches */
+        if (cpu != sem->pcp.on_cpu)
+                return -EINVAL;
+        eprio = effective_agent_priority(get_priority(t));
+        pcp_update_prio_ceiling(&sem->pcp, eprio);
+        return 0;
+}
+int pfp_dpcp_close(struct litmus_lock* l)
+{
+        struct task_struct *t = current;
+        struct dpcp_semaphore *sem = dpcp_from_lock(l);
+        int owner = 0;
+        preempt_disable();
+        if (sem->pcp.on_cpu == smp_processor_id())
+                owner = sem->pcp.owner == t;
+        preempt_enable();
+        if (owner)
+                pfp_dpcp_unlock(l);
+        return 0;
+}
+void pfp_dpcp_free(struct litmus_lock* lock)
+{
+        kfree(dpcp_from_lock(lock));
+}
+static struct litmus_lock_ops pfp_dpcp_lock_ops = {
+        .close  = pfp_dpcp_close,
+        .lock   = pfp_dpcp_lock,
+        .open   = pfp_dpcp_open,
+        .unlock = pfp_dpcp_unlock,
+        .deallocate = pfp_dpcp_free,
+};
+static struct litmus_lock* pfp_new_dpcp(int on_cpu)
+{
+        struct dpcp_semaphore* sem;
+        sem = kmalloc(sizeof(*sem), GFP_KERNEL);
+        if (!sem)
+                return NULL;
+        sem->litmus_lock.ops = &pfp_dpcp_lock_ops;
+        sem->owner_cpu = NO_CPU;
+        pcp_init_semaphore(&sem->pcp, on_cpu);
+        return &sem->litmus_lock;
+}
+/* **** lock constructor **** */
+static long pfp_allocate_lock(struct litmus_lock **lock, int type,
+                                 void* __user config)
+{
+        int err = -ENXIO, cpu;
+        struct srp_semaphore* srp;
+        /* P-FP currently supports the SRP for local resources and the FMLP
+         * for global resources. */
+        switch (type) {
+        case FMLP_SEM:
+                /* FIFO Mutex Locking Protocol */
+                *lock = pfp_new_fmlp();
+                if (*lock)
+                        err = 0;
+                else
+                        err = -ENOMEM;
+                break;
+        case MPCP_SEM:
+                /* Multiprocesor Priority Ceiling Protocol */
+                *lock = pfp_new_mpcp(0);
+                if (*lock)
+                        err = 0;
+                else
+                        err = -ENOMEM;
+                break;
+        case MPCP_VS_SEM:
+                /* Multiprocesor Priority Ceiling Protocol with virtual spinning */
+                *lock = pfp_new_mpcp(1);
+                if (*lock)
+                        err = 0;
+                else
+                        err = -ENOMEM;
+                break;
+        case DPCP_SEM:
+                /* Distributed Priority Ceiling Protocol */
+                if (get_user(cpu, (int*) config))
+                        return -EFAULT;
+                if (!cpu_online(cpu))
+                        return -EINVAL;
+                *lock = pfp_new_dpcp(cpu);
+                if (*lock)
+                        err = 0;
+                else
+                        err = -ENOMEM;
+                break;
+        case SRP_SEM:
+                /* Baker's Stack Resource Policy */
+                srp = allocate_srp_semaphore();
+                if (srp) {
+                        *lock = &srp->litmus_lock;
+                        err = 0;
+                } else
+                        err = -ENOMEM;
+                break;
+        case PCP_SEM:
+                /* Priority Ceiling Protocol */
+                if (get_user(cpu, (int*) config))
+                        return -EFAULT;
+                if (!cpu_online(cpu))
+                        return -EINVAL;
+                *lock = pfp_new_pcp(cpu);
+                if (*lock)
+                        err = 0;
+                else
+                        err = -ENOMEM;
+                break;
+        };
+        return err;
+}
+#endif
+static long pfp_admit_task(struct task_struct* tsk)
+{
+        if (task_cpu(tsk) == tsk->rt_param.task_params.cpu &&
+#ifdef CONFIG_RELEASE_MASTER
+            /* don't allow tasks on release master CPU */
+            task_cpu(tsk) != remote_dom(task_cpu(tsk))->release_master &&
+#endif
+            litmus_is_valid_fixed_prio(get_priority(tsk)))
+                return 0;
+        else
+                return -EINVAL;
+}
+static long pfp_activate_plugin(void)
+{
+#if defined(CONFIG_RELEASE_MASTER) || defined(CONFIG_LITMUS_LOCKING)
+        int cpu;
+#endif
+#ifdef CONFIG_RELEASE_MASTER
+        for_each_online_cpu(cpu) {
+                remote_dom(cpu)->release_master = atomic_read(&release_master_cpu);
+        }
+#endif
+#ifdef CONFIG_LITMUS_LOCKING
+        get_srp_prio = pfp_get_srp_prio;
+        for_each_online_cpu(cpu) {
+                init_waitqueue_head(&per_cpu(mpcpvs_vspin_wait, cpu));
+                per_cpu(mpcpvs_vspin, cpu) = NULL;
+                pcp_init_state(&per_cpu(pcp_state, cpu));
+                pfp_doms[cpu] = remote_pfp(cpu);
+        }
+#endif
+        return 0;
+}
+/*      Plugin object   */
+static struct sched_plugin pfp_plugin __cacheline_aligned_in_smp = {
+        .plugin_name            = "P-FP",
+        .tick                   = pfp_tick,
+        .task_new               = pfp_task_new,
+        .complete_job           = complete_job,
+        .task_exit              = pfp_task_exit,
+        .schedule               = pfp_schedule,
+        .task_wake_up           = pfp_task_wake_up,
+        .task_block             = pfp_task_block,
+        .admit_task             = pfp_admit_task,
+        .activate_plugin        = pfp_activate_plugin,
+#ifdef CONFIG_LITMUS_LOCKING
+        .allocate_lock          = pfp_allocate_lock,
+        .finish_switch          = pfp_finish_switch,
+#endif
+};
+static int __init init_pfp(void)
+{
+        int i;
+        /* We do not really want to support cpu hotplug, do we? ;)
+         * However, if we are so crazy to do so,
+         * we cannot use num_online_cpu()
+         */
+        for (i = 0; i < num_online_cpus(); i++) {
+                pfp_domain_init(remote_pfp(i), i);
+        }
+        return register_sched_plugin(&pfp_plugin);
+}
+module_init(init_pfp);
diff --git a/litmus/sched_psn_edf.c b/litmus/sched_psn_edf.c
index 8e4a22dd8d6a..0e1675d2e572 100644
--- a/litmus/sched_psn_edf.c
+++ b/litmus/sched_psn_edf.c
@@ -17,6 +17,7 @@
 #include <litmus/litmus.h>
 #include <litmus/jobs.h>
 #include <litmus/preempt.h>
+#include <litmus/budget.h>
 #include <litmus/sched_plugin.h>
 #include <litmus/edf_common.h>
 #include <litmus/sched_trace.h>
@@ -59,7 +60,7 @@ static void requeue(struct task_struct* t, rt_domain_t *edf)
        if (t->state != TASK_RUNNING)
                TRACE_TASK(t, "requeue: !TASK_RUNNING\n");
-        set_rt_flags(t, RT_F_RUNNING);
+        tsk_rt(t)->completed = 0;
        if (is_released(t, litmus_clock()))
                __add_ready(edf, t);
        else
@@ -132,6 +133,15 @@ static void unboost_priority(struct task_struct* t)
 #endif
+static int psnedf_preempt_check(psnedf_domain_t *pedf)
+{
+        if (edf_preemption_needed(&pedf->domain, pedf->scheduled)) {
+                preempt(pedf);
+                return 1;
+        } else
+                return 0;
+}
 /* This check is trivial in partioned systems as we only have to consider
 * the CPU of the partition.
 */
@@ -142,11 +152,7 @@ static int psnedf_check_resched(rt_domain_t *edf)
        /* because this is a callback from rt_domain_t we already hold
         * the necessary lock for the ready queue
         */
-        if (edf_preemption_needed(edf, pedf->scheduled)) {
+        return psnedf_preempt_check(pedf);
-                preempt(pedf);
-                return 1;
-        } else
-                return 0;
 }
 static void job_completion(struct task_struct* t, int forced)
@@ -154,7 +160,7 @@ static void job_completion(struct task_struct* t, int forced)
        sched_trace_task_completion(t,forced);
        TRACE_TASK(t, "job_completion().\n");
-        set_rt_flags(t, RT_F_SLEEP);
+        tsk_rt(t)->completed = 1;
        prepare_for_next_period(t);
 }
@@ -208,7 +214,7 @@ static struct task_struct* psnedf_schedule(struct task_struct * prev)
                                  budget_enforced(pedf->scheduled) &&
                                  budget_exhausted(pedf->scheduled);
        np          = exists && is_np(pedf->scheduled);
-        sleep       = exists && get_rt_flags(pedf->scheduled) == RT_F_SLEEP;
+        sleep       = exists && is_completed(pedf->scheduled);
        preempt     = edf_preemption_needed(edf, prev);
        /* If we need to preempt do so.
@@ -260,7 +266,7 @@ static struct task_struct* psnedf_schedule(struct task_struct * prev)
        if (next) {
                TRACE_TASK(next, "scheduled at %llu\n", litmus_clock());
-                set_rt_flags(next, RT_F_RUNNING);
+                tsk_rt(next)->completed = 0;
        } else {
                TRACE("becoming idle at %llu\n", litmus_clock());
        }
@@ -298,7 +304,7 @@ static void psnedf_task_new(struct task_struct * t, int on_rq, int running)
        } else {
                requeue(t, edf);
                /* maybe we have to reschedule */
-                preempt(pedf);
+                psnedf_preempt_check(pedf);
        }
        raw_spin_unlock_irqrestore(&pedf->slock, flags);
 }
@@ -334,8 +340,10 @@ static void psnedf_task_wake_up(struct task_struct *task)
         * de-scheduling the task, i.e., wake_up() raced with schedule()
         * and won.
         */
-        if (pedf->scheduled != task)
+        if (pedf->scheduled != task) {
                requeue(task, edf);
+                psnedf_preempt_check(pedf);
+        }
        raw_spin_unlock_irqrestore(&pedf->slock, flags);
        TRACE_TASK(task, "wake up done\n");
diff --git a/litmus/sync.c b/litmus/sync.c
index bf75fde5450b..3e79e0a12a5a 100644
--- a/litmus/sync.c
+++ b/litmus/sync.c
@@ -16,63 +16,106 @@
 #include <litmus/sched_trace.h>
-static DECLARE_COMPLETION(ts_release);
+struct ts_release_wait {
+        struct list_head list;
+        struct completion completion;
+        lt_t ts_release_time;
+};
+#define DECLARE_TS_RELEASE_WAIT(symb)                                   \
+        struct ts_release_wait symb =                                   \
+        {                                                               \
+                LIST_HEAD_INIT(symb.list),                              \
+                COMPLETION_INITIALIZER_ONSTACK(symb.completion),        \
+                0                                                       \
+        }
+static LIST_HEAD(task_release_list);
+static DEFINE_MUTEX(task_release_lock);
 static long do_wait_for_ts_release(void)
 {
-        long ret = 0;
+        DECLARE_TS_RELEASE_WAIT(wait);
+        long ret = -ERESTARTSYS;
+        if (mutex_lock_interruptible(&task_release_lock))
+                goto out;
+        list_add(&wait.list, &task_release_list);
-        /* If the interruption races with a release, the completion object
+        mutex_unlock(&task_release_lock);
-         * may have a non-zero counter. To avoid this problem, this should
-         * be replaced by wait_for_completion().
-         *
-         * For debugging purposes, this is interruptible for now.
-         */
-        ret = wait_for_completion_interruptible(&ts_release);
+        /* We are enqueued, now we wait for someone to wake us up. */
+        ret = wait_for_completion_interruptible(&wait.completion);
+        if (!ret) {
+                /* Completion succeeded, setup release. */
+                litmus->release_at(current, wait.ts_release_time
+                                   + current->rt_param.task_params.phase
+                                   - current->rt_param.task_params.period);
+                /* trigger advance to next job release at the programmed time */
+                ret = complete_job();
+        } else {
+                /* We were interrupted, must cleanup list. */
+                mutex_lock(&task_release_lock);
+                if (!wait.completion.done)
+                        list_del(&wait.list);
+                mutex_unlock(&task_release_lock);
+        }
+out:
        return ret;
 }
 int count_tasks_waiting_for_release(void)
 {
-        unsigned long flags;
        int task_count = 0;
        struct list_head *pos;
-        spin_lock_irqsave(&ts_release.wait.lock, flags);
+        mutex_lock(&task_release_lock);
-        list_for_each(pos, &ts_release.wait.task_list) {
+        list_for_each(pos, &task_release_list) {
                task_count++;
        }
-        spin_unlock_irqrestore(&ts_release.wait.lock, flags);
+        mutex_unlock(&task_release_lock);
        return task_count;
 }
 static long do_release_ts(lt_t start)
 {
-        int  task_count = 0;
+        long  task_count = 0;
-        unsigned long flags;
-        struct list_head        *pos;
-        struct task_struct      *t;
+        struct list_head        *pos, *safe;
+        struct ts_release_wait  *wait;
-        spin_lock_irqsave(&ts_release.wait.lock, flags);
+        if (mutex_lock_interruptible(&task_release_lock)) {
-        TRACE("<<<<<< synchronous task system release >>>>>>\n");
+                task_count = -ERESTARTSYS;
+                goto out;
+        }
+        TRACE("<<<<<< synchronous task system release >>>>>>\n");
        sched_trace_sys_release(&start);
-        list_for_each(pos, &ts_release.wait.task_list) {
-                t = (struct task_struct*) list_entry(pos,
+        task_count = 0;
-                                                     struct __wait_queue,
+        list_for_each_safe(pos, safe, &task_release_list) {
-                                                     task_list)->private;
+                wait = (struct ts_release_wait*)
+                        list_entry(pos, struct ts_release_wait, list);
                task_count++;
-                litmus->release_at(t, start + t->rt_param.task_params.phase);
+                wait->ts_release_time = start;
-                sched_trace_task_release(t);
+                complete(&wait->completion);
        }
-        spin_unlock_irqrestore(&ts_release.wait.lock, flags);
+        /* clear stale list */
+        INIT_LIST_HEAD(&task_release_list);
-        complete_n(&ts_release, task_count);
+        mutex_unlock(&task_release_lock);
+out:
        return task_count;
 }
@@ -88,17 +131,22 @@ asmlinkage long sys_wait_for_ts_release(void)
        return ret;
 }
+#define ONE_MS 1000000
 asmlinkage long sys_release_ts(lt_t __user *__delay)
 {
        long ret;
        lt_t delay;
+        lt_t start_time;
        /* FIXME: check capabilities... */
        ret = copy_from_user(&delay, __delay, sizeof(delay));
-        if (ret == 0)
+        if (ret == 0) {
-                ret = do_release_ts(litmus_clock() + delay);
+                /* round up to next larger integral millisecond */
+                start_time = ((litmus_clock() / ONE_MS) + 1) * ONE_MS;
+                ret = do_release_ts(start_time + delay);
+        }
        return ret;
 }
diff --git a/litmus/trace.c b/litmus/trace.c
index 3c35c527e805..7dbb98e4a3cd 100644
--- a/litmus/trace.c
+++ b/litmus/trace.c
@@ -18,6 +18,15 @@ static unsigned int ts_seq_no = 0;
 DEFINE_PER_CPU(atomic_t, irq_fired_count);
+void ft_irq_fired(void)
+{
+        /* Only called with preemptions disabled.  */
+        atomic_inc(&__get_cpu_var(irq_fired_count));
+        if (has_control_page(current))
+                get_control_page(current)->irq_count++;
+}
 static inline void clear_irq_fired(void)
 {
        atomic_set(&__raw_get_cpu_var(irq_fired_count), 0);
@@ -34,77 +43,119 @@ static inline unsigned int get_and_clear_irq_fired(void)
        return atomic_xchg(&__raw_get_cpu_var(irq_fired_count), 0);
 }
-static inline void __save_irq_flags(struct timestamp *ts)
+static inline void save_irq_flags(struct timestamp *ts, unsigned int irq_count)
 {
-        unsigned int irq_count;
-        irq_count     = get_and_clear_irq_fired();
        /* Store how many interrupts occurred. */
        ts->irq_count = irq_count;
        /* Extra flag because ts->irq_count overflows quickly. */
        ts->irq_flag  = irq_count > 0;
 }
-static inline void __save_timestamp_cpu(unsigned long event,
+static inline void write_timestamp(uint8_t event,
-                                        uint8_t type, uint8_t cpu)
+                                   uint8_t type,
+                                   uint8_t cpu,
+                                   uint16_t pid_fragment,
+                                   unsigned int irq_count,
+                                   int record_irq,
+                                   int hide_irq,
+                                   uint64_t timestamp,
+                                   int record_timestamp)
 {
+        unsigned long flags;
        unsigned int seq_no;
        struct timestamp *ts;
+        /* Avoid preemptions while recording the timestamp. This reduces the
+         * number of "out of order" timestamps in the stream and makes
+         * post-processing easier. */
+        local_irq_save(flags);
        seq_no = fetch_and_inc((int *) &ts_seq_no);
        if (ft_buffer_start_write(trace_ts_buf, (void**)  &ts)) {
                ts->event     = event;
                ts->seq_no    = seq_no;
-                ts->cpu       = cpu;
                ts->task_type = type;
-                __save_irq_flags(ts);
+                ts->pid       = pid_fragment;
-                barrier();
-                /* prevent re-ordering of ft_timestamp() */
+                ts->cpu       = cpu;
-                ts->timestamp = ft_timestamp();
+                if (record_irq)
+                        irq_count = get_and_clear_irq_fired();
+                save_irq_flags(ts, irq_count - hide_irq);
+                if (record_timestamp)
+                        timestamp = ft_timestamp();
+                ts->timestamp = timestamp;
                ft_buffer_finish_write(trace_ts_buf, ts);
        }
+        local_irq_restore(flags);
 }
 static void __add_timestamp_user(struct timestamp *pre_recorded)
 {
+        unsigned long flags;
        unsigned int seq_no;
        struct timestamp *ts;
+        local_irq_save(flags);
        seq_no = fetch_and_inc((int *) &ts_seq_no);
        if (ft_buffer_start_write(trace_ts_buf, (void**)  &ts)) {
                *ts = *pre_recorded;
                ts->seq_no = seq_no;
-                __save_irq_flags(ts);
+                ts->cpu    = raw_smp_processor_id();
+                save_irq_flags(ts, get_and_clear_irq_fired());
                ft_buffer_finish_write(trace_ts_buf, ts);
        }
-}
-static inline void __save_timestamp(unsigned long event,
+        local_irq_restore(flags);
-                                   uint8_t type)
-{
-        __save_timestamp_cpu(event, type, raw_smp_processor_id());
 }
 feather_callback void save_timestamp(unsigned long event)
 {
-        __save_timestamp(event, TSK_UNKNOWN);
+        write_timestamp(event, TSK_UNKNOWN,
+                        raw_smp_processor_id(),
+                        current->pid,
+                        0, 1, 0,
+                        0, 1);
 }
 feather_callback void save_timestamp_def(unsigned long event,
                                         unsigned long type)
 {
-        __save_timestamp(event, (uint8_t) type);
+        write_timestamp(event, type,
+                        raw_smp_processor_id(),
+                        current->pid,
+                        0, 1, 0,
+                        0, 1);
 }
 feather_callback void save_timestamp_task(unsigned long event,
                                          unsigned long t_ptr)
 {
-        int rt = is_realtime((struct task_struct *) t_ptr);
+        struct task_struct *t = (struct task_struct *) t_ptr;
-        __save_timestamp(event, rt ? TSK_RT : TSK_BE);
+        int rt = is_realtime(t);
+        write_timestamp(event, rt ? TSK_RT : TSK_BE,
+                        raw_smp_processor_id(),
+                        t->pid,
+                        0, 1, 0,
+                        0, 1);
 }
 feather_callback void save_timestamp_cpu(unsigned long event,
                                         unsigned long cpu)
 {
-        __save_timestamp_cpu(event, TSK_UNKNOWN, cpu);
+        write_timestamp(event, TSK_UNKNOWN, cpu, current->pid,
+                        0, 1, 0,
+                        0, 1);
 }
 feather_callback void save_task_latency(unsigned long event,
@@ -112,20 +163,44 @@ feather_callback void save_task_latency(unsigned long event,
 {
        lt_t now = litmus_clock();
        lt_t *when = (lt_t*) when_ptr;
-        unsigned int seq_no;
-        int cpu = raw_smp_processor_id();
-        struct timestamp *ts;
-        seq_no = fetch_and_inc((int *) &ts_seq_no);
+        write_timestamp(event, TSK_RT, raw_smp_processor_id(), 0,
-        if (ft_buffer_start_write(trace_ts_buf, (void**)  &ts)) {
+                        0, 1, 0,
-                ts->event     = event;
+                        now - *when, 0);
-                ts->timestamp = now - *when;
+}
-                ts->seq_no    = seq_no;
-                ts->cpu       = cpu;
+/* fake timestamp to user-reported time */
-                ts->task_type = TSK_RT;
+feather_callback void save_timestamp_time(unsigned long event,
-                __save_irq_flags(ts);
+                         unsigned long ptr)
-                ft_buffer_finish_write(trace_ts_buf, ts);
+{
-        }
+        uint64_t* time = (uint64_t*) ptr;
+        write_timestamp(event, is_realtime(current) ? TSK_RT : TSK_BE,
+                        raw_smp_processor_id(), current->pid,
+                        0, 1, 0,
+                        *time, 0);
+}
+/* Record user-reported IRQ count */
+feather_callback void save_timestamp_irq(unsigned long event,
+                        unsigned long irq_counter_ptr)
+{
+        uint64_t* irqs = (uint64_t*) irq_counter_ptr;
+        write_timestamp(event, is_realtime(current) ? TSK_RT : TSK_BE,
+                        raw_smp_processor_id(), current->pid,
+                        *irqs, 0, 0,
+                        0, 1);
+}
+/* Suppress one IRQ from the irq count. Used by TS_SEND_RESCHED_END, which is
+ * called from within an interrupt that is expected. */
+feather_callback void save_timestamp_hide_irq(unsigned long event)
+{
+        write_timestamp(event, is_realtime(current) ? TSK_RT : TSK_BE,
+                        raw_smp_processor_id(), current->pid,
+                        0, 1, 1,
+                        0, 1);
 }
 /******************************************************************************/
author	Glenn Elliott <gelliott@cs.unc.edu>	2013-03-06 14:20:55 -0500
committer	Glenn Elliott <gelliott@cs.unc.edu>	2013-03-06 14:20:55 -0500
commit	22da1b2b4f02413e58bf01caa5b14e42e7913598 (patch)
tree	6e4022a5140e682d287c4206550848300bb7986b
parent	da954aa12e99b502356ca62bff822cb6a95cba7a (diff)
parent	c7cd5432b98df518b05bc8978d34382797fd9a05 (diff)