Initial attempt to "connect the wires"

- This is my first attempt to re-add all of the modifications on top of this version of the Linux kernel that were present in the previous version of LITMUS. - More notes on changes will follow after testing--no guarantees the code as it is now will compile or run correctly.
author: Nathan O <otternes@cs.unc.edu> 2019-12-09 14:59:56 -0500
committer: Nathan O <otternes@cs.unc.edu> 2019-12-09 14:59:56 -0500
commit: 3c4abebc788e9d92d776d7bc8b778f398cdb4010 (patch)
tree: 7392a57bb2d5e0e61cd3a03bae0e8ce79991f6d5
parent: 2627f203874e04500ea80f6e588cd659bec5866b (diff)
32 files changed, 1413 insertions, 40 deletions
diff --git a/Makefile b/Makefile
index 1d5298356ea8..405d18d59837 100644
--- a/Makefile
+++ b/Makefile
@@ -1011,6 +1011,7 @@ export MODORDER := $(extmod-prefix)modules.order
 ifeq ($(KBUILD_EXTMOD),)
 core-y          += kernel/ certs/ mm/ fs/ ipc/ security/ crypto/ block/
+core-y          += litmus/
 vmlinux-dirs    := $(patsubst %/,%,$(filter %/, $(init-y) $(init-m) \
                     $(core-y) $(core-m) $(drivers-y) $(drivers-m) \
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 8a50efb559f3..3aaa81a3ae70 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -2094,3 +2094,12 @@ source "arch/arm/crypto/Kconfig"
 endif
 source "arch/arm/kvm/Kconfig"
+config ARCH_HAS_FEATHER_TRACE
+       def_bool n
+config ARCH_CALLS_IRQ_ENTER_ON_RESCHED_IPI
+       def_bool n
+source "litmus/Kconfig"
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 3f047afb982c..a6bf629e708c 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1680,3 +1680,12 @@ source "arch/arm64/kvm/Kconfig"
 if CRYPTO
 source "arch/arm64/crypto/Kconfig"
 endif
+config ARCH_HAS_FEATHER_TRACE
+       def_bool n
+config ARCH_CALLS_IRQ_ENTER_ON_RESCHED_IPI
+       def_bool n
+source "litmus/Kconfig"
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 8ef85139553f..3765164809c5 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2972,3 +2972,12 @@ config X86_DEV_DMA_OPS
 source "drivers/firmware/Kconfig"
 source "arch/x86/kvm/Kconfig"
+config ARCH_HAS_FEATHER_TRACE
+       def_bool y
+config ARCH_CALLS_IRQ_ENTER_ON_RESCHED_IPI
+       def_bool y
+source "litmus/Kconfig"
diff --git a/arch/x86/include/asm/feather_trace.h b/arch/x86/include/asm/feather_trace.h
new file mode 100644
index 000000000000..4e732d4ea508
--- /dev/null
+++ b/arch/x86/include/asm/feather_trace.h
@@ -0,0 +1,18 @@
+#ifndef _ARCH_FEATHER_TRACE_H
+#define _ARCH_FEATHER_TRACE_H
+#include <asm/msr.h>
+#include <asm/timex.h>
+static inline unsigned long long ft_timestamp(void)
+{
+        return get_cycles();
+}
+#ifdef CONFIG_X86_32
+#include "feather_trace_32.h"
+#else
+#include "feather_trace_64.h"
+#endif
+#endif
diff --git a/arch/x86/include/asm/feather_trace_32.h b/arch/x86/include/asm/feather_trace_32.h
new file mode 100644
index 000000000000..75e81a9f9382
--- /dev/null
+++ b/arch/x86/include/asm/feather_trace_32.h
@@ -0,0 +1,115 @@
+/* Copyright (c) 2007-2012 Björn Brandenburg, <bbb@mpi-sws.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/* Do not directly include this file. Include feather_trace.h instead */
+#define feather_callback __attribute__((regparm(3)))  __attribute__((used))
+/*
+ * Make the compiler reload any register that is not saved in a cdecl function
+ * call (minus the registers that we explicitly clobber as output registers).
+ */
+#define __FT_CLOBBER_LIST0 "memory", "cc", "eax", "edx", "ecx"
+#define __FT_CLOBBER_LIST1 "memory", "cc", "eax", "ecx"
+#define __FT_CLOBBER_LIST2 "memory", "cc", "eax"
+#define __FT_CLOBBER_LIST3 "memory", "cc", "eax"
+#define __FT_TMP1(x) "=d" (x)
+#define __FT_ARG1(x) "0" ((long) (x))
+#define __FT_TMP2(x) "=c" (x)
+#define __FT_ARG2(x) "1" ((long) (x))
+#define __FT_ARG3(x) "r" ((long) (x))
+#define ft_event(id, callback)                                  \
+        __asm__ __volatile__(                                   \
+            "1: jmp 2f                                    \n\t" \
+            " call " #callback "                          \n\t" \
+            ".section __event_table, \"aw\"               \n\t" \
+            ".long " #id  ", 0, 1b, 2f                    \n\t" \
+            ".previous                                    \n\t" \
+            "2:                                           \n\t" \
+            : : : __FT_CLOBBER_LIST0)
+#define ft_event0(id, callback)                                 \
+        __asm__ __volatile__(                                   \
+            "1: jmp 2f                                    \n\t" \
+            " movl $" #id  ", %%eax                       \n\t" \
+            " call " #callback "                          \n\t" \
+            ".section __event_table, \"aw\"               \n\t" \
+            ".long " #id  ", 0, 1b, 2f                    \n\t" \
+            ".previous                                    \n\t" \
+            "2:                                           \n\t" \
+            : : : __FT_CLOBBER_LIST0)
+#define ft_event1(id, callback, param)                          \
+        do {                                                    \
+                long __ft_tmp1;                                 \
+        __asm__ __volatile__(                                   \
+            "1: jmp 2f                                    \n\t" \
+            " movl $" #id  ", %%eax                       \n\t" \
+            " call " #callback "                          \n\t" \
+            ".section __event_table, \"aw\"               \n\t" \
+            ".long " #id  ", 0, 1b, 2f                    \n\t" \
+            ".previous                                    \n\t" \
+            "2:                                           \n\t" \
+            : __FT_TMP1(__ft_tmp1)                              \
+            : __FT_ARG1(param)                                  \
+            : __FT_CLOBBER_LIST1);                              \
+        } while (0);
+#define ft_event2(id, callback, param, param2)                  \
+        do {                                                    \
+                long __ft_tmp1, __ft_tmp2;                      \
+        __asm__ __volatile__(                                   \
+            "1: jmp 2f                                    \n\t" \
+            " movl $" #id  ", %%eax                       \n\t" \
+            " call " #callback "                          \n\t" \
+            ".section __event_table, \"aw\"               \n\t" \
+            ".long " #id  ", 0, 1b, 2f                    \n\t" \
+            ".previous                                    \n\t" \
+            "2:                                           \n\t" \
+            : __FT_TMP1(__ft_tmp1), __FT_TMP2(__ft_tmp2)        \
+            : __FT_ARG1(param), __FT_ARG2(param2)               \
+            : __FT_CLOBBER_LIST2);                              \
+        } while (0);
+#define ft_event3(id, callback, param, param2, param3)          \
+        do {                                                    \
+                long __ft_tmp1, __ft_tmp2;                      \
+        __asm__ __volatile__(                                   \
+            "1: jmp 2f                                    \n\t" \
+            " subl $4, %%esp                              \n\t" \
+            " movl $" #id  ", %%eax                       \n\t" \
+            " movl %2, (%%esp)                            \n\t" \
+            " call " #callback "                          \n\t" \
+            " addl $4, %%esp                              \n\t" \
+            ".section __event_table, \"aw\"               \n\t" \
+            ".long " #id  ", 0, 1b, 2f                    \n\t" \
+            ".previous                                    \n\t" \
+            "2:                                           \n\t" \
+            : __FT_TMP1(__ft_tmp1), __FT_TMP2(__ft_tmp2)        \
+            : __FT_ARG1(param), __FT_ARG2(param2), __FT_ARG3(param3)    \
+            : __FT_CLOBBER_LIST3);                              \
+        } while (0);
diff --git a/arch/x86/include/asm/feather_trace_64.h b/arch/x86/include/asm/feather_trace_64.h
new file mode 100644
index 000000000000..5ce49e2eebba
--- /dev/null
+++ b/arch/x86/include/asm/feather_trace_64.h
@@ -0,0 +1,124 @@
+/* Copyright (c) 2010 Andrea Bastoni, <bastoni@cs.unc.edu>
+ * Copyright (c) 2012 Björn Brandenburg, <bbb@mpi-sws.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/* Do not directly include this file. Include feather_trace.h instead */
+/* regparm is the default on x86_64 */
+#define feather_callback  __attribute__((used))
+#define __FT_EVENT_TABLE(id,from,to) \
+            ".section __event_table, \"aw\"\n\t" \
+            ".balign 8\n\t" \
+            ".quad " #id  ", 0, " #from ", " #to " \n\t" \
+            ".previous \n\t"
+/*
+ * x86_64 caller only owns rbp, rbx, r12-r15;
+ * the callee can freely modify the others.
+ */
+#define __FT_CLOBBER_LIST0      "memory", "cc", "rdi", "rsi", "rdx", "rcx", \
+                        "r8", "r9", "r10", "r11", "rax"
+#define __FT_CLOBBER_LIST1      "memory", "cc", "rdi", "rdx", "rcx", \
+                        "r8", "r9", "r10", "r11", "rax"
+#define __FT_CLOBBER_LIST2      "memory", "cc", "rdi", "rcx", \
+                        "r8", "r9", "r10", "r11", "rax"
+#define __FT_CLOBBER_LIST3      "memory", "cc", "rdi", \
+                        "r8", "r9", "r10", "r11", "rax"
+/* The registers RDI, RSI, RDX, RCX, R8 and R9 are used for integer and pointer
+ * arguments. */
+/* RSI */
+#define __FT_TMP1(x) "=S" (x)
+#define __FT_ARG1(x) "0" ((long) (x))
+/* RDX */
+#define __FT_TMP2(x) "=d" (x)
+#define __FT_ARG2(x) "1" ((long) (x))
+/* RCX */
+#define __FT_TMP3(x) "=c" (x)
+#define __FT_ARG3(x) "2" ((long) (x))
+#define ft_event(id, callback)                                  \
+        __asm__ __volatile__(                                   \
+            "1: jmp 2f                                    \n\t" \
+            " call " #callback "                          \n\t" \
+            __FT_EVENT_TABLE(id,1b,2f)                          \
+            "2:                                           \n\t" \
+        : : : __FT_CLOBBER_LIST0)
+#define ft_event0(id, callback)                                 \
+        __asm__ __volatile__(                                   \
+            "1: jmp 2f                                    \n\t" \
+            " movq $" #id ", %%rdi                        \n\t" \
+            " call " #callback "                          \n\t" \
+            __FT_EVENT_TABLE(id,1b,2f)                          \
+            "2:                                           \n\t" \
+        : :  : __FT_CLOBBER_LIST0)
+#define ft_event1(id, callback, param)                          \
+        do {                                                    \
+                long __ft_tmp1;                                 \
+        __asm__ __volatile__(                                   \
+            "1: jmp 2f                                    \n\t" \
+            " movq $" #id ", %%rdi                        \n\t" \
+            " call " #callback "                          \n\t" \
+            __FT_EVENT_TABLE(id,1b,2f)                          \
+            "2:                                           \n\t" \
+            : __FT_TMP1(__ft_tmp1)                              \
+            : __FT_ARG1(param)                                  \
+            : __FT_CLOBBER_LIST1);                              \
+        } while (0);
+#define ft_event2(id, callback, param, param2)                  \
+        do {                                                    \
+                long __ft_tmp1, __ft_tmp2;                      \
+        __asm__ __volatile__(                                   \
+            "1: jmp 2f                                    \n\t" \
+            " movq $" #id ", %%rdi                        \n\t" \
+            " call " #callback "                          \n\t" \
+            __FT_EVENT_TABLE(id,1b,2f)                          \
+            "2:                                           \n\t" \
+            : __FT_TMP1(__ft_tmp1), __FT_TMP2(__ft_tmp2)        \
+            : __FT_ARG1(param), __FT_ARG2(param2)               \
+            : __FT_CLOBBER_LIST2);                              \
+        } while (0);
+#define ft_event3(id, callback, param, param2, param3)          \
+        do {                                                    \
+                long __ft_tmp1, __ft_tmp2, __ft_tmp3;           \
+        __asm__ __volatile__(                                   \
+            "1: jmp 2f                                    \n\t" \
+            " movq $" #id ", %%rdi                        \n\t" \
+            " call " #callback "                          \n\t" \
+            __FT_EVENT_TABLE(id,1b,2f)                          \
+            "2:                                           \n\t" \
+            : __FT_TMP1(__ft_tmp1), __FT_TMP2(__ft_tmp2), __FT_TMP3(__ft_tmp3) \
+            : __FT_ARG1(param), __FT_ARG2(param2), __FT_ARG3(param3)    \
+            : __FT_CLOBBER_LIST3);                              \
+        } while (0);
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 3578ad248bc9..5ee68d48e0a4 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -140,6 +140,8 @@ obj-$(CONFIG_UNWINDER_ORC)		+= unwind_orc.o
 obj-$(CONFIG_UNWINDER_FRAME_POINTER)    += unwind_frame.o
 obj-$(CONFIG_UNWINDER_GUESS)            += unwind_guess.o
+obj-$(CONFIG_FEATHER_TRACE)    += ft_event.o
 ###
 # 64 bit specific files
 ifeq ($(CONFIG_X86_64),y)
diff --git a/arch/x86/kernel/ft_event.c b/arch/x86/kernel/ft_event.c
new file mode 100644
index 000000000000..7aa3d0592ff2
--- /dev/null
+++ b/arch/x86/kernel/ft_event.c
@@ -0,0 +1,170 @@
+#include <linux/types.h>
+#include <linux/module.h>
+#include <asm/cacheflush.h>
+#include <asm/sections.h>
+#include <litmus/feather_trace.h>
+/* the feather trace management functions assume
+ * exclusive access to the event table
+ */
+#ifndef CONFIG_RELOCATABLE
+#define BYTE_JUMP      0xeb
+#define BYTE_JUMP_LEN  0x02
+/* for each event, there is an entry in the event table */
+struct trace_event {
+        long    id;
+        long    count;
+        long    start_addr;
+        long    end_addr;
+};
+extern struct trace_event  __start___event_table[];
+extern struct trace_event  __stop___event_table[];
+/* NOTE: The following two functions have been stolen from ftrace.c */
+static inline int
+within(unsigned long addr, unsigned long start, unsigned long end)
+{
+        return addr >= start && addr < end;
+}
+static unsigned long text_ip_addr(unsigned long ip)
+{
+        /*
+         * On x86_64, kernel text mappings are mapped read-only, so we use
+         * the kernel identity mapping instead of the kernel text mapping
+         * to modify the kernel text.
+         *
+         * For 32bit kernels, these mappings are same and we can use
+         * kernel identity mapping to modify code.
+         */
+        if (within(ip, (unsigned long)_text, (unsigned long)_etext))
+                ip = (unsigned long)__va(__pa_symbol(ip));
+        return ip;
+}
+/* Workaround: if no events are defined, then the event_table section does not
+ * exist and the above references cause linker errors. This could probably be
+ * fixed by adjusting the linker script, but it is easier to maintain for us if
+ * we simply create a dummy symbol in the event table section.
+ */
+int __event_table_dummy[0] __attribute__ ((section("__event_table")));
+int ft_enable_event(unsigned long id)
+{
+        struct trace_event* te = __start___event_table;
+        int count = 0;
+        char* delta;
+        unsigned char* instr;
+        set_kernel_text_rw();
+        set_all_modules_text_rw();
+        while (te < __stop___event_table) {
+                if (te->id == id && ++te->count == 1) {
+                        instr  = (unsigned char*) te->start_addr;
+                        /* make sure we don't clobber something wrong */
+                        if (*instr == BYTE_JUMP) {
+                                delta  = (unsigned char*) text_ip_addr(
+                                                ((unsigned long) te->start_addr)
+                                                + 1);
+                                *delta = 0;
+                        }
+                }
+                if (te->id == id)
+                        count++;
+                te++;
+        }
+        set_all_modules_text_ro();
+        set_kernel_text_ro();
+        printk(KERN_DEBUG "ft_enable_event: enabled %d events\n", count);
+        return count;
+}
+int ft_disable_event(unsigned long id)
+{
+        struct trace_event* te = __start___event_table;
+        int count = 0;
+        char* delta;
+        unsigned char* instr;
+        set_kernel_text_rw();
+        set_all_modules_text_rw();
+        while (te < __stop___event_table) {
+                if (te->id == id && --te->count == 0) {
+                        instr  = (unsigned char*) te->start_addr;
+                        if (*instr == BYTE_JUMP) {
+                                delta  = (unsigned char*) text_ip_addr(
+                                                ((unsigned long) te->start_addr)
+                                                + 1);
+                                *delta = te->end_addr - te->start_addr -
+                                        BYTE_JUMP_LEN;
+                        }
+                }
+                if (te->id == id)
+                        count++;
+                te++;
+        }
+        set_all_modules_text_ro();
+        set_kernel_text_ro();
+        printk(KERN_DEBUG "ft_disable_event: disabled %d events\n", count);
+        return count;
+}
+int ft_disable_all_events(void)
+{
+        struct trace_event* te = __start___event_table;
+        int count = 0;
+        char* delta;
+        unsigned char* instr;
+        set_kernel_text_rw();
+        set_all_modules_text_rw();
+        while (te < __stop___event_table) {
+                if (te->count) {
+                        instr  = (unsigned char*) te->start_addr;
+                        if (*instr == BYTE_JUMP) {
+                                delta  = (unsigned char*) text_ip_addr(
+                                                ((unsigned long) te->start_addr)
+                                                + 1);
+                                *delta = te->end_addr - te->start_addr -
+                                        BYTE_JUMP_LEN;
+                                te->count = 0;
+                                count++;
+                        }
+                }
+                te++;
+        }
+        set_all_modules_text_ro();
+        set_kernel_text_ro();
+        return count;
+}
+int ft_is_event_enabled(unsigned long id)
+{
+        struct trace_event* te = __start___event_table;
+        while (te < __stop___event_table) {
+                if (te->id == id)
+                        return te->count;
+                te++;
+        }
+        return 0;
+}
+#endif
diff --git a/fs/exec.c b/fs/exec.c
index 555e93c7dec8..49c8613d2510 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -64,6 +64,8 @@
 #include <linux/compat.h>
 #include <linux/vmalloc.h>
+#include <litmus/litmus.h>
 #include <linux/uaccess.h>
 #include <asm/mmu_context.h>
 #include <asm/tlb.h>
@@ -1765,6 +1767,7 @@ static int __do_execve_file(int fd, struct filename *filename,
                goto out_unmark;
        sched_exec();
+        litmus_exec();
        bprm->file = file;
        if (!filename) {
diff --git a/fs/inode.c b/fs/inode.c
index fef457a42882..abf61717d9db 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -394,6 +394,8 @@ void inode_init_once(struct inode *inode)
        INIT_LIST_HEAD(&inode->i_lru);
        __address_space_init_once(&inode->i_data);
        i_size_ordered_init(inode);
+        INIT_LIST_HEAD(&inode->i_obj_list);
+        mutex_init(&inode->i_obj_mutex);
 }
 EXPORT_SYMBOL(inode_init_once);
diff --git a/fs/select.c b/fs/select.c
index 53a0c149f528..7a3745f8d17f 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -32,6 +32,8 @@
 #include <net/busy_poll.h>
 #include <linux/vmalloc.h>
+#include <litmus/litmus.h>
 #include <linux/uaccess.h>
@@ -80,9 +82,9 @@ u64 select_estimate_accuracy(struct timespec64 *tv)
        /*
         * Realtime tasks get a slack of 0 for obvious reasons.
         */
+        if (rt_task(current) || is_realtime(current)) {
-        if (rt_task(current))
                return 0;
+        }
        ktime_get_ts64(&now);
        now = timespec64_sub(*tv, now);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e0d909d35763..d65e17d3d302 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -734,6 +734,9 @@ struct inode {
        struct fsverity_info    *i_verity_info;
 #endif
+        struct list_head        i_obj_list;
+        struct mutex            i_obj_mutex;
        void                    *i_private; /* fs or device private pointer */
 } __randomize_layout;
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index da0af631ded5..35271458e22b 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -8,6 +8,7 @@
 #include <linux/vtime.h>
 #include <asm/hardirq.h>
+#include <litmus/trace_irq.h>
 extern void synchronize_irq(unsigned int irq);
 extern bool synchronize_hardirq(unsigned int irq);
@@ -38,6 +39,7 @@ extern void rcu_nmi_exit(void);
                account_irq_enter_time(current);        \
                preempt_count_add(HARDIRQ_OFFSET);      \
                trace_hardirq_enter();                  \
+                ft_irq_fired();                         \
        } while (0)
 /*
@@ -75,6 +77,7 @@ extern void irq_exit(void);
                preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET); \
                rcu_nmi_enter();                                \
                trace_hardirq_enter();                          \
+                ft_irq_fired();                                 \
        } while (0)
 #define nmi_exit()                                              \
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 1b9a51a1bccb..a145e140d532 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -123,6 +123,9 @@ struct hrtimer {
        u8                              is_rel;
        u8                              is_soft;
        u8                              is_hard;
+#if defined(CONFIG_REPORT_TIMER_LATENCY) || defined(CONFIG_SCHED_OVERHEAD_TRACE)
+        ktime_t when_added;
+#endif
 };
 /**
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 67a1d86981a9..0a1b09305248 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -32,6 +32,9 @@
 #include <linux/posix-timers.h>
 #include <linux/rseq.h>
+#include <litmus/rt_param.h>
+#include <litmus/preempt.h>
 /* task_struct member predeclarations (sorted alphabetically): */
 struct audit_context;
 struct backing_dev_info;
@@ -61,6 +64,8 @@ struct signal_struct;
 struct task_delay_info;
 struct task_group;
+struct od_table_entry;
 /*
 * Task state bitmask. NOTE! These bits are also
 * encoded in fs/proc/array.c: get_task_state().
@@ -1158,6 +1163,10 @@ struct task_struct {
        /* Start of a write-and-pause period: */
        unsigned long                   dirty_paused_when;
+        /* LITMUS RT parameters and state */
+        struct rt_param rt_param;
+        struct od_table_entry *od_table;
 #ifdef CONFIG_LATENCYTOP
        int                             latency_record_count;
        struct latency_record           latency_record[LT_SAVECOUNT];
@@ -1741,6 +1750,7 @@ static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag)
 static inline void set_tsk_need_resched(struct task_struct *tsk)
 {
        set_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
+        sched_state_will_schedule(tsk);
 }
 static inline void clear_tsk_need_resched(struct task_struct *tsk)
diff --git a/include/trace/events/litmus.h b/include/trace/events/litmus.h
new file mode 100644
index 000000000000..0fffcee02be0
--- /dev/null
+++ b/include/trace/events/litmus.h
@@ -0,0 +1,231 @@
+/*
+ * LITMUS^RT kernel style scheduling tracepoints
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM litmus
+#if !defined(_SCHED_TASK_TRACEPOINT_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _SCHED_TASK_TRACEPOINT_H
+#include <linux/tracepoint.h>
+#include <litmus/litmus.h>
+#include <litmus/rt_param.h>
+/*
+ * Tracing task admission
+ */
+TRACE_EVENT(litmus_task_param,
+        TP_PROTO(struct task_struct *t),
+        TP_ARGS(t),
+        TP_STRUCT__entry(
+                __field( pid_t,         pid     )
+                __field( unsigned int,  job     )
+                __field( lt_t,          wcet    )
+                __field( lt_t,          period  )
+                __field( lt_t,          phase   )
+                __field( int,           partition )
+        ),
+        TP_fast_assign(
+                __entry->pid    = t ? t->pid : 0;
+                __entry->job    = t ? t->rt_param.job_params.job_no : 0;
+                __entry->wcet   = get_exec_cost(t);
+                __entry->period = get_rt_period(t);
+                __entry->phase  = get_rt_phase(t);
+                __entry->partition = get_partition(t);
+        ),
+        TP_printk("period(%d, %Lu).\nwcet(%d, %Lu).\n",
+                __entry->pid, __entry->period,
+                __entry->pid, __entry->wcet)
+);
+/*
+ * Tracing jobs release
+ */
+TRACE_EVENT(litmus_task_release,
+        TP_PROTO(struct task_struct *t),
+        TP_ARGS(t),
+        TP_STRUCT__entry(
+                __field( pid_t,         pid     )
+                __field( unsigned int,  job     )
+                __field( lt_t,          release )
+                __field( lt_t,          deadline        )
+        ),
+        TP_fast_assign(
+                __entry->pid    = t ? t->pid : 0;
+                __entry->job    = t ? t->rt_param.job_params.job_no : 0;
+                __entry->release        = get_release(t);
+                __entry->deadline       = get_deadline(t);
+        ),
+        TP_printk("release(job(%u, %u)): %Lu\ndeadline(job(%u, %u)): %Lu\n",
+                        __entry->pid, __entry->job, __entry->release,
+                        __entry->pid, __entry->job, __entry->deadline)
+);
+/*
+ * Tracepoint for switching to new task
+ */
+TRACE_EVENT(litmus_switch_to,
+        TP_PROTO(struct task_struct *t),
+        TP_ARGS(t),
+        TP_STRUCT__entry(
+                __field( pid_t,         pid     )
+                __field( unsigned int,  job     )
+                __field( lt_t,          when    )
+                __field( lt_t,          exec_time       )
+        ),
+        TP_fast_assign(
+                __entry->pid    = is_realtime(t) ? t->pid : 0;
+                __entry->job    = is_realtime(t) ? t->rt_param.job_params.job_no : 0;
+                __entry->when           = litmus_clock();
+                __entry->exec_time      = get_exec_time(t);
+        ),
+        TP_printk("switch_to(job(%u, %u)): %Lu (exec: %Lu)\n",
+                        __entry->pid, __entry->job,
+                        __entry->when, __entry->exec_time)
+);
+/*
+ * Tracepoint for switching away previous task
+ */
+TRACE_EVENT(litmus_switch_away,
+        TP_PROTO(struct task_struct *t),
+        TP_ARGS(t),
+        TP_STRUCT__entry(
+                __field( pid_t,         pid     )
+                __field( unsigned int,  job     )
+                __field( lt_t,          when    )
+                __field( lt_t,          exec_time       )
+        ),
+        TP_fast_assign(
+                __entry->pid    = is_realtime(t) ? t->pid : 0;
+                __entry->job    = is_realtime(t) ? t->rt_param.job_params.job_no : 0;
+                __entry->when           = litmus_clock();
+                __entry->exec_time      = get_exec_time(t);
+        ),
+        TP_printk("switch_away(job(%u, %u)): %Lu (exec: %Lu)\n",
+                        __entry->pid, __entry->job,
+                        __entry->when, __entry->exec_time)
+);
+/*
+ * Tracing jobs completion
+ */
+TRACE_EVENT(litmus_task_completion,
+        TP_PROTO(struct task_struct *t, unsigned long forced),
+        TP_ARGS(t, forced),
+        TP_STRUCT__entry(
+                __field( pid_t,         pid     )
+                __field( unsigned int,  job     )
+                __field( lt_t,          when    )
+                __field( unsigned long, forced  )
+        ),
+        TP_fast_assign(
+                __entry->pid    = t ? t->pid : 0;
+                __entry->job    = t ? t->rt_param.job_params.job_no : 0;
+                __entry->when   = litmus_clock();
+                __entry->forced = forced;
+        ),
+        TP_printk("completed(job(%u, %u)): %Lu (forced: %lu)\n",
+                        __entry->pid, __entry->job,
+                        __entry->when, __entry->forced)
+);
+/*
+ * Trace blocking tasks.
+ */
+TRACE_EVENT(litmus_task_block,
+        TP_PROTO(struct task_struct *t),
+        TP_ARGS(t),
+        TP_STRUCT__entry(
+                __field( pid_t,         pid     )
+                __field( lt_t,          when    )
+        ),
+        TP_fast_assign(
+                __entry->pid    = t ? t->pid : 0;
+                __entry->when   = litmus_clock();
+        ),
+        TP_printk("(%u) blocks: %Lu\n", __entry->pid, __entry->when)
+);
+/*
+ * Tracing jobs resume
+ */
+TRACE_EVENT(litmus_task_resume,
+        TP_PROTO(struct task_struct *t),
+        TP_ARGS(t),
+        TP_STRUCT__entry(
+                __field( pid_t,         pid     )
+                __field( unsigned int,  job     )
+                __field( lt_t,          when    )
+        ),
+        TP_fast_assign(
+                __entry->pid    = t ? t->pid : 0;
+                __entry->job    = t ? t->rt_param.job_params.job_no : 0;
+                __entry->when   = litmus_clock();
+        ),
+        TP_printk("resume(job(%u, %u)): %Lu\n",
+                        __entry->pid, __entry->job, __entry->when)
+);
+/*
+ * Trace synchronous release
+ */
+TRACE_EVENT(litmus_sys_release,
+        TP_PROTO(lt_t *start),
+        TP_ARGS(start),
+        TP_STRUCT__entry(
+                __field( lt_t,          rel     )
+                __field( lt_t,          when    )
+        ),
+        TP_fast_assign(
+                __entry->rel    = *start;
+                __entry->when   = litmus_clock();
+        ),
+        TP_printk("SynRelease(%Lu) at %Lu\n", __entry->rel, __entry->when)
+);
+#endif /* _SCHED_TASK_TRACEPOINT_H */
+/* Must stay outside the protection */
+#include <trace/define_trace.h>
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index 25b4fa00bad1..f6e838d97ff3 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -87,6 +87,7 @@ struct clone_args {
 /* SCHED_ISO: reserved but not implemented yet */
 #define SCHED_IDLE              5
 #define SCHED_DEADLINE          6
+#define SCHED_LITMUS            7
 /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */
 #define SCHED_RESET_ON_FORK     0x40000000
diff --git a/kernel/exit.c b/kernel/exit.c
index a46a50d67002..6832c614c663 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -69,6 +69,10 @@
 #include <asm/pgtable.h>
 #include <asm/mmu_context.h>
+#include <litmus/litmus.h>
+extern void exit_od_table(struct task_struct *t);
 static void __unhash_process(struct task_struct *p, bool group_dead)
 {
        nr_threads--;
@@ -727,6 +731,14 @@ void __noreturn do_exit(long code)
        if (unlikely(!tsk->pid))
                panic("Attempted to kill the idle task!");
+        if (unlikely(is_realtime(tsk))) {
+                /* We would like the task to be polite and transition out of
+                 * RT mode first.
+                 */
+                litmus_do_exit(tsk);
+                BUG_ON(is_realtime(tsk);
+        }
        /*
         * If do_exit is called because this processes oopsed, it's possible
         * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
@@ -798,6 +810,8 @@ void __noreturn do_exit(long code)
                tty_audit_exit();
        audit_free(tsk);
+        exit_od_table(tsk);
        tsk->exit_code = code;
        taskstats_exit(tsk, group_dead);
diff --git a/kernel/fork.c b/kernel/fork.c
index 55af6931c6ec..220211ef8946 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -104,6 +104,9 @@
 #include <trace/events/sched.h>
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/task.h>
@@ -740,6 +743,9 @@ void __put_task_struct(struct task_struct *tsk)
        cgroup_free(tsk);
        task_numa_free(tsk, true);
        security_task_free(tsk);
+        exit_litmus(tsk);
        exit_creds(tsk);
        delayacct_tsk_free(tsk);
        put_signal_struct(tsk->signal);
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index eef04551eae7..9adb95795f83 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -31,6 +31,8 @@
 #include "rwsem.h"
 #include "lock_events.h"
+#include <litmus/litmus.h>
 /*
 * The least significant 3 bits of the owner value has the following
 * meanings when set.
@@ -886,11 +888,13 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock)
                 * a writer, need_resched() check needs to be done here.
                 */
                if (owner_state != OWNER_WRITER) {
-                        if (need_resched())
+                        if (need_resched()) {
                                break;
-                        if (rt_task(current) &&
+                        }
-                           (prev_owner_state != OWNER_WRITER))
+                        if ((rt_task(current) || is_realtime(current)) &&
+                           (prev_owner_state != OWNER_WRITER)) {
                                break;
+                        }
                }
                prev_owner_state = owner_state;
@@ -1258,7 +1262,8 @@ wait:
                         * until rwsem_try_write_lock() is called.
                         */
                        if ((wstate == WRITER_FIRST) && (rt_task(current) ||
-                            time_after(jiffies, waiter.timeout))) {
+                                is_realtime(current) ||
+                                time_after(jiffies, waiter.timeout))) {
                                wstate = WRITER_HANDOFF;
                                lockevent_inc(rwsem_wlock_handoff);
                                break;
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index ca65327a6de8..4c3d18d2587e 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -68,6 +68,13 @@ int console_printk[4] = {
 };
 EXPORT_SYMBOL_GPL(console_printk);
+/*
+ * Divert printk() messages when there is a LITMUS^RT debug listener.
+ */
+#include <litmus/debug_trace.h>
+int trace_override = 0;
+int trace_recurse = 0;
 atomic_t ignore_console_lock_warning __read_mostly = ATOMIC_INIT(0);
 EXPORT_SYMBOL(ignore_console_lock_warning);
@@ -1916,6 +1923,11 @@ int vprintk_store(int facility, int level,
         */
        text_len = vscnprintf(text, sizeof(textbuf), fmt, args);
+        /* If the LITMUS^RT tracer is active then divert printk messages. */
+        if (trace_override && !trace_recurse) {
+                TRACE("%s", text);
+        }
        /* mark and strip a trailing newline */
        if (text_len && text[text_len-1] == '\n') {
                text_len--;
@@ -2967,7 +2979,7 @@ static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = {
 void wake_up_klogd(void)
 {
        preempt_disable();
-        if (waitqueue_active(&log_wait)) {
+        if (!trace_override && waitqueue_active(&log_wait)) {
                this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
                irq_work_queue(this_cpu_ptr(&wake_up_klogd_work));
        }
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 21fb5a5662b5..95000e43fce7 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -30,3 +30,6 @@ obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
 obj-$(CONFIG_MEMBARRIER) += membarrier.o
 obj-$(CONFIG_CPU_ISOLATION) += isolation.o
 obj-$(CONFIG_PSI) += psi.o
+obj-y += litmus.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0f2eb3629070..917a374b616f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -20,6 +20,12 @@
 #include "pelt.h"
+#include <litmus/litmus.h>
+#include <litmus/debug_trace.h>
+#include <litmus/trace.h>
+#include <litmus/sched_trace.h>
+#include <litmus/sched_plugin.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/sched.h>
@@ -520,6 +526,11 @@ void resched_curr(struct rq *rq)
                set_tsk_need_resched(curr);
                set_preempt_need_resched();
                return;
+        } else if (is_realtime(curr)) {
+                /* Cannot call set_tsk_need_resched() on LITMUS tasks on a
+                 * remote core. Only policy plugins may do this
+                 * via litmus_reschedule(). */
+                return;
        }
        if (set_nr_and_not_polling(curr))
@@ -2317,9 +2328,17 @@ void scheduler_ipi(void)
         * this IPI.
         */
        preempt_fold_need_resched();
+        /* Let LITMUS' preemption state machine know about this IPI. */
-        if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
+        sched_state_ipi();
+        if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) {
+#ifndef CONFIG_ARCH_CALLS_IRQ_ENTER_ON_RESCHED_IPI
+                /* If we don't call irq_enter() then we need to trigger the
+                 * IRQ tracing manually. */
+                ft_irq_fired();
+#endif
                return;
+        }
        /*
         * Not all reschedule IPI handlers call irq_enter/irq_exit, since
@@ -2397,7 +2416,12 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
        struct rq_flags rf;
 #if defined(CONFIG_SMP)
-        if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
+        /*
+         * In LITMUS, it is up to a plugin to determine whether to send an IPI
+         * to a remote CPU.
+         */
+        if (!is_realtime(p) && sched_feat(TTWU_QUEUE) &&
+                !cpus_share_cache(smp_processor_id(), cpu)) {
                sched_clock_cpu(cpu); /* Sync clocks across CPUs */
                ttwu_queue_remote(p, cpu, wake_flags);
                return;
@@ -2517,6 +2541,9 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 {
        unsigned long flags;
        int cpu, success = 0;
+        if (is_realtime(p)) {
+                TRACE_TASK(p, "try_to_wake_up() state: %d\n", p->state);
+        }
        preempt_disable();
        if (p == current) {
@@ -2616,6 +2643,13 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
         */
        smp_cond_load_acquire(&p->on_cpu, !VAL);
+        /* LITMUS: Once the task can be safely referenced by this CPU, don't
+         * mess with further Linux load balancing stuff.
+         */
+        if (is_realtime(p)) {
+                goto litmus_out_activate;
+        }
        p->sched_contributes_to_load = !!task_contributes_to_load(p);
        p->state = TASK_WAKING;
@@ -2631,6 +2665,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
                set_task_cpu(p, cpu);
        }
+litmus_out_activate:
 #else /* CONFIG_SMP */
        if (p->in_iowait) {
@@ -2641,6 +2676,10 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 #endif /* CONFIG_SMP */
        ttwu_queue(p, cpu, wake_flags);
+        if (is_realtime(p)) {
+                TRACE_TASK(p, "try_to_wake_up() done state: %d\n", p->state);
+        }
 unlock:
        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 out:
@@ -2853,13 +2892,16 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
         */
        p->prio = current->normal_prio;
+        litmus_fork(p);
        uclamp_fork(p);
        /*
         * Revert to default priority/policy on fork if requested.
         */
        if (unlikely(p->sched_reset_on_fork)) {
-                if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
+                if (task_has_dl_policy(p) || task_has_rt_policy(p) ||
+                        p->policy == SCHED_LITMUS) {
                        p->policy = SCHED_NORMAL;
                        p->static_prio = NICE_TO_PRIO(0);
                        p->rt_priority = 0;
@@ -2876,12 +2918,15 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
                p->sched_reset_on_fork = 0;
        }
-        if (dl_prio(p->prio))
+        if (is_realtime(p)) {
+                p->sched_class = &litmus_sched_class;
+        } else if (dl_prio(p->prio)) {
                return -EAGAIN;
-        else if (rt_prio(p->prio))
+        } else if (rt_prio(p->prio)) {
                p->sched_class = &rt_sched_class;
-        else
+        } else {
                p->sched_class = &fair_sched_class;
+        }
        init_entity_runnable_average(&p->se);
@@ -2945,6 +2990,10 @@ void wake_up_new_task(struct task_struct *p)
        struct rq_flags rf;
        struct rq *rq;
+        if (is_realtime(p)) {
+                litmus->task_new(p, 1, 0);
+        }
        raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
        p->state = TASK_RUNNING;
 #ifdef CONFIG_SMP
@@ -3218,6 +3267,8 @@ static struct rq *finish_task_switch(struct task_struct *prev)
         */
        prev_state = prev->state;
        vtime_task_switch(prev);
+        litmus->finish_switch(prev);
+        prev->rt_param.stack_in_use = NO_CPU;
        perf_event_task_sched_in(prev, current);
        finish_task(prev);
        finish_lock_switch(rq);
@@ -3317,6 +3368,12 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
         */
        rq = finish_task_switch(prev);
+        sched_trace_task_switch_to(current);
+        if (unlikely(sched_state_validate_switch())) {
+                litmus_reschedule_local();
+        }
        balance_callback(rq);
        preempt_enable();
@@ -3608,7 +3665,9 @@ void scheduler_tick(void)
 #ifdef CONFIG_SMP
        rq->idle_balance = idle_cpu(cpu);
-        trigger_load_balance(rq);
+        if (!is_realtime(current)) {
+                trigger_load_balance(rq);
+        }
 #endif
 }
@@ -3910,9 +3969,13 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
        /*
         * Optimization: we know that if all tasks are in the fair class we can
         * call that function directly, but only if the @prev task wasn't of a
-         * higher scheduling class, because otherwise those loose the
+         * higher scheduling class, because otherwise those lose the
         * opportunity to pull in more work from other CPUs.
-         */
+         *
+         * We can't do this in LITMUS!
+         *
+         * This breaks many assumptions in the plugins. Do not uncomment
+         * without considering how this affects global plugins such as GSN-EDF.
        if (likely((prev->sched_class == &idle_sched_class ||
                    prev->sched_class == &fair_sched_class) &&
                   rq->nr_running == rq->cfs.h_nr_running)) {
@@ -3921,12 +3984,13 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
                if (unlikely(p == RETRY_TASK))
                        goto restart;
-                /* Assumes fair_sched_class->next == idle_sched_class */
+                // Assumes fair_sched_class->next == idle_sched_class 
                if (unlikely(!p))
                        p = idle_sched_class.pick_next_task(rq, prev, rf);
                return p;
        }
+        */
 restart:
 #ifdef CONFIG_SMP
@@ -4003,10 +4067,15 @@ static void __sched notrace __schedule(bool preempt)
        struct rq *rq;
        int cpu;
+        TS_SCHED_START;
+        sched_state_entered_schedule();
        cpu = smp_processor_id();
        rq = cpu_rq(cpu);
        prev = rq->curr;
+        sched_trace_task_switch_away(prev);
        schedule_debug(prev, preempt);
        if (sched_feat(HRTICK))
@@ -4030,6 +4099,8 @@ static void __sched notrace __schedule(bool preempt)
        rq->clock_update_flags <<= 1;
        update_rq_clock(rq);
+        this_cpu_write(litmus_preemption_in_process, preempt);
        switch_count = &prev->nivcsw;
        if (!preempt && prev->state) {
                if (signal_pending_state(prev->state, prev)) {
@@ -4049,6 +4120,8 @@ static void __sched notrace __schedule(bool preempt)
        clear_tsk_need_resched(prev);
        clear_preempt_need_resched();
+        this_cpu_write(litmus_preemption_in_progress, false);
        if (likely(prev != next)) {
                rq->nr_switches++;
                /*
@@ -4073,15 +4146,25 @@ static void __sched notrace __schedule(bool preempt)
                ++*switch_count;
                trace_sched_switch(preempt, prev, next);
+                TS_SCHED_END(next);
+                TS_CXS_START(next);
                /* Also unlocks the rq: */
                rq = context_switch(rq, prev, next, &rf);
+                TS_CXS_END(current);
        } else {
                rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
+                TS_SCHED_END(prev);
                rq_unlock_irq(rq, &rf);
        }
+        TS_SCHED2_START(prev);
+        sched_trace_task_switch_to(current);
+        if (unlikely(sched_state_validate_switch())) {
+                litmus_reschedule_local();
+        }
        balance_callback(rq);
+        TS_SCHED2_END(prev);
 }
 void __noreturn do_task_dead(void)
@@ -4513,7 +4596,7 @@ void set_user_nice(struct task_struct *p, long nice)
         * it wont have any effect on scheduling until the task is
         * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR:
         */
-        if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
+        if (task_has_dl_policy(p) || task_has_rt_policy(p) || is_realtime(p)) {
                p->static_prio = NICE_TO_PRIO(nice);
                goto out_unlock;
        }
@@ -4723,12 +4806,15 @@ static void __setscheduler(struct rq *rq, struct task_struct *p,
        if (keep_boost)
                p->prio = rt_effective_prio(p, p->prio);
-        if (dl_prio(p->prio))
+        if (p->policy == SCHED_LITMUS) {
+                p->sched_class = &litmus_sched_class;
+        } else if (dl_prio(p->prio)) {
                p->sched_class = &dl_sched_class;
-        else if (rt_prio(p->prio))
+        } else if (rt_prio(p->prio)) {
                p->sched_class = &rt_sched_class;
-        else
+        } else {
                p->sched_class = &fair_sched_class;
+        }
 }
 /*
@@ -4760,6 +4846,7 @@ static int __sched_setscheduler(struct task_struct *p,
        int reset_on_fork;
        int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
        struct rq *rq;
+        int litmus_task = 0;
        /* The pi code expects interrupts enabled */
        BUG_ON(pi && in_interrupt());
@@ -4789,7 +4876,9 @@ recheck:
        if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
            (rt_policy(policy) != (attr->sched_priority != 0)))
                return -EINVAL;
+        if ((policy == SCHED_LITMUS) && (policy == p->policy)) {
+                return -EINVAL;
+        }
        /*
         * Allow unprivileged RT tasks to decrease priority:
         */
@@ -4857,6 +4946,13 @@ recheck:
                        return retval;
        }
+        if (policy == SCHED_LITMUS) {
+                retval = litmus_admit_task(p);
+                if (retval) {
+                        return retval;
+                }
+        }
        if (pi)
                cpuset_read_lock();
@@ -4949,6 +5045,11 @@ change:
                goto unlock;
        }
+        if (is_realtime(p)) {
+                litmus_exit_task(p);
+                litmus_task = 1;
+        }
        p->sched_reset_on_fork = reset_on_fork;
        oldprio = p->prio;
@@ -4977,6 +5078,16 @@ change:
        __setscheduler(rq, p, attr, pi);
        __setscheduler_uclamp(p, attr);
+        if (litmus_policy(policy)) {
+#ifdef CONFIG_SMP
+                p->rt_param.stack_in_use = running ? rq->cpu : NO_CPU;
+#else
+                p->rt_param.stack_in_use = running ? 0 : NO_CPU;
+#endif
+                p->rt_param.present = running;
+                litmus->task_new(p, queued, running);
+        }
        if (queued) {
                /*
                 * We enqueue to tail when the priority of a task is
@@ -5005,6 +5116,10 @@ change:
        balance_callback(rq);
        preempt_enable();
+        if (litmus_task) {
+                litmus_dealloc(p);
+        }
        return 0;
 unlock:
@@ -5391,9 +5506,9 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
        rcu_read_lock();
        p = find_process_by_pid(pid);
-        if (!p) {
+        if (!p || is_realtime(p)) {
                rcu_read_unlock();
-                return -ESRCH;
+                return p ? -EPERM : -ESRCH;
        }
        /* Prevent p going away */
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index a8a08030a8f7..1842c3e33476 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -18,6 +18,8 @@
 #include "sched.h"
 #include "pelt.h"
+#include <litmus/litmus.h>
 struct dl_bandwidth def_dl_bandwidth;
 static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se)
@@ -1049,17 +1051,21 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
 #endif
        enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
-        if (dl_task(rq->curr))
+        if (dl_task(rq->curr)) {
                check_preempt_curr_dl(rq, p, 0);
-        else
+        } else if (!is_realtime(rq->curr)) {
                resched_curr(rq);
+        }
 #ifdef CONFIG_SMP
        /*
         * Queueing this task back might have overloaded rq, check if we need
         * to kick someone away.
+         *
+         * LITMUS note: Don't incur this overhead if we are running a LITMUS
+         * task.
         */
-        if (has_pushable_dl_tasks(rq)) {
+        if (has_pushable_dl_tasks(rq) && (!is_realtime(rq->curr))) {
                /*
                 * Nothing relies on rq->lock after this, so its safe to drop
                 * rq->lock.
@@ -2357,9 +2363,13 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
         * Since this might be the only -deadline task on the rq,
         * this is the right place to try to pull some other one
         * from an overloaded CPU, if any.
+         *
+         * LITMUS note: also don't pull a task when we're running LITMUS tasks.
         */
-        if (!task_on_rq_queued(p) || rq->dl.dl_nr_running)
+        if (!task_on_rq_queued(p) || rq->dl.dl_nr_running ||
+                is_realtime(rq->curr)) {
                return;
+        }
        deadline_queue_pull_task(rq);
 }
@@ -2374,9 +2384,8 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
                put_task_struct(p);
        /* If p is not queued we will update its parameters at next wakeup. */
-        if (!task_on_rq_queued(p)) {
+        if (!task_on_rq_queued(p) || is_realtime(rq->curr)) {
                add_rq_bw(&p->dl, &rq->dl);
                return;
        }
diff --git a/kernel/sched/litmus.c b/kernel/sched/litmus.c
new file mode 100644
index 000000000000..d9c59998155b
--- /dev/null
+++ b/kernel/sched/litmus.c
@@ -0,0 +1,386 @@
+/* This file is included from kernel/sched.c */
+#include "sched.h"
+#include <litmus/trace.h>
+#include <litmus/sched_trace.h>
+#include <litmus/debug_trace.h>
+#include <litmus/litmus.h>
+#include <litmus/budget.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/preempt.h>
+#include <litmus/np.h>
+static void update_time_litmus(struct rq *rq, struct task_struct *p)
+{
+        u64 delta = rq->clock - p->se.exec_start;
+        if (unlikely((s64)delta < 0))
+                delta = 0;
+        /* per job counter */
+        p->rt_param.job_params.exec_time += delta;
+        /* task counter */
+        p->se.sum_exec_runtime += delta;
+        if (delta) {
+                TRACE_TASK(p, "charged %llu exec time (total:%llu, rem:%llu)\n",
+                        delta, p->rt_param.job_params.exec_time, budget_remaining(p));
+        }
+        /* sched_clock() */
+        p->se.exec_start = rq->clock;
+        cpuacct_charge(p, delta);
+}
+static void double_rq_lock(struct rq *rq1, struct rq *rq2);
+static void double_rq_unlock(struct rq *rq1, struct rq *rq2);
+static struct task_struct *
+litmus_schedule(struct rq *rq, struct task_struct *prev)
+{
+        struct task_struct *next;
+#ifdef CONFIG_SMP
+        struct rq* other_rq;
+        long was_running;
+        int from_where;
+        lt_t _maybe_deadlock = 0;
+#endif
+        /* let the plugin schedule */
+        next = litmus->schedule(prev);
+        sched_state_plugin_check();
+#ifdef CONFIG_SMP
+        /* check if a global plugin pulled a task from a different RQ */
+        if (next && task_rq(next) != rq) {
+                /* we need to migrate the task */
+                other_rq = task_rq(next);
+                from_where = other_rq->cpu;
+                TRACE_TASK(next, "migrate from %d\n", from_where);
+                /* while we drop the lock, the prev task could change its
+                 * state
+                 */
+                BUG_ON(prev != current);
+                was_running = is_current_running();
+                /* Don't race with a concurrent switch.  This could deadlock in
+                 * the case of cross or circular migrations.  It's the job of
+                 * the plugin to make sure that doesn't happen.
+                 */
+                TRACE_TASK(next, "stack_in_use=%d\n",
+                           next->rt_param.stack_in_use);
+                if (next->rt_param.stack_in_use != NO_CPU) {
+                        TRACE_TASK(next, "waiting to deschedule\n");
+                        _maybe_deadlock = litmus_clock();
+                }
+                raw_spin_unlock(&rq->lock);
+                while (next->rt_param.stack_in_use != NO_CPU) {
+                        cpu_relax();
+                        mb();
+                        if (next->rt_param.stack_in_use == NO_CPU)
+                                TRACE_TASK(next,"descheduled. Proceeding.\n");
+                        if (!litmus->should_wait_for_stack(next)) {
+                                /* plugin aborted the wait */
+                                TRACE_TASK(next,
+                                           "plugin gave up waiting for stack\n");
+                                next = NULL;
+                                /* Make sure plugin is given a chance to
+                                 * reconsider. */
+                                litmus_reschedule_local();
+                                /* give up */
+                                raw_spin_lock(&rq->lock);
+                                goto out;
+                        }
+                        if (from_where != task_rq(next)->cpu) {
+                                /* The plugin should not give us something
+                                 * that other cores are trying to pull, too */
+                                TRACE_TASK(next, "next invalid: task keeps "
+                                                 "shifting around!? "
+                                                 "(%d->%d)\n",
+                                                 from_where,
+                                                 task_rq(next)->cpu);
+                                /* bail out */
+                                raw_spin_lock(&rq->lock);
+                                litmus->next_became_invalid(next);
+                                litmus_reschedule_local();
+                                next = NULL;
+                                goto out;
+                        }
+                        if (lt_before(_maybe_deadlock + 1000000000L,
+                                      litmus_clock())) {
+                                /* We've been spinning for 1s.
+                                 * Something can't be right!
+                                 * Let's abandon the task and bail out; at least
+                                 * we will have debug info instead of a hard
+                                 * deadlock.
+                                 */
+#ifdef CONFIG_BUG_ON_MIGRATION_DEADLOCK
+                                BUG();
+#else
+                                TRACE_TASK(next,"stack too long in use. "
+                                           "Deadlock?\n");
+                                next = NULL;
+                                /* bail out */
+                                raw_spin_lock(&rq->lock);
+                                goto out;
+#endif
+                        }
+                }
+#ifdef  __ARCH_WANT_UNLOCKED_CTXSW
+                if (next->on_cpu)
+                        TRACE_TASK(next, "waiting for !oncpu");
+                while (next->on_cpu) {
+                        cpu_relax();
+                        mb();
+                }
+#endif
+                double_rq_lock(rq, other_rq);
+                if (other_rq == task_rq(next) &&
+                    next->rt_param.stack_in_use == NO_CPU) {
+                        /* ok, we can grab it */
+                        set_task_cpu(next, rq->cpu);
+                        /* release the other CPU's runqueue, but keep ours */
+                        raw_spin_unlock(&other_rq->lock);
+                } else {
+                        /* Either it moved or the stack was claimed; both is
+                         * bad and forces us to abort the migration. */
+                        TRACE_TASK(next, "next invalid: no longer available\n");
+                        raw_spin_unlock(&other_rq->lock);
+                        litmus->next_became_invalid(next);
+                        next = NULL;
+                        goto out;
+                }
+                if (!litmus->post_migration_validate(next)) {
+                        TRACE_TASK(next, "plugin deems task now invalid\n");
+                        litmus_reschedule_local();
+                        next = NULL;
+                }
+        }
+#endif
+        /* check if the task became invalid while we dropped the lock */
+        if (next && (!is_realtime(next) || !tsk_rt(next)->present)) {
+                TRACE_TASK(next,
+                        "BAD: next (no longer?) valid\n");
+                litmus->next_became_invalid(next);
+                litmus_reschedule_local();
+                next = NULL;
+        }
+        if (next) {
+#ifdef CONFIG_SMP
+                next->rt_param.stack_in_use = rq->cpu;
+#else
+                next->rt_param.stack_in_use = 0;
+#endif
+                update_rq_clock(rq);
+                next->se.exec_start = rq->clock;
+        }
+out:
+        update_enforcement_timer(next);
+        return next;
+}
+static void enqueue_task_litmus(struct rq *rq, struct task_struct *p,
+                                int flags)
+{
+        tsk_rt(p)->present = 1;
+        if (flags & ENQUEUE_WAKEUP) {
+                sched_trace_task_resume(p);
+                /* LITMUS^RT plugins need to update the state
+                 * _before_ making it available in global structures.
+                 * Linux gets away with being lazy about the task state
+                 * update. We can't do that, hence we update the task
+                 * state already here.
+                 *
+                 * WARNING: this needs to be re-evaluated when porting
+                 *          to newer kernel versions.
+                 */
+                p->state = TASK_RUNNING;
+                litmus->task_wake_up(p);
+                rq->litmus.nr_running++;
+        } else {
+                TRACE_TASK(p, "ignoring an enqueue, not a wake up.\n");
+                p->se.exec_start = rq->clock;
+        }
+}
+static void dequeue_task_litmus(struct rq *rq, struct task_struct *p,
+                                int flags)
+{
+        if (flags & DEQUEUE_SLEEP) {
+#ifdef CONFIG_SCHED_TASK_TRACE
+                tsk_rt(p)->job_params.last_suspension = litmus_clock();
+#endif
+                litmus->task_block(p);
+                tsk_rt(p)->present = 0;
+                sched_trace_task_block(p);
+                rq->litmus.nr_running--;
+        } else
+                TRACE_TASK(p, "ignoring a dequeue, not going to sleep.\n");
+}
+static void yield_task_litmus(struct rq *rq)
+{
+        TS_SYSCALL_IN_START;
+        TS_SYSCALL_IN_END;
+        BUG_ON(rq->curr != current);
+        /* sched_yield() is called to trigger delayed preemptions.
+         * Thus, mark the current task as needing to be rescheduled.
+         * This will cause the scheduler plugin to be invoked, which can
+         * then determine if a preemption is still required.
+         */
+        clear_exit_np(current);
+        litmus_reschedule_local();
+        TS_SYSCALL_OUT_START;
+}
+/* Plugins are responsible for this.
+ */
+static void check_preempt_curr_litmus(struct rq *rq, struct task_struct *p, int flags)
+{
+}
+static void put_prev_task_litmus(struct rq *rq, struct task_struct *p)
+{
+}
+/* pick_next_task_litmus() - litmus_schedule() function
+ *
+ * return the next task to be scheduled
+ */
+static struct task_struct *pick_next_task_litmus(struct rq *rq,
+        struct task_struct *prev, struct pin_cookie cookie)
+{
+        struct task_struct *next;
+        if (is_realtime(prev))
+                update_time_litmus(rq, prev);
+        lockdep_unpin_lock(&rq->lock, cookie);
+        TS_PLUGIN_SCHED_START;
+        next = litmus_schedule(rq, prev);
+        TS_PLUGIN_SCHED_END;
+        lockdep_repin_lock(&rq->lock, cookie);
+        /* This is a bit backwards: the other classes call put_prev_task()
+         * _after_ they've determined that the class has some queued tasks.
+         * We can't determine this easily because each plugin manages its own
+         * ready queues, and because in the case of globally shared queues,
+         * we really don't know whether we'll have something ready even if
+         * we test here. So we do it in reverse: first ask the plugin to
+         * provide a task, and if we find one, call put_prev_task() on the
+         * previously scheduled task.
+         */
+        if (next)
+                put_prev_task(rq, prev);
+        return next;
+}
+static void task_tick_litmus(struct rq *rq, struct task_struct *p, int queued)
+{
+        if (is_realtime(p) && !queued) {
+                update_time_litmus(rq, p);
+                /* budget check for QUANTUM_ENFORCEMENT tasks */
+                if (budget_enforced(p) && budget_exhausted(p)) {
+                        litmus_reschedule_local();
+                }
+        }
+}
+static void switched_to_litmus(struct rq *rq, struct task_struct *p)
+{
+}
+static void prio_changed_litmus(struct rq *rq, struct task_struct *p,
+                                int oldprio)
+{
+}
+unsigned int get_rr_interval_litmus(struct rq *rq, struct task_struct *p)
+{
+        /* return infinity */
+        return 0;
+}
+/* This is called when a task became a real-time task, either due to a SCHED_*
+ * class transition or due to PI mutex inheritance. We don't handle Linux PI
+ * mutex inheritance yet (and probably never will). Use LITMUS provided
+ * synchronization primitives instead.
+ */
+static void set_curr_task_litmus(struct rq *rq)
+{
+        rq->curr->se.exec_start = rq->clock;
+}
+#ifdef CONFIG_SMP
+/* execve tries to rebalance task in this scheduling domain.
+ * We don't care about the scheduling domain; can gets called from
+ * exec, fork, wakeup.
+ */
+static int
+select_task_rq_litmus(struct task_struct *p, int cpu, int sd_flag, int flags)
+{
+        /* preemption is already disabled.
+         * We don't want to change cpu here
+         */
+        return task_cpu(p);
+}
+#endif
+static void update_curr_litmus(struct rq *rq)
+{
+        struct task_struct *p = rq->curr;
+        if (!is_realtime(p))
+                return;
+        update_time_litmus(rq, p);
+}
+const struct sched_class litmus_sched_class = {
+        /* From 34f971f6 the stop/migrate worker threads have a class on
+         * their own, which is the highest prio class. We don't support
+         * cpu-hotplug or cpu throttling. Allows Litmus to use up to 1.0
+         * CPU capacity.
+         */
+        .next                   = &stop_sched_class,
+        .enqueue_task           = enqueue_task_litmus,
+        .dequeue_task           = dequeue_task_litmus,
+        .yield_task             = yield_task_litmus,
+        .check_preempt_curr     = check_preempt_curr_litmus,
+        .pick_next_task         = pick_next_task_litmus,
+        .put_prev_task          = put_prev_task_litmus,
+#ifdef CONFIG_SMP
+        .select_task_rq         = select_task_rq_litmus,
+#endif
+        .set_curr_task          = set_curr_task_litmus,
+        .task_tick              = task_tick_litmus,
+        .get_rr_interval        = get_rr_interval_litmus,
+        .prio_changed           = prio_changed_litmus,
+        .switched_to            = switched_to_litmus,
+        .update_curr            = update_curr_litmus,
+};
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 9b8adc01be3d..a48c98b950b3 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -7,6 +7,8 @@
 #include "pelt.h"
+#include <litmus/litmus.h>
 int sched_rr_timeslice = RR_TIMESLICE;
 int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
@@ -499,8 +501,12 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
                else if (!on_rt_rq(rt_se))
                        enqueue_rt_entity(rt_se, 0);
-                if (rt_rq->highest_prio.curr < curr->prio)
+                // LITMUS note: Don't subject LITMUS tasks to remote
+                // reschedules.
+                if ((rt_rq->highest_prio.curr < curr->prio) &&
+                        !is_realtime(curr)) {
                        resched_curr(rq);
+                }
        }
 }
@@ -589,8 +595,10 @@ static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 {
        struct rq *rq = rq_of_rt_rq(rt_rq);
-        if (!rt_rq->rt_nr_running)
+        if (!rt_rq->rt_nr_running ||
+                is_realtime(rq_of_rt_rq(rt_rq)->current)) {
                return;
+        }
        enqueue_top_rt_rq(rt_rq);
        resched_curr(rq);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c8870c5bd7df..c4f7afbe90c0 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -162,6 +162,11 @@ static inline int rt_policy(int policy)
        return policy == SCHED_FIFO || policy == SCHED_RR;
 }
+static inline int litmus_policy(int policy)
+{
+        return policy == SCHED_LITMUS;
+}
 static inline int dl_policy(int policy)
 {
        return policy == SCHED_DEADLINE;
@@ -169,7 +174,8 @@ static inline int dl_policy(int policy)
 static inline bool valid_policy(int policy)
 {
        return idle_policy(policy) || fair_policy(policy) ||
-                rt_policy(policy) || dl_policy(policy);
+                rt_policy(policy) || dl_policy(policy) ||
+                litmus_policy(policy);
 }
 static inline int task_has_idle_policy(struct task_struct *p)
@@ -685,6 +691,10 @@ struct dl_rq {
        u64                     bw_ratio;
 };
+struct litmus_rq {
+        unsigned long nr_running;
+};
 #ifdef CONFIG_FAIR_GROUP_SCHED
 /* An entity is a task if it doesn't "own" a runqueue */
 #define entity_is_task(se)      (!se->my_q)
@@ -881,6 +891,7 @@ struct rq {
        struct cfs_rq           cfs;
        struct rt_rq            rt;
        struct dl_rq            dl;
+        struct litmus_rq        litmus;
 #ifdef CONFIG_FAIR_GROUP_SCHED
        /* list of leaf cfs_rq on this CPU: */
@@ -1783,11 +1794,19 @@ static inline void set_next_task(struct rq *rq, struct task_struct *next)
        next->sched_class->set_next_task(rq, next);
 }
+/* FIXME: This is conceptually wrong; this should be below the stop-machine
+ * class, but existing plugins (that predate the stop-machine class) depend on
+ * the assumption that LITMUS^RT plugins are the top scheduling class.
+ */
+#define sched_class_highest (&litmus_sched_class)
+/*
 #ifdef CONFIG_SMP
 #define sched_class_highest (&stop_sched_class)
 #else
 #define sched_class_highest (&dl_sched_class)
 #endif
+*/
 #define for_class_range(class, _from, _to) \
        for (class = (_from); class != (_to); class = class->next)
@@ -1795,6 +1814,7 @@ static inline void set_next_task(struct rq *rq, struct task_struct *next)
 #define for_each_class(class) \
        for_class_range(class, sched_class_highest, NULL)
+extern const struct sched_class litmus_sched_class;
 extern const struct sched_class stop_sched_class;
 extern const struct sched_class dl_sched_class;
 extern const struct sched_class rt_sched_class;
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index c0640739e05e..3bd42cf27d88 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -8,6 +8,7 @@
 * See kernel/stop_machine.c
 */
 #include "sched.h"
+#include <litmus/preempt.h>
 #ifdef CONFIG_SMP
 static int
@@ -43,6 +44,13 @@ pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
                return NULL;
        set_next_task_stop(rq, rq->stop);
+        /* Let the LITMUS state machine know that a task was picked. This is
+         * needed because the LITMUS scheduling plugin will not be called if
+         * the stop-task class picks a task.
+         */
+        sched_state_task_picked();
        return rq->stop;
 }
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 65605530ee34..ce20111d3fe2 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -42,6 +42,10 @@
 #include <linux/freezer.h>
 #include <linux/compat.h>
+#include <litmus/litmus.h>
+#include <litmus/debug_trace.h>
+#include <litmus/trace.h>
 #include <linux/uaccess.h>
 #include <trace/events/timer.h>
@@ -1092,6 +1096,10 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
        tim = hrtimer_update_lowres(timer, tim, mode);
+#ifdef CONFIG_REPORT_TIMER_LATENCY
+        timer->when_added = base->get_time();
+#endif
        hrtimer_set_expires_range_ns(timer, tim, delta_ns);
        /* Switch the timer base, if necessary: */
@@ -1546,6 +1554,9 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now,
 {
        struct hrtimer_clock_base *base;
        unsigned int active = cpu_base->active_bases & active_mask;
+#ifdef CONFIG_REPORT_TIMER_LATENCY
+        ktime_t was_exp_nxt = cpu_base->expires_next;
+#endif
        for_each_active_base(base, cpu_base, active) {
                struct timerqueue_node *node;
@@ -1573,6 +1584,26 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now,
                        if (basenow < hrtimer_get_softexpires_tv64(timer))
                                break;
+#ifdef CONFIG_REPORT_TIMER_LATENCY
+                        if (cpu_base->hres_active && (basenow.tv64 >=
+                                hrtimer_get_expires_tv64(timer) +
+                                ((s64) CONFIG_REPORT_TIMER_LATENCY_THRESHOLD))) {
+                                printk_ratelimited(KERN_WARNING "WARNING: "
+                                        "P%d timer latency: %lld now: %lld "
+                                        "basenow:%lld exp:%lld "
+                                        "nxt:%lld added:%lld "
+                                        "timer:%p fn:%p\n",
+                                        smp_processor_id(),
+                                        basenow.tv64 - hrtimer_get_expires_tv64(timer),
+                                        now.tv64, basenow.tv64,
+                                        hrtimer_get_expires_tv64(timer),
+                                        hrtimer_get_softexpires(timer),
+                                        was_exp_nxt.tv64,
+                                        timer->when_added.tv64,
+                                        timer, timer->function);
+                        }
+#endif
                        __run_hrtimer(cpu_base, base, timer, &basenow, flags);
                        if (active_mask == HRTIMER_ACTIVE_SOFT)
                                hrtimer_sync_wait_running(cpu_base, flags);
@@ -1679,9 +1710,14 @@ retry:
         */
        cpu_base->nr_hangs++;
        cpu_base->hang_detected = 1;
+        TRACE("hrtimer hang detected on P%d: #%u\n", cpu_base->cpu,
+                cpu_base->nr_hangs);
        raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
        delta = ktime_sub(now, entry_time);
+        TRACE("hrtimer hang delta.tv64:%u\n", (unsigned int) delta.tv64);
        if ((unsigned int)delta > cpu_base->max_hang_time)
                cpu_base->max_hang_time = (unsigned int) delta;
        /*
@@ -1692,6 +1728,9 @@ retry:
                expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC);
        else
                expires_next = ktime_add(now, delta);
+        TRACE("hrtimer expires_next:%llu\n", expires_next.tv64);
        tick_program_event(expires_next, 1);
        pr_warn_once("hrtimer: interrupt took %llu ns\n", ktime_to_ns(delta));
 }
@@ -1762,8 +1801,21 @@ static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer)
        struct task_struct *task = t->task;
        t->task = NULL;
-        if (task)
+        if (task) {
+#ifdef CONFIG_SCHED_OVERHEAD_TRACE
+                if (is_realtime(task)) {
+                        ktime_t expires = hrtimer_get_expires(timer);
+                        /* Fix up timers that were added past their due date,
+                         * because that's not really release latency. */
+                        lt_t intended_release = max(expires.tv64,
+                                timer->when_added.tv64);
+                        TS_RELEASE_LATENCY(intended_release);
+                }
+#endif
+                TS_RELEASE_START;
                wake_up_process(task);
+                TS_RELEASE_END;
+        }
        return HRTIMER_NORESTART;
 }
@@ -1916,9 +1968,19 @@ long hrtimer_nanosleep(const struct timespec64 *rqtp,
        u64 slack;
        slack = current->timer_slack_ns;
-        if (dl_task(current) || rt_task(current))
+        if (dl_task(current) || rt_task(current) || is_realtime(current))
                slack = 0;
+        if (is_realtime(current) && (clockid == CLOCK_MONOTONIC) &&
+                (mode == HRTIMER_MODE_ABS)) {
+                /* Special handling: to handle periodic activations correctly
+                 * despite timer jitter and overheads, the plugin might need to
+                 * know the time at which the task intends to wake up. */
+                tsk_rt(current)->doing_abs_nanosleep = 1;
+                tsk_rt(current)->nanosleep_wakeup = ktime_to_ns(
+                        timespec_to_ktime(*rqtp));
+        }
        hrtimer_init_sleeper_on_stack(&t, clockid, mode);
        hrtimer_set_expires_range_ns(&t.timer, timespec64_to_ktime(*rqtp), slack);
        ret = do_nanosleep(&t, mode);
@@ -1937,6 +1999,9 @@ long hrtimer_nanosleep(const struct timespec64 *rqtp,
        restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer);
 out:
        destroy_hrtimer_on_stack(&t.timer);
+        tsk_rt(current)->doing_abs_nanosleep = 0;
        return ret;
 }
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 50055d2e4ea8..1ad757848f69 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -42,6 +42,8 @@
 #include "internal.h"
+#include <litmus/litmus.h>
 /*
 * Sleep at most 200ms at a time in balance_dirty_pages().
 */
@@ -436,7 +438,8 @@ static void domain_dirty_limits(struct dirty_throttle_control *dtc)
        if (bg_thresh >= thresh)
                bg_thresh = thresh / 2;
        tsk = current;
-        if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
+        if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk) ||
+                is_realtime(tsk)) {
                bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32;
                thresh += thresh / 4 + global_wb_domain.dirty_limit / 32;
        }
@@ -486,7 +489,7 @@ static unsigned long node_dirty_limit(struct pglist_data *pgdat)
        else
                dirty = vm_dirty_ratio * node_memory / 100;
-        if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk))
+        if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk) || is_realtime(tsk))
                dirty += dirty / 4;
        return dirty;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f391c0c4ed1d..6d90a9ed20c4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -75,6 +75,8 @@
 #include "internal.h"
 #include "shuffle.h"
+#include <litmus/litmus.h>
 /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
 static DEFINE_MUTEX(pcp_batch_high_lock);
 #define MIN_PERCPU_PAGELIST_FRACTION    (8)
@@ -4208,8 +4210,10 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
                 * comment for __cpuset_node_allowed().
                 */
                alloc_flags &= ~ALLOC_CPUSET;
-        } else if (unlikely(rt_task(current)) && !in_interrupt())
+        } else if (unlikely(rt_task(current) || is_realtime(current)) &&
+                !in_interrupt()) {
                alloc_flags |= ALLOC_HARDER;
+        }
        if (gfp_mask & __GFP_KSWAPD_RECLAIM)
                alloc_flags |= ALLOC_KSWAPD;
author	Nathan O <otternes@cs.unc.edu>	2019-12-09 14:59:56 -0500
committer	Nathan O <otternes@cs.unc.edu>	2019-12-09 14:59:56 -0500
commit	3c4abebc788e9d92d776d7bc8b778f398cdb4010 (patch)
tree	7392a57bb2d5e0e61cd3a03bae0e8ce79991f6d5
parent	2627f203874e04500ea80f6e588cd659bec5866b (diff)