[ported from 2008.3] Core LITMUS^RT infrastructure

Port 2008.3 Core LITMUS^RT infrastructure to Linux 2.6.32 litmus_sched_class implements 4 new methods: - prio_changed: void - switched_to: void - get_rr_interval: return infinity (i.e., 0) - select_task_rq: return current cpu
author: Andrea Bastoni <bastoni@cs.unc.edu> 2009-12-17 21:23:36 -0500
committer: Andrea Bastoni <bastoni@cs.unc.edu> 2010-05-29 17:05:45 -0400
commit: 4b38febbd59fd33542a343991262119eb9860f5e (patch)
tree: 1af88a0d354abe344c2c2869631f76a1806d75c3
parent: 22763c5cf3690a681551162c15d34d935308c8d7 (diff)
23 files changed, 2723 insertions, 12 deletions
diff --git a/Makefile b/Makefile
index f5cdb72ba2ce..2603066a012d 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 VERSION = 2
 PATCHLEVEL = 6
 SUBLEVEL = 32
-EXTRAVERSION =
+EXTRAVERSION =-litmus2010
 NAME = Man-Eating Seals of Antiquity
 # *DOCUMENTATION*
@@ -644,7 +644,7 @@ export mod_strip_cmd
 ifeq ($(KBUILD_EXTMOD),)
-core-y          += kernel/ mm/ fs/ ipc/ security/ crypto/ block/
+core-y          += kernel/ mm/ fs/ ipc/ security/ crypto/ block/ litmus/
 vmlinux-dirs    := $(patsubst %/,%,$(filter %/, $(init-y) $(init-m) \
                     $(core-y) $(core-m) $(drivers-y) $(drivers-m) \
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 75e6e60bf583..bb046c0adf99 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -38,6 +38,7 @@
 #define SCHED_BATCH             3
 /* SCHED_ISO: reserved but not implemented yet */
 #define SCHED_IDLE              5
+#define SCHED_LITMUS            6
 /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */
 #define SCHED_RESET_ON_FORK     0x40000000
@@ -94,6 +95,8 @@ struct sched_param {
 #include <asm/processor.h>
+#include <litmus/rt_param.h>
 struct exec_domain;
 struct futex_pi_state;
 struct robust_list_head;
@@ -1505,6 +1508,10 @@ struct task_struct {
        int make_it_fail;
 #endif
        struct prop_local_single dirties;
+        /* LITMUS RT parameters and state */
+        struct rt_param rt_param;
 #ifdef CONFIG_LATENCYTOP
        int latency_record_count;
        struct latency_record latency_record[LT_SAVECOUNT];
diff --git a/include/litmus/feather_buffer.h b/include/litmus/feather_buffer.h
new file mode 100644
index 000000000000..6c18277fdfc9
--- /dev/null
+++ b/include/litmus/feather_buffer.h
@@ -0,0 +1,94 @@
+#ifndef _FEATHER_BUFFER_H_
+#define _FEATHER_BUFFER_H_
+/* requires UINT_MAX and memcpy */
+#define SLOT_FREE       0
+#define SLOT_BUSY       1
+#define SLOT_READY      2
+struct ft_buffer {
+        unsigned int    slot_count;
+        unsigned int    slot_size;
+        int             free_count;
+        unsigned int    write_idx;
+        unsigned int    read_idx;
+        char*           slots;
+        void*           buffer_mem;
+        unsigned int    failed_writes;
+};
+static inline int init_ft_buffer(struct ft_buffer*      buf,
+                                 unsigned int           slot_count,
+                                 unsigned int           slot_size,
+                                 char*                  slots,
+                                 void*                  buffer_mem)
+{
+        int i = 0;
+        if (!slot_count || UINT_MAX % slot_count != slot_count - 1) {
+                /* The slot count must divide UNIT_MAX + 1 so that when it
+                 * wraps around the index correctly points to 0.
+                 */
+                return 0;
+        } else {
+                buf->slot_count    = slot_count;
+                buf->slot_size     = slot_size;
+                buf->slots         = slots;
+                buf->buffer_mem    = buffer_mem;
+                buf->free_count    = slot_count;
+                buf->write_idx     = 0;
+                buf->read_idx      = 0;
+                buf->failed_writes = 0;
+                for (i = 0; i < slot_count; i++)
+                        buf->slots[i] = SLOT_FREE;
+                return 1;
+        }
+}
+static inline int ft_buffer_start_write(struct ft_buffer* buf, void **ptr)
+{
+        int free = fetch_and_dec(&buf->free_count);
+        unsigned int idx;
+        if (free <= 0) {
+                fetch_and_inc(&buf->free_count);
+                *ptr = 0;
+                fetch_and_inc(&buf->failed_writes);
+                return 0;
+        } else {
+                idx  = fetch_and_inc((int*) &buf->write_idx) % buf->slot_count;
+                buf->slots[idx] = SLOT_BUSY;
+                *ptr = ((char*) buf->buffer_mem) + idx * buf->slot_size;
+                return 1;
+        }
+}
+static inline void ft_buffer_finish_write(struct ft_buffer* buf, void *ptr)
+{
+        unsigned int idx = ((char*) ptr - (char*) buf->buffer_mem) / buf->slot_size;
+        buf->slots[idx]  = SLOT_READY;
+}
+/* exclusive reader access is assumed */
+static inline int ft_buffer_read(struct ft_buffer* buf, void* dest)
+{
+        unsigned int idx;
+        if (buf->free_count == buf->slot_count)
+                /* nothing available */
+                return 0;
+        idx = buf->read_idx % buf->slot_count;
+        if (buf->slots[idx] == SLOT_READY) {
+                memcpy(dest, ((char*) buf->buffer_mem) + idx * buf->slot_size,
+                       buf->slot_size);
+                buf->slots[idx] = SLOT_FREE;
+                buf->read_idx++;
+                fetch_and_inc(&buf->free_count);
+                return 1;
+        } else
+                return 0;
+}
+#endif
diff --git a/include/litmus/feather_trace.h b/include/litmus/feather_trace.h
new file mode 100644
index 000000000000..3ac1ee5e0277
--- /dev/null
+++ b/include/litmus/feather_trace.h
@@ -0,0 +1,36 @@
+#ifndef _FEATHER_TRACE_H_
+#define _FEATHER_TRACE_H_
+int ft_enable_event(unsigned long id);
+int ft_disable_event(unsigned long id);
+int ft_is_event_enabled(unsigned long id);
+int ft_disable_all_events(void);
+#ifndef __ARCH_HAS_FEATHER_TRACE
+/* provide default implementation */
+#define feather_callback
+#define MAX_EVENTS 1024
+extern int ft_events[MAX_EVENTS];
+#define ft_event(id, callback) \
+        if (ft_events[id]) callback();
+#define ft_event0(id, callback) \
+        if (ft_events[id]) callback(id);
+#define ft_event1(id, callback, param) \
+        if (ft_events[id]) callback(id, param);
+#define ft_event2(id, callback, param, param2) \
+        if (ft_events[id]) callback(id, param, param2);
+#define ft_event3(id, callback, p, p2, p3) \
+        if (ft_events[id]) callback(id, p, p2, p3);
+#endif
+#endif
diff --git a/include/litmus/heap.h b/include/litmus/heap.h
new file mode 100644
index 000000000000..da959b0bec9c
--- /dev/null
+++ b/include/litmus/heap.h
@@ -0,0 +1,77 @@
+/* heaps.h -- Binomial Heaps
+ *
+ * (c) 2008, 2009 Bjoern Brandenburg
+ */
+#ifndef HEAP_H
+#define HEAP_H
+#define NOT_IN_HEAP UINT_MAX
+struct heap_node {
+        struct heap_node*       parent;
+        struct heap_node*       next;
+        struct heap_node*       child;
+        unsigned int            degree;
+        void*                   value;
+        struct heap_node**      ref;
+};
+struct heap {
+        struct heap_node*       head;
+        /* We cache the minimum of the heap.
+         * This speeds up repeated peek operations.
+         */
+        struct heap_node*       min;
+};
+typedef int (*heap_prio_t)(struct heap_node* a, struct heap_node* b);
+void heap_init(struct heap* heap);
+void heap_node_init(struct heap_node** ref_to_heap_node_ptr, void* value);
+static inline int heap_node_in_heap(struct heap_node* h)
+{
+        return h->degree != NOT_IN_HEAP;
+}
+static inline int heap_empty(struct heap* heap)
+{
+        return heap->head == NULL && heap->min == NULL;
+}
+/* insert (and reinitialize) a node into the heap */
+void heap_insert(heap_prio_t higher_prio,
+                 struct heap* heap,
+                 struct heap_node* node);
+/* merge addition into target */
+void heap_union(heap_prio_t higher_prio,
+                struct heap* target,
+                struct heap* addition);
+struct heap_node* heap_peek(heap_prio_t higher_prio,
+                            struct heap* heap);
+struct heap_node* heap_take(heap_prio_t higher_prio,
+                            struct heap* heap);
+void heap_uncache_min(heap_prio_t higher_prio, struct heap* heap);
+int  heap_decrease(heap_prio_t higher_prio, struct heap_node* node);
+void heap_delete(heap_prio_t higher_prio,
+                 struct heap* heap,
+                 struct heap_node* node);
+/* allocate from memcache */
+struct heap_node* heap_node_alloc(int gfp_flags);
+void heap_node_free(struct heap_node* hn);
+/* allocate a heap node for value and insert into the heap */
+int heap_add(heap_prio_t higher_prio, struct heap* heap,
+             void* value, int gfp_flags);
+void* heap_take_del(heap_prio_t higher_prio,
+                    struct heap* heap);
+#endif
diff --git a/include/litmus/jobs.h b/include/litmus/jobs.h
new file mode 100644
index 000000000000..9bd361ef3943
--- /dev/null
+++ b/include/litmus/jobs.h
@@ -0,0 +1,9 @@
+#ifndef __LITMUS_JOBS_H__
+#define __LITMUS_JOBS_H__
+void prepare_for_next_period(struct task_struct *t);
+void release_at(struct task_struct *t, lt_t start);
+long complete_job(void);
+#endif
diff --git a/include/litmus/litmus.h b/include/litmus/litmus.h
new file mode 100644
index 000000000000..380fcb8acb33
--- /dev/null
+++ b/include/litmus/litmus.h
@@ -0,0 +1,177 @@
+/*
+ * Constant definitions related to
+ * scheduling policy.
+ */
+#ifndef _LINUX_LITMUS_H_
+#define _LINUX_LITMUS_H_
+#include <linux/jiffies.h>
+#include <litmus/sched_trace.h>
+extern atomic_t release_master_cpu;
+extern atomic_t __log_seq_no;
+#define TRACE(fmt, args...) \
+        sched_trace_log_message("%d P%d: " fmt, atomic_add_return(1, &__log_seq_no), \
+                                raw_smp_processor_id(), ## args)
+#define TRACE_TASK(t, fmt, args...) \
+        TRACE("(%s/%d) " fmt, (t)->comm, (t)->pid, ##args)
+#define TRACE_CUR(fmt, args...) \
+        TRACE_TASK(current, fmt, ## args)
+#define TRACE_BUG_ON(cond) \
+        do { if (cond) TRACE("BUG_ON(%s) at %s:%d " \
+                             "called from %p current=%s/%d state=%d " \
+                             "flags=%x partition=%d cpu=%d rtflags=%d"\
+                             " job=%u knp=%d timeslice=%u\n",           \
+        #cond, __FILE__, __LINE__, __builtin_return_address(0), current->comm, \
+        current->pid, current->state, current->flags,  \
+        get_partition(current), smp_processor_id(), get_rt_flags(current), \
+        current->rt_param.job_params.job_no, current->rt_param.kernel_np, \
+        current->rt.time_slice\
+        ); } while(0);
+/* in_list - is a given list_head queued on some list?
+ */
+static inline int in_list(struct list_head* list)
+{
+        return !(  /* case 1: deleted */
+                   (list->next == LIST_POISON1 &&
+                    list->prev == LIST_POISON2)
+                 ||
+                   /* case 2: initialized */
+                   (list->next == list &&
+                    list->prev == list)
+                );
+}
+#define NO_CPU                  0xffffffff
+void litmus_fork(struct task_struct *tsk);
+void litmus_exec(void);
+/* clean up real-time state of a task */
+void exit_litmus(struct task_struct *dead_tsk);
+long litmus_admit_task(struct task_struct *tsk);
+void litmus_exit_task(struct task_struct *tsk);
+#define is_realtime(t)          ((t)->policy == SCHED_LITMUS)
+#define rt_transition_pending(t) \
+        ((t)->rt_param.transition_pending)
+#define tsk_rt(t)               (&(t)->rt_param)
+/*      Realtime utility macros */
+#define get_rt_flags(t)         (tsk_rt(t)->flags)
+#define set_rt_flags(t,f)       (tsk_rt(t)->flags=(f))
+#define get_exec_cost(t)        (tsk_rt(t)->task_params.exec_cost)
+#define get_exec_time(t)        (tsk_rt(t)->job_params.exec_time)
+#define get_rt_period(t)        (tsk_rt(t)->task_params.period)
+#define get_rt_phase(t)         (tsk_rt(t)->task_params.phase)
+#define get_partition(t)        (tsk_rt(t)->task_params.cpu)
+#define get_deadline(t)         (tsk_rt(t)->job_params.deadline)
+#define get_release(t)          (tsk_rt(t)->job_params.release)
+#define get_class(t)            (tsk_rt(t)->task_params.cls)
+inline static int budget_exhausted(struct task_struct* t)
+{
+        return get_exec_time(t) >= get_exec_cost(t);
+}
+#define is_hrt(t)               \
+        (tsk_rt(t)->task_params.class == RT_CLASS_HARD)
+#define is_srt(t)               \
+        (tsk_rt(t)->task_params.class == RT_CLASS_SOFT)
+#define is_be(t)                \
+        (tsk_rt(t)->task_params.class == RT_CLASS_BEST_EFFORT)
+/* Our notion of time within LITMUS: kernel monotonic time. */
+static inline lt_t litmus_clock(void)
+{
+        return ktime_to_ns(ktime_get());
+}
+/* A macro to convert from nanoseconds to ktime_t. */
+#define ns_to_ktime(t)          ktime_add_ns(ktime_set(0, 0), t)
+#define get_domain(t) (tsk_rt(t)->domain)
+/* Honor the flag in the preempt_count variable that is set
+ * when scheduling is in progress.
+ */
+#define is_running(t)                   \
+        ((t)->state == TASK_RUNNING ||  \
+         task_thread_info(t)->preempt_count & PREEMPT_ACTIVE)
+#define is_blocked(t)       \
+        (!is_running(t))
+#define is_released(t, now)     \
+        (lt_before_eq(get_release(t), now))
+#define is_tardy(t, now)    \
+        (lt_before_eq(tsk_rt(t)->job_params.deadline, now))
+/* real-time comparison macros */
+#define earlier_deadline(a, b) (lt_before(\
+        (a)->rt_param.job_params.deadline,\
+        (b)->rt_param.job_params.deadline))
+#define earlier_release(a, b)  (lt_before(\
+        (a)->rt_param.job_params.release,\
+        (b)->rt_param.job_params.release))
+#define make_np(t) do {t->rt_param.kernel_np++;} while(0);
+#define take_np(t) do {t->rt_param.kernel_np--;} while(0);
+#ifdef CONFIG_SRP
+void srp_ceiling_block(void);
+#else
+#define srp_ceiling_block() /* nothing */
+#endif
+#define heap2task(hn) ((struct task_struct*) hn->value)
+static inline int is_np(struct task_struct *t)
+{
+        return tsk_rt(t)->kernel_np;
+}
+#define  request_exit_np(t)
+static inline int is_present(struct task_struct* t)
+{
+        return t && tsk_rt(t)->present;
+}
+/* make the unit explicit */
+typedef unsigned long quanta_t;
+enum round {
+        FLOOR,
+        CEIL
+};
+/* Tick period is used to convert ns-specified execution
+ * costs and periods into tick-based equivalents.
+ */
+extern ktime_t tick_period;
+static inline quanta_t time2quanta(lt_t time, enum round round)
+{
+        s64  quantum_length = ktime_to_ns(tick_period);
+        if (do_div(time, quantum_length) && round == CEIL)
+                time++;
+        return (quanta_t) time;
+}
+/* By how much is cpu staggered behind CPU 0? */
+u64 cpu_stagger_offset(int cpu);
+#endif
diff --git a/include/litmus/rt_param.h b/include/litmus/rt_param.h
new file mode 100644
index 000000000000..c599f848d1ed
--- /dev/null
+++ b/include/litmus/rt_param.h
@@ -0,0 +1,175 @@
+/*
+ * Definition of the scheduler plugin interface.
+ *
+ */
+#ifndef _LINUX_RT_PARAM_H_
+#define _LINUX_RT_PARAM_H_
+/* Litmus time type. */
+typedef unsigned long long lt_t;
+static inline int lt_after(lt_t a, lt_t b)
+{
+        return ((long long) b) - ((long long) a) < 0;
+}
+#define lt_before(a, b) lt_after(b, a)
+static inline int lt_after_eq(lt_t a, lt_t b)
+{
+        return ((long long) a) - ((long long) b) >= 0;
+}
+#define lt_before_eq(a, b) lt_after_eq(b, a)
+/* different types of clients */
+typedef enum {
+        RT_CLASS_HARD,
+        RT_CLASS_SOFT,
+        RT_CLASS_BEST_EFFORT
+} task_class_t;
+struct rt_task {
+        lt_t            exec_cost;
+        lt_t            period;
+        lt_t            phase;
+        unsigned int    cpu;
+        task_class_t    cls;
+};
+/* don't export internal data structures to user space (liblitmus) */
+#ifdef __KERNEL__
+struct _rt_domain;
+struct heap_node;
+struct release_heap;
+struct rt_job {
+        /* Time instant the the job was or will be released.  */
+        lt_t    release;
+        /* What is the current deadline? */
+        lt_t    deadline;
+        /* How much service has this job received so far? */
+        lt_t    exec_time;
+        /* Which job is this. This is used to let user space
+         * specify which job to wait for, which is important if jobs
+         * overrun. If we just call sys_sleep_next_period() then we
+         * will unintentionally miss jobs after an overrun.
+         *
+         * Increase this sequence number when a job is released.
+         */
+        unsigned int    job_no;
+};
+struct pfair_param;
+/*      RT task parameters for scheduling extensions
+ *      These parameters are inherited during clone and therefore must
+ *      be explicitly set up before the task set is launched.
+ */
+struct rt_param {
+        /* is the task sleeping? */
+        unsigned int            flags:8;
+        /* do we need to check for srp blocking? */
+        unsigned int            srp_non_recurse:1;
+        /* is the task present? (true if it can be scheduled) */
+        unsigned int            present:1;
+        /* user controlled parameters */
+        struct rt_task          task_params;
+        /* timing parameters */
+        struct rt_job           job_params;
+        /* task representing the current "inherited" task
+         * priority, assigned by inherit_priority and
+         * return priority in the scheduler plugins.
+         * could point to self if PI does not result in
+         * an increased task priority.
+         */
+         struct task_struct*    inh_task;
+        /* Don't just dereference this pointer in kernel space!
+         * It might very well point to junk or nothing at all.
+         * NULL indicates that the task has not requested any non-preemptable
+         * section support.
+         * Not inherited upon fork.
+         */
+        short*                  np_flag;
+        /* re-use unused counter in plugins that don't need it */
+        union {
+                /* For the FMLP under PSN-EDF, it is required to make the task
+                 * non-preemptive from kernel space. In order not to interfere with
+                 * user space, this counter indicates the kernel space np setting.
+                 * kernel_np > 0 => task is non-preemptive
+                 */
+                unsigned int    kernel_np;
+                /* Used by GQ-EDF */
+                unsigned int    last_cpu;
+        };
+        /* This field can be used by plugins to store where the task
+         * is currently scheduled. It is the responsibility of the
+         * plugin to avoid race conditions.
+         *
+         * This used by GSN-EDF and PFAIR.
+         */
+        volatile int            scheduled_on;
+        /* Is the stack of the task currently in use? This is updated by
+         * the LITMUS core.
+         *
+         * Be careful to avoid deadlocks!
+         */
+        volatile int            stack_in_use;
+        /* This field can be used by plugins to store where the task
+         * is currently linked. It is the responsibility of the plugin
+         * to avoid race conditions.
+         *
+         * Used by GSN-EDF.
+         */
+        volatile int            linked_on;
+        /* PFAIR/PD^2 state. Allocated on demand. */
+        struct pfair_param*     pfair;
+        /* Fields saved before BE->RT transition.
+         */
+        int old_policy;
+        int old_prio;
+        /* ready queue for this task */
+        struct _rt_domain* domain;
+        /* heap element for this task
+         *
+         * Warning: Don't statically allocate this node. The heap
+         *          implementation swaps these between tasks, thus after
+         *          dequeuing from a heap you may end up with a different node
+         *          then the one you had when enqueuing the task.  For the same
+         *          reason, don't obtain and store references to this node
+         *          other than this pointer (which is updated by the heap
+         *          implementation).
+         */
+        struct heap_node*       heap_node;
+        struct release_heap*    rel_heap;
+        /* Used by rt_domain to queue task in release list.
+         */
+        struct list_head list;
+};
+/*      Possible RT flags       */
+#define RT_F_RUNNING            0x00000000
+#define RT_F_SLEEP              0x00000001
+#define RT_F_EXIT_SEM           0x00000008
+#endif
+#endif
diff --git a/include/litmus/sched_plugin.h b/include/litmus/sched_plugin.h
new file mode 100644
index 000000000000..94952f6ccbfa
--- /dev/null
+++ b/include/litmus/sched_plugin.h
@@ -0,0 +1,159 @@
+/*
+ * Definition of the scheduler plugin interface.
+ *
+ */
+#ifndef _LINUX_SCHED_PLUGIN_H_
+#define _LINUX_SCHED_PLUGIN_H_
+#include <linux/sched.h>
+/* struct for semaphore with priority inheritance */
+struct pi_semaphore {
+        atomic_t count;
+        int sleepers;
+        wait_queue_head_t wait;
+        union {
+                /* highest-prio holder/waiter */
+                struct task_struct *task;
+                struct task_struct* cpu_task[NR_CPUS];
+        } hp;
+        /* current lock holder */
+        struct task_struct *holder;
+};
+/************************ setup/tear down ********************/
+typedef long (*activate_plugin_t) (void);
+typedef long (*deactivate_plugin_t) (void);
+/********************* scheduler invocation ******************/
+/*  Plugin-specific realtime tick handler */
+typedef void (*scheduler_tick_t) (struct task_struct *cur);
+/* Novell make sched decision function */
+typedef struct task_struct* (*schedule_t)(struct task_struct * prev);
+/* Clean up after the task switch has occured.
+ * This function is called after every (even non-rt) task switch.
+ */
+typedef void (*finish_switch_t)(struct task_struct *prev);
+/********************* task state changes ********************/
+/* Called to setup a new real-time task.
+ * Release the first job, enqueue, etc.
+ * Task may already be running.
+ */
+typedef void (*task_new_t) (struct task_struct *task,
+                            int on_rq,
+                            int running);
+/* Called to re-introduce a task after blocking.
+ * Can potentially be called multiple times.
+ */
+typedef void (*task_wake_up_t) (struct task_struct *task);
+/* called to notify the plugin of a blocking real-time task
+ * it will only be called for real-time tasks and before schedule is called */
+typedef void (*task_block_t)  (struct task_struct *task);
+/* Called when a real-time task exits or changes to a different scheduling
+ * class.
+ * Free any allocated resources
+ */
+typedef void (*task_exit_t)    (struct task_struct *);
+/* Called when the new_owner is released from the wait queue
+ * it should now inherit the priority from sem, _before_ it gets readded
+ * to any queue
+ */
+typedef long (*inherit_priority_t) (struct pi_semaphore *sem,
+                                    struct task_struct *new_owner);
+/* Called when the current task releases a semahpore where it might have
+ * inherited a piority from
+ */
+typedef long (*return_priority_t) (struct pi_semaphore *sem);
+/* Called when a task tries to acquire a semaphore and fails. Check if its
+ * priority is higher than that of the current holder.
+ */
+typedef long (*pi_block_t) (struct pi_semaphore *sem, struct task_struct *t);
+/********************* sys call backends  ********************/
+/* This function causes the caller to sleep until the next release */
+typedef long (*complete_job_t) (void);
+typedef long (*admit_task_t)(struct task_struct* tsk);
+typedef void (*release_at_t)(struct task_struct *t, lt_t start);
+struct sched_plugin {
+        struct list_head        list;
+        /*      basic info              */
+        char                    *plugin_name;
+        /*      setup                   */
+        activate_plugin_t       activate_plugin;
+        deactivate_plugin_t     deactivate_plugin;
+#ifdef CONFIG_SRP
+        unsigned int            srp_active;
+#endif
+        /*      scheduler invocation    */
+        scheduler_tick_t        tick;
+        schedule_t              schedule;
+        finish_switch_t         finish_switch;
+        /*      syscall backend         */
+        complete_job_t          complete_job;
+        release_at_t            release_at;
+        /*      task state changes      */
+        admit_task_t            admit_task;
+        task_new_t              task_new;
+        task_wake_up_t          task_wake_up;
+        task_block_t            task_block;
+        task_exit_t             task_exit;
+#ifdef CONFIG_FMLP
+        /*     priority inheritance     */
+        unsigned int            fmlp_active;
+        inherit_priority_t      inherit_priority;
+        return_priority_t       return_priority;
+        pi_block_t              pi_block;
+#endif
+} __attribute__ ((__aligned__(SMP_CACHE_BYTES)));
+extern struct sched_plugin *litmus;
+int register_sched_plugin(struct sched_plugin* plugin);
+struct sched_plugin* find_sched_plugin(const char* name);
+int print_sched_plugins(char* buf, int max);
+static inline int srp_active(void)
+{
+#ifdef CONFIG_SRP
+        return litmus->srp_active;
+#else
+        return 0;
+#endif
+}
+static inline int fmlp_active(void)
+{
+#ifdef CONFIG_FMLP
+        return litmus->fmlp_active;
+#else
+        return 0;
+#endif
+}
+extern struct sched_plugin linux_sched_plugin;
+#endif
diff --git a/include/litmus/sched_trace.h b/include/litmus/sched_trace.h
new file mode 100644
index 000000000000..afd0391d127b
--- /dev/null
+++ b/include/litmus/sched_trace.h
@@ -0,0 +1,191 @@
+/* sched_trace.h -- record scheduler events to a byte stream for offline analysis.
+ */
+#ifndef _LINUX_SCHED_TRACE_H_
+#define _LINUX_SCHED_TRACE_H_
+/* all times in nanoseconds */
+struct st_trace_header {
+        u8      type;           /* Of what type is this record?  */
+        u8      cpu;            /* On which CPU was it recorded? */
+        u16     pid;            /* PID of the task.              */
+        u32     job;            /* The job sequence number.      */
+};
+#define ST_NAME_LEN 16
+struct st_name_data {
+        char    cmd[ST_NAME_LEN];/* The name of the executable of this process. */
+};
+struct st_param_data {          /* regular params */
+        u32     wcet;
+        u32     period;
+        u32     phase;
+        u8      partition;
+        u8      __unused[3];
+};
+struct st_release_data {        /* A job is was/is going to be released. */
+        u64     release;        /* What's the release time?              */
+        u64     deadline;       /* By when must it finish?               */
+};
+struct st_assigned_data {       /* A job was asigned to a CPU.           */
+        u64     when;
+        u8      target;         /* Where should it execute?              */
+        u8      __unused[3];
+};
+struct st_switch_to_data {      /* A process was switched to on a given CPU.   */
+        u64     when;           /* When did this occur?                        */
+        u32     exec_time;      /* Time the current job has executed.          */
+};
+struct st_switch_away_data {    /* A process was switched away from on a given CPU. */
+        u64     when;
+        u64     exec_time;
+};
+struct st_completion_data {     /* A job completed. */
+        u64     when;
+        u8      forced:1;       /* Set to 1 if job overran and kernel advanced to the
+                                 * next task automatically; set to 0 otherwise.
+                                 */
+        u8      __uflags:7;
+        u8      __unused[3];
+};
+struct st_block_data {          /* A task blocks. */
+        u64     when;
+        u64     __unused;
+};
+struct st_resume_data {         /* A task resumes. */
+        u64     when;
+        u64     __unused;
+};
+struct st_sys_release_data {
+        u64     when;
+        u64     release;
+};
+#define DATA(x) struct st_ ## x ## _data x;
+typedef enum {
+        ST_NAME = 1,            /* Start at one, so that we can spot
+                                 * uninitialized records. */
+        ST_PARAM,
+        ST_RELEASE,
+        ST_ASSIGNED,
+        ST_SWITCH_TO,
+        ST_SWITCH_AWAY,
+        ST_COMPLETION,
+        ST_BLOCK,
+        ST_RESUME,
+        ST_SYS_RELEASE,
+} st_event_record_type_t;
+struct st_event_record {
+        struct st_trace_header hdr;
+        union {
+                u64 raw[2];
+                DATA(name);
+                DATA(param);
+                DATA(release);
+                DATA(assigned);
+                DATA(switch_to);
+                DATA(switch_away);
+                DATA(completion);
+                DATA(block);
+                DATA(resume);
+                DATA(sys_release);
+        } data;
+};
+#undef DATA
+#ifdef __KERNEL__
+#include <linux/sched.h>
+#include <litmus/feather_trace.h>
+#ifdef CONFIG_SCHED_TASK_TRACE
+#define SCHED_TRACE(id, callback, task) \
+        ft_event1(id, callback, task)
+#define SCHED_TRACE2(id, callback, task, xtra) \
+        ft_event2(id, callback, task, xtra)
+/* provide prototypes; needed on sparc64 */
+#ifndef NO_TASK_TRACE_DECLS
+feather_callback void do_sched_trace_task_name(unsigned long id,
+                                               struct task_struct* task);
+feather_callback void do_sched_trace_task_param(unsigned long id,
+                                                struct task_struct* task);
+feather_callback void do_sched_trace_task_release(unsigned long id,
+                                                  struct task_struct* task);
+feather_callback void do_sched_trace_task_switch_to(unsigned long id,
+                                                    struct task_struct* task);
+feather_callback void do_sched_trace_task_switch_away(unsigned long id,
+                                                      struct task_struct* task);
+feather_callback void do_sched_trace_task_completion(unsigned long id,
+                                                     struct task_struct* task,
+                                                     unsigned long forced);
+feather_callback void do_sched_trace_task_block(unsigned long id,
+                                                struct task_struct* task);
+feather_callback void do_sched_trace_task_resume(unsigned long id,
+                                                 struct task_struct* task);
+feather_callback void do_sched_trace_sys_release(unsigned long id,
+                                                 lt_t* start);
+#endif
+#else
+#define SCHED_TRACE(id, callback, task)        /* no tracing */
+#define SCHED_TRACE2(id, callback, task, xtra) /* no tracing */
+#endif
+#define SCHED_TRACE_BASE_ID 500
+#define sched_trace_task_name(t) \
+        SCHED_TRACE(SCHED_TRACE_BASE_ID + 1, do_sched_trace_task_name, t)
+#define sched_trace_task_param(t) \
+        SCHED_TRACE(SCHED_TRACE_BASE_ID + 2, do_sched_trace_task_param, t)
+#define sched_trace_task_release(t) \
+        SCHED_TRACE(SCHED_TRACE_BASE_ID + 3, do_sched_trace_task_release, t)
+#define sched_trace_task_switch_to(t) \
+        SCHED_TRACE(SCHED_TRACE_BASE_ID + 4, do_sched_trace_task_switch_to, t)
+#define sched_trace_task_switch_away(t) \
+        SCHED_TRACE(SCHED_TRACE_BASE_ID + 5, do_sched_trace_task_switch_away, t)
+#define sched_trace_task_completion(t, forced) \
+        SCHED_TRACE2(SCHED_TRACE_BASE_ID + 6, do_sched_trace_task_completion, t, \
+                     forced)
+#define sched_trace_task_block(t) \
+        SCHED_TRACE(SCHED_TRACE_BASE_ID + 7, do_sched_trace_task_block, t)
+#define sched_trace_task_resume(t) \
+        SCHED_TRACE(SCHED_TRACE_BASE_ID + 8, do_sched_trace_task_resume, t)
+#define sched_trace_sys_release(when) \
+        SCHED_TRACE(SCHED_TRACE_BASE_ID + 9, do_sched_trace_sys_release, when)
+#define sched_trace_quantum_boundary() /* NOT IMPLEMENTED */
+#ifdef CONFIG_SCHED_DEBUG_TRACE
+void sched_trace_log_message(const char* fmt, ...);
+void dump_trace_buffer(int max);
+#else
+#define sched_trace_log_message(fmt, ...)
+#endif
+#endif /* __KERNEL__ */
+#endif
diff --git a/include/litmus/trace.h b/include/litmus/trace.h
new file mode 100644
index 000000000000..e8e0c7b6cc6a
--- /dev/null
+++ b/include/litmus/trace.h
@@ -0,0 +1,113 @@
+#ifndef _SYS_TRACE_H_
+#define _SYS_TRACE_H_
+#ifdef CONFIG_SCHED_OVERHEAD_TRACE
+#include <litmus/feather_trace.h>
+#include <litmus/feather_buffer.h>
+/*********************** TIMESTAMPS ************************/
+enum task_type_marker {
+        TSK_BE,
+        TSK_RT,
+        TSK_UNKNOWN
+};
+struct timestamp {
+        uint64_t                timestamp;
+        uint32_t                seq_no;
+        uint8_t                 cpu;
+        uint8_t                 event;
+        uint8_t                 task_type;
+};
+/* tracing callbacks */
+feather_callback void save_timestamp(unsigned long event);
+feather_callback void save_timestamp_def(unsigned long event, unsigned long type);
+feather_callback void save_timestamp_task(unsigned long event, unsigned long t_ptr);
+feather_callback void save_timestamp_cpu(unsigned long event, unsigned long cpu);
+#define TIMESTAMP(id) ft_event0(id, save_timestamp)
+#define DTIMESTAMP(id, def)  ft_event1(id, save_timestamp_def, def)
+#define TTIMESTAMP(id, task) \
+        ft_event1(id, save_timestamp_task, (unsigned long) task)
+#define CTIMESTAMP(id, cpu) \
+        ft_event1(id, save_timestamp_cpu, cpu)
+#else /* !CONFIG_SCHED_OVERHEAD_TRACE */
+#define TIMESTAMP(id)        /* no tracing */
+#define DTIMESTAMP(id, def)  /* no tracing */
+#define TTIMESTAMP(id, task) /* no tracing */
+#define CTIMESTAMP(id, cpu)  /* no tracing */
+#endif
+/* Convention for timestamps
+ * =========================
+ *
+ * In order to process the trace files with a common tool, we use the following
+ * convention to measure execution times: The end time id of a code segment is
+ * always the next number after the start time event id.
+ */
+#define TS_SCHED_START                  DTIMESTAMP(100, TSK_UNKNOWN) /* we only
+                                                                      * care
+                                                                      * about
+                                                                      * next */
+#define TS_SCHED_END(t)                 TTIMESTAMP(101, t)
+#define TS_SCHED2_START(t)              TTIMESTAMP(102, t)
+#define TS_SCHED2_END(t)                TTIMESTAMP(103, t)
+#define TS_CXS_START(t)                 TTIMESTAMP(104, t)
+#define TS_CXS_END(t)                   TTIMESTAMP(105, t)
+#define TS_RELEASE_START                DTIMESTAMP(106, TSK_RT)
+#define TS_RELEASE_END                  DTIMESTAMP(107, TSK_RT)
+#define TS_TICK_START(t)                TTIMESTAMP(110, t)
+#define TS_TICK_END(t)                  TTIMESTAMP(111, t)
+#define TS_PLUGIN_SCHED_START           /* TIMESTAMP(120) */  /* currently unused */
+#define TS_PLUGIN_SCHED_END             /* TIMESTAMP(121) */
+#define TS_PLUGIN_TICK_START            /* TIMESTAMP(130) */
+#define TS_PLUGIN_TICK_END              /* TIMESTAMP(131) */
+#define TS_ENTER_NP_START               TIMESTAMP(140)
+#define TS_ENTER_NP_END                 TIMESTAMP(141)
+#define TS_EXIT_NP_START                TIMESTAMP(150)
+#define TS_EXIT_NP_END                  TIMESTAMP(151)
+#define TS_SRP_UP_START                 TIMESTAMP(160)
+#define TS_SRP_UP_END                   TIMESTAMP(161)
+#define TS_SRP_DOWN_START               TIMESTAMP(162)
+#define TS_SRP_DOWN_END                 TIMESTAMP(163)
+#define TS_PI_UP_START                  TIMESTAMP(170)
+#define TS_PI_UP_END                    TIMESTAMP(171)
+#define TS_PI_DOWN_START                TIMESTAMP(172)
+#define TS_PI_DOWN_END                  TIMESTAMP(173)
+#define TS_FIFO_UP_START                TIMESTAMP(180)
+#define TS_FIFO_UP_END                  TIMESTAMP(181)
+#define TS_FIFO_DOWN_START              TIMESTAMP(182)
+#define TS_FIFO_DOWN_END                TIMESTAMP(183)
+#define TS_SEND_RESCHED_START(c)        CTIMESTAMP(190, c)
+#define TS_SEND_RESCHED_END             DTIMESTAMP(191, TSK_UNKNOWN)
+#endif /* !_SYS_TRACE_H_ */
diff --git a/kernel/fork.c b/kernel/fork.c
index 166b8c49257c..889730cce3ad 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -74,6 +74,9 @@
 #include <trace/events/sched.h>
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
 /*
 * Protected counters by write_lock_irq(&tasklist_lock)
 */
@@ -162,6 +165,7 @@ void __put_task_struct(struct task_struct *tsk)
        WARN_ON(atomic_read(&tsk->usage));
        WARN_ON(tsk == current);
+        exit_litmus(tsk);
        exit_creds(tsk);
        delayacct_tsk_free(tsk);
@@ -244,6 +248,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
        tsk->stack = ti;
+        /* Don't let the new task be a real-time task. */
+        memset(&tsk->rt_param, 0, sizeof(struct rt_task));
        err = prop_local_init_single(&tsk->dirties);
        if (err)
                goto out;
diff --git a/kernel/sched.c b/kernel/sched.c
index 3c11ae0a948d..fcaed6b96442 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -77,6 +77,9 @@
 #include "sched_cpupri.h"
+#include <litmus/sched_trace.h>
+#include <litmus/trace.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/sched.h>
@@ -571,6 +574,8 @@ struct rq {
        atomic_t nr_iowait;
+        struct task_struct *litmus_next;
 #ifdef CONFIG_SMP
        struct root_domain *rd;
        struct sched_domain *sd;
@@ -1815,11 +1820,12 @@ static void calc_load_account_active(struct rq *this_rq);
 #include "sched_idletask.c"
 #include "sched_fair.c"
 #include "sched_rt.c"
+#include "../litmus/sched_litmus.c"
 #ifdef CONFIG_SCHED_DEBUG
 # include "sched_debug.c"
 #endif
-#define sched_class_highest (&rt_sched_class)
+#define sched_class_highest (&litmus_sched_class)
 #define for_each_class(class) \
   for (class = sched_class_highest; class; class = class->next)
@@ -2343,6 +2349,9 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
        unsigned long flags;
        struct rq *rq, *orig_rq;
+        if (is_realtime(p))
+                TRACE_TASK(p, "try_to_wake_up() state:%d\n", p->state);
        if (!sched_feat(SYNC_WAKEUPS))
                wake_flags &= ~WF_SYNC;
@@ -2361,7 +2370,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
        orig_cpu = cpu;
 #ifdef CONFIG_SMP
-        if (unlikely(task_running(rq, p)))
+        if (unlikely(task_running(rq, p)) || is_realtime(p))
                goto out_activate;
        /*
@@ -2442,6 +2451,8 @@ out_running:
                p->sched_class->task_wake_up(rq, p);
 #endif
 out:
+        if (is_realtime(p))
+                TRACE_TASK(p, "try_to_wake_up() done state:%d\n", p->state);
        task_rq_unlock(rq, &flags);
        put_cpu();
@@ -2750,6 +2761,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
         */
        prev_state = prev->state;
        finish_arch_switch(prev);
+        litmus->finish_switch(prev);
+        prev->rt_param.stack_in_use = NO_CPU;
        perf_event_task_sched_in(current, cpu_of(rq));
        finish_lock_switch(rq, prev);
@@ -5232,18 +5245,31 @@ void scheduler_tick(void)
        sched_clock_tick();
+        TS_TICK_START(current);
        spin_lock(&rq->lock);
        update_rq_clock(rq);
        update_cpu_load(rq);
        curr->sched_class->task_tick(rq, curr, 0);
+        /*
+         * LITMUS_TODO: can we move litmus_tick inside task_tick
+         * or will deadlock ?
+         */
+        TS_PLUGIN_TICK_START;
+        litmus_tick(rq, curr);
+        TS_PLUGIN_TICK_END;
        spin_unlock(&rq->lock);
        perf_event_task_tick(curr, cpu);
 #ifdef CONFIG_SMP
        rq->idle_at_tick = idle_cpu(cpu);
-        trigger_load_balance(rq, cpu);
+        if (!is_realtime(current))
+                trigger_load_balance(rq, cpu);
 #endif
+        TS_TICK_END(current);
 }
 notrace unsigned long get_parent_ip(unsigned long addr)
@@ -5387,11 +5413,17 @@ pick_next_task(struct rq *rq)
         * Optimization: we know that if all tasks are in
         * the fair class we can call that function directly:
         */
+        /*
+         * LITMUS_TODO: can we move processes out of fair class?
+         * i.e., create a litmus_rq
+         */
+        /* Don't do this for LITMUS
        if (likely(rq->nr_running == rq->cfs.nr_running)) {
                p = fair_sched_class.pick_next_task(rq);
                if (likely(p))
                        return p;
        }
+        */
        class = sched_class_highest;
        for ( ; ; ) {
@@ -5426,6 +5458,8 @@ need_resched:
        release_kernel_lock(prev);
 need_resched_nonpreemptible:
+        TS_SCHED_START;
+        sched_trace_task_switch_away(prev);
        schedule_debug(prev);
@@ -5436,6 +5470,14 @@ need_resched_nonpreemptible:
        update_rq_clock(rq);
        clear_tsk_need_resched(prev);
+        /*
+         * LITMUS_TODO: can we integrate litmus_schedule in
+         * pick_next_task?
+         */
+        TS_PLUGIN_SCHED_START;
+        litmus_schedule(rq, prev);
+        TS_PLUGIN_SCHED_END;
        if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
                if (unlikely(signal_pending_state(prev->state, prev)))
                        prev->state = TASK_RUNNING;
@@ -5460,22 +5502,35 @@ need_resched_nonpreemptible:
                rq->curr = next;
                ++*switch_count;
+                TS_SCHED_END(next);
+                TS_CXS_START(next);
                context_switch(rq, prev, next); /* unlocks the rq */
+                TS_CXS_END(current);
                /*
                 * the context switch might have flipped the stack from under
                 * us, hence refresh the local variables.
                 */
                cpu = smp_processor_id();
                rq = cpu_rq(cpu);
-        } else
+        } else {
+                TS_SCHED_END(prev);
                spin_unlock_irq(&rq->lock);
+        }
+        TS_SCHED2_START(current);
+        sched_trace_task_switch_to(current);
        post_schedule(rq);
-        if (unlikely(reacquire_kernel_lock(current) < 0))
+        if (unlikely(reacquire_kernel_lock(current) < 0)) {
+                TS_SCHED2_END(current);
                goto need_resched_nonpreemptible;
+        }
        preempt_enable_no_resched();
+        TS_SCHED2_END(current);
        if (need_resched())
                goto need_resched;
 }
@@ -6185,6 +6240,9 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
        case SCHED_RR:
                p->sched_class = &rt_sched_class;
                break;
+        case SCHED_LITMUS:
+                p->sched_class = &litmus_sched_class;
+                break;
        }
        p->rt_priority = prio;
@@ -6232,7 +6290,7 @@ recheck:
                if (policy != SCHED_FIFO && policy != SCHED_RR &&
                                policy != SCHED_NORMAL && policy != SCHED_BATCH &&
-                                policy != SCHED_IDLE)
+                                policy != SCHED_IDLE && policy != SCHED_LITMUS)
                        return -EINVAL;
        }
@@ -6247,6 +6305,8 @@ recheck:
                return -EINVAL;
        if (rt_policy(policy) != (param->sched_priority != 0))
                return -EINVAL;
+        if (policy == SCHED_LITMUS && policy == p->policy)
+                return -EINVAL;
        /*
         * Allow unprivileged RT tasks to decrease priority:
@@ -6301,6 +6361,12 @@ recheck:
                        return retval;
        }
+        if (policy == SCHED_LITMUS) {
+                retval = litmus_admit_task(p);
+                if (retval)
+                        return retval;
+        }
        /*
         * make sure no PI-waiters arrive (or leave) while we are
         * changing the priority of the task:
@@ -6328,9 +6394,18 @@ recheck:
        p->sched_reset_on_fork = reset_on_fork;
+        if (p->policy == SCHED_LITMUS)
+                litmus_exit_task(p);
        oldprio = p->prio;
        __setscheduler(rq, p, policy, param->sched_priority);
+        if (policy == SCHED_LITMUS) {
+                p->rt_param.stack_in_use = running ? rq->cpu : NO_CPU;
+                p->rt_param.present = running;
+                litmus->task_new(p, on_rq, running);
+        }
        if (running)
                p->sched_class->set_curr_task(rq);
        if (on_rq) {
@@ -6500,10 +6575,11 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
        read_lock(&tasklist_lock);
        p = find_process_by_pid(pid);
-        if (!p) {
+        /* Don't set affinity if task not found and for LITMUS tasks */
+        if (!p || is_realtime(p)) {
                read_unlock(&tasklist_lock);
                put_online_cpus();
-                return -ESRCH;
+                return p ? -EPERM : -ESRCH;
        }
        /*
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 37087a7fac22..ef43ff95999d 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1598,7 +1598,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
        update_curr(cfs_rq);
-        if (unlikely(rt_prio(p->prio))) {
+        if (unlikely(rt_prio(p->prio)) || p->policy == SCHED_LITMUS) {
                resched_task(curr);
                return;
        }
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index a4d790cddb19..f622880e918f 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1004,7 +1004,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
 */
 static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
 {
-        if (p->prio < rq->curr->prio) {
+        if (p->prio < rq->curr->prio || p->policy == SCHED_LITMUS) {
                resched_task(rq->curr);
                return;
        }
diff --git a/litmus/Kconfig b/litmus/Kconfig
new file mode 100644
index 000000000000..f8c642658a2f
--- /dev/null
+++ b/litmus/Kconfig
@@ -0,0 +1,50 @@
+menu "LITMUS^RT"
+menu "Tracing"
+config FEATHER_TRACE
+        bool "Feather-Trace Infrastructure"
+        default y
+        help
+          Feather-Trace basic tracing infrastructure. Includes device file
+          driver and instrumentation point support.
+config SCHED_TASK_TRACE
+        bool "Trace real-time tasks"
+        depends on FEATHER_TRACE
+        default y
+        help
+          Include support for the sched_trace_XXX() tracing functions. This
+          allows the collection of real-time task events such as job
+          completions, job releases, early completions, etc. This results in  a
+          small overhead in the scheduling code. Disable if the overhead is not
+          acceptable (e.g., benchmarking).
+          Say Yes for debugging.
+          Say No for overhead tracing.
+config SCHED_OVERHEAD_TRACE
+        bool "Record timestamps for overhead measurements"
+        depends on FEATHER_TRACE
+        default n
+        help
+          Export event stream for overhead tracing.
+          Say Yes for overhead tracing.
+config SCHED_DEBUG_TRACE
+        bool "TRACE() debugging"
+        default y
+        help
+          Include support for sched_trace_log_messageg(), which is used to
+          implement TRACE(). If disabled, no TRACE() messages will be included
+          in the kernel, and no overheads due to debugging statements will be
+          incurred by the scheduler. Disable if the overhead is not acceptable
+          (e.g. benchmarking).
+          Say Yes for debugging.
+          Say No for overhead tracing.
+endmenu
+endmenu
diff --git a/litmus/Makefile b/litmus/Makefile
new file mode 100644
index 000000000000..f4c2d564cd0b
--- /dev/null
+++ b/litmus/Makefile
@@ -0,0 +1,12 @@
+#
+# Makefile for LITMUS^RT
+#
+obj-y     = sched_plugin.o litmus.o \
+            jobs.o \
+            heap.o
+obj-$(CONFIG_FEATHER_TRACE) += ft_event.o ftdev.o
+obj-$(CONFIG_SCHED_TASK_TRACE) += sched_task_trace.o
+obj-$(CONFIG_SCHED_DEBUG_TRACE) += sched_trace.o
+obj-$(CONFIG_SCHED_OVERHEAD_TRACE) += trace.o
diff --git a/litmus/ft_event.c b/litmus/ft_event.c
new file mode 100644
index 000000000000..6084b6d6b364
--- /dev/null
+++ b/litmus/ft_event.c
@@ -0,0 +1,43 @@
+#include <linux/types.h>
+#include <litmus/feather_trace.h>
+#ifndef __ARCH_HAS_FEATHER_TRACE
+/* provide dummy implementation */
+int ft_events[MAX_EVENTS];
+int ft_enable_event(unsigned long id)
+{
+        if (id < MAX_EVENTS) {
+                ft_events[id]++;
+                return 1;
+        } else
+                return 0;
+}
+int ft_disable_event(unsigned long id)
+{
+        if (id < MAX_EVENTS && ft_events[id]) {
+                ft_events[id]--;
+                return 1;
+        } else
+                return 0;
+}
+int ft_disable_all_events(void)
+{
+        int i;
+        for (i = 0; i < MAX_EVENTS; i++)
+                ft_events[i] = 0;
+        return MAX_EVENTS;
+}
+int ft_is_event_enabled(unsigned long id)
+{
+        return  id < MAX_EVENTS && ft_events[id];
+}
+#endif
diff --git a/litmus/heap.c b/litmus/heap.c
new file mode 100644
index 000000000000..112d14da46c3
--- /dev/null
+++ b/litmus/heap.c
@@ -0,0 +1,314 @@
+#include "linux/kernel.h"
+#include "litmus/heap.h"
+void heap_init(struct heap* heap)
+{
+        heap->head = NULL;
+        heap->min  = NULL;
+}
+void heap_node_init(struct heap_node** _h, void* value)
+{
+        struct heap_node* h = *_h;
+        h->parent = NULL;
+        h->next   = NULL;
+        h->child  = NULL;
+        h->degree = NOT_IN_HEAP;
+        h->value  = value;
+        h->ref    = _h;
+}
+/* make child a subtree of root */
+static void __heap_link(struct heap_node* root,
+                        struct heap_node* child)
+{
+        child->parent = root;
+        child->next   = root->child;
+        root->child   = child;
+        root->degree++;
+}
+/* merge root lists */
+static  struct heap_node* __heap_merge(struct heap_node* a,
+                                             struct heap_node* b)
+{
+        struct heap_node* head = NULL;
+        struct heap_node** pos = &head;
+        while (a && b) {
+                if (a->degree < b->degree) {
+                        *pos = a;
+                        a = a->next;
+                } else {
+                        *pos = b;
+                        b = b->next;
+                }
+                pos = &(*pos)->next;
+        }
+        if (a)
+                *pos = a;
+        else
+                *pos = b;
+        return head;
+}
+/* reverse a linked list of nodes. also clears parent pointer */
+static  struct heap_node* __heap_reverse(struct heap_node* h)
+{
+        struct heap_node* tail = NULL;
+        struct heap_node* next;
+        if (!h)
+                return h;
+        h->parent = NULL;
+        while (h->next) {
+                next    = h->next;
+                h->next = tail;
+                tail    = h;
+                h       = next;
+                h->parent = NULL;
+        }
+        h->next = tail;
+        return h;
+}
+static  void __heap_min(heap_prio_t higher_prio, struct heap* heap,
+                              struct heap_node** prev, struct heap_node** node)
+{
+        struct heap_node *_prev, *cur;
+        *prev = NULL;
+        if (!heap->head) {
+                *node = NULL;
+                return;
+        }
+        *node = heap->head;
+        _prev = heap->head;
+        cur   = heap->head->next;
+        while (cur) {
+                if (higher_prio(cur, *node)) {
+                        *node = cur;
+                        *prev = _prev;
+                }
+                _prev = cur;
+                cur   = cur->next;
+        }
+}
+static  void __heap_union(heap_prio_t higher_prio, struct heap* heap,
+                                struct heap_node* h2)
+{
+        struct heap_node* h1;
+        struct heap_node *prev, *x, *next;
+        if (!h2)
+                return;
+        h1 = heap->head;
+        if (!h1) {
+                heap->head = h2;
+                return;
+        }
+        h1 = __heap_merge(h1, h2);
+        prev = NULL;
+        x    = h1;
+        next = x->next;
+        while (next) {
+                if (x->degree != next->degree ||
+                    (next->next && next->next->degree == x->degree)) {
+                        /* nothing to do, advance */
+                        prev = x;
+                        x    = next;
+                } else if (higher_prio(x, next)) {
+                        /* x becomes the root of next */
+                        x->next = next->next;
+                        __heap_link(x, next);
+                } else {
+                        /* next becomes the root of x */
+                        if (prev)
+                                prev->next = next;
+                        else
+                                h1 = next;
+                        __heap_link(next, x);
+                        x = next;
+                }
+                next = x->next;
+        }
+        heap->head = h1;
+}
+static struct heap_node* __heap_extract_min(heap_prio_t higher_prio,
+                                            struct heap* heap)
+{
+        struct heap_node *prev, *node;
+        __heap_min(higher_prio, heap, &prev, &node);
+        if (!node)
+                return NULL;
+        if (prev)
+                prev->next = node->next;
+        else
+                heap->head = node->next;
+        __heap_union(higher_prio, heap, __heap_reverse(node->child));
+        return node;
+}
+/* insert (and reinitialize) a node into the heap */
+void heap_insert(heap_prio_t higher_prio, struct heap* heap,
+                 struct heap_node* node)
+{
+        struct heap_node *min;
+        node->child  = NULL;
+        node->parent = NULL;
+        node->next   = NULL;
+        node->degree = 0;
+        if (heap->min && higher_prio(node, heap->min)) {
+                /* swap min cache */
+                min = heap->min;
+                min->child  = NULL;
+                min->parent = NULL;
+                min->next   = NULL;
+                min->degree = 0;
+                __heap_union(higher_prio, heap, min);
+                heap->min   = node;
+        } else
+                __heap_union(higher_prio, heap, node);
+}
+void heap_uncache_min(heap_prio_t higher_prio, struct heap* heap)
+{
+        struct heap_node* min;
+        if (heap->min) {
+                min = heap->min;
+                heap->min = NULL;
+                heap_insert(higher_prio, heap, min);
+        }
+}
+/* merge addition into target */
+void heap_union(heap_prio_t higher_prio,
+                struct heap* target, struct heap* addition)
+{
+        /* first insert any cached minima, if necessary */
+        heap_uncache_min(higher_prio, target);
+        heap_uncache_min(higher_prio, addition);
+        __heap_union(higher_prio, target, addition->head);
+        /* this is a destructive merge */
+        addition->head = NULL;
+}
+struct heap_node* heap_peek(heap_prio_t higher_prio,
+                            struct heap* heap)
+{
+        if (!heap->min)
+                heap->min = __heap_extract_min(higher_prio, heap);
+        return heap->min;
+}
+struct heap_node* heap_take(heap_prio_t higher_prio,
+                            struct heap* heap)
+{
+        struct heap_node *node;
+        if (!heap->min)
+                heap->min = __heap_extract_min(higher_prio, heap);
+        node = heap->min;
+        heap->min = NULL;
+        if (node)
+                node->degree = NOT_IN_HEAP;
+        return node;
+}
+int heap_decrease(heap_prio_t higher_prio, struct heap_node* node)
+{
+        struct heap_node  *parent;
+        struct heap_node** tmp_ref;
+        void* tmp;
+        /* bubble up */
+        parent = node->parent;
+        while (parent && higher_prio(node, parent)) {
+                /* swap parent and node */
+                tmp           = parent->value;
+                parent->value = node->value;
+                node->value   = tmp;
+                /* swap references */
+                *(parent->ref) = node;
+                *(node->ref)   = parent;
+                tmp_ref        = parent->ref;
+                parent->ref    = node->ref;
+                node->ref      = tmp_ref;
+                /* step up */
+                node   = parent;
+                parent = node->parent;
+        }
+        return parent != NULL;
+}
+void heap_delete(heap_prio_t higher_prio, struct heap* heap,
+                 struct heap_node* node)
+{
+        struct heap_node *parent, *prev, *pos;
+        struct heap_node** tmp_ref;
+        void* tmp;
+        if (heap->min != node) {
+                /* bubble up */
+                parent = node->parent;
+                while (parent) {
+                        /* swap parent and node */
+                        tmp           = parent->value;
+                        parent->value = node->value;
+                        node->value   = tmp;
+                        /* swap references */
+                        *(parent->ref) = node;
+                        *(node->ref)   = parent;
+                        tmp_ref        = parent->ref;
+                        parent->ref    = node->ref;
+                        node->ref      = tmp_ref;
+                        /* step up */
+                        node   = parent;
+                        parent = node->parent;
+                }
+                /* now delete:
+                 * first find prev */
+                prev = NULL;
+                pos  = heap->head;
+                while (pos != node) {
+                        prev = pos;
+                        pos  = pos->next;
+                }
+                /* we have prev, now remove node */
+                if (prev)
+                        prev->next = node->next;
+                else
+                        heap->head = node->next;
+                __heap_union(higher_prio, heap, __heap_reverse(node->child));
+        } else
+                heap->min = NULL;
+        node->degree = NOT_IN_HEAP;
+}
+/* allocate a heap node for value and insert into the heap */
+int heap_add(heap_prio_t higher_prio, struct heap* heap,
+             void* value, int gfp_flags)
+{
+        struct heap_node* hn = heap_node_alloc(gfp_flags);
+        if (likely(hn)) {
+                heap_node_init(&hn, value);
+                heap_insert(higher_prio, heap, hn);
+        }
+        return hn != NULL;
+}
+void* heap_take_del(heap_prio_t higher_prio,
+                    struct heap* heap)
+{
+        struct heap_node* hn = heap_take(higher_prio, heap);
+        void* ret = NULL;
+        if (hn) {
+                ret = hn->value;
+                heap_node_free(hn);
+        }
+        return ret;
+}
diff --git a/litmus/jobs.c b/litmus/jobs.c
new file mode 100644
index 000000000000..36e314625d86
--- /dev/null
+++ b/litmus/jobs.c
@@ -0,0 +1,43 @@
+/* litmus/jobs.c - common job control code
+ */
+#include <linux/sched.h>
+#include <litmus/litmus.h>
+#include <litmus/jobs.h>
+void prepare_for_next_period(struct task_struct *t)
+{
+        BUG_ON(!t);
+        /* prepare next release */
+        t->rt_param.job_params.release   = t->rt_param.job_params.deadline;
+        t->rt_param.job_params.deadline += get_rt_period(t);
+        t->rt_param.job_params.exec_time = 0;
+        /* update job sequence number */
+        t->rt_param.job_params.job_no++;
+        /* don't confuse Linux */
+        t->rt.time_slice = 1;
+}
+void release_at(struct task_struct *t, lt_t start)
+{
+        t->rt_param.job_params.deadline = start;
+        prepare_for_next_period(t);
+        set_rt_flags(t, RT_F_RUNNING);
+}
+/*
+ *      Deactivate current task until the beginning of the next period.
+ */
+long complete_job(void)
+{
+        /* Mark that we do not excute anymore */
+        set_rt_flags(current, RT_F_SLEEP);
+        /* call schedule, this will return when a new job arrives
+         * it also takes care of preparing for the next release
+         */
+        schedule();
+        return 0;
+}
diff --git a/litmus/litmus.c b/litmus/litmus.c
new file mode 100644
index 000000000000..eb0d17e298d7
--- /dev/null
+++ b/litmus/litmus.c
@@ -0,0 +1,654 @@
+/* litmus.c -- Implementation of the LITMUS syscalls, the LITMUS intialization code,
+ *             and the procfs interface..
+ */
+#include <asm/uaccess.h>
+#include <linux/uaccess.h>
+#include <linux/sysrq.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/slab.h>
+#include <litmus/litmus.h>
+#include <linux/sched.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/heap.h>
+#include <litmus/trace.h>
+/* Number of RT tasks that exist in the system */
+atomic_t rt_task_count          = ATOMIC_INIT(0);
+static DEFINE_SPINLOCK(task_transition_lock);
+/* Give log messages sequential IDs. */
+atomic_t __log_seq_no = ATOMIC_INIT(0);
+/* current master CPU for handling timer IRQs */
+atomic_t release_master_cpu = ATOMIC_INIT(NO_CPU);
+static struct kmem_cache * heap_node_cache;
+struct heap_node* heap_node_alloc(int gfp_flags)
+{
+        return kmem_cache_alloc(heap_node_cache, gfp_flags);
+}
+void heap_node_free(struct heap_node* hn)
+{
+        kmem_cache_free(heap_node_cache, hn);
+}
+/*
+ * sys_set_task_rt_param
+ * @pid: Pid of the task which scheduling parameters must be changed
+ * @param: New real-time extension parameters such as the execution cost and
+ *         period
+ * Syscall for manipulating with task rt extension params
+ * Returns EFAULT  if param is NULL.
+ *         ESRCH   if pid is not corrsponding
+ *                 to a valid task.
+ *         EINVAL  if either period or execution cost is <=0
+ *         EPERM   if pid is a real-time task
+ *         0       if success
+ *
+ * Only non-real-time tasks may be configured with this system call
+ * to avoid races with the scheduler. In practice, this means that a
+ * task's parameters must be set _before_ calling sys_prepare_rt_task()
+ *
+ * find_task_by_vpid() assumes that we are in the same namespace of the
+ * target.
+ */
+asmlinkage long sys_set_rt_task_param(pid_t pid, struct rt_task __user * param)
+{
+        struct rt_task tp;
+        struct task_struct *target;
+        int retval = -EINVAL;
+        printk("Setting up rt task parameters for process %d.\n", pid);
+        if (pid < 0 || param == 0) {
+                goto out;
+        }
+        if (copy_from_user(&tp, param, sizeof(tp))) {
+                retval = -EFAULT;
+                goto out;
+        }
+        /* Task search and manipulation must be protected */
+        read_lock_irq(&tasklist_lock);
+        if (!(target = find_task_by_vpid(pid))) {
+                retval = -ESRCH;
+                goto out_unlock;
+        }
+        if (is_realtime(target)) {
+                /* The task is already a real-time task.
+                 * We cannot not allow parameter changes at this point.
+                 */
+                retval = -EBUSY;
+                goto out_unlock;
+        }
+        if (tp.exec_cost <= 0)
+                goto out_unlock;
+        if (tp.period <= 0)
+                goto out_unlock;
+        if (!cpu_online(tp.cpu))
+                goto out_unlock;
+        if (tp.period < tp.exec_cost)
+        {
+                printk(KERN_INFO "litmus: real-time task %d rejected "
+                       "because wcet > period\n", pid);
+                goto out_unlock;
+        }
+        target->rt_param.task_params = tp;
+        retval = 0;
+      out_unlock:
+        read_unlock_irq(&tasklist_lock);
+      out:
+        return retval;
+}
+/*
+ * Getter of task's RT params
+ *   returns EINVAL if param or pid is NULL
+ *   returns ESRCH  if pid does not correspond to a valid task
+ *   returns EFAULT if copying of parameters has failed.
+ *
+ *   find_task_by_vpid() assumes that we are in the same namespace of the
+ *   target.
+ */
+asmlinkage long sys_get_rt_task_param(pid_t pid, struct rt_task __user * param)
+{
+        int retval = -EINVAL;
+        struct task_struct *source;
+        struct rt_task lp;
+        if (param == 0 || pid < 0)
+                goto out;
+        read_lock(&tasklist_lock);
+        if (!(source = find_task_by_vpid(pid))) {
+                retval = -ESRCH;
+                goto out_unlock;
+        }
+        lp = source->rt_param.task_params;
+        read_unlock(&tasklist_lock);
+        /* Do copying outside the lock */
+        retval =
+            copy_to_user(param, &lp, sizeof(lp)) ? -EFAULT : 0;
+        return retval;
+      out_unlock:
+        read_unlock(&tasklist_lock);
+      out:
+        return retval;
+}
+/*
+ *      This is the crucial function for periodic task implementation,
+ *      It checks if a task is periodic, checks if such kind of sleep
+ *      is permitted and calls plugin-specific sleep, which puts the
+ *      task into a wait array.
+ *      returns 0 on successful wakeup
+ *      returns EPERM if current conditions do not permit such sleep
+ *      returns EINVAL if current task is not able to go to sleep
+ */
+asmlinkage long sys_complete_job(void)
+{
+        int retval = -EPERM;
+        if (!is_realtime(current)) {
+                retval = -EINVAL;
+                goto out;
+        }
+        /* Task with negative or zero period cannot sleep */
+        if (get_rt_period(current) <= 0) {
+                retval = -EINVAL;
+                goto out;
+        }
+        /* The plugin has to put the task into an
+         * appropriate queue and call schedule
+         */
+        retval = litmus->complete_job();
+      out:
+        return retval;
+}
+/*      This is an "improved" version of sys_complete_job that
+ *      addresses the problem of unintentionally missing a job after
+ *      an overrun.
+ *
+ *      returns 0 on successful wakeup
+ *      returns EPERM if current conditions do not permit such sleep
+ *      returns EINVAL if current task is not able to go to sleep
+ */
+asmlinkage long sys_wait_for_job_release(unsigned int job)
+{
+        int retval = -EPERM;
+        if (!is_realtime(current)) {
+                retval = -EINVAL;
+                goto out;
+        }
+        /* Task with negative or zero period cannot sleep */
+        if (get_rt_period(current) <= 0) {
+                retval = -EINVAL;
+                goto out;
+        }
+        retval = 0;
+        /* first wait until we have "reached" the desired job
+         *
+         * This implementation has at least two problems:
+         *
+         * 1) It doesn't gracefully handle the wrap around of
+         *    job_no. Since LITMUS is a prototype, this is not much
+         *    of a problem right now.
+         *
+         * 2) It is theoretically racy if a job release occurs
+         *    between checking job_no and calling sleep_next_period().
+         *    A proper solution would requiring adding another callback
+         *    in the plugin structure and testing the condition with
+         *    interrupts disabled.
+         *
+         * FIXME: At least problem 2 should be taken care of eventually.
+         */
+        while (!retval && job > current->rt_param.job_params.job_no)
+                /* If the last job overran then job <= job_no and we
+                 * don't send the task to sleep.
+                 */
+                retval = litmus->complete_job();
+      out:
+        return retval;
+}
+/*      This is a helper syscall to query the current job sequence number.
+ *
+ *      returns 0 on successful query
+ *      returns EPERM if task is not a real-time task.
+ *      returns EFAULT if &job is not a valid pointer.
+ */
+asmlinkage long sys_query_job_no(unsigned int __user *job)
+{
+        int retval = -EPERM;
+        if (is_realtime(current))
+                retval = put_user(current->rt_param.job_params.job_no, job);
+        return retval;
+}
+/* sys_null_call() is only used for determining raw system call
+ * overheads (kernel entry, kernel exit). It has no useful side effects.
+ * If ts is non-NULL, then the current Feather-Trace time is recorded.
+ */
+asmlinkage long sys_null_call(cycles_t __user *ts)
+{
+        long ret = 0;
+        cycles_t now;
+        if (ts) {
+                now = get_cycles();
+                ret = put_user(now, ts);
+        }
+        return ret;
+}
+/* p is a real-time task. Re-init its state as a best-effort task. */
+static void reinit_litmus_state(struct task_struct* p, int restore)
+{
+        struct rt_task  user_config = {};
+        __user short   *np_flag     = NULL;
+        if (restore) {
+                /* Safe user-space provided configuration data. */
+                user_config = p->rt_param.task_params;
+                np_flag     = p->rt_param.np_flag;
+        }
+        /* We probably should not be inheriting any task's priority
+         * at this point in time.
+         */
+        WARN_ON(p->rt_param.inh_task);
+        /* We need to restore the priority of the task. */
+//      __setscheduler(p, p->rt_param.old_policy, p->rt_param.old_prio);
+        /* Cleanup everything else. */
+        memset(&p->rt_param, 0, sizeof(user_config));
+        /* Restore preserved fields. */
+        if (restore) {
+                p->rt_param.task_params = user_config;
+                p->rt_param.np_flag      = np_flag;
+        }
+}
+long litmus_admit_task(struct task_struct* tsk)
+{
+        long retval = 0;
+        unsigned long flags;
+        BUG_ON(is_realtime(tsk));
+        if (get_rt_period(tsk) == 0 ||
+            get_exec_cost(tsk) > get_rt_period(tsk)) {
+                TRACE_TASK(tsk, "litmus admit: invalid task parameters "
+                           "(%lu, %lu)\n",
+                       get_exec_cost(tsk), get_rt_period(tsk));
+                return -EINVAL;
+        }
+        if (!cpu_online(get_partition(tsk)))
+        {
+                TRACE_TASK(tsk, "litmus admit: cpu %d is not online\n",
+                           get_partition(tsk));
+                return -EINVAL;
+        }
+        INIT_LIST_HEAD(&tsk_rt(tsk)->list);
+        /* avoid scheduler plugin changing underneath us */
+        spin_lock_irqsave(&task_transition_lock, flags);
+        /* allocate heap node for this task */
+        tsk_rt(tsk)->heap_node    = heap_node_alloc(GFP_ATOMIC);
+        if (!tsk_rt(tsk)->heap_node ||
+            !tsk_rt(tsk)->rel_heap) {
+                printk(KERN_WARNING "litmus: no more heap node memory!?\n");
+                retval = -ENOMEM;
+                heap_node_free(tsk_rt(tsk)->heap_node);
+        } else
+                heap_node_init(&tsk_rt(tsk)->heap_node, tsk);
+        if (!retval)
+                retval = litmus->admit_task(tsk);
+        if (!retval) {
+                sched_trace_task_name(tsk);
+                sched_trace_task_param(tsk);
+                atomic_inc(&rt_task_count);
+        }
+        spin_unlock_irqrestore(&task_transition_lock, flags);
+        return retval;
+}
+void litmus_exit_task(struct task_struct* tsk)
+{
+        if (is_realtime(tsk)) {
+                sched_trace_task_completion(tsk, 1);
+                litmus->task_exit(tsk);
+                BUG_ON(heap_node_in_heap(tsk_rt(tsk)->heap_node));
+                heap_node_free(tsk_rt(tsk)->heap_node);
+                atomic_dec(&rt_task_count);
+                reinit_litmus_state(tsk, 1);
+        }
+}
+/* Switching a plugin in use is tricky.
+ * We must watch out that no real-time tasks exists
+ * (and that none is created in parallel) and that the plugin is not
+ * currently in use on any processor (in theory).
+ *
+ * For now, we don't enforce the second part since it is unlikely to cause
+ * any trouble by itself as long as we don't unload modules.
+ */
+int switch_sched_plugin(struct sched_plugin* plugin)
+{
+        unsigned long flags;
+        int ret = 0;
+        BUG_ON(!plugin);
+        /* stop task transitions */
+        spin_lock_irqsave(&task_transition_lock, flags);
+        /* don't switch if there are active real-time tasks */
+        if (atomic_read(&rt_task_count) == 0) {
+                ret = litmus->deactivate_plugin();
+                if (0 != ret)
+                        goto out;
+                ret = plugin->activate_plugin();
+                if (0 != ret) {
+                        printk(KERN_INFO "Can't activate %s (%d).\n",
+                               plugin->plugin_name, ret);
+                        plugin = &linux_sched_plugin;
+                }
+                printk(KERN_INFO "Switching to LITMUS^RT plugin %s.\n", plugin->plugin_name);
+                litmus = plugin;
+        } else
+                ret = -EBUSY;
+out:
+        spin_unlock_irqrestore(&task_transition_lock, flags);
+        return ret;
+}
+/* Called upon fork.
+ * p is the newly forked task.
+ */
+void litmus_fork(struct task_struct* p)
+{
+        if (is_realtime(p))
+                /* clean out any litmus related state, don't preserve anything*/
+                reinit_litmus_state(p, 0);
+}
+/* Called upon execve().
+ * current is doing the exec.
+ * Don't let address space specific stuff leak.
+ */
+void litmus_exec(void)
+{
+        struct task_struct* p = current;
+        if (is_realtime(p)) {
+                WARN_ON(p->rt_param.inh_task);
+                p->rt_param.np_flag = NULL;
+        }
+}
+void exit_litmus(struct task_struct *dead_tsk)
+{
+        if (is_realtime(dead_tsk))
+                litmus_exit_task(dead_tsk);
+}
+#ifdef CONFIG_MAGIC_SYSRQ
+int sys_kill(int pid, int sig);
+static void sysrq_handle_kill_rt_tasks(int key, struct tty_struct *tty)
+{
+        struct task_struct *t;
+        read_lock(&tasklist_lock);
+        for_each_process(t) {
+                if (is_realtime(t)) {
+                        sys_kill(t->pid, SIGKILL);
+                }
+        }
+        read_unlock(&tasklist_lock);
+}
+static struct sysrq_key_op sysrq_kill_rt_tasks_op = {
+        .handler        = sysrq_handle_kill_rt_tasks,
+        .help_msg       = "quit-rt-tasks(X)",
+        .action_msg     = "sent SIGKILL to all LITMUS^RT real-time tasks",
+};
+#endif
+static int proc_read_stats(char *page, char **start,
+                           off_t off, int count,
+                           int *eof, void *data)
+{
+        int len;
+        len = snprintf(page, PAGE_SIZE,
+                       "real-time tasks   = %d\n"
+                       "ready for release = %d\n",
+                       atomic_read(&rt_task_count),
+                       0);
+        return len;
+}
+static int proc_read_plugins(char *page, char **start,
+                           off_t off, int count,
+                           int *eof, void *data)
+{
+        int len;
+        len = print_sched_plugins(page, PAGE_SIZE);
+        return len;
+}
+static int proc_read_curr(char *page, char **start,
+                          off_t off, int count,
+                          int *eof, void *data)
+{
+        int len;
+        len = snprintf(page, PAGE_SIZE, "%s\n", litmus->plugin_name);
+        return len;
+}
+static int proc_write_curr(struct file *file,
+                           const char *buffer,
+                           unsigned long count,
+                           void *data)
+{
+        int len, ret;
+        char name[65];
+        struct sched_plugin* found;
+        if(count > 64)
+                len = 64;
+        else
+                len = count;
+        if(copy_from_user(name, buffer, len))
+                return -EFAULT;
+        name[len] = '\0';
+        /* chomp name */
+        if (len > 1 && name[len - 1] == '\n')
+                name[len - 1] = '\0';
+        found = find_sched_plugin(name);
+        if (found) {
+                ret = switch_sched_plugin(found);
+                if (ret != 0)
+                        printk(KERN_INFO "Could not switch plugin: %d\n", ret);
+        } else
+                printk(KERN_INFO "Plugin '%s' is unknown.\n", name);
+        return len;
+}
+static int proc_read_release_master(char *page, char **start,
+                                    off_t off, int count,
+                                    int *eof, void *data)
+{
+        int len, master;
+        master = atomic_read(&release_master_cpu);
+        if (master == NO_CPU)
+                len = snprintf(page, PAGE_SIZE, "NO_CPU\n");
+        else
+                len = snprintf(page, PAGE_SIZE, "%d\n", master);
+        return len;
+}
+static int proc_write_release_master(struct file *file,
+                                     const char *buffer,
+                                     unsigned long count,
+                                     void *data)
+{
+        int cpu, err, online = 0;
+        char msg[64];
+        if (count > 63)
+                return -EINVAL;
+        if (copy_from_user(msg, buffer, count))
+                return -EFAULT;
+        /* terminate */
+        msg[count] = '\0';
+        /* chomp */
+        if (count > 1 && msg[count - 1] == '\n')
+                msg[count - 1] = '\0';
+        if (strcmp(msg, "NO_CPU") == 0) {
+                atomic_set(&release_master_cpu, NO_CPU);
+                return count;
+        } else {
+                err = sscanf(msg, "%d", &cpu);
+                if (err == 1 && cpu >= 0 && (online = cpu_online(cpu))) {
+                        atomic_set(&release_master_cpu, cpu);
+                        return count;
+                } else {
+                        TRACE("invalid release master: '%s' "
+                              "(err:%d cpu:%d online:%d)\n",
+                              msg, err, cpu, online);
+                        return -EINVAL;
+                }
+        }
+}
+static struct proc_dir_entry *litmus_dir = NULL,
+        *curr_file = NULL,
+        *stat_file = NULL,
+        *plugs_file = NULL,
+        *release_master_file = NULL;
+static int __init init_litmus_proc(void)
+{
+        litmus_dir = proc_mkdir("litmus", NULL);
+        if (!litmus_dir) {
+                printk(KERN_ERR "Could not allocate LITMUS^RT procfs entry.\n");
+                return -ENOMEM;
+        }
+        curr_file = create_proc_entry("active_plugin",
+                                      0644, litmus_dir);
+        if (!curr_file) {
+                printk(KERN_ERR "Could not allocate active_plugin "
+                       "procfs entry.\n");
+                return -ENOMEM;
+        }
+        curr_file->read_proc  = proc_read_curr;
+        curr_file->write_proc = proc_write_curr;
+        release_master_file = create_proc_entry("release_master",
+                                                0644, litmus_dir);
+        if (!release_master_file) {
+                printk(KERN_ERR "Could not allocate release_master "
+                       "procfs entry.\n");
+                return -ENOMEM;
+        }
+        release_master_file->read_proc = proc_read_release_master;
+        release_master_file->write_proc  = proc_write_release_master;
+        stat_file = create_proc_read_entry("stats", 0444, litmus_dir,
+                                           proc_read_stats, NULL);
+        plugs_file = create_proc_read_entry("plugins", 0444, litmus_dir,
+                                           proc_read_plugins, NULL);
+        return 0;
+}
+static void exit_litmus_proc(void)
+{
+        if (plugs_file)
+                remove_proc_entry("plugins", litmus_dir);
+        if (stat_file)
+                remove_proc_entry("stats", litmus_dir);
+        if (curr_file)
+                remove_proc_entry("active_plugin", litmus_dir);
+        if (litmus_dir)
+                remove_proc_entry("litmus", NULL);
+}
+extern struct sched_plugin linux_sched_plugin;
+static int __init _init_litmus(void)
+{
+        /*      Common initializers,
+         *      mode change lock is used to enforce single mode change
+         *      operation.
+         */
+        printk("Starting LITMUS^RT kernel\n");
+        register_sched_plugin(&linux_sched_plugin);
+        heap_node_cache    = KMEM_CACHE(heap_node, SLAB_PANIC);
+#ifdef CONFIG_MAGIC_SYSRQ
+        /* offer some debugging help */
+        if (!register_sysrq_key('x', &sysrq_kill_rt_tasks_op))
+                printk("Registered kill rt tasks magic sysrq.\n");
+        else
+                printk("Could not register kill rt tasks magic sysrq.\n");
+#endif
+        init_litmus_proc();
+        return 0;
+}
+static void _exit_litmus(void)
+{
+        exit_litmus_proc();
+        kmem_cache_destroy(heap_node_cache);
+}
+module_init(_init_litmus);
+module_exit(_exit_litmus);
diff --git a/litmus/sched_litmus.c b/litmus/sched_litmus.c
new file mode 100644
index 000000000000..ccedd3670ac5
--- /dev/null
+++ b/litmus/sched_litmus.c
@@ -0,0 +1,275 @@
+/* This file is included from kernel/sched.c */
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+static void update_time_litmus(struct rq *rq, struct task_struct *p)
+{
+        u64 delta = rq->clock - p->se.exec_start;
+        if (unlikely((s64)delta < 0))
+                delta = 0;
+        /* per job counter */
+        p->rt_param.job_params.exec_time += delta;
+        /* task counter */
+        p->se.sum_exec_runtime += delta;
+        /* sched_clock() */
+        p->se.exec_start = rq->clock;
+        cpuacct_charge(p, delta);
+}
+static void double_rq_lock(struct rq *rq1, struct rq *rq2);
+static void double_rq_unlock(struct rq *rq1, struct rq *rq2);
+static void litmus_tick(struct rq *rq, struct task_struct *p)
+{
+        if (is_realtime(p))
+                update_time_litmus(rq, p);
+        litmus->tick(p);
+}
+static void litmus_schedule(struct rq *rq, struct task_struct *prev)
+{
+        struct rq* other_rq;
+        long was_running;
+        lt_t _maybe_deadlock = 0;
+        /* WARNING: rq is _not_ locked! */
+        if (is_realtime(prev)) {
+                update_time_litmus(rq, prev);
+                if (!is_running(prev))
+                        tsk_rt(prev)->present = 0;
+        }
+        /* let the plugin schedule */
+        rq->litmus_next = litmus->schedule(prev);
+        /* check if a global plugin pulled a task from a different RQ */
+        if (rq->litmus_next && task_rq(rq->litmus_next) != rq) {
+                /* we need to migrate the task */
+                other_rq = task_rq(rq->litmus_next);
+                TRACE_TASK(rq->litmus_next, "migrate from %d\n", other_rq->cpu);
+                /* while we drop the lock, the prev task could change its
+                 * state
+                 */
+                was_running = is_running(prev);
+                mb();
+                spin_unlock(&rq->lock);
+                /* Don't race with a concurrent switch.  This could deadlock in
+                 * the case of cross or circular migrations.  It's the job of
+                 * the plugin to make sure that doesn't happen.
+                 */
+                TRACE_TASK(rq->litmus_next, "stack_in_use=%d\n",
+                           rq->litmus_next->rt_param.stack_in_use);
+                if (rq->litmus_next->rt_param.stack_in_use != NO_CPU) {
+                        TRACE_TASK(rq->litmus_next, "waiting to deschedule\n");
+                        _maybe_deadlock = litmus_clock();
+                }
+                while (rq->litmus_next->rt_param.stack_in_use != NO_CPU) {
+                        cpu_relax();
+                        mb();
+                        if (rq->litmus_next->rt_param.stack_in_use == NO_CPU)
+                                TRACE_TASK(rq->litmus_next,
+                                           "descheduled. Proceeding.\n");
+                        if (lt_before(_maybe_deadlock + 10000000,
+                                      litmus_clock())) {
+                                /* We've been spinning for 10ms.
+                                 * Something can't be right!
+                                 * Let's abandon the task and bail out; at least
+                                 * we will have debug info instead of a hard
+                                 * deadlock.
+                                 */
+                                TRACE_TASK(rq->litmus_next,
+                                           "stack too long in use. "
+                                           "Deadlock?\n");
+                                rq->litmus_next = NULL;
+                                /* bail out */
+                                spin_lock(&rq->lock);
+                                return;
+                        }
+                }
+#ifdef  __ARCH_WANT_UNLOCKED_CTXSW
+                if (rq->litmus_next->oncpu)
+                        TRACE_TASK(rq->litmus_next, "waiting for !oncpu");
+                while (rq->litmus_next->oncpu) {
+                        cpu_relax();
+                        mb();
+                }
+#endif
+                double_rq_lock(rq, other_rq);
+                mb();
+                if (is_realtime(prev) && is_running(prev) != was_running) {
+                        TRACE_TASK(prev,
+                                   "state changed while we dropped"
+                                   " the lock: is_running=%d, was_running=%d\n",
+                                   is_running(prev), was_running);
+                        if (is_running(prev) && !was_running) {
+                                /* prev task became unblocked
+                                 * we need to simulate normal sequence of events
+                                 * to scheduler plugins.
+                                 */
+                                litmus->task_block(prev);
+                                litmus->task_wake_up(prev);
+                        }
+                }
+                set_task_cpu(rq->litmus_next, smp_processor_id());
+                /* DEBUG: now that we have the lock we need to make sure a
+                 *  couple of things still hold:
+                 *  - it is still a real-time task
+                 *  - it is still runnable (could have been stopped)
+                 * If either is violated, then the active plugin is
+                 * doing something wrong.
+                 */
+                if (!is_realtime(rq->litmus_next) ||
+                    !is_running(rq->litmus_next)) {
+                        /* BAD BAD BAD */
+                        TRACE_TASK(rq->litmus_next,
+                                   "BAD: migration invariant FAILED: "
+                                   "rt=%d running=%d\n",
+                                   is_realtime(rq->litmus_next),
+                                   is_running(rq->litmus_next));
+                        /* drop the task */
+                        rq->litmus_next = NULL;
+                }
+                /* release the other CPU's runqueue, but keep ours */
+                spin_unlock(&other_rq->lock);
+        }
+        if (rq->litmus_next)
+                rq->litmus_next->rt_param.stack_in_use = rq->cpu;
+}
+static void enqueue_task_litmus(struct rq *rq, struct task_struct *p,
+                                int wakeup)
+{
+        if (wakeup) {
+                sched_trace_task_resume(p);
+                tsk_rt(p)->present = 1;
+                litmus->task_wake_up(p);
+        } else
+                TRACE_TASK(p, "ignoring an enqueue, not a wake up.\n");
+}
+static void dequeue_task_litmus(struct rq *rq, struct task_struct *p, int sleep)
+{
+        if (sleep) {
+                litmus->task_block(p);
+                tsk_rt(p)->present = 0;
+                sched_trace_task_block(p);
+        } else
+                TRACE_TASK(p, "ignoring a dequeue, not going to sleep.\n");
+}
+static void yield_task_litmus(struct rq *rq)
+{
+        BUG_ON(rq->curr != current);
+        litmus->complete_job();
+}
+/* Plugins are responsible for this.
+ */
+static void check_preempt_curr_litmus(struct rq *rq, struct task_struct *p, int flags)
+{
+}
+/* has already been taken care of */
+static void put_prev_task_litmus(struct rq *rq, struct task_struct *p)
+{
+}
+static struct task_struct *pick_next_task_litmus(struct rq *rq)
+{
+        struct task_struct* picked = rq->litmus_next;
+        rq->litmus_next = NULL;
+        if (picked)
+                picked->se.exec_start = rq->clock;
+        return picked;
+}
+static void task_tick_litmus(struct rq *rq, struct task_struct *p, int queued)
+{
+}
+static void switched_to_litmus(struct rq *rq, struct task_struct *p, int running)
+{
+}
+static void prio_changed_litmus(struct rq *rq, struct task_struct *p,
+                                int oldprio, int running)
+{
+}
+unsigned int get_rr_interval_litmus(struct task_struct *p)
+{
+        /* return infinity */
+        return 0;
+}
+/* This is called when a task became a real-time task, either due to a SCHED_*
+ * class transition or due to PI mutex inheritance. We don't handle Linux PI
+ * mutex inheritance yet (and probably never will). Use LITMUS provided
+ * synchronization primitives instead.
+ */
+static void set_curr_task_litmus(struct rq *rq)
+{
+        rq->curr->se.exec_start = rq->clock;
+}
+#ifdef CONFIG_SMP
+/* execve tries to rebalance task in this scheduling domain */
+static int select_task_rq_litmus(struct task_struct *p, int sd_flag, int flags)
+{
+        /* preemption is already disabled.
+         * We don't want to change cpu here
+         */
+        return smp_processor_id();
+}
+/* we don't repartition at runtime */
+static unsigned long
+load_balance_litmus(struct rq *this_rq, int this_cpu, struct rq *busiest,
+                unsigned long max_load_move,
+                struct sched_domain *sd, enum cpu_idle_type idle,
+                int *all_pinned, int *this_best_prio)
+{
+        return 0;
+}
+static int
+move_one_task_litmus(struct rq *this_rq, int this_cpu, struct rq *busiest,
+                 struct sched_domain *sd, enum cpu_idle_type idle)
+{
+        return 0;
+}
+#endif
+const struct sched_class litmus_sched_class = {
+        .next                   = &rt_sched_class,
+        .enqueue_task           = enqueue_task_litmus,
+        .dequeue_task           = dequeue_task_litmus,
+        .yield_task             = yield_task_litmus,
+        .check_preempt_curr     = check_preempt_curr_litmus,
+        .pick_next_task         = pick_next_task_litmus,
+        .put_prev_task          = put_prev_task_litmus,
+#ifdef CONFIG_SMP
+        .select_task_rq         = select_task_rq_litmus,
+        .load_balance           = load_balance_litmus,
+        .move_one_task          = move_one_task_litmus,
+#endif
+        .set_curr_task          = set_curr_task_litmus,
+        .task_tick              = task_tick_litmus,
+        .get_rr_interval        = get_rr_interval_litmus,
+        .prio_changed           = prio_changed_litmus,
+        .switched_to            = switched_to_litmus,
+};
diff --git a/litmus/sched_plugin.c b/litmus/sched_plugin.c
new file mode 100644
index 000000000000..0be091ece569
--- /dev/null
+++ b/litmus/sched_plugin.c
@@ -0,0 +1,199 @@
+/* sched_plugin.c -- core infrastructure for the scheduler plugin system
+ *
+ * This file includes the initialization of the plugin system, the no-op Linux
+ * scheduler plugin and some dummy functions.
+ */
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/jobs.h>
+/*************************************************************
+ *                   Dummy plugin functions                  *
+ *************************************************************/
+static void litmus_dummy_finish_switch(struct task_struct * prev)
+{
+}
+static struct task_struct* litmus_dummy_schedule(struct task_struct * prev)
+{
+        return NULL;
+}
+static void litmus_dummy_tick(struct task_struct* tsk)
+{
+}
+static long litmus_dummy_admit_task(struct task_struct* tsk)
+{
+        printk(KERN_CRIT "LITMUS^RT: Linux plugin rejects %s/%d.\n",
+                tsk->comm, tsk->pid);
+        return -EINVAL;
+}
+static void litmus_dummy_task_new(struct task_struct *t, int on_rq, int running)
+{
+}
+static void litmus_dummy_task_wake_up(struct task_struct *task)
+{
+}
+static void litmus_dummy_task_block(struct task_struct *task)
+{
+}
+static void litmus_dummy_task_exit(struct task_struct *task)
+{
+}
+static long litmus_dummy_complete_job(void)
+{
+        return -ENOSYS;
+}
+static long litmus_dummy_activate_plugin(void)
+{
+        return 0;
+}
+static long litmus_dummy_deactivate_plugin(void)
+{
+        return 0;
+}
+#ifdef CONFIG_FMLP
+static long litmus_dummy_inherit_priority(struct pi_semaphore *sem,
+                                          struct task_struct *new_owner)
+{
+        return -ENOSYS;
+}
+static long litmus_dummy_return_priority(struct pi_semaphore *sem)
+{
+        return -ENOSYS;
+}
+static long litmus_dummy_pi_block(struct pi_semaphore *sem,
+                                  struct task_struct *new_waiter)
+{
+        return -ENOSYS;
+}
+#endif
+/* The default scheduler plugin. It doesn't do anything and lets Linux do its
+ * job.
+ */
+struct sched_plugin linux_sched_plugin = {
+        .plugin_name = "Linux",
+        .tick = litmus_dummy_tick,
+        .task_new   = litmus_dummy_task_new,
+        .task_exit = litmus_dummy_task_exit,
+        .task_wake_up = litmus_dummy_task_wake_up,
+        .task_block = litmus_dummy_task_block,
+        .complete_job = litmus_dummy_complete_job,
+        .schedule = litmus_dummy_schedule,
+        .finish_switch = litmus_dummy_finish_switch,
+        .activate_plugin = litmus_dummy_activate_plugin,
+        .deactivate_plugin = litmus_dummy_deactivate_plugin,
+#ifdef CONFIG_FMLP
+        .inherit_priority = litmus_dummy_inherit_priority,
+        .return_priority = litmus_dummy_return_priority,
+        .pi_block = litmus_dummy_pi_block,
+#endif
+        .admit_task = litmus_dummy_admit_task
+};
+/*
+ *      The reference to current plugin that is used to schedule tasks within
+ *      the system. It stores references to actual function implementations
+ *      Should be initialized by calling "init_***_plugin()"
+ */
+struct sched_plugin *litmus = &linux_sched_plugin;
+/* the list of registered scheduling plugins */
+static LIST_HEAD(sched_plugins);
+static DEFINE_SPINLOCK(sched_plugins_lock);
+#define CHECK(func) {\
+        if (!plugin->func) \
+                plugin->func = litmus_dummy_ ## func;}
+/* FIXME: get reference to module  */
+int register_sched_plugin(struct sched_plugin* plugin)
+{
+        printk(KERN_INFO "Registering LITMUS^RT plugin %s.\n",
+               plugin->plugin_name);
+        /* make sure we don't trip over null pointers later */
+        CHECK(finish_switch);
+        CHECK(schedule);
+        CHECK(tick);
+        CHECK(task_wake_up);
+        CHECK(task_exit);
+        CHECK(task_block);
+        CHECK(task_new);
+        CHECK(complete_job);
+        CHECK(activate_plugin);
+        CHECK(deactivate_plugin);
+#ifdef CONFIG_FMLP
+        CHECK(inherit_priority);
+        CHECK(return_priority);
+        CHECK(pi_block);
+#endif
+        CHECK(admit_task);
+        if (!plugin->release_at)
+                plugin->release_at = release_at;
+        spin_lock(&sched_plugins_lock);
+        list_add(&plugin->list, &sched_plugins);
+        spin_unlock(&sched_plugins_lock);
+        return 0;
+}
+/* FIXME: reference counting, etc. */
+struct sched_plugin* find_sched_plugin(const char* name)
+{
+        struct list_head *pos;
+        struct sched_plugin *plugin;
+        spin_lock(&sched_plugins_lock);
+        list_for_each(pos, &sched_plugins) {
+                plugin = list_entry(pos, struct sched_plugin, list);
+                if (!strcmp(plugin->plugin_name, name))
+                    goto out_unlock;
+        }
+        plugin = NULL;
+out_unlock:
+        spin_unlock(&sched_plugins_lock);
+        return plugin;
+}
+int print_sched_plugins(char* buf, int max)
+{
+        int count = 0;
+        struct list_head *pos;
+        struct sched_plugin *plugin;
+        spin_lock(&sched_plugins_lock);
+        list_for_each(pos, &sched_plugins) {
+                plugin = list_entry(pos, struct sched_plugin, list);
+                count += snprintf(buf + count, max - count, "%s\n", plugin->plugin_name);
+                if (max - count <= 0)
+                        break;
+        }
+        spin_unlock(&sched_plugins_lock);
+        return  count;
+}
author	Andrea Bastoni <bastoni@cs.unc.edu>	2009-12-17 21:23:36 -0500
committer	Andrea Bastoni <bastoni@cs.unc.edu>	2010-05-29 17:05:45 -0400
commit	4b38febbd59fd33542a343991262119eb9860f5e (patch)
tree	1af88a0d354abe344c2c2869631f76a1806d75c3
parent	22763c5cf3690a681551162c15d34d935308c8d7 (diff)