8 files changed, 289 insertions, 13 deletions
diff --git a/kernel/exit.c b/kernel/exit.c
index f2b321bae440..64879bdff921 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -57,6 +57,8 @@
 #include <asm/pgtable.h>
 #include <asm/mmu_context.h>
+extern void exit_od_table(struct task_struct *t);
 static void exit_mm(struct task_struct * tsk);
 static void __unhash_process(struct task_struct *p, bool group_dead)
@@ -980,6 +982,8 @@ NORET_TYPE void do_exit(long code)
        if (unlikely(tsk->audit_context))
                audit_free(tsk);
+        exit_od_table(tsk);
        tsk->exit_code = code;
        taskstats_exit(tsk, group_dead);
diff --git a/kernel/fork.c b/kernel/fork.c
index 0276c30401a0..25c6111fe3a6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -77,6 +77,9 @@
 #include <trace/events/sched.h>
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
 /*
 * Protected counters by write_lock_irq(&tasklist_lock)
 */
@@ -191,6 +194,7 @@ void __put_task_struct(struct task_struct *tsk)
        WARN_ON(atomic_read(&tsk->usage));
        WARN_ON(tsk == current);
+        exit_litmus(tsk);
        exit_creds(tsk);
        delayacct_tsk_free(tsk);
        put_signal_struct(tsk->signal);
@@ -275,6 +279,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
        tsk->stack = ti;
+        /* Don't let the new task be a real-time task. */
+        litmus_fork(tsk);
        err = prop_local_init_single(&tsk->dirties);
        if (err)
                goto out;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index a9205e32a059..11e896903828 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -46,6 +46,8 @@
 #include <linux/sched.h>
 #include <linux/timer.h>
+#include <litmus/litmus.h>
 #include <asm/uaccess.h>
 #include <trace/events/timer.h>
@@ -1026,6 +1028,98 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
 }
 EXPORT_SYMBOL_GPL(hrtimer_start);
+#ifdef CONFIG_ARCH_HAS_SEND_PULL_TIMERS
+/**
+ * hrtimer_start_on_info_init - Initialize hrtimer_start_on_info
+ */
+void hrtimer_start_on_info_init(struct hrtimer_start_on_info *info)
+{
+        memset(info, 0, sizeof(struct hrtimer_start_on_info));
+        atomic_set(&info->state, HRTIMER_START_ON_INACTIVE);
+}
+/**
+ *  hrtimer_pull - PULL_TIMERS_VECTOR callback on remote cpu
+ */
+void hrtimer_pull(void)
+{
+        struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases);
+        struct hrtimer_start_on_info *info;
+        struct list_head *pos, *safe, list;
+        raw_spin_lock(&base->lock);
+        list_replace_init(&base->to_pull, &list);
+        raw_spin_unlock(&base->lock);
+        list_for_each_safe(pos, safe, &list) {
+                info = list_entry(pos, struct hrtimer_start_on_info, list);
+                TRACE("pulled timer 0x%x\n", info->timer);
+                list_del(pos);
+                hrtimer_start(info->timer, info->time, info->mode);
+        }
+}
+/**
+ *  hrtimer_start_on - trigger timer arming on remote cpu
+ *  @cpu:       remote cpu
+ *  @info:      save timer information for enqueuing on remote cpu
+ *  @timer:     timer to be pulled
+ *  @time:      expire time
+ *  @mode:      timer mode
+ */
+int hrtimer_start_on(int cpu, struct hrtimer_start_on_info* info,
+                struct hrtimer *timer, ktime_t time,
+                const enum hrtimer_mode mode)
+{
+        unsigned long flags;
+        struct hrtimer_cpu_base* base;
+        int in_use = 0, was_empty;
+        /* serialize access to info through the timer base */
+        lock_hrtimer_base(timer, &flags);
+        in_use = (atomic_read(&info->state) != HRTIMER_START_ON_INACTIVE);
+        if (!in_use) {
+                INIT_LIST_HEAD(&info->list);
+                info->timer = timer;
+                info->time  = time;
+                info->mode  = mode;
+                /* mark as in use */
+                atomic_set(&info->state, HRTIMER_START_ON_QUEUED);
+        }
+        unlock_hrtimer_base(timer, &flags);
+        if (!in_use) {
+                /* initiate pull  */
+                preempt_disable();
+                if (cpu == smp_processor_id()) {
+                        /* start timer locally; we may get called
+                         * with rq->lock held, do not wake up anything
+                         */
+                        TRACE("hrtimer_start_on: starting on local CPU\n");
+                        __hrtimer_start_range_ns(info->timer, info->time,
+                                                 0, info->mode, 0);
+                } else {
+                        TRACE("hrtimer_start_on: pulling to remote CPU\n");
+                        base = &per_cpu(hrtimer_bases, cpu);
+                        raw_spin_lock_irqsave(&base->lock, flags);
+                        was_empty = list_empty(&base->to_pull);
+                        list_add(&info->list, &base->to_pull);
+                        raw_spin_unlock_irqrestore(&base->lock, flags);
+                        if (was_empty)
+                                /* only send IPI if other no else
+                                 * has done so already
+                                 */
+                                smp_send_pull_timers(cpu);
+                }
+                preempt_enable();
+        }
+        return in_use;
+}
+#endif
 /**
 * hrtimer_try_to_cancel - try to deactivate a timer
@@ -1625,6 +1719,7 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
        }
        hrtimer_init_hres(cpu_base);
+        INIT_LIST_HEAD(&cpu_base->to_pull);
 }
 #ifdef CONFIG_HOTPLUG_CPU
diff --git a/kernel/printk.c b/kernel/printk.c
index 35185392173f..b799a2ee96e5 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -70,6 +70,13 @@ int console_printk[4] = {
 };
 /*
+ * divert printk() messages when there is a LITMUS^RT debug listener
+ */
+#include <litmus/litmus.h>
+int trace_override = 0;
+int trace_recurse  = 0;
+/*
 * Low level drivers may need that to know if they can schedule in
 * their unblank() callback or not. So let's export it.
 */
@@ -871,6 +878,9 @@ asmlinkage int vprintk(const char *fmt, va_list args)
        /* Emit the output into the temporary buffer */
        printed_len += vscnprintf(printk_buf + printed_len,
                                  sizeof(printk_buf) - printed_len, fmt, args);
+        /* if LITMUS^RT tracer is active divert printk() msgs */
+        if (trace_override && !trace_recurse)
+                TRACE("%s", printk_buf);
        p = printk_buf;
@@ -947,7 +957,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
         * Try to acquire and then immediately release the
         * console semaphore. The release will do all the
         * actual magic (print out buffers, wake up klogd,
-         * etc). 
+         * etc).
         *
         * The console_trylock_for_printk() function
         * will release 'logbuf_lock' regardless of whether it
@@ -1220,7 +1230,7 @@ int printk_needs_cpu(int cpu)
 void wake_up_klogd(void)
 {
-        if (waitqueue_active(&log_wait))
+        if (!trace_override && waitqueue_active(&log_wait))
                this_cpu_write(printk_pending, 1);
 }
diff --git a/kernel/sched.c b/kernel/sched.c
index fde6ff903525..935f8e8e6160 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -80,6 +80,11 @@
 #include "workqueue_sched.h"
 #include "sched_autogroup.h"
+#include <litmus/sched_trace.h>
+#include <litmus/trace.h>
+static void litmus_tick(struct rq*, struct task_struct*);
 #define CREATE_TRACE_POINTS
 #include <trace/events/sched.h>
@@ -410,6 +415,12 @@ struct rt_rq {
 #endif
 };
+/* Litmus related fields in a runqueue */
+struct litmus_rq {
+        unsigned long nr_running;
+        struct task_struct *prev;
+};
 #ifdef CONFIG_SMP
 /*
@@ -475,6 +486,7 @@ struct rq {
        struct cfs_rq cfs;
        struct rt_rq rt;
+        struct litmus_rq litmus;
 #ifdef CONFIG_FAIR_GROUP_SCHED
        /* list of leaf cfs_rq on this cpu: */
@@ -1045,6 +1057,7 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
        raw_spin_lock(&rq->lock);
        update_rq_clock(rq);
        rq->curr->sched_class->task_tick(rq, rq->curr, 1);
+        litmus_tick(rq, rq->curr);
        raw_spin_unlock(&rq->lock);
        return HRTIMER_NORESTART;
@@ -1773,7 +1786,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 static const struct sched_class rt_sched_class;
-#define sched_class_highest (&stop_sched_class)
+#define sched_class_highest (&litmus_sched_class)
 #define for_each_class(class) \
   for (class = sched_class_highest; class; class = class->next)
@@ -2031,6 +2044,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
 #include "sched_rt.c"
 #include "sched_autogroup.c"
 #include "sched_stoptask.c"
+#include "../litmus/sched_litmus.c"
 #ifdef CONFIG_SCHED_DEBUG
 # include "sched_debug.c"
 #endif
@@ -2153,6 +2167,10 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
         * A queue event has occurred, and we're going to schedule.  In
         * this case, we can save a useless back to back clock update.
         */
+        /* LITMUS^RT:
+         * The "disable-clock-update" approach was buggy in Linux 2.6.36.
+         * The issue has been solved in 2.6.37.
+         */
        if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
                rq->skip_clock_update = 1;
 }
@@ -2676,6 +2694,9 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
        unsigned long flags;
        int cpu, success = 0;
+        if (is_realtime(p))
+                TRACE_TASK(p, "try_to_wake_up() state:%d\n", p->state);
        smp_wmb();
        raw_spin_lock_irqsave(&p->pi_lock, flags);
        if (!(p->state & state))
@@ -2712,6 +2733,12 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
         */
        smp_rmb();
+        /* LITMUS^RT: once the task can be safely referenced by this
+         * CPU, don't mess up with Linux load balancing stuff.
+         */
+        if (is_realtime(p))
+                goto litmus_out_activate;
        p->sched_contributes_to_load = !!task_contributes_to_load(p);
        p->state = TASK_WAKING;
@@ -2723,12 +2750,16 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
                wake_flags |= WF_MIGRATED;
                set_task_cpu(p, cpu);
        }
+litmus_out_activate:
 #endif /* CONFIG_SMP */
        ttwu_queue(p, cpu);
 stat:
        ttwu_stat(p, cpu, wake_flags);
 out:
+        if (is_realtime(p))
+                TRACE_TASK(p, "try_to_wake_up() done state:%d\n", p->state);
        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
        return success;
@@ -2839,7 +2870,8 @@ void sched_fork(struct task_struct *p)
         * Revert to default priority/policy on fork if requested.
         */
        if (unlikely(p->sched_reset_on_fork)) {
-                if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {
+                if (p->policy == SCHED_FIFO || p->policy == SCHED_RR ||
+                    p->policy == SCHED_LITMUS) {
                        p->policy = SCHED_NORMAL;
                        p->normal_prio = p->static_prio;
                }
@@ -3050,6 +3082,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
         */
        prev_state = prev->state;
        finish_arch_switch(prev);
+        litmus->finish_switch(prev);
+        prev->rt_param.stack_in_use = NO_CPU;
 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
        local_irq_disable();
 #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
@@ -3079,6 +3113,15 @@ static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
 {
        if (prev->sched_class->pre_schedule)
                prev->sched_class->pre_schedule(rq, prev);
+        /* LITMUS^RT not very clean hack: we need to save the prev task
+         * as our scheduling decision rely on it (as we drop the rq lock
+         * something in prev can change...); there is no way to escape
+         * this ack apart from modifying pick_nex_task(rq, _prev_) or
+         * falling back on the previous solution of decoupling
+         * scheduling decisions
+         */
+        rq->litmus.prev = prev;
 }
 /* rq->lock is NOT held, but preemption is disabled */
@@ -4094,18 +4137,26 @@ void scheduler_tick(void)
        sched_clock_tick();
+        TS_TICK_START(current);
        raw_spin_lock(&rq->lock);
        update_rq_clock(rq);
        update_cpu_load_active(rq);
        curr->sched_class->task_tick(rq, curr, 0);
+        /* litmus_tick may force current to resched */
+        litmus_tick(rq, curr);
        raw_spin_unlock(&rq->lock);
        perf_event_task_tick();
 #ifdef CONFIG_SMP
        rq->idle_at_tick = idle_cpu(cpu);
-        trigger_load_balance(rq, cpu);
+        if (!is_realtime(current))
+                trigger_load_balance(rq, cpu);
 #endif
+        TS_TICK_END(current);
 }
 notrace unsigned long get_parent_ip(unsigned long addr)
@@ -4225,12 +4276,20 @@ pick_next_task(struct rq *rq)
        /*
         * Optimization: we know that if all tasks are in
         * the fair class we can call that function directly:
-         */
-        if (likely(rq->nr_running == rq->cfs.nr_running)) {
+         * NOT IN LITMUS^RT!
+         * This breaks many assumptions in the plugins.
+         * Do not uncomment without thinking long and hard
+         * about how this affects global plugins such as GSN-EDF.
+        if (rq->nr_running == rq->cfs.nr_running) {
+                TRACE("taking shortcut in pick_next_task()\n");
                p = fair_sched_class.pick_next_task(rq);
                if (likely(p))
                        return p;
        }
+        */
        for_each_class(class) {
                p = class->pick_next_task(rq);
@@ -4253,11 +4312,19 @@ asmlinkage void __sched schedule(void)
 need_resched:
        preempt_disable();
+        sched_state_entered_schedule();
        cpu = smp_processor_id();
        rq = cpu_rq(cpu);
        rcu_note_context_switch(cpu);
        prev = rq->curr;
+        /* LITMUS^RT: quickly re-evaluate the scheduling decision
+         * if the previous one is no longer valid after CTX.
+         */
+litmus_need_resched_nonpreemptible:
+        TS_SCHED_START;
+        sched_trace_task_switch_away(prev);
        schedule_debug(prev);
        if (sched_feat(HRTICK))
@@ -4314,7 +4381,10 @@ need_resched:
                rq->curr = next;
                ++*switch_count;
+                TS_SCHED_END(next);
+                TS_CXS_START(next);
                context_switch(rq, prev, next); /* unlocks the rq */
+                TS_CXS_END(current);
                /*
                 * The context switch have flipped the stack from under us
                 * and restored the local variables which were saved when
@@ -4323,14 +4393,23 @@ need_resched:
                 */
                cpu = smp_processor_id();
                rq = cpu_rq(cpu);
-        } else
+        } else {
+                TS_SCHED_END(prev);
                raw_spin_unlock_irq(&rq->lock);
+        }
+        sched_trace_task_switch_to(current);
        post_schedule(rq);
+        if (sched_state_validate_switch())
+                goto litmus_need_resched_nonpreemptible;
        preempt_enable_no_resched();
        if (need_resched())
                goto need_resched;
+        srp_ceiling_block();
 }
 EXPORT_SYMBOL(schedule);
@@ -4600,6 +4679,17 @@ void complete_all(struct completion *x)
 }
 EXPORT_SYMBOL(complete_all);
+void complete_n(struct completion *x, int n)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&x->wait.lock, flags);
+        x->done += n;
+        __wake_up_common(&x->wait, TASK_NORMAL, n, 0, NULL);
+        spin_unlock_irqrestore(&x->wait.lock, flags);
+}
+EXPORT_SYMBOL(complete_n);
 static inline long __sched
 do_wait_for_common(struct completion *x, long timeout, int state)
 {
@@ -5039,7 +5129,9 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
        p->normal_prio = normal_prio(p);
        /* we are holding p->pi_lock already */
        p->prio = rt_mutex_getprio(p);
-        if (rt_prio(p->prio))
+        if (p->policy == SCHED_LITMUS)
+                p->sched_class = &litmus_sched_class;
+        else if (rt_prio(p->prio))
                p->sched_class = &rt_sched_class;
        else
                p->sched_class = &fair_sched_class;
@@ -5087,7 +5179,7 @@ recheck:
                if (policy != SCHED_FIFO && policy != SCHED_RR &&
                                policy != SCHED_NORMAL && policy != SCHED_BATCH &&
-                                policy != SCHED_IDLE)
+                                policy != SCHED_IDLE && policy != SCHED_LITMUS)
                        return -EINVAL;
        }
@@ -5102,6 +5194,8 @@ recheck:
                return -EINVAL;
        if (rt_policy(policy) != (param->sched_priority != 0))
                return -EINVAL;
+        if (policy == SCHED_LITMUS && policy == p->policy)
+                return -EINVAL;
        /*
         * Allow unprivileged RT tasks to decrease priority:
@@ -5145,6 +5239,12 @@ recheck:
                        return retval;
        }
+        if (policy == SCHED_LITMUS) {
+                retval = litmus_admit_task(p);
+                if (retval)
+                        return retval;
+        }
        /*
         * make sure no PI-waiters arrive (or leave) while we are
         * changing the priority of the task:
@@ -5203,10 +5303,19 @@ recheck:
        p->sched_reset_on_fork = reset_on_fork;
+        if (p->policy == SCHED_LITMUS)
+                litmus_exit_task(p);
        oldprio = p->prio;
        prev_class = p->sched_class;
        __setscheduler(rq, p, policy, param->sched_priority);
+        if (policy == SCHED_LITMUS) {
+                p->rt_param.stack_in_use = running ? rq->cpu : NO_CPU;
+                p->rt_param.present = running;
+                litmus->task_new(p, on_rq, running);
+        }
        if (running)
                p->sched_class->set_curr_task(rq);
        if (on_rq)
@@ -5374,10 +5483,11 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
        rcu_read_lock();
        p = find_process_by_pid(pid);
-        if (!p) {
+        /* Don't set affinity if task not found and for LITMUS tasks */
+        if (!p || is_realtime(p)) {
                rcu_read_unlock();
                put_online_cpus();
-                return -ESRCH;
+                return p ? -EPERM : -ESRCH;
        }
        /* Prevent p going away */
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index c768588e180b..334eb474af93 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1890,6 +1890,9 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
        int scale = cfs_rq->nr_running >= sched_nr_latency;
        int next_buddy_marked = 0;
+        if (unlikely(rt_prio(p->prio)) || p->policy == SCHED_LITMUS)
+                goto preempt;
        if (unlikely(se == pse))
                return;
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 10d018212bab..58cf5d18dfdc 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1078,7 +1078,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
 */
 static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
 {
-        if (p->prio < rq->curr->prio) {
+        if (p->prio < rq->curr->prio || p->policy == SCHED_LITMUS) {
                resched_task(rq->curr);
                return;
        }
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index d5097c44b407..0c0e02f1b819 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -766,12 +766,53 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
 }
 /**
+ * tick_set_quanta_type - get the quanta type as a boot option
+ * Default is standard setup with ticks staggered over first
+ * half of tick period.
+ */
+int quanta_type = LINUX_DEFAULT_TICKS;
+static int __init tick_set_quanta_type(char *str)
+{
+        if (strcmp("aligned", str) == 0) {
+                quanta_type = LITMUS_ALIGNED_TICKS;
+                printk(KERN_INFO "LITMUS^RT: setting aligned quanta\n");
+        }
+        else if (strcmp("staggered", str) == 0) {
+                quanta_type = LITMUS_STAGGERED_TICKS;
+                printk(KERN_INFO "LITMUS^RT: setting staggered quanta\n");
+        }
+        return 1;
+}
+__setup("quanta=", tick_set_quanta_type);
+u64 cpu_stagger_offset(int cpu)
+{
+        u64 offset = 0;
+        switch (quanta_type) {
+                case LITMUS_ALIGNED_TICKS:
+                        offset = 0;
+                        break;
+                case LITMUS_STAGGERED_TICKS:
+                        offset = ktime_to_ns(tick_period);
+                        do_div(offset, num_possible_cpus());
+                        offset *= cpu;
+                        break;
+                default:
+                        offset = ktime_to_ns(tick_period) >> 1;
+                        do_div(offset, num_possible_cpus());
+                        offset *= cpu;
+        }
+        return offset;
+}
+/**
 * tick_setup_sched_timer - setup the tick emulation timer
 */
 void tick_setup_sched_timer(void)
 {
        struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
        ktime_t now = ktime_get();
+        u64 offset;
        /*
         * Emulate tick processing via per-CPU hrtimers:
@@ -782,6 +823,12 @@ void tick_setup_sched_timer(void)
        /* Get the next period (per cpu) */
        hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
+        /* Offset must be set correctly to achieve desired quanta type. */
+        offset = cpu_stagger_offset(smp_processor_id());
+        /* Add the correct offset to expiration time */
+        hrtimer_add_expires_ns(&ts->sched_timer, offset);
        for (;;) {
                hrtimer_forward(&ts->sched_timer, now, tick_period);
                hrtimer_start_expires(&ts->sched_timer,