8 files changed, 248 insertions, 17 deletions
diff --git a/kernel/exit.c b/kernel/exit.c
index 7f2683a10ac4..256ce8c2ebc8 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -57,6 +57,8 @@
 #include <asm/mmu_context.h>
 #include "cred-internals.h"
+extern void exit_od_table(struct task_struct *t);
 static void exit_mm(struct task_struct * tsk);
 static void __unhash_process(struct task_struct *p)
@@ -968,6 +970,8 @@ NORET_TYPE void do_exit(long code)
        if (unlikely(tsk->audit_context))
                audit_free(tsk);
+        exit_od_table(tsk);
        tsk->exit_code = code;
        taskstats_exit(tsk, group_dead);
diff --git a/kernel/fork.c b/kernel/fork.c
index 4c14942a0ee3..166eb780dd7d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -75,6 +75,9 @@
 #include <trace/events/sched.h>
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
 /*
 * Protected counters by write_lock_irq(&tasklist_lock)
 */
@@ -171,6 +174,7 @@ void __put_task_struct(struct task_struct *tsk)
        WARN_ON(atomic_read(&tsk->usage));
        WARN_ON(tsk == current);
+        exit_litmus(tsk);
        exit_creds(tsk);
        delayacct_tsk_free(tsk);
@@ -253,6 +257,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
        tsk->stack = ti;
+        /* Don't let the new task be a real-time task. */
+        litmus_fork(tsk);
        err = prop_local_init_single(&tsk->dirties);
        if (err)
                goto out;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 0086628b6e97..c0b440b1f6ee 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -46,6 +46,8 @@
 #include <linux/sched.h>
 #include <linux/timer.h>
+#include <litmus/litmus.h>
 #include <asm/uaccess.h>
 #include <trace/events/timer.h>
@@ -1041,6 +1043,85 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
 }
 EXPORT_SYMBOL_GPL(hrtimer_start);
+/**
+ *  hrtimer_pull - PULL_TIMERS_VECTOR callback on remote cpu
+ */
+void hrtimer_pull(void)
+{
+        struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases);
+        struct hrtimer_start_on_info *info;
+        struct list_head *pos, *safe, list;
+        spin_lock(&base->lock);
+        list_replace_init(&base->to_pull, &list);
+        spin_unlock(&base->lock);
+        list_for_each_safe(pos, safe, &list) {
+                info = list_entry(pos, struct hrtimer_start_on_info, list);
+                TRACE("pulled timer 0x%x\n", info->timer);
+                list_del(pos);
+                hrtimer_start(info->timer, info->time, info->mode);
+        }
+}
+/**
+ *  hrtimer_start_on - trigger timer arming on remote cpu
+ *  @cpu:       remote cpu
+ *  @info:      save timer information for enqueuing on remote cpu
+ *  @timer:     timer to be pulled
+ *  @time:      expire time
+ *  @mode:      timer mode
+ */
+int hrtimer_start_on(int cpu, struct hrtimer_start_on_info* info,
+                struct hrtimer *timer, ktime_t time,
+                const enum hrtimer_mode mode)
+{
+        unsigned long flags;
+        struct hrtimer_cpu_base* base;
+        int in_use = 0, was_empty;
+        /* serialize access to info through the timer base */
+        lock_hrtimer_base(timer, &flags);
+        in_use = (atomic_read(&info->state) != HRTIMER_START_ON_INACTIVE);
+        if (!in_use) {
+                INIT_LIST_HEAD(&info->list);
+                info->timer = timer;
+                info->time  = time;
+                info->mode  = mode;
+                /* mark as in use */
+                atomic_set(&info->state, HRTIMER_START_ON_QUEUED);
+        }
+        unlock_hrtimer_base(timer, &flags);
+        if (!in_use) {
+                /* initiate pull  */
+                preempt_disable();
+                if (cpu == smp_processor_id()) {
+                        /* start timer locally; we may get called
+                         * with rq->lock held, do not wake up anything
+                         */
+                        TRACE("hrtimer_start_on: starting on local CPU\n");
+                        __hrtimer_start_range_ns(info->timer, info->time,
+                                                 0, info->mode, 0);
+                } else {
+                        TRACE("hrtimer_start_on: pulling to remote CPU\n");
+                        base = &per_cpu(hrtimer_bases, cpu);
+                        spin_lock_irqsave(&base->lock, flags);
+                        was_empty = list_empty(&base->to_pull);
+                        list_add(&info->list, &base->to_pull);
+                        spin_unlock_irqrestore(&base->lock, flags);
+                        if (was_empty)
+                                /* only send IPI if other no else
+                                 * has done so already
+                                 */
+                                smp_send_pull_timers(cpu);
+                }
+                preempt_enable();
+        }
+        return in_use;
+}
 /**
 * hrtimer_try_to_cancel - try to deactivate a timer
@@ -1631,6 +1712,7 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
                cpu_base->clock_base[i].cpu_base = cpu_base;
        hrtimer_init_hres(cpu_base);
+        INIT_LIST_HEAD(&cpu_base->to_pull);
 }
 #ifdef CONFIG_HOTPLUG_CPU
diff --git a/kernel/printk.c b/kernel/printk.c
index 75077ad0b537..ee54355cfdf1 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -71,6 +71,13 @@ int console_printk[4] = {
 };
 /*
+ * divert printk() messages when there is a LITMUS^RT debug listener
+ */
+#include <litmus/litmus.h>
+int trace_override = 0;
+int trace_recurse  = 0;
+/*
 * Low level drivers may need that to know if they can schedule in
 * their unblank() callback or not. So let's export it.
 */
@@ -708,6 +715,9 @@ asmlinkage int vprintk(const char *fmt, va_list args)
        /* Emit the output into the temporary buffer */
        printed_len += vscnprintf(printk_buf + printed_len,
                                  sizeof(printk_buf) - printed_len, fmt, args);
+        /* if LITMUS^RT tracer is active divert printk() msgs */
+        if (trace_override && !trace_recurse)
+                TRACE("%s", printk_buf);
        p = printk_buf;
@@ -777,7 +787,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
         * Try to acquire and then immediately release the
         * console semaphore. The release will do all the
         * actual magic (print out buffers, wake up klogd,
-         * etc). 
+         * etc).
         *
         * The acquire_console_semaphore_for_printk() function
         * will release 'logbuf_lock' regardless of whether it
@@ -1014,7 +1024,7 @@ int printk_needs_cpu(int cpu)
 void wake_up_klogd(void)
 {
-        if (waitqueue_active(&log_wait))
+        if (!trace_override && waitqueue_active(&log_wait))
                __raw_get_cpu_var(printk_pending) = 1;
 }
diff --git a/kernel/sched.c b/kernel/sched.c
index 3c2a54f70ffe..5e3c509e0efe 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -78,6 +78,9 @@
 #include "sched_cpupri.h"
+#include <litmus/sched_trace.h>
+#include <litmus/trace.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/sched.h>
@@ -450,6 +453,12 @@ struct rt_rq {
 #endif
 };
+/* Litmus related fields in a runqueue */
+struct litmus_rq {
+        unsigned long nr_running;
+        struct task_struct *prev;
+};
 #ifdef CONFIG_SMP
 /*
@@ -512,6 +521,7 @@ struct rq {
        struct cfs_rq cfs;
        struct rt_rq rt;
+        struct litmus_rq litmus;
 #ifdef CONFIG_FAIR_GROUP_SCHED
        /* list of leaf cfs_rq on this cpu: */
@@ -1833,7 +1843,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 static const struct sched_class rt_sched_class;
-#define sched_class_highest (&rt_sched_class)
+#define sched_class_highest (&litmus_sched_class)
 #define for_each_class(class) \
   for (class = sched_class_highest; class; class = class->next)
@@ -1932,6 +1942,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
 #include "sched_idletask.c"
 #include "sched_fair.c"
 #include "sched_rt.c"
+#include "../litmus/sched_litmus.c"
 #ifdef CONFIG_SCHED_DEBUG
 # include "sched_debug.c"
 #endif
@@ -2372,6 +2383,9 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
        unsigned long flags;
        struct rq *rq;
+        if (is_realtime(p))
+                TRACE_TASK(p, "try_to_wake_up() state:%d\n", p->state);
        if (!sched_feat(SYNC_WAKEUPS))
                wake_flags &= ~WF_SYNC;
@@ -2390,7 +2404,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
        orig_cpu = cpu;
 #ifdef CONFIG_SMP
-        if (unlikely(task_running(rq, p)))
+        if (unlikely(task_running(rq, p)) || is_realtime(p))
                goto out_activate;
        /*
@@ -2497,6 +2511,8 @@ out_running:
        }
 #endif
 out:
+        if (is_realtime(p))
+                TRACE_TASK(p, "try_to_wake_up() done state:%d\n", p->state);
        task_rq_unlock(rq, &flags);
        put_cpu();
@@ -2814,6 +2830,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
         */
        prev_state = prev->state;
        finish_arch_switch(prev);
+        litmus->finish_switch(prev);
+        prev->rt_param.stack_in_use = NO_CPU;
 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
        local_irq_disable();
 #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
@@ -2843,6 +2861,15 @@ static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
 {
        if (prev->sched_class->pre_schedule)
                prev->sched_class->pre_schedule(rq, prev);
+        /* LITMUS^RT not very clean hack: we need to save the prev task
+         * as our scheduling decision rely on it (as we drop the rq lock
+         * something in prev can change...); there is no way to escape
+         * this ack apart from modifying pick_nex_task(rq, _prev_) or
+         * falling back on the previous solution of decoupling
+         * scheduling decisions
+         */
+        rq->litmus.prev = prev;
 }
 /* rq->lock is NOT held, but preemption is disabled */
@@ -3520,18 +3547,26 @@ void scheduler_tick(void)
        sched_clock_tick();
+        TS_TICK_START(current);
        raw_spin_lock(&rq->lock);
        update_rq_clock(rq);
        update_cpu_load(rq);
        curr->sched_class->task_tick(rq, curr, 0);
+        /* litmus_tick may force current to resched */
+        litmus_tick(rq, curr);
        raw_spin_unlock(&rq->lock);
        perf_event_task_tick(curr);
 #ifdef CONFIG_SMP
        rq->idle_at_tick = idle_cpu(cpu);
-        trigger_load_balance(rq, cpu);
+        if (!is_realtime(current))
+                trigger_load_balance(rq, cpu);
 #endif
+        TS_TICK_END(current);
 }
 notrace unsigned long get_parent_ip(unsigned long addr)
@@ -3672,12 +3707,20 @@ pick_next_task(struct rq *rq)
        /*
         * Optimization: we know that if all tasks are in
         * the fair class we can call that function directly:
-         */
-        if (likely(rq->nr_running == rq->cfs.nr_running)) {
+         * NOT IN LITMUS^RT!
+         * This breaks many assumptions in the plugins.
+         * Do not uncomment without thinking long and hard
+         * about how this affects global plugins such as GSN-EDF.
+        if (rq->nr_running == rq->cfs.nr_running) {
+                TRACE("taking shortcut in pick_next_task()\n");
                p = fair_sched_class.pick_next_task(rq);
                if (likely(p))
                        return p;
        }
+        */
        class = sched_class_highest;
        for ( ; ; ) {
@@ -3712,6 +3755,8 @@ need_resched:
        release_kernel_lock(prev);
 need_resched_nonpreemptible:
+        TS_SCHED_START;
+        sched_trace_task_switch_away(prev);
        schedule_debug(prev);
@@ -3746,15 +3791,22 @@ need_resched_nonpreemptible:
                rq->curr = next;
                ++*switch_count;
+                TS_SCHED_END(next);
+                TS_CXS_START(next);
                context_switch(rq, prev, next); /* unlocks the rq */
+                TS_CXS_END(current);
                /*
                 * the context switch might have flipped the stack from under
                 * us, hence refresh the local variables.
                 */
                cpu = smp_processor_id();
                rq = cpu_rq(cpu);
-        } else
+        } else {
+                TS_SCHED_END(prev);
                raw_spin_unlock_irq(&rq->lock);
+        }
+        sched_trace_task_switch_to(current);
        post_schedule(rq);
@@ -3767,6 +3819,9 @@ need_resched_nonpreemptible:
        preempt_enable_no_resched();
        if (need_resched())
                goto need_resched;
+        if (srp_active())
+                srp_ceiling_block();
 }
 EXPORT_SYMBOL(schedule);
@@ -4043,6 +4098,17 @@ void complete_all(struct completion *x)
 }
 EXPORT_SYMBOL(complete_all);
+void complete_n(struct completion *x, int n)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&x->wait.lock, flags);
+        x->done += n;
+        __wake_up_common(&x->wait, TASK_NORMAL, n, 0, NULL);
+        spin_unlock_irqrestore(&x->wait.lock, flags);
+}
+EXPORT_SYMBOL(complete_n);
 static inline long __sched
 do_wait_for_common(struct completion *x, long timeout, int state)
 {
@@ -4471,7 +4537,9 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
        p->normal_prio = normal_prio(p);
        /* we are holding p->pi_lock already */
        p->prio = rt_mutex_getprio(p);
-        if (rt_prio(p->prio))
+        if (p->policy == SCHED_LITMUS)
+                p->sched_class = &litmus_sched_class;
+        else if (rt_prio(p->prio))
                p->sched_class = &rt_sched_class;
        else
                p->sched_class = &fair_sched_class;
@@ -4516,7 +4584,7 @@ recheck:
                if (policy != SCHED_FIFO && policy != SCHED_RR &&
                                policy != SCHED_NORMAL && policy != SCHED_BATCH &&
-                                policy != SCHED_IDLE)
+                                policy != SCHED_IDLE && policy != SCHED_LITMUS)
                        return -EINVAL;
        }
@@ -4531,6 +4599,8 @@ recheck:
                return -EINVAL;
        if (rt_policy(policy) != (param->sched_priority != 0))
                return -EINVAL;
+        if (policy == SCHED_LITMUS && policy == p->policy)
+                return -EINVAL;
        /*
         * Allow unprivileged RT tasks to decrease priority:
@@ -4585,6 +4655,12 @@ recheck:
                        return retval;
        }
+        if (policy == SCHED_LITMUS) {
+                retval = litmus_admit_task(p);
+                if (retval)
+                        return retval;
+        }
        /*
         * make sure no PI-waiters arrive (or leave) while we are
         * changing the priority of the task:
@@ -4612,10 +4688,19 @@ recheck:
        p->sched_reset_on_fork = reset_on_fork;
+        if (p->policy == SCHED_LITMUS)
+                litmus_exit_task(p);
        oldprio = p->prio;
        prev_class = p->sched_class;
        __setscheduler(rq, p, policy, param->sched_priority);
+        if (policy == SCHED_LITMUS) {
+                p->rt_param.stack_in_use = running ? rq->cpu : NO_CPU;
+                p->rt_param.present = running;
+                litmus->task_new(p, on_rq, running);
+        }
        if (running)
                p->sched_class->set_curr_task(rq);
        if (on_rq) {
@@ -4785,10 +4870,11 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
        rcu_read_lock();
        p = find_process_by_pid(pid);
-        if (!p) {
+        /* Don't set affinity if task not found and for LITMUS tasks */
+        if (!p || is_realtime(p)) {
                rcu_read_unlock();
                put_online_cpus();
-                return -ESRCH;
+                return p ? -EPERM : -ESRCH;
        }
        /* Prevent p going away */
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 5a5ea2cd924f..b1af6d42c024 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1708,7 +1708,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
        int sync = wake_flags & WF_SYNC;
        int scale = cfs_rq->nr_running >= sched_nr_latency;
-        if (unlikely(rt_prio(p->prio)))
+        if (unlikely(rt_prio(p->prio)) || p->policy == SCHED_LITMUS)
                goto preempt;
        if (unlikely(p->sched_class != &fair_sched_class))
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index b5b920ae2ea7..c2fbb02c1b54 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1014,7 +1014,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
 */
 static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
 {
-        if (p->prio < rq->curr->prio) {
+        if (p->prio < rq->curr->prio || p->policy == SCHED_LITMUS) {
                resched_task(rq->curr);
                return;
        }
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f992762d7f51..0adc54bd7c7c 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -721,6 +721,46 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
 }
 /**
+ * tick_set_quanta_type - get the quanta type as a boot option
+ * Default is standard setup with ticks staggered over first
+ * half of tick period.
+ */
+int quanta_type = LINUX_DEFAULT_TICKS;
+static int __init tick_set_quanta_type(char *str)
+{
+        if (strcmp("aligned", str) == 0) {
+                quanta_type = LITMUS_ALIGNED_TICKS;
+                printk(KERN_INFO "LITMUS^RT: setting aligned quanta\n");
+        }
+        else if (strcmp("staggered", str) == 0) {
+                quanta_type = LITMUS_STAGGERED_TICKS;
+                printk(KERN_INFO "LITMUS^RT: setting staggered quanta\n");
+        }
+        return 1;
+}
+__setup("quanta=", tick_set_quanta_type);
+u64 cpu_stagger_offset(int cpu)
+{
+        u64 offset = 0;
+        switch (quanta_type) {
+                case LITMUS_ALIGNED_TICKS:
+                        offset = 0;
+                        break;
+                case LITMUS_STAGGERED_TICKS:
+                        offset = ktime_to_ns(tick_period);
+                        do_div(offset, num_possible_cpus());
+                        offset *= cpu;
+                        break;
+                default:
+                        offset = ktime_to_ns(tick_period) >> 1;
+                        do_div(offset, num_possible_cpus());
+                        offset *= cpu;
+        }
+        return offset;
+}
+/**
 * tick_setup_sched_timer - setup the tick emulation timer
 */
 void tick_setup_sched_timer(void)
@@ -737,9 +777,11 @@ void tick_setup_sched_timer(void)
        /* Get the next period (per cpu) */
        hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
-        offset = ktime_to_ns(tick_period) >> 1;
-        do_div(offset, num_possible_cpus());
+        /* Offset must be set correctly to achieve desired quanta type. */
-        offset *= smp_processor_id();
+        offset = cpu_stagger_offset(smp_processor_id());
+        /* Add the correct offset to expiration time */
        hrtimer_add_expires_ns(&ts->sched_timer, offset);
        for (;;) {