Refactor to allow generic domains

author: Jonathan Herman <hermanjl@cs.unc.edu> 2011-09-07 18:03:33 -0400
committer: Jonathan Herman <hermanjl@cs.unc.edu> 2011-09-07 18:03:33 -0400
commit: 0b096fbe159a60c56190f8a627d764051e1e52ea (patch)
tree: 89535a50e48ae92d0add444684ef28603ea0bf3f /litmus/sched_mc.c
parent: d5e965b0074d6ef10f5a77112fc3671613a2150f (diff)
1 files changed, 581 insertions, 1043 deletions
diff --git a/litmus/sched_mc.c b/litmus/sched_mc.c
index dcf86d60275a..bc4b46165b64 100644
--- a/litmus/sched_mc.c
+++ b/litmus/sched_mc.c
@@ -8,12 +8,6 @@
 * This version uses the simple approach and serializes all scheduling
 * decisions by the use of a queue lock. This is probably not the
 * best way to do it, but it should suffice for now.
- *
- * --Todo--
- * Timer Accounting: adjusting the clock values of the ghost timer using
- * the _tick() method is relatively expensive. This should be changed.
- * Locks: Accesses to domains should be serialized using a per-domain lock.
- * CPU locks should be properly used e.g. wip-semi-part
 */
 #include <linux/spinlock.h>
@@ -21,1106 +15,533 @@
 #include <linux/sched.h>
 #include <linux/hrtimer.h>
 #include <linux/slab.h>
+#include <linux/module.h>
 #include <litmus/litmus.h>
 #include <litmus/jobs.h>
 #include <litmus/sched_plugin.h>
 #include <litmus/edf_common.h>
 #include <litmus/sched_trace.h>
+#include <litmus/domain.h>
 #include <litmus/bheap.h>
-#include <linux/module.h>
 #include <litmus/sched_mc.h>
-/* Overview of MC operations.
+/* Per CPU per criticality level state */
- *
+typedef struct {
- * link_task_to_cpu(T, cpu)     - Low-level operation to update the linkage
+        enum crit_level         level;
- *                                structure (NOT the actually scheduled
+        struct task_struct*     linked; /* Logically running task */
- *                                task). If there is another linked task To
+        domain_t*               domain;
- *                                already it will set To->linked_on = NO_CPU
- *                                (thereby removing its association with this
- *                                CPU). However, it will not requeue the
- *                                previously linked task (if any). It will set
- *                                T's state to RT_F_RUNNING and check whether
- *                                it is already running somewhere else. If T
- *                                is scheduled somewhere else it will link
- *                                it to that CPU instead (and pull the linked
- *                                task to cpu). T may be NULL.
- *
- * unlink(T)                    - Unlink removes T from all scheduler data
- *                                structures. If it is linked to some CPU it
- *                                will link NULL to that CPU. If it is
- *                                currently queued in the mc queue it will
- *                                be removed from the rt_domain. It is safe to
- *                                call unlink(T) if T is not linked. T may not
- *                                be NULL.
- *
- * requeue(T)                   - Requeue will insert T into the appropriate
- *                                queue. If the system is in real-time mode and
- *                                the T is released already, it will go into the
- *                                ready queue. If the system is not in
- *                                real-time mode is T, then T will go into the
- *                                release queue. If T's release time is in the
- *                                future, it will go into the release
- *                                queue. That means that T's release time/job
- *                                no/etc. has to be updated before requeu(T) is
- *                                called. It is not safe to call requeue(T)
- *                                when T is already queued. T may not be NULL.
- *
- * mc_job_arrival(T)    - This is the catch all function when T enters
- *                                the system after either a suspension or at a
- *                                job release. It will queue T (which means it
- *                                is not safe to call mc_job_arrival(T) if
- *                                T is already queued) and then check whether a
- *                                preemption is necessary. If a preemption is
- *                                necessary it will update the linkage
- *                                accordingly and cause scheduled to be called
- *                                (either with an IPI or need_resched). It is
- *                                safe to call mc_job_arrival(T) if T's
- *                                next job has not been actually released yet
- *                                (releast time in the future). T will be put
- *                                on the release queue in that case.
- *
- * job_completion(T)            - Take care of everything that needs to be done
- *                                to prepare T for its next release and place
- *                                it in the right queue with
- *                                mc_job_arrival().
- *
- *
- * When we now that T is linked to CPU then link_task_to_cpu(NULL, CPU) is
- * equivalent to unlink(T). Note that if you unlink a task from a CPU none of
- * the functions will automatically propagate pending task from the ready queue
- * to a linked task. This is the job of the calling function ( by means of
- * __take_ready).
- */
-/* cpu_entry_t - maintain the linked and scheduled state
- */
-typedef struct  {
-        int                     cpu;
-        struct task_struct*     linked;         /* only RT tasks */
-        struct task_struct*     scheduled;      /* only RT tasks */
-        atomic_t                will_schedule;  /* prevent unneeded IPIs */
-        struct bheap_node*      hn_c;
-        struct bheap_node*      hn_d;
-        struct task_struct*     ghost_tasks[NUM_CRIT_LEVELS];
-} cpu_entry_t;
-/*This code is heavily based on Bjoern's budget enforcement code. */
-struct watchdog_timer {
-        /* The watchdog timers determine when ghost jobs finish. */
-        struct hrtimer          timer;
-        struct task_struct*     task;
-};
-DEFINE_PER_CPU(struct watchdog_timer[NUM_CRIT_LEVELS], ghost_timers);
-#define ghost_timer(cpu, crit) (&(per_cpu(ghost_timers, cpu)[crit]))
-DEFINE_PER_CPU(cpu_entry_t, mc_cpu_entries);
-cpu_entry_t* mc_cpus[NR_CPUS];
+        struct hrtimer          timer;  /* For ghost task budget enforcement */
+        struct bheap_node*      node;   /* For membership in global domains  */
+} crit_cpu_entry_t;
-#define set_will_schedule() \
+/* Per CPU state */
-        (atomic_set(&__get_cpu_var(mc_cpu_entries).will_schedule, 1))
+typedef struct {
-#define clear_will_schedule() \
+        int                     cpu;
-        (atomic_set(&__get_cpu_var(mc_cpu_entries).will_schedule, 0))
+        struct task_struct*     scheduled; /* Task that is physically running */
-#define test_will_schedule(cpu) \
+        struct task_struct*     linked;    /* Task that is logically running  */
-        (atomic_read(&per_cpu(mc_cpu_entries, cpu).will_schedule))
-#define remote_cpu_entry(cpu)   (&per_cpu(mc_cpu_entries, cpu))
-#define tsk_mc_data(t) (tsk_rt(t)->mc_data)
+        crit_cpu_entry_t        crit_entries[NUM_CRIT_LEVELS];
-#define tsk_mc_crit(t) (tsk_mc_data(t)->mc_task.crit)
+} cpu_entry_t;
-#define TRACE_TASK(t, fmt, args...)                     \
-        TRACE("(%s/%d:%d:%d) " fmt, (t)->comm, (t)->pid,        \
-              (t)->rt_param.job_params.job_no, \
-              (tsk_mc_data(t)) ? tsk_mc_crit(t) : -1, ##args)
-/* need to do a short-circuit null check on mc_data before checking is_ghost */
-static inline int is_ghost(struct task_struct *t)
-{
-        struct mc_data *mc_data = tsk_mc_data(t);
-        return mc_data && mc_data->mc_job.is_ghost;
-}
-/* the cpus queue themselves according to priority in here */
-static struct bheap_node mc_heap_node_c[NR_CPUS], mc_heap_node_d[NR_CPUS];
-static struct bheap      mc_cpu_heap_c, mc_cpu_heap_d;
-/* Create per-CPU domains for criticality A */
-DEFINE_PER_CPU(rt_domain_t, crit_a);
-#define remote_a_queue(cpu)     (&per_cpu(crit_a, cpu))
-#define local_a_queue           (&__get_cpu_var(crit_a))
-/* Create per-CPU domains for criticality B */
-DEFINE_PER_CPU(rt_domain_t, crit_b);
-#define remote_b_queue(cpu)     (&per_cpu(crit_b, cpu))
-#define local_b_queue           (&__get_cpu_var(crit_b))
-/* Create global domains for criticalities C and D */
-static rt_domain_t crit_c;
-static rt_domain_t crit_d;
-/* We use crit_c for shared globals */
-#define global_lock (crit_c.ready_lock)
-#define mc_release_master (crit_c.release_master)
-/* BEGIN clone of edf_common.c to allow shared C/D run queue*/
-static int mc_edf_higher_prio(struct task_struct* first, struct task_struct*
-                                second)
-{
-        /*Only differs from normal EDF when two tasks of differing criticality
-          are compared.*/
-        if (first && second) {
-                enum crit_level first_crit = tsk_mc_crit(first);
-                enum crit_level second_crit = tsk_mc_crit(second);
-                /*Lower criticality numbers are higher priority*/
-                if (first_crit < second_crit) {
-                        return 1;
-                }
-                else if (second_crit < first_crit) {
-                        return 0;
-                }
-        }
-        return edf_higher_prio(first, second);
-}
-static int mc_edf_entry_higher_prio(cpu_entry_t* first, cpu_entry_t* second,
+/* Wrapper necessary until cpu linking code is moved into header file */
-                                        enum crit_level crit)
+typedef struct domain_data {
-{
+        domain_t                domain;
-        struct task_struct *first_active, *second_active;
+        struct bheap*           heap;       /* For global domains      */
-        first_active = first->linked;
+        crit_cpu_entry_t*       crit_entry; /* For partitioned domains */
-        second_active = second->linked;
+} domain_data_t;
-        if (first->ghost_tasks[crit]) {
-                first_active = first->ghost_tasks[crit];
+static cpu_entry_t* cpus[NR_CPUS];
-        }
+static raw_spinlock_t global_lock;
-        if (second->ghost_tasks[crit]) {
-                second_active = second->ghost_tasks[crit];
+#define domain_data(dom)  (container_of(dom, domain_data_t, domain))
-        }
+#define is_global(dom)    (domain_data(dom)->heap)
-        return mc_edf_higher_prio(first_active, second_active);
+#define is_global_task(t) (is_global(get_task_domain(t)))
-}
+#define crit_cpu(ce) \
+        (container_of((void*)(ce - ce->level), cpu_entry_t, crit_entries))
+#define TS "(%s/%d:%d:%s)"
+#define TA(t) (t) ? (is_ghost(t)) ? "ghost" : t->comm : "NULL", (t) ? t->pid : 1, \
+              (t) ? t->rt_param.job_params.job_no : 1,                  \
+              (t && get_task_domain(t)) ? get_task_domain(t)->name : ""
+#define TRACE_ENTRY(e, fmt, args...)                            \
+        TRACE("P%d, linked=" TS " " fmt "\n", \
+              e->cpu, TA(e->linked), ##args)
+#define TRACE_CRIT_ENTRY(ce, fmt, args...)                      \
+        TRACE("%s P%d, linked=" TS " " fmt "\n",                \
+              ce->domain->name, crit_cpu(ce)->cpu, TA(ce->linked), ##args)
+#define TRACE_TASK(t, fmt, args...)                             \
+        TRACE(TS " " fmt "\n", TA(t), ##args)
-/* need_to_preempt - check whether the task t needs to be preempted
+/*
- *                   call only with irqs disabled and with  ready_lock acquired
+ * Sort CPUs within a global domain by the domain's priority function.
- *                   THIS DOES NOT TAKE NON-PREEMPTIVE SECTIONS INTO ACCOUNT!
 */
-static int mc_edf_preemption_needed(rt_domain_t* rt, enum crit_level crit,
+static int cpu_lower_prio(struct bheap_node *a, struct bheap_node *b)
-                                        cpu_entry_t* entry)
 {
-        struct task_struct *active_task;
+        domain_t *domain;
+        crit_cpu_entry_t *first, *second;
+        struct task_struct *first_link, *second_link;
-        /* we need the read lock for edf_ready_queue */
+        first  = a->value;
-        /* no need to preempt if there is nothing pending */
+        second = b->value;
-        if (!__jobs_pending(rt))
+        first_link  = first->linked;
-                return 0;
+        second_link = second->linked;
-        active_task = entry->linked;
+        if (!first_link || !second_link) {
-        /* A ghost task can only exist if we haven't scheduled something above
+                return second_link && !first_link;
-         * its level
+        } else {
-         */
+                domain = get_task_domain(first_link);
-        if (entry->ghost_tasks[crit]) {
+                BUG_ON(domain != get_task_domain(second_link));
-                active_task = entry->ghost_tasks[crit];
+                return domain->higher_prio(second_link, first_link);
        }
-        /* we need to reschedule if t doesn't exist */
-        if (!active_task)
-                return 1;
-        /* NOTE: We cannot check for non-preemptibility since we
-         *       don't know what address space we're currently in.
-         */
-        /* make sure to get non-rt stuff out of the way */
-        return !is_realtime(active_task) ||
-                mc_edf_higher_prio(__next_ready(rt), active_task);
-}
-static int mc_edf_ready_order(struct bheap_node* a, struct bheap_node* b)
-{
-        return mc_edf_higher_prio(bheap2task(a), bheap2task(b));
-}
-static void mc_edf_domain_init(rt_domain_t* rt, check_resched_needed_t resched,
-                      release_jobs_t release)
-{
-        rt_domain_init(rt, mc_edf_ready_order, resched, release);
-}
-#define WANT_ALL_SCHED_EVENTS
-/* Called by update_cpu_position and lowest_prio_cpu in bheap operations
- *      Callers always have global lock
-*/
-static int cpu_lower_prio_c(struct bheap_node *_a, struct bheap_node *_b)
-{
-        cpu_entry_t *a, *b;
-        a = _a->value;
-        b = _b->value;
-        /* Note that a and b are inverted: we want the lowest-priority CPU at
-         * the top of the heap.
-         */
-        return mc_edf_entry_higher_prio(b, a, CRIT_LEVEL_C);
-}
-/* Called by update_cpu_position and lowest_prio_cpu in bheap operations
- *      Callers always have global lock
-*/
-static int cpu_lower_prio_d(struct bheap_node *_a, struct bheap_node *_b)
-{
-        cpu_entry_t *a, *b;
-        a = _a->value;
-        b = _b->value;
-        /* Note that a and b are inverted: we want the lowest-priority CPU at
-         * the top of the heap.
-         */
-        return mc_edf_entry_higher_prio(b, a, CRIT_LEVEL_D);
-}
-/* update_cpu_position - Move the cpu entry to the correct place to maintain
- *                       order in the cpu queue. Caller must hold global lock.
- * Called from link_task_to_cpu, which holds global lock
- * link_task_to_cpu is the only way a CPU can get a new task, and hence have its
- *      priority change.
- */
-static void update_cpu_position(cpu_entry_t *entry)
-{
-        if (likely(bheap_node_in_heap(entry->hn_c)))
-                bheap_delete(cpu_lower_prio_c, &mc_cpu_heap_c, entry->hn_c);
-        if (likely(bheap_node_in_heap(entry->hn_d)))
-                bheap_delete(cpu_lower_prio_d, &mc_cpu_heap_d, entry->hn_d);
-        bheap_insert(cpu_lower_prio_c, &mc_cpu_heap_c, entry->hn_c);
-        bheap_insert(cpu_lower_prio_d, &mc_cpu_heap_d, entry->hn_d);
 }
-/* caller must hold global lock
+/*
- * Only called when checking for gedf preemptions by check_for_gedf_preemptions,
+ * Return next CPU which should preempted or NULL if the domain has no
- *      which always has global lock
+ * preemptable CPUs.
- */
-static cpu_entry_t* lowest_prio_cpu_c(void)
-{
-        struct bheap_node* hn;
-        hn = bheap_peek(cpu_lower_prio_c, &mc_cpu_heap_c);
-        return hn->value;
-}
-/* caller must hold global lock
- * Only called when checking for gedf preemptions by check_for_gedf_preemptions,
- *      which always has global lock
 */
-static cpu_entry_t* lowest_prio_cpu_d(void)
+static inline crit_cpu_entry_t* lowest_prio_cpu(domain_t *dom)
 {
+        struct bheap *heap = domain_data(dom)->heap;
        struct bheap_node* hn;
-        hn = bheap_peek(cpu_lower_prio_d, &mc_cpu_heap_d);
+        hn = bheap_peek(cpu_lower_prio, heap);
-        return hn->value;
+        return (hn) ? hn->value : NULL;
 }
-/* Forward Declarations*/
+/*
-static noinline void unlink(struct task_struct* t);
+ * Time accounting for ghost tasks. Called during ticks and linking.
-static noinline void job_completion(struct task_struct *t, int forced);
-/* update_ghost_time - Do time accounting for a ghost job.
- * Updates ghost budget and handles expired ghost budget.
- * Called from unlink(), mc_tick().
- * Caller holds global lock.
 */
 static void update_ghost_time(struct task_struct *p)
 {
-        u64 delta;
+        u64 delta, clock;
-        u64 clock;
        BUG_ON(!is_ghost(p));
        clock = litmus_clock();
        delta = clock - p->se.exec_start;
        if (unlikely ((s64)delta < 0)) {
                delta = 0;
-                TRACE_TASK(p, "WARNING: negative time delta.\n");
+                TRACE_TASK(p, "WARNING: negative time delta");
        }
        if (tsk_mc_data(p)->mc_job.ghost_budget <= delta) {
-                /* Currently will just set ghost budget to zero since
+                TRACE_TASK(p, "Ghost job could have ended");
-                 * task has already been queued.  Could probably do
-                 * more efficiently with significant reworking.
-                 */
-                TRACE_TASK(p, "Ghost job could have ended\n");
                tsk_mc_data(p)->mc_job.ghost_budget = 0;
                p->se.exec_start = clock;
        } else {
-                TRACE_TASK(p, "Ghost jub updated, but didn't finish\n");
+                TRACE_TASK(p, "Ghost job updated, but didn't finish");
                tsk_mc_data(p)->mc_job.ghost_budget -= delta;
                p->se.exec_start = clock;
        }
 }
 /*
- *
+ * Logically set running task for a domain on a CPU.
 */
-static void cancel_watchdog_timer(struct watchdog_timer* wt)
+static void link_task_to_crit(crit_cpu_entry_t *ce,
+                              struct task_struct *task)
 {
-        int ret;
+        lt_t when_to_fire;
+        struct bheap *heap;
-        if (wt->task) {
-                TRACE_TASK(wt->task, "Cancelling watchdog timer.\n");
+        TRACE_TASK(task, "Linking to P%d", crit_cpu(ce)->cpu);
-                ret = hrtimer_try_to_cancel(&wt->timer);
+        BUG_ON(task && tsk_rt(task)->linked_on != NO_CPU);
-                /*Should never be inactive.*/
+        BUG_ON(task && is_global(ce->domain) &&
-                BUG_ON(ret == 0);
+               !bheap_node_in_heap(ce->node));
-                /*Running concurrently is an unusual situation - log it. */
-                /*TODO: is there a way to prevent this?  This probably means
+        /* Unlink last task */
-                 * the timer task is waiting to acquire the lock while the
+        if (ce->linked) {
-                 * cancellation attempt is happening.
+                TRACE_TASK(ce->linked, "Unlinking");
-                 *
+                ce->linked->rt_param.linked_on = NO_CPU;
-                 * And are we even in a correct state when this happens?
+                if (is_ghost(ce->linked)) {
-                 */
+                        hrtimer_try_to_cancel(&ce->timer);
-                if (ret == -1)
+                        if (tsk_mc_data(ce->linked)->mc_job.ghost_budget > 0) {
-                        TRACE_TASK(wt->task, "Timer cancellation "
+                                /* Job isn't finished, so do accounting */
-                                "attempted while task completing\n");
+                                update_ghost_time(ce->linked);
+                        }
+                }
+        }
+        /* Actually link task */
+        ce->linked = task;
+        if (task) {
+                task->rt_param.linked_on = crit_cpu(ce)->cpu;
+                if (is_ghost(task)) {
+                        /* Reset budget timer */
+                        task->se.exec_start = litmus_clock();
+                        when_to_fire = litmus_clock() +
+                                tsk_mc_data(task)->mc_job.ghost_budget;
+                        __hrtimer_start_range_ns(&ce->timer,
+                                                 ns_to_ktime(when_to_fire),
+                                                 0 /* delta */,
+                                                 HRTIMER_MODE_ABS_PINNED,
+                                                 0 /* no wakeup */);
+                }
+        }
-                wt->task = NULL;
+        /* Update global heap node position */
+        if (is_global(ce->domain) && bheap_node_in_heap(ce->node)) {
+                heap = domain_data(ce->domain)->heap;
+                bheap_delete(cpu_lower_prio, heap, ce->node);
+                bheap_insert(cpu_lower_prio, heap, ce->node);
        }
 }
-/* link_task_to_cpu - Update the link of a CPU.
+static void check_for_preempt(domain_t*);
- *                    Handles the case where the to-be-linked task is already
- *                    scheduled on a different CPU.
+/*
- *                    Also handles ghost jobs and preemption of ghost jobs.
+ * Catch all function for when a task enters the system after a suspension
- *      Called from unlink(), prepare_preemption(), and mc_schedule()
+ * or a release. Requeues the task and causes a preemption, if necessary.
- *      Callers hold global lock
 */
-static noinline void link_task_to_cpu(struct task_struct* linked,
+static void job_arrival(struct task_struct* task)
-                                      cpu_entry_t *entry)
 {
-        cpu_entry_t *sched;
+        domain_t *dom = get_task_domain(task);
-        struct task_struct* tmp;
-        int on_cpu;
-        int i;
-        struct watchdog_timer* timer;
-        lt_t when_to_fire;
-        BUG_ON(linked && !is_realtime(linked));
+        TRACE_TASK(task, "Job arriving");
-        BUG_ON(linked && is_realtime(linked) &&
+        BUG_ON(!task);
-                (tsk_mc_crit(linked) < CRIT_LEVEL_C) &&
-                (tsk_rt(linked)->task_params.cpu != entry->cpu));
-        if (linked && is_ghost(linked)) {
-                TRACE_TASK(linked, "Linking ghost job to CPU %d.\n",
-                                entry->cpu);
-                BUG_ON(entry->linked &&
-                        tsk_mc_crit(entry->linked) < tsk_mc_crit(linked));
-                tmp = entry->ghost_tasks[tsk_mc_crit(linked)];
+        if (!is_global(dom) || tsk_rt(task)->scheduled_on == NO_CPU) {
-                if (tmp) {
+                dom->requeue(dom, task);
-                        unlink(tmp);
+                check_for_preempt(dom);
-                }
+        } else {
-                /* We shouldn't link a ghost job that is already somewhere
+                /* If a global task is scheduled on one cpu, it CANNOT
-                 * else (or here) - the caller is responsible for unlinking]
+                 * be requeued into a global domain. Another cpu might
-                 * first.
+                 * dequeue the global task before it is descheduled,
+                 * causing the system to crash when the task is scheduled
+                 * in two places simultaneously.
                 */
-                BUG_ON(linked->rt_param.linked_on != NO_CPU);
+                TRACE_TASK(task, "Delayed arrival of scheduled task");
-                linked->rt_param.linked_on = entry->cpu;
-                linked->se.exec_start = litmus_clock();
-                entry->ghost_tasks[tsk_mc_crit(linked)] = linked;
-                /* Set up the watchdog timer. */
-                timer = ghost_timer(entry->cpu, tsk_mc_crit(linked));
-                if (timer->task) {
-                        cancel_watchdog_timer(timer);
-                }
-                when_to_fire = litmus_clock() +
-                        tsk_mc_data(linked)->mc_job.ghost_budget;
-                timer->task = linked;
-                __hrtimer_start_range_ns(&timer->timer,
-                        ns_to_ktime(when_to_fire),
-                        0 /* delta */,
-                        HRTIMER_MODE_ABS_PINNED,
-                        0 /* no wakeup */);
        }
-        else{
+}
-                /* Currently linked task is set to be unlinked. */
-                if (entry->linked) {
+/*
-                        entry->linked->rt_param.linked_on = NO_CPU;
+ * Logically run a task on a CPU. The task must first have been linked
+ * to one of the criticalities running on this CPU.
+ */
+static void link_task_to_cpu(cpu_entry_t *entry, struct task_struct *task)
+{
+        int i, in_heap;
+        crit_cpu_entry_t *ce;
+        struct bheap *heap;
+        struct task_struct *tmp;
+        enum crit_level last, next;
+        next = (task) ? tsk_mc_crit(task) : NUM_CRIT_LEVELS - 1;
+        last = (entry->linked) ? tsk_mc_crit(entry->linked) :
+                NUM_CRIT_LEVELS - 1;
+        TRACE_TASK(task, "Linking to P%d", entry->cpu);
+        BUG_ON(task && tsk_rt(task)->linked_on != entry->cpu);
+        BUG_ON(task && is_ghost(task));
+        BUG_ON(entry->linked && task && tsk_mc_crit(entry->linked) < next);
+        /* Actually link task */
+        if (task && !is_ghost(task)) {
+                set_rt_flags(task, RT_F_RUNNING);
+                entry->linked = task;
+        } else {
+                entry->linked = NULL;
+        }
+        /* Update CPU states */
+        for (i  = ((next < last) ? next : last);
+             i <= ((next > last) ? next : last); i++) {
+                ce = &entry->crit_entries[i];
+                /* Put CPU only in heaps which can preempt the linked task */
+                if (is_global(ce->domain)) {
+                        heap = domain_data(ce->domain)->heap;
+                        in_heap = bheap_node_in_heap(ce->node);
+                        if (ce->level > next && in_heap) {
+                                bheap_delete(cpu_lower_prio, heap, ce->node);
+                        } else if ((ce->level < next || !task) && !in_heap) {
+                                bheap_insert(cpu_lower_prio, heap, ce->node);
+                        }
                }
-                /* Link new task to CPU. */
+                /* Remove and requeue lower priority tasks on this CPU */
-                if (linked) {
+                if (ce->linked && ce->level > next) {
-                        set_rt_flags(linked, RT_F_RUNNING);
+                        TRACE_TASK(ce->linked, "Removed by higher priority");
-                        /* handle task is already scheduled somewhere! */
+                        tmp = ce->linked;
-                        on_cpu = linked->rt_param.scheduled_on;
+                        link_task_to_crit(ce, NULL);
-                        if (on_cpu != NO_CPU) {
+                        if (is_global(ce->domain)) {
-                                sched = &per_cpu(mc_cpu_entries, on_cpu);
+                                /* Need to check for a preemption.
-                                /* this should only happen if not linked
+                                 * We know this CPU is no longer in the heap
-                                 * already
+                                 * so it cannot get re-preempted here.
-                                 */
-                                BUG_ON(sched->linked == linked);
-                                /* If we are already scheduled on the CPU to
-                                 * which we wanted to link, we don't need to do
-                                 * the swap -- we just link ourselves to the
-                                 * CPU and depend on the caller to get things
-                                 * right.
-                                 *
-                                 * Also, we can only safely swap if neither
-                                 * task is partitioned.
                                 */
-                                tmp = sched->linked;
+                                job_arrival(tmp);
-                                if (entry != sched && tsk_mc_crit(linked) >
+                        } else {
-                                                CRIT_LEVEL_B &&
+                                ce->domain->requeue(ce->domain, tmp);
-                                        (!tmp || tsk_mc_crit(tmp)
-                                                > CRIT_LEVEL_B)) {
-                                        TRACE_TASK(linked,
-                                                   "already scheduled on %d, updating link.\n",
-                                                   sched->cpu);
-                                        linked->rt_param.linked_on = sched->cpu;
-                                        sched->linked = linked;
-                                        for (i = tsk_mc_crit(linked);
-                                                i < NUM_CRIT_LEVELS; i++) {
-                                                if (sched->ghost_tasks[i]) {
-                                                        unlink(sched->
-                                                                ghost_tasks[i]);
-                                                }
-                                        }
-                                        update_cpu_position(sched);
-                                        linked = tmp;
-                                }
-                        }
-                        if (linked) { /* might be NULL due to swap */
-                                linked->rt_param.linked_on = entry->cpu;
-                                for (i = tsk_mc_crit(linked);
-                                        i < NUM_CRIT_LEVELS; i++) {
-                                        if (entry->ghost_tasks[i]) {
-                                                unlink(entry->ghost_tasks[i]);
-                                                /* WARNING: it is up to the
-                                                 * caller to requeue ghost jobs
-                                                 */
-                                        }
-                                }
                        }
                }
-                entry->linked = linked;
        }
-#ifdef WANT_ALL_SCHED_EVENTS
-                if (linked)
-                        TRACE_TASK(linked, "linked to %d.\n", entry->cpu);
-                else
-                        TRACE("NULL linked to %d.\n", entry->cpu);
-#endif
-        update_cpu_position(entry);
 }
-/* unlink - Make sure a task is not linked any longer to a cpu entry
+/*
- *          where it was linked before.
+ * Preempt logically running task in a domain. If the preempting task should be
- *          Can handle ghost jobs.
+ * running on the domain's CPU, also links the task to the CPU and causes
- *      Called by schedule, task_block, task_exit, and job_completion
+ * a physical preemption.
- *      Caller assumed to hold global lock
 */
-static noinline void unlink(struct task_struct* t)
+static void preempt(domain_t *dom, crit_cpu_entry_t *ce)
 {
-        int cpu;
+        struct task_struct *task = dom->take_ready(dom);
-        cpu_entry_t *entry;
+        cpu_entry_t *entry = crit_cpu(ce);
-        struct watchdog_timer *timer;
+        TRACE_CRIT_ENTRY(ce, "Preempted by " TS, TA(task));
+        BUG_ON(!task);
-        if (unlikely(!t)) {
+        /* Per-domain preemption */
-                BUG_ON(1);
+        if (ce->linked) {
-                return;
+                dom->requeue(dom, ce->linked);
        }
+        link_task_to_crit(ce, task);
-        cpu = t->rt_param.linked_on;
+        /* Preempt actual execution if this is a running task */
-        if (cpu != NO_CPU) {
+        if (!is_ghost(task)) {
-                /* unlink */
+                link_task_to_cpu(entry, task);
-                entry = remote_cpu_entry(cpu);
+                preempt_if_preemptable(entry->scheduled, entry->cpu);
-                t->rt_param.linked_on = NO_CPU;
-                if (is_ghost(t)) {
-                        /* Clear the timer if it's set.
-                         * It may be unset if we are called as a result of
-                         * the watchdog timer triggering.
-                         */
-                        timer = ghost_timer(cpu, tsk_mc_crit(t));
-                        if (timer->task) {
-                                /* Should already be watching task.*/
-                                BUG_ON(timer->task != t);
-                                cancel_watchdog_timer(timer);
-                        }
-                        if (tsk_mc_data(t)->mc_job.ghost_budget > 0) {
-                                /* Job isn't finished, so do accounting. */
-                                update_ghost_time(t);
-                                /* Just remove from CPU, even in the rare case
-                                 * of zero time left - it will be scheduled
-                                 * with an immediate timer fire.
-                                 */
-                                entry->ghost_tasks[tsk_mc_crit(t)] = NULL;
-                                /*TODO: maybe make more efficient by
-                                 * only updating on C/D completion?
-                                 */
-                                update_cpu_position(entry);
-                        }
-                        else{
-                                /* Job finished, so just remove */
-                                entry->ghost_tasks[tsk_mc_crit(t)] = NULL;
-                                update_cpu_position(entry);
-                        }
-                }
-                else {
-                        link_task_to_cpu(NULL, entry);
-                }
-        } else if (is_queued(t)) {
-                /* This is an interesting situation: t is scheduled,
-                 * but was just recently unlinked.  It cannot be
-                 * linked anywhere else (because then it would have
-                 * been relinked to this CPU), thus it must be in some
-                 * queue. We must remove it from the list in this
-                 * case.
-                 */
-                TRACE("Weird is_queued situation happened\n");
-                remove(tsk_rt(t)->domain, t);
        }
 }
+/*
-/* preempt - force a CPU to reschedule
+ * Causes a logical preemption if the domain has a higher-priority ready task.
- * Just sets a Linux scheduler flag.
 */
-static void preempt(cpu_entry_t *entry)
+static void check_for_preempt(domain_t *dom)
 {
-        preempt_if_preemptable(entry->scheduled, entry->cpu);
+        int lower_prio;
+        cpu_entry_t *entry;
+        crit_cpu_entry_t *ce;
+        if (is_global(dom)) {
+                /* If a higher priority task is running on a CPU,
+                 * it will not be present in the domain heap.
+                 */
+                for (ce = lowest_prio_cpu(dom);
+                     ce && dom->preempt_needed(dom, ce->linked);
+                     ce = lowest_prio_cpu(dom)) {
+                        preempt(dom, ce);
+                }
+        } else /* Partitioned */ {
+                ce = domain_data(dom)->crit_entry;
+                entry = crit_cpu(ce);
+                /* A higher priority task might be running, in which case
+                 * this level cannot link any task.
+                 */
+                lower_prio = entry->linked &&
+                        tsk_mc_crit(entry->linked) < ce->level;
+                if (!lower_prio && dom->preempt_needed(dom, ce->linked)) {
+                        preempt(dom, ce);
+                }
+        }
 }
-/* requeue - Put an unlinked task into the proper domain.
+/*
- *           Caller holds global lock.
+ * Remove a running task from all structures.
- *           Called by mc_job_arrival() and prepare_preemption().
 */
-static noinline void requeue(struct task_struct* task)
+static void remove_from_all(struct task_struct* task)
 {
-        BUG_ON(!task || !is_realtime(task));
+        int cpu, level;
-        /* sanity check before insertion */
+        cpu_entry_t *entry;
-        BUG_ON(is_queued(task));
+        crit_cpu_entry_t *ce;
-        if (is_released(task, litmus_clock())) {
+        TRACE_TASK(task, "Removing from everything");
-                __add_ready(tsk_rt(task)->domain, task);
+        BUG_ON(!task);
-        } else {
-                /* it has got to wait */
-                add_release(tsk_rt(task)->domain, task);
-        }
-}
-static void prepare_preemption(rt_domain_t *dom, cpu_entry_t *cpu,
+        cpu = task->rt_param.linked_on;
-                        enum crit_level crit) {
+        level = tsk_mc_crit(task);
-        struct task_struct* task;
+        if (cpu != NO_CPU) {
-        int i;
+                /* Unlink */
-        task = __take_ready(dom);
+                entry = cpus[cpu];
-        TRACE("prepare_preemption: attempting to link task %d to %d\n",
+                ce = &entry->crit_entries[level];
-                task->pid, cpu->cpu);
+                link_task_to_crit(ce, NULL);
-        if (is_ghost(task)) {
+                if (!is_ghost(task)) {
-                /* Changing ghost task only affects linked task at our level */
+                        link_task_to_cpu(entry, NULL);
-                if (cpu->linked && tsk_mc_crit(cpu->linked) == crit)
-                        requeue(cpu->linked);
-                /* Can change ghost task at our level as well. */
-                if (cpu->ghost_tasks[crit])
-                        requeue(cpu->ghost_tasks[crit]);
-        }
-        else {
-                /* Changing linked tasks could affect both real and ghost
-                 * tasks at multiple levels
-                 */
-                if (cpu->linked)
-                        requeue(cpu->linked);
-                for (i = crit; i < NUM_CRIT_LEVELS; i++) {
-                        if (cpu->ghost_tasks[i])
-                                requeue(cpu->ghost_tasks[i]);
                }
+                BUG_ON(is_queued(task));
+        } else if (is_queued(task)) {
+                /* This is an interesting situation: t is scheduled,
+                 * but was just recently unlinked. It cannot be
+                 * linked anywhere else (because then it would have
+                 * been relinked to this CPU), thus it must be in some
+                 * queue. We must remove it from the list in this
+                 * case.
+                 */
+                TRACE_TASK(task, "Weird is_queued situation happened");
+                remove((rt_domain_t*)get_task_domain(task)->data, task);
        }
-        link_task_to_cpu(task, cpu);
-        preempt(cpu);
 }
-/* Callers always have global lock for functions in this section*/
+/*
-static noinline void check_for_c_preemptions(rt_domain_t *dom) {
+ * Prepares a task for its next period and causes a preemption, if necessary.
-        cpu_entry_t* last;
+ * Converts tasks which completed their execution early into ghost tasks.
-        TRACE("Checking for c preempt");
+ */
-        for (last = lowest_prio_cpu_c();
+static void job_completion(struct task_struct *task, int forced)
-                        mc_edf_preemption_needed(dom, CRIT_LEVEL_C, 
+{
-                                        last);
+        TRACE_TASK(task, "Completed");
-                        last = lowest_prio_cpu_c()) {
+        sched_trace_task_completion(task, forced);
-                prepare_preemption(dom, last, CRIT_LEVEL_C);
+        BUG_ON(!task);
-        }
-}
-static noinline void check_for_d_preemptions(rt_domain_t *dom) {
+        /* Logically stop the task execution */
-        cpu_entry_t* last;
+        set_rt_flags(task, RT_F_SLEEP);
-        TRACE("Checking for d preempt");
+        remove_from_all(task);
-        for (last = lowest_prio_cpu_d();
-                        mc_edf_preemption_needed(dom, CRIT_LEVEL_D, 
-                                        last);
-                        last = lowest_prio_cpu_d()) {
-                prepare_preemption(dom, last, CRIT_LEVEL_D);
-        }
-}
-static noinline void check_for_a_preemption(rt_domain_t *dom, cpu_entry_t *cpu) {
+        /* If it's not a ghost job, do ghost job conversion */
-        TRACE("Checking for a preempt");
+        if (!is_ghost(task)) {
-        if (mc_edf_preemption_needed(dom, CRIT_LEVEL_A, cpu)) {
+                tsk_mc_data(task)->mc_job.ghost_budget = budget_remaining(task);
-                prepare_preemption(dom, cpu, CRIT_LEVEL_A);
+                tsk_mc_data(task)->mc_job.is_ghost = 1;
        }
-}
-static noinline void check_for_b_preemption(rt_domain_t *dom, cpu_entry_t *cpu) {
+        /* If the task is a ghost job with no budget, it either exhausted
-        TRACE("Checking for b preempt");
+         * its ghost budget or there was no ghost budget after the job
-        if (mc_edf_preemption_needed(dom, CRIT_LEVEL_B, cpu)) {
+         * conversion. Revert back to a normal task and complete the period.
-                prepare_preemption(dom, cpu, CRIT_LEVEL_B);
+         */
+        if (tsk_mc_data(task)->mc_job.ghost_budget == 0) {
+                tsk_mc_data(task)->mc_job.is_ghost = 0;
+                prepare_for_next_period(task);
+                if (is_released(task, litmus_clock())) {
+                        sched_trace_task_release(task);
+                }
        }
+        /* Requeue non-blocking tasks */
+        if (is_running(task))
+                job_arrival(task);
 }
-/* mc_job_arrival: task is either resumed or released
+/*
- * Called from job_completion(), mc_task_new(), and mc_task_wake_up(), all
+ * Return true if the domain has a higher priority ready task. The curr
- *      of which have the global lock
+ * task must belong to the domain.
- * Requeues task and checks for/triggers preemptions
 */
-static noinline void mc_job_arrival(struct task_struct* task)
+static noinline int mc_preempt_needed(domain_t *dom, struct task_struct* curr)
 {
-        enum crit_level task_crit_level;
+        struct task_struct *next = dom->peek_ready(dom);
-        BUG_ON(!task);
-        TRACE("mc_job_arrival triggered\n");
+        if (!next || !curr) {
-        task_crit_level = tsk_mc_crit(task);
+                return next && !curr;
-        requeue(task);
+        } else {
-        if (task_crit_level == CRIT_LEVEL_A) {
+                BUG_ON(tsk_mc_crit(next) != tsk_mc_crit(curr));
-                check_for_a_preemption(remote_a_queue(get_partition(task)),
+                return get_task_domain(next)->higher_prio(next, curr);
-                        remote_cpu_entry(get_partition(task)));
-        } else if (task_crit_level == CRIT_LEVEL_B) {
-                check_for_b_preemption(remote_b_queue(get_partition(task)),
-                        remote_cpu_entry(get_partition(task)));
-        } else if (task_crit_level == CRIT_LEVEL_C) {
-                check_for_c_preemptions(&crit_c);
-        } else if (task_crit_level == CRIT_LEVEL_D) {
-                check_for_d_preemptions(&crit_d);
        }
 }
-/* Called by the domain
+/*
- * Obtains global lock, merges ready tasks, checks for/triggers preemptions,
+ * Completes a logically (but not physically) running ghost task.
- *      and releases global lock
+ */
-*/
+static enum hrtimer_restart mc_ghost_exhausted(struct hrtimer *timer)
-static void mc_release_jobs(rt_domain_t* rt, struct bheap* tasks)
 {
        unsigned long flags;
-        int i;
+        crit_cpu_entry_t *ce;
        raw_spin_lock_irqsave(&global_lock, flags);
-        TRACE("mc_release_jobs triggered\n");
-        __merge_ready(rt, tasks);
+        ce = container_of(timer, crit_cpu_entry_t, timer);
+        TRACE_CRIT_ENTRY(ce, "Ghost exhausted firing");
-        for (i = 0; i < NR_CPUS; i++) {
+        /* Due to race conditions, we cannot just set the linked
-                if (rt == remote_b_queue(i)) {
+         * task's budget to 0 as it may no longer be the task
-                        check_for_b_preemption(rt, remote_cpu_entry(i));
+         * for which this timer was armed.
-                }
+         */
-                else if (rt == remote_a_queue(i)) {
+        if (ce->linked && is_ghost(ce->linked)) {
-                        check_for_a_preemption(rt, remote_cpu_entry(i));
+                update_ghost_time(ce->linked);
+                if (tsk_mc_data(ce->linked)->mc_job.ghost_budget == 0) {
+                        job_completion(ce->linked, 0);
+                        goto out;
                }
        }
-        if (rt == &crit_c) {
-                check_for_c_preemptions(rt);
-        }
-        if (rt == &crit_d) {
-                check_for_d_preemptions(rt);
-        }
+        TRACE_TASK(ce->linked, "Was not exhausted");
+ out:
        raw_spin_unlock_irqrestore(&global_lock, flags);
+        return HRTIMER_NORESTART;
 }
-/* caller holds global_lock
+/*
- * Called only by mc_schedule() which holds global lock
+ * Adds released jobs to a domain and causes a preemption, if necessary.
- * Prepares task for next period, unlinks it, and calls mc_job_arrival
+ */
- * Converts jobs to ghost jobs as necessary, or finishes end of ghost jobs.
+static void mc_release_jobs(rt_domain_t* rt, struct bheap* tasks)
-*/
-static noinline void job_completion(struct task_struct *t, int forced)
 {
-        cpu_entry_t *cpu;
+        unsigned long flags;
-        BUG_ON(!t);
+        struct task_struct *first;
-        sched_trace_task_completion(t, forced);
+        raw_spin_lock_irqsave(&global_lock, flags);
-        TRACE_TASK(t, "job_completion().\n");
+        first = bheap_peek(rt->order, tasks)->value;
+        TRACE_TASK(first, "Jobs released");
-        /* set flags */
+        __merge_ready(rt, tasks);
-        set_rt_flags(t, RT_F_SLEEP);
+        check_for_preempt(get_task_domain(first));
-        /* If it's not a ghost job, do ghost job conversion and return if
-         * needed.
-         */
-        if (!is_ghost(t)) {
-                TRACE_TASK(t, "Converting to ghost from %d.\n", t->rt_param.scheduled_on);
-                cpu = remote_cpu_entry(t->rt_param.scheduled_on);
-                /*Unlink first while it's not a ghost job.*/
-                unlink(t);
-                tsk_mc_data(t)->mc_job.ghost_budget = budget_remaining(t);
-                tsk_mc_data(t)->mc_job.is_ghost = 1;
-                /* If we did just convert the job to ghost, we can safely
-                 * reschedule it and then let schedule() determine a new
-                 * job to run in the slack.
-                 *
-                 * If it actually needs to run as a ghost job, we'll do so
-                 * here.
-                 *
-                 * If it doesn't need to, it will fall through and be handled
-                 * properly as well.
-                 */
-                if (tsk_mc_data(t)->mc_job.ghost_budget > 0) {
-                        link_task_to_cpu(t, cpu);
-                        preempt(cpu);
-                        return;
-                }
-        }
-        /* prepare for next period - we either just became ghost but with no
-         * budget left, or we were already ghost and the ghost job expired*/
-        if (is_ghost(t)) {
-                tsk_mc_data(t)->mc_job.ghost_budget = 0;
-                /*Need to unlink here so prepare_for_next_period doesn't try
-                 * to unlink us
-                 */
-                unlink(t);
-                tsk_mc_data(t)->mc_job.is_ghost = 0;
-                tsk_mc_data(t)->mc_job.ghost_budget = 0;
-                prepare_for_next_period(t);
-        }
-        if (is_released(t, litmus_clock()))
-                sched_trace_task_release(t);
-        /* requeue
-         * But don't requeue a blocking task. */
-        if (is_running(t))
-                mc_job_arrival(t);
-}
-/* watchdog_timeout - this function is called when a watchdog timer expires.
- * 
- * Acquires global lock
- */
-static enum hrtimer_restart watchdog_timeout(struct hrtimer *timer)
-{
-        struct watchdog_timer* wt = container_of(timer,
-                                                 struct watchdog_timer,
-                                                 timer);
-        unsigned long flags;
-        struct task_struct* task = wt->task;
-        raw_spin_lock_irqsave(&global_lock, flags);
-        /*If we have triggered, we know the budget must have expired.*/
-        /*This needs to run first, so it doesn't look to job_completion like
-         * we have an active timer.
-         */
-        wt->task = NULL;
-        tsk_mc_data(task)->mc_job.ghost_budget = 0;
-        job_completion(task, 0);
-        TRACE_TASK(task, "Watchdog timeout\n");
        raw_spin_unlock_irqrestore(&global_lock, flags);
-        return HRTIMER_NORESTART;
 }
+/*
-/* mc_tick - this function is called for every local timer
+ * Ghost time accounting.
- *                         interrupt.
+ * TODO: remove
- *
- *                   checks whether the current task has expired and checks
- *                   whether we need to preempt it if it has not expired
- * Called from LITMUS core
- * Locks when calling update_ghost_time(t)
- * Just sets reschedule flags on task and CPU and request_exit_np flag on task
 */
 static void mc_tick(struct task_struct* t)
 {
        unsigned long flags;
-        if (is_ghost(t)) {
+        if (is_realtime(t) && is_ghost(t)) {
                raw_spin_lock_irqsave(&global_lock, flags);
                update_ghost_time(t);
                raw_spin_unlock_irqrestore(&global_lock, flags);
        }
-        if (is_realtime(t) && budget_enforced(t) && budget_exhausted(t)) {
-                if (!is_np(t)) {
-                        /* np tasks will be preempted when they become
-                         * preemptable again
-                         */
-                        set_tsk_need_resched(t);
-                        set_will_schedule();
-                        TRACE("mc_scheduler_tick: "
-                              "%d is preemptable "
-                              " => FORCE_RESCHED\n", t->pid);
-                } else if (is_user_np(t)) {
-                        TRACE("mc_scheduler_tick: "
-                              "%d is non-preemptable, "
-                              "preemption delayed.\n", t->pid);
-                        request_exit_np(t);
-                }
-        }
-}
-/* Getting schedule() right is a bit tricky. schedule() may not make any
- * assumptions on the state of the current task since it may be called for a
- * number of reasons. The reasons include a scheduler_tick() determined that it
- * was necessary, because sys_exit_np() was called, because some Linux
- * subsystem determined so, or even (in the worst case) because there is a bug
- * hidden somewhere. Thus, we must take extreme care to determine what the
- * current state is.
- *
- * The CPU could currently be scheduling a task (or not), be linked (or not).
- *
- * The following assertions for the scheduled task could hold:
- *
- *      - !is_running(scheduled)        // the job blocks
- *      - scheduled->timeslice == 0     // the job completed (forcefully)
- *      - get_rt_flag() == RT_F_SLEEP   // the job completed (by syscall)
- *      - linked != scheduled           // we need to reschedule (for any reason)
- *      - is_np(scheduled)              // rescheduling must be delayed,
- *                                         sys_exit_np must be requested
- *
- * Any of these can occur together.
- *
- *
- * Called by LITMUS core
- * No lock required by caller
- * Obtains global lock
- * can call unlink(), request_exit_np(), job_completion(), __take_ready()
- * modifies next, scheduled->scheduled_on, linked->scheduled_on
- * Releases global lock
- */
-static struct task_struct* mc_schedule(struct task_struct * prev)
-{
-        cpu_entry_t* entry = &__get_cpu_var(mc_cpu_entries);
-        int out_of_time, sleep, preempt, np, exists, blocks;
-        struct task_struct* next = NULL;
-        struct task_struct* ready_task = NULL;
-        enum crit_level ready_crit;
-        int i;
-#ifdef CONFIG_RELEASE_MASTER
-        /* Bail out early if we are the release master.
-         * The release master never schedules any real-time tasks.
-         */
-        if (mc_release_master == entry->cpu) {
-                sched_state_task_picked();
-                return NULL;
-        }
-#endif
-        raw_spin_lock(&global_lock);
-        clear_will_schedule();
-        /* sanity checking */
-        BUG_ON(entry->scheduled && entry->scheduled != prev);
-        BUG_ON(entry->scheduled && !is_realtime(prev));
-        BUG_ON(is_realtime(prev) && !entry->scheduled);
-        /* (0) Determine state */
-        exists      = entry->scheduled != NULL;
-        blocks      = exists && !is_running(entry->scheduled);
-        out_of_time = exists && budget_enforced(entry->scheduled) &&
-                                budget_exhausted(entry->scheduled);
-        np          = exists && is_np(entry->scheduled);
-        sleep       = exists && get_rt_flags(entry->scheduled) == RT_F_SLEEP;
-        preempt     = entry->scheduled != entry->linked;
-#ifdef WANT_ALL_SCHED_EVENTS
-        TRACE_TASK(prev, "invoked mc_schedule.\n");
-#endif
-        if (exists)
-                TRACE_TASK(prev,
-                           "blocks:%d out_of_time:%d np:%d sleep:%d preempt:%d "
-                           "state:%d sig:%d\n",
-                           blocks, out_of_time, np, sleep, preempt,
-                           prev->state, signal_pending(prev));
-        if (entry->linked && preempt)
-                TRACE_TASK(prev, "will be preempted by %s/%d\n",
-                           entry->linked->comm, entry->linked->pid);
-        /* If a task blocks we have no choice but to reschedule.
-         */
-        if (blocks)
-                unlink(entry->scheduled);
-        /* Request a sys_exit_np() call if we would like to preempt but cannot.
-         * We need to make sure to update the link structure anyway in case
-         * that we are still linked. Multiple calls to request_exit_np() don't
-         * hurt.
-         */
-        if (np && (out_of_time || preempt || sleep)) {
-                unlink(entry->scheduled);
-                request_exit_np(entry->scheduled);
-        }
-        /* Any task that is preemptable and either exhausts its execution
-         * budget or wants to sleep completes. We may have to reschedule after
-         * this. Don't do a job completion if we block (can't have timers running
-         * for blocked jobs). Preemption go first for the same reason.
-         */
-        if (!np && (out_of_time || sleep) && !blocks && !preempt)
-                job_completion(entry->scheduled, !sleep);
-        /* Link pending task if we became unlinked.
-         */
-        if (!entry->linked) {
-                if (!entry->ghost_tasks[CRIT_LEVEL_A]) {
-                        ready_task = __take_ready(local_a_queue);
-                        ready_crit = CRIT_LEVEL_A;
-                        if (ready_task && is_ghost(ready_task)) {
-                                link_task_to_cpu(ready_task, entry);
-                                ready_task = NULL;
-                        }
-                }
-                if (!ready_task && !entry->ghost_tasks[CRIT_LEVEL_B]) {
-                        ready_task = __take_ready(local_b_queue);
-                        ready_crit = CRIT_LEVEL_B;
-                        if (ready_task && is_ghost(ready_task)) {
-                                link_task_to_cpu(ready_task, entry);
-                                ready_task = NULL;
-                        }
-                }
-                if (!ready_task && !entry->ghost_tasks[CRIT_LEVEL_C]) {
-                        ready_task = __take_ready(&crit_c);
-                        ready_crit = CRIT_LEVEL_C;
-                        if (ready_task && is_ghost(ready_task)) {
-                                link_task_to_cpu(ready_task, entry);
-                                ready_task = NULL;
-                        }
-                }
-                if (!ready_task && !entry->ghost_tasks[CRIT_LEVEL_D]) {
-                        ready_task = __take_ready(&crit_d);
-                        ready_crit = CRIT_LEVEL_D;
-                        if (ready_task && is_ghost(ready_task)) {
-                                link_task_to_cpu(ready_task, entry);
-                                ready_task = NULL;
-                        }
-                }
-                if (!ready_task) {
-                        /* set to something invalid? */
-                        ready_crit = NUM_CRIT_LEVELS;
-                }
-                for (i = ready_crit; i < NUM_CRIT_LEVELS; i++) {
-                        if (entry->ghost_tasks[i])
-                                requeue(entry->ghost_tasks[i]);
-                }
-                link_task_to_cpu(ready_task, entry);
-                if (ready_task)
-                        TRACE_TASK(ready_task,
-                                        "Linked task inside scheduler\n");
-        }
-        /* The final scheduling decision. Do we need to switch for some reason?
-         * If linked is different from scheduled, then select linked as next.
-         */
-        if ((!np || blocks) &&
-            entry->linked != entry->scheduled) {
-                /* Schedule a linked job? */
-                if (entry->linked) {
-                        entry->linked->rt_param.scheduled_on = entry->cpu;
-                        next = entry->linked;
-                }
-                if (entry->scheduled) {
-                        /* not gonna be scheduled soon */
-                        entry->scheduled->rt_param.scheduled_on = NO_CPU;
-                        TRACE_TASK(entry->scheduled, "scheduled_on = NO_CPU\n");
-                }
-        } else
-                /* Only override Linux scheduler if we have a real-time task
-                 * scheduled that needs to continue.
-                 */
-                if (exists)
-                        next = prev;
-        sched_state_task_picked();
-        /*TODO: Update name of locking, reflect that we're locking all queues*/
-        raw_spin_unlock(&global_lock);
-#ifdef WANT_ALL_SCHED_EVENTS
-        TRACE("global_lock released, next=0x%p\n", next);
-        if (next)
-                TRACE_TASK(next, "scheduled at %llu\n", litmus_clock());
-        else if (exists && !next)
-                TRACE("becomes idle at %llu.\n", litmus_clock());
-#endif
-        return next;
-}
-/* _finish_switch - we just finished the switch away from prev
- * Called by LITMUS core
- * No locks
- */
-static void mc_finish_switch(struct task_struct *prev)
-{
-        cpu_entry_t*    entry = &__get_cpu_var(mc_cpu_entries);
-        entry->scheduled = is_realtime(current) ? current : NULL;
-#ifdef WANT_ALL_SCHED_EVENTS
-        TRACE_TASK(prev, "switched away from\n");
-#endif
 }
+/*
-/*      Prepare a task for running in RT mode
+ * Setup new mixed-criticality task.
- *      Called by LITMUS core
- *      No lock required by caller
- *      Obtains lock and calls mc_job_arrival before releasing lock
 */
 static void mc_task_new(struct task_struct *t, int on_rq, int running)
 {
        unsigned long           flags;
        cpu_entry_t*            entry;
-        enum crit_level         lvl;
+        enum crit_level         level;
-        TRACE("mixed crit: task new %d\n", t->pid);
+        TRACE("New mixed criticality task %d\n", t->pid);
        raw_spin_lock_irqsave(&global_lock, flags);
-        lvl = tsk_mc_crit(t);
+        /* Assign domain */
-        t->rt_param.domain =
+        level = tsk_mc_crit(t);
-                (lvl == CRIT_LEVEL_A) ? remote_a_queue(get_partition(t)) :
+        if (level < CRIT_LEVEL_C) {
-                (lvl == CRIT_LEVEL_B) ? remote_b_queue(get_partition(t)) :
+                entry = cpus[get_partition(t)];
-                (lvl == CRIT_LEVEL_C) ? &crit_c : &crit_d;
+        } else {
+                entry = cpus[task_cpu(t)];
+        }
+        level = tsk_mc_crit(t);
+        t->rt_param._domain = entry->crit_entries[level].domain;
-        /* setup job params */
+        /* Setup job params */
        release_at(t, litmus_clock());
        tsk_mc_data(t)->mc_job.ghost_budget = 0;
        tsk_mc_data(t)->mc_job.is_ghost = 0;
        if (running) {
-                entry = &per_cpu(mc_cpu_entries, task_cpu(t));
                BUG_ON(entry->scheduled);
+                entry->scheduled = t;
-#ifdef CONFIG_RELEASE_MASTER
+                tsk_rt(t)->scheduled_on = entry->cpu;
-                if (entry->cpu != mc_release_master) {
-#endif
-                        entry->scheduled = t;
-                        tsk_rt(t)->scheduled_on = task_cpu(t);
-#ifdef CONFIG_RELEASE_MASTER
-                } else {
-                        /* do not schedule on release master */
-                        preempt(entry); /* force resched */
-                        tsk_rt(t)->scheduled_on = NO_CPU;
-                }
-#endif
        } else {
                t->rt_param.scheduled_on = NO_CPU;
        }
-        t->rt_param.linked_on          = NO_CPU;
+        t->rt_param.linked_on = NO_CPU;
+        job_arrival(t);
-        mc_job_arrival(t);
        raw_spin_unlock_irqrestore(&global_lock, flags);
 }
-/* Called by LITMUS core
+/*
- * No lock required by caller
+ * Add task back into its domain and cause any necessary preemptions.
- * Obtains lock and calls mc_job_arrival before releasing lock
 */
 static void mc_task_wake_up(struct task_struct *task)
 {
        unsigned long flags;
        lt_t now;
-        TRACE_TASK(task, "wake_up at %llu\n", litmus_clock());
        raw_spin_lock_irqsave(&global_lock, flags);
+        TRACE_TASK(task, "Wakes up");
        /* We need to take suspensions because of semaphores into
         * account! If a job resumes after being suspended due to acquiring
         * a semaphore, it should never be treated as a new job release.
@@ -1130,114 +551,167 @@ static void mc_task_wake_up(struct task_struct *task)
        } else {
                now = litmus_clock();
                if (is_tardy(task, now)) {
-                        /* new sporadic release */
+                        /* New sporadic release */
                        release_at(task, now);
                        sched_trace_task_release(task);
-                }
+                } else {
-                else {
                        if (task->rt.time_slice) {
-                                /* came back in time before deadline
+                                /* Came back in time before deadline */
-                                */
                                set_rt_flags(task, RT_F_RUNNING);
                        }
                }
        }
-        /*Delay job arrival if we still have an active ghost job*/
        if (!is_ghost(task))
-                mc_job_arrival(task);
+                job_arrival(task);
        raw_spin_unlock_irqrestore(&global_lock, flags);
 }
-/* Called by LITMUS core
+/*
- * No lock required by caller
+ * Remove task from global state to prevent it from being linked / run
- * Obtains and releases global lock
+ * on any CPU.
 */
-static void mc_task_block(struct task_struct *t)
+static void mc_task_block(struct task_struct *task)
 {
        unsigned long flags;
+        raw_spin_lock_irqsave(&global_lock, flags);
+        TRACE_TASK(task, "Block at %llu", litmus_clock());
-        TRACE_TASK(t, "block at %llu\n", litmus_clock());
+        remove_from_all(task);
-        /* unlink if necessary */
-        raw_spin_lock_irqsave(&global_lock, flags);
-        unlink(t);
        raw_spin_unlock_irqrestore(&global_lock, flags);
-        BUG_ON(!is_realtime(t));
 }
+/*
-/* Called by LITMUS core
+ * Remove task from the system.
- * No lock required by caller
- * Obtains and releases global lock
 */
-static void mc_task_exit(struct task_struct * t)
+static void mc_task_exit(struct task_struct *task)
 {
        unsigned long flags;
-        /* unlink if necessary */
+        BUG_ON(!is_realtime(task));
+        TRACE_TASK(task, "RIP");
        raw_spin_lock_irqsave(&global_lock, flags);
-        unlink(t);
+        remove_from_all(task);
-        if (tsk_rt(t)->scheduled_on != NO_CPU) {
+        if (tsk_rt(task)->scheduled_on != NO_CPU) {
-                mc_cpus[tsk_rt(t)->scheduled_on]->scheduled = NULL;
+                cpus[tsk_rt(task)->scheduled_on]->scheduled = NULL;
-                tsk_rt(t)->scheduled_on = NO_CPU;
+                tsk_rt(task)->scheduled_on = NO_CPU;
        }
        raw_spin_unlock_irqrestore(&global_lock, flags);
-        BUG_ON(!is_realtime(t));
-        TRACE_TASK(t, "RIP\n");
 }
-static long mc_admit_task(struct task_struct* tsk)
+/*
+ * Return true if the task is a valid mixed-criticality task.
+ */
+static long mc_admit_task(struct task_struct* task)
 {
-        if (!tsk_mc_data(tsk))
+        if (!tsk_mc_data(task)) {
-        {
+                printk(KERN_WARNING "Tried to admit task with no criticality "
-                printk(KERN_WARNING "tried to admit task with no criticality "
                        "level\n");
                return -EINVAL;
        }
-        printk(KERN_INFO "admitted task with criticality level %d\n",
+        if (tsk_mc_crit(task) < CRIT_LEVEL_C && get_partition(task) == NO_CPU) {
-                tsk_mc_crit(tsk));
+                printk(KERN_WARNING "Tried to admit partitioned task with no "
+                       "partition\n");
+                return -EINVAL;
+        }
+        printk(KERN_INFO "Admitted task with criticality level %d\n",
+                tsk_mc_crit(task));
        return 0;
 }
-static long mc_activate_plugin(void)
+/*
+ * Return next task which should be scheduled.
+ */
+static struct task_struct* mc_schedule(struct task_struct * prev)
 {
-        int cpu;
+        domain_t *dom;
-        cpu_entry_t *entry;
+        crit_cpu_entry_t *ce;
+        cpu_entry_t* entry = cpus[smp_processor_id()];
+        int i, out_of_time, sleep, preempt, exists, blocks, global;
+        struct task_struct *dtask = NULL, *ready_task = NULL, *next = NULL;
-        bheap_init(&mc_cpu_heap_c);
+        raw_spin_lock(&global_lock);
-        bheap_init(&mc_cpu_heap_d);
-#ifdef CONFIG_RELEASE_MASTER
-        crit_c.release_master = atomic_read(&release_master_cpu);
-        crit_d.release_master = crit_c.release_master;
-#endif
-        for_each_online_cpu(cpu) {
+        /* Sanity checking */
-                entry = &per_cpu(mc_cpu_entries, cpu);
+        BUG_ON(entry->scheduled && entry->scheduled != prev);
-                bheap_node_init(&entry->hn_c, entry);
+        BUG_ON(entry->scheduled && !is_realtime(prev));
-                bheap_node_init(&entry->hn_d, entry);
+        BUG_ON(is_realtime(prev) && !entry->scheduled);
-                atomic_set(&entry->will_schedule, 0);
-                entry->linked    = NULL;
+        /* Determine state */
-                entry->scheduled = NULL;
+        exists      = entry->scheduled != NULL;
-#ifdef CONFIG_RELEASE_MASTER
+        blocks      = exists && !is_running(entry->scheduled);
-                if (cpu != mc_release_master) {
+        out_of_time = exists && budget_enforced(entry->scheduled) &&
-#endif
+                                budget_exhausted(entry->scheduled);
-                        TRACE("MC: Initializing CPU #%d.\n", cpu);
+        sleep       = exists && get_rt_flags(entry->scheduled) == RT_F_SLEEP;
-                        update_cpu_position(entry);
+        global      = exists && is_global_task(entry->scheduled);
-#ifdef CONFIG_RELEASE_MASTER
+        preempt     = entry->scheduled != entry->linked;
-                } else {
-                        TRACE("MC: CPU %d is release master.\n", cpu);
+        if (exists) {
+                entry->scheduled->rt_param.scheduled_on = NO_CPU;
+                TRACE_TASK(prev,
+                           "blocks:%d out_of_time:%d sleep:%d preempt:%d "
+                           "state:%d sig:%d global:%d",
+                           blocks, out_of_time, sleep, preempt,
+                           prev->state, signal_pending(prev), global);
+        }
+        /* If a task blocks we have no choice but to reschedule */
+        if (blocks)
+                remove_from_all(entry->scheduled);
+        /* Any task which exhausts its budget or sleeps waiting for its next
+         * period completes unless its execution has been forcibly stopped.
+         */
+        if ((out_of_time || sleep) && !blocks && !preempt)
+                job_completion(entry->scheduled, !sleep);
+        /* Global scheduled tasks must wait for a deschedule before they
+         * can rejoin a global domain. See comment in job_arrival.
+         */
+        else if (global && preempt && !blocks)
+                job_arrival(entry->scheduled);
+        /* Pick next task if none is linked */
+        if (!entry->linked) {
+                for (i = 0; i < NUM_CRIT_LEVELS && !ready_task; i++) {
+                        ce = &entry->crit_entries[i];
+                        dom = ce->domain;
+                        dtask = dom->peek_ready(dom);
+                        if (!ce->linked && dtask) {
+                                dom->take_ready(dom);
+                                link_task_to_crit(ce, dtask);
+                                ready_task = (is_ghost(dtask)) ? NULL : dtask;
+                        }
                }
-#endif
+                if (ready_task)
+                        link_task_to_cpu(entry, ready_task);
        }
-        return 0;
+        /* Schedule next task */
+        next = entry->linked;
+        entry->scheduled = next;
+        if (entry->scheduled)
+                entry->scheduled->rt_param.scheduled_on = entry->cpu;
+        sched_state_task_picked();
+        raw_spin_unlock(&global_lock);
+        if (next)
+                TRACE_TASK(next, "Scheduled at %llu", litmus_clock());
+        else if (exists && !next)
+                TRACE("Becomes idle at %llu\n", litmus_clock());
+        return next;
 }
-/*      Plugin object   */
+/* **************************************************************************
+ * Initialization
+ * ************************************************************************** */
 static struct sched_plugin mc_plugin __cacheline_aligned_in_smp = {
        .plugin_name            = "MC",
-        .finish_switch          = mc_finish_switch,
        .tick                   = mc_tick,
        .task_new               = mc_task_new,
        .complete_job           = complete_job,
@@ -1246,48 +720,112 @@ static struct sched_plugin mc_plugin __cacheline_aligned_in_smp = {
        .task_wake_up           = mc_task_wake_up,
        .task_block             = mc_task_block,
        .admit_task             = mc_admit_task,
-        .activate_plugin        = mc_activate_plugin,
 };
+/* Initialize values here so that they are allocated with the module
+ * and destroyed when the module is unloaded.
+ */
+DEFINE_PER_CPU(cpu_entry_t, _mc_cpus);
+DEFINE_PER_CPU(domain_data_t, _mc_crit_a);
+DEFINE_PER_CPU(domain_data_t, _mc_crit_b);
+static domain_data_t _mc_crit_c, _mc_crit_d;
+struct bheap _mc_heap_c, _mc_heap_d;
+struct bheap_node _mc_nodes_c[NR_CPUS], _mc_nodes_d[NR_CPUS];
+static void init_crit_entry(crit_cpu_entry_t *ce, enum crit_level level,
+                            domain_data_t *dom_data,
+                            struct bheap_node *node)
+{
+        ce->level  = level;
+        ce->linked = NULL;
+        ce->node   = node;
+        ce->domain = &dom_data->domain;
-static int __init init_mc(void)
+        hrtimer_init(&ce->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+        ce->timer.function = mc_ghost_exhausted;
+}
+static void init_local_domain(cpu_entry_t *entry, domain_data_t *dom_data,
+                              enum crit_level level)
+{
+        dom_data->heap  = NULL;
+        dom_data->crit_entry = &entry->crit_entries[level];
+        init_crit_entry(dom_data->crit_entry, level, dom_data, NULL);
+}
+static void init_global_domain(domain_data_t *dom_data, enum crit_level level,
+                               struct bheap *heap, struct bheap_node *nodes)
 {
        int cpu;
-        int i;
        cpu_entry_t *entry;
-        struct watchdog_timer *timer;
+        crit_cpu_entry_t *ce;
+        struct bheap_node *node;
-        bheap_init(&mc_cpu_heap_c);
-        bheap_init(&mc_cpu_heap_d);
+        dom_data->crit_entry = NULL;
-        /* initialize CPU state */
+        dom_data->heap = heap;
-        for (cpu = 0; cpu < NR_CPUS; cpu++)  {
+        bheap_init(heap);
-                entry = &per_cpu(mc_cpu_entries, cpu);
-                mc_cpus[cpu] = entry;
+        for_each_online_cpu(cpu) {
-                atomic_set(&entry->will_schedule, 0);
+                entry = cpus[cpu];
-                entry->cpu       = cpu;
+                node = &nodes[cpu];
-                entry->hn_c      = &mc_heap_node_c[cpu];
+                ce = &entry->crit_entries[level];
-                entry->hn_d      = &mc_heap_node_d[cpu];
-                bheap_node_init(&entry->hn_c, entry);
+                init_crit_entry(ce, level, dom_data, node);
-                bheap_node_init(&entry->hn_d, entry);
-                for (i = CRIT_LEVEL_A; i < NUM_CRIT_LEVELS; i++) {
+                bheap_node_init(&ce->node, ce);
-                        timer = ghost_timer(cpu, i);
+                bheap_insert(cpu_lower_prio, heap, node);
-                        hrtimer_init(&timer->timer, CLOCK_MONOTONIC,
-                                HRTIMER_MODE_ABS);
-                        timer->timer.function = watchdog_timeout;
-                }
-        }
-        mc_edf_domain_init(&crit_c, NULL, mc_release_jobs);
-        mc_edf_domain_init(&crit_d, NULL, mc_release_jobs);
-        for (i = 0; i < NR_CPUS; i++) {
-                mc_edf_domain_init(remote_b_queue(i), NULL,
-                                   mc_release_jobs);
        }
-        for (i = 0; i < NR_CPUS; i++) {
+}
-                mc_edf_domain_init(remote_a_queue(i), NULL,
-                                   mc_release_jobs);
+static inline void init_edf_domain(domain_t *dom)
+{
+        pd_domain_init(dom, edf_ready_order, NULL,
+                       mc_release_jobs, mc_preempt_needed,
+                       edf_higher_prio);
+}
+static int __init init_mc(void)
+{
+        int cpu;
+        cpu_entry_t *entry;
+        domain_data_t *dom_data;
+        raw_spin_lock_init(&global_lock);
+        for_each_online_cpu(cpu) {
+                entry = &per_cpu(_mc_cpus, cpu);
+                cpus[cpu] = entry;
+                entry->cpu = cpu;
+                entry->scheduled = NULL;
+                entry->linked = NULL;
+                /* CRIT_LEVEL_A */
+                dom_data = &per_cpu(_mc_crit_a, cpu);
+                init_local_domain(entry, dom_data, CRIT_LEVEL_A);
+                init_edf_domain(&dom_data->domain);
+                dom_data->domain.name = "LVL-A";
+                /* CRIT_LEVEL_B */
+                dom_data = &per_cpu(_mc_crit_b, cpu);
+                init_local_domain(entry, dom_data, CRIT_LEVEL_B);
+                init_edf_domain(&dom_data->domain);
+                dom_data->domain.name = "LVL-B";
        }
+        /* CRIT_LEVEL_C */
+        init_global_domain(&_mc_crit_c, CRIT_LEVEL_C,
+                           &_mc_heap_c, _mc_nodes_c);
+        init_edf_domain(&_mc_crit_c.domain);
+        _mc_crit_c.domain.name = "LVL-C";
+        /* CRIT_LEVEL_D */
+        init_global_domain(&_mc_crit_d, CRIT_LEVEL_D,
+                           &_mc_heap_d, _mc_nodes_d);
+        init_edf_domain(&_mc_crit_d.domain);
+        _mc_crit_d.domain.name = "LVL-D";
        return register_sched_plugin(&mc_plugin);
 }
 module_init(init_mc);
author	Jonathan Herman <hermanjl@cs.unc.edu>	2011-09-07 18:03:33 -0400
committer	Jonathan Herman <hermanjl@cs.unc.edu>	2011-09-07 18:03:33 -0400
commit	0b096fbe159a60c56190f8a627d764051e1e52ea (patch)
tree	89535a50e48ae92d0add444684ef28603ea0bf3f /litmus/sched_mc.c
parent	d5e965b0074d6ef10f5a77112fc3671613a2150f (diff)