Add PD^2 scheduler plugin

PFAIR: fix wrong memset() PFAIR: don't arm timer for tardy tasks If the system is overloaded, tasks may be tardy. In that case, do not arm timers. Rather, add tardy tasks back into the ready queue right away. PFAIR: use sched_trace_last_suspension_as_completion() PFAIR: use inferred_sporadic_job_release_at() rebase fix: use new hrtimer API in PFAIR plugin
author: Bjoern Brandenburg <bbb@mpi-sws.org> 2015-08-09 07:18:56 -0400
committer: Bjoern Brandenburg <bbb@mpi-sws.org> 2017-05-26 17:12:42 -0400
commit: 55b4b8689a88d6cb457ecfaabbccc09d5f7c121a (patch)
tree: e4f28b33dd7a0cb2f18bd0a6dd4aeb2af3504e65 /litmus
parent: d3976b0260a5241d7f5461d1a12b51b53c833a91 (diff)
3 files changed, 1244 insertions, 0 deletions
diff --git a/litmus/Kconfig b/litmus/Kconfig
index 5916db3973ca..9fc402e79cfd 100644
--- a/litmus/Kconfig
+++ b/litmus/Kconfig
@@ -12,6 +12,19 @@ config PLUGIN_CEDF
          On smaller platforms (e.g., ARM PB11MPCore), using C-EDF
          makes little sense since there aren't any shared caches.
+config PLUGIN_PFAIR
+        bool "PFAIR"
+        default y
+        help
+          Include the PFAIR plugin (i.e., the PD^2 scheduler) in the kernel.
+          The PFAIR plugin requires high resolution timers (for staggered
+          quanta) and also requires HZ_PERIODIC (i.e., periodic timer ticks
+          even if a processor is idle, as quanta could be missed otherwise).
+          Further, the PFAIR plugin uses the system tick and thus requires
+          HZ=1000 to achive reasonable granularity.
+          If unsure, say Yes.
 config RELEASE_MASTER
        bool "Release-master Support"
        depends on SMP
diff --git a/litmus/Makefile b/litmus/Makefile
index 86b865d26563..c969ce59db67 100644
--- a/litmus/Makefile
+++ b/litmus/Makefile
@@ -24,6 +24,7 @@ obj-y     = sched_plugin.o litmus.o \
            sched_pfp.o
 obj-$(CONFIG_PLUGIN_CEDF) += sched_cedf.o
+obj-$(CONFIG_PLUGIN_PFAIR) += sched_pfair.o
 obj-$(CONFIG_FEATHER_TRACE) += ft_event.o ftdev.o
 obj-$(CONFIG_SCHED_TASK_TRACE) += sched_task_trace.o
diff --git a/litmus/sched_pfair.c b/litmus/sched_pfair.c
new file mode 100644
index 000000000000..f75cc8c7e441
--- /dev/null
+++ b/litmus/sched_pfair.c
@@ -0,0 +1,1230 @@
+/*
+ * kernel/sched_pfair.c
+ *
+ * Implementation of the PD^2 pfair scheduling algorithm. This
+ * implementation realizes "early releasing," i.e., it is work-conserving.
+ *
+ */
+#include <asm/div64.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <litmus/litmus.h>
+#include <litmus/jobs.h>
+#include <litmus/preempt.h>
+#include <litmus/rt_domain.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/sched_trace.h>
+#include <litmus/trace.h>
+#include <litmus/bheap.h>
+/* to configure the cluster size */
+#include <litmus/litmus_proc.h>
+#include <litmus/clustered.h>
+static enum cache_level pfair_cluster_level = GLOBAL_CLUSTER;
+struct subtask {
+        /* measured in quanta relative to job release */
+        quanta_t release;
+        quanta_t deadline;
+        quanta_t overlap; /* called "b bit" by PD^2 */
+        quanta_t group_deadline;
+};
+struct pfair_param   {
+        quanta_t        quanta;       /* number of subtasks */
+        quanta_t        cur;          /* index of current subtask */
+        quanta_t        release;      /* in quanta */
+        quanta_t        period;       /* in quanta */
+        quanta_t        last_quantum; /* when scheduled last */
+        int             last_cpu;     /* where scheduled last */
+        unsigned int    needs_requeue:1;
+        struct pfair_cluster* cluster; /* where this task is scheduled */
+        struct subtask subtasks[0];   /* allocate together with pfair_param */
+};
+#define tsk_pfair(tsk) ((tsk)->rt_param.pfair)
+struct pfair_state {
+        struct cluster_cpu topology;
+        struct hrtimer quantum_timer;
+        volatile quanta_t cur_tick;    /* updated by the CPU that is advancing
+                                        * the time */
+        volatile quanta_t local_tick;  /* What tick is the local CPU currently
+                                        * executing? Updated only by the local
+                                        * CPU. In QEMU, this may lag behind the
+                                        * current tick. In a real system, with
+                                        * proper timers and aligned quanta,
+                                        * that should only be the case for a
+                                        * very short time after the time
+                                        * advanced. With staggered quanta, it
+                                        * will lag for the duration of the
+                                        * offset.
+                                        */
+        struct task_struct* linked;    /* the task that should be executing */
+        struct task_struct* local;     /* the local copy of linked          */
+        struct task_struct* scheduled; /* what is actually scheduled        */
+        struct list_head    out_of_budget; /* list of tasks that exhausted their allocation */
+        lt_t offset;                    /* stagger offset */
+        unsigned int missed_updates;
+        unsigned int missed_quanta;
+};
+struct pfair_cluster {
+        struct scheduling_cluster topology;
+        /* The "global" time in this cluster. */
+        quanta_t pfair_time; /* the "official" PFAIR clock */
+        /* The ready queue for this cluster. */
+        rt_domain_t pfair;
+        /* The set of jobs that should have their release enacted at the next
+         * quantum boundary.
+         */
+        struct bheap release_queue;
+        raw_spinlock_t release_lock;
+};
+static inline struct pfair_cluster* cpu_cluster(struct pfair_state* state)
+{
+        return container_of(state->topology.cluster, struct pfair_cluster, topology);
+}
+static inline int cpu_id(struct pfair_state* state)
+{
+        return state->topology.id;
+}
+static inline struct pfair_state* from_cluster_list(struct list_head* pos)
+{
+        return list_entry(pos, struct pfair_state, topology.cluster_list);
+}
+static inline struct pfair_cluster* from_domain(rt_domain_t* rt)
+{
+        return container_of(rt, struct pfair_cluster, pfair);
+}
+static inline raw_spinlock_t* cluster_lock(struct pfair_cluster* cluster)
+{
+        /* The ready_lock is used to serialize all scheduling events. */
+        return &cluster->pfair.ready_lock;
+}
+static inline raw_spinlock_t* cpu_lock(struct pfair_state* state)
+{
+        return cluster_lock(cpu_cluster(state));
+}
+DEFINE_PER_CPU(struct pfair_state, pfair_state);
+struct pfair_state* *pstate; /* short cut */
+static struct pfair_cluster* pfair_clusters;
+static int num_pfair_clusters;
+/* Enable for lots of trace info.
+ * #define PFAIR_DEBUG
+ */
+#ifdef PFAIR_DEBUG
+#define PTRACE_TASK(t, f, args...)  TRACE_TASK(t, f, ## args)
+#define PTRACE(f, args...) TRACE(f, ## args)
+#else
+#define PTRACE_TASK(t, f, args...)
+#define PTRACE(f, args...)
+#endif
+/* gcc will inline all of these accessor functions... */
+static struct subtask* cur_subtask(struct task_struct* t)
+{
+        return tsk_pfair(t)->subtasks + tsk_pfair(t)->cur;
+}
+static quanta_t cur_deadline(struct task_struct* t)
+{
+        return cur_subtask(t)->deadline +  tsk_pfair(t)->release;
+}
+static quanta_t cur_release(struct task_struct* t)
+{
+        /* This is early releasing: only the release of the first subtask
+         * counts. */
+        return tsk_pfair(t)->release;
+}
+static quanta_t cur_overlap(struct task_struct* t)
+{
+        return cur_subtask(t)->overlap;
+}
+static quanta_t cur_group_deadline(struct task_struct* t)
+{
+        quanta_t gdl = cur_subtask(t)->group_deadline;
+        if (gdl)
+                return gdl + tsk_pfair(t)->release;
+        else
+                return gdl;
+}
+static int pfair_higher_prio(struct task_struct* first,
+                             struct task_struct* second)
+{
+        return  /* first task must exist */
+                first && (
+                /* Does the second task exist and is it a real-time task?  If
+                 * not, the first task (which is a RT task) has higher
+                 * priority.
+                 */
+                !second || !is_realtime(second)  ||
+                /* Is the (subtask) deadline of the first task earlier?
+                 * Then it has higher priority.
+                 */
+                time_before(cur_deadline(first), cur_deadline(second)) ||
+                /* Do we have a deadline tie?
+                 * Then break by B-bit.
+                 */
+                (cur_deadline(first) == cur_deadline(second) &&
+                 (cur_overlap(first) > cur_overlap(second) ||
+                /* Do we have a B-bit tie?
+                 * Then break by group deadline.
+                 */
+                (cur_overlap(first) == cur_overlap(second) &&
+                 (time_after(cur_group_deadline(first),
+                             cur_group_deadline(second)) ||
+                /* Do we have a group deadline tie?
+                 * Then break by PID, which are unique.
+                 */
+                (cur_group_deadline(first) ==
+                 cur_group_deadline(second) &&
+                 first->pid < second->pid))))));
+}
+int pfair_ready_order(struct bheap_node* a, struct bheap_node* b)
+{
+        return pfair_higher_prio(bheap2task(a), bheap2task(b));
+}
+static void pfair_release_jobs(rt_domain_t* rt, struct bheap* tasks)
+{
+        struct pfair_cluster* cluster = from_domain(rt);
+        unsigned long flags;
+        raw_spin_lock_irqsave(&cluster->release_lock, flags);
+        bheap_union(pfair_ready_order, &cluster->release_queue, tasks);
+        raw_spin_unlock_irqrestore(&cluster->release_lock, flags);
+}
+static void prepare_release(struct task_struct* t, quanta_t at)
+{
+        tsk_pfair(t)->release    = at;
+        tsk_pfair(t)->cur        = 0;
+}
+/* pull released tasks from the release queue */
+static void poll_releases(struct pfair_cluster* cluster)
+{
+        raw_spin_lock(&cluster->release_lock);
+        __merge_ready(&cluster->pfair, &cluster->release_queue);
+        raw_spin_unlock(&cluster->release_lock);
+}
+static void check_preempt(struct task_struct* t)
+{
+        int cpu = NO_CPU;
+        if (tsk_rt(t)->linked_on != tsk_rt(t)->scheduled_on &&
+            is_present(t)) {
+                /* the task can be scheduled and
+                 * is not scheduled where it ought to be scheduled
+                 */
+                cpu = tsk_rt(t)->linked_on != NO_CPU ?
+                        tsk_rt(t)->linked_on         :
+                        tsk_rt(t)->scheduled_on;
+                PTRACE_TASK(t, "linked_on:%d, scheduled_on:%d\n",
+                           tsk_rt(t)->linked_on, tsk_rt(t)->scheduled_on);
+                /* preempt */
+                litmus_reschedule(cpu);
+        }
+}
+/* caller must hold pfair.ready_lock */
+static void drop_all_references(struct task_struct *t)
+{
+        int cpu;
+        struct pfair_state* s;
+        struct pfair_cluster* cluster;
+        if (bheap_node_in_heap(tsk_rt(t)->heap_node)) {
+                /* It must be in the ready queue; drop references isn't called
+                 * when the job is in a release queue. */
+                cluster = tsk_pfair(t)->cluster;
+                bheap_delete(pfair_ready_order, &cluster->pfair.ready_queue,
+                            tsk_rt(t)->heap_node);
+        }
+        for (cpu = 0; cpu < num_online_cpus(); cpu++) {
+                s = &per_cpu(pfair_state, cpu);
+                if (s->linked == t)
+                        s->linked = NULL;
+                if (s->local  == t)
+                        s->local  = NULL;
+                if (s->scheduled  == t)
+                        s->scheduled = NULL;
+        }
+        /* make sure we don't have a stale linked_on field */
+        tsk_rt(t)->linked_on = NO_CPU;
+        /* make sure we're not queued for re-releasing */
+        if (in_list(&tsk_rt(t)->list))
+        {
+                TRACE_TASK(t, "removing from out_of_budget queue\n");
+                list_del(&tsk_rt(t)->list);
+        }
+}
+static void pfair_prepare_next_period(struct task_struct* t)
+{
+        struct pfair_param* p = tsk_pfair(t);
+        prepare_for_next_period(t);
+        tsk_rt(t)->completed = 0;
+        p->release = time2quanta(get_release(t), CEIL);
+}
+/* returns 1 if the task needs to go the release queue */
+static int advance_subtask(quanta_t time, struct task_struct* t, int cpu)
+{
+        struct pfair_param* p = tsk_pfair(t);
+        int to_relq;
+        p->cur = (p->cur + 1) % p->quanta;
+        if (!p->cur) {
+                if (is_present(t)) {
+                        /* The job overran; we start a new budget allocation. */
+                        TRACE_TASK(t, "overran budget, preparing next period\n");
+                        sched_trace_task_completion(t, 1);
+                        pfair_prepare_next_period(t);
+                } else {
+                        /* remove task from system until it wakes */
+                        drop_all_references(t);
+                        p->needs_requeue = 1;
+                        TRACE_TASK(t, "on %d advanced to subtask %lu (not present)\n",
+                                   cpu, p->cur);
+                        return 0;
+                }
+        }
+        to_relq = time_after(cur_release(t), time);
+        TRACE_TASK(t, "on %d advanced to subtask %lu -> to_relq=%d "
+                "(cur_release:%lu time:%lu present:%d on_cpu=%d)\n",
+                cpu, p->cur, to_relq, cur_release(t), time,
+                tsk_rt(t)->present, tsk_rt(t)->scheduled_on);
+        return to_relq;
+}
+static void advance_subtasks(struct pfair_cluster *cluster, quanta_t time)
+{
+        struct task_struct* l;
+        struct pfair_param* p;
+        struct list_head* pos;
+        struct pfair_state* cpu;
+        list_for_each(pos, &cluster->topology.cpus) {
+                cpu = from_cluster_list(pos);
+                l = cpu->linked;
+                cpu->missed_updates += cpu->linked != cpu->local;
+                if (l) {
+                        p = tsk_pfair(l);
+                        p->last_quantum = time;
+                        p->last_cpu     =  cpu_id(cpu);
+                        if (advance_subtask(time, l, cpu_id(cpu))) {
+                                cpu->linked = NULL;
+                                tsk_rt(l)->linked_on = NO_CPU;
+                                PTRACE_TASK(l, "should go to release queue. "
+                                            "scheduled_on=%d present=%d\n",
+                                            tsk_rt(l)->scheduled_on,
+                                            tsk_rt(l)->present);
+                                list_add(&tsk_rt(l)->list, &cpu->out_of_budget);
+                        }
+                }
+        }
+}
+static int target_cpu(quanta_t time, struct task_struct* t, int default_cpu)
+{
+        int cpu;
+        if (tsk_rt(t)->scheduled_on != NO_CPU) {
+                /* always observe scheduled_on linkage */
+                default_cpu = tsk_rt(t)->scheduled_on;
+        } else if (tsk_pfair(t)->last_quantum == time - 1) {
+                /* back2back quanta */
+                /* Only observe last_quantum if no scheduled_on is in the way.
+                 * This should only kick in if a CPU missed quanta, and that
+                 * *should* only happen in QEMU.
+                 */
+                cpu = tsk_pfair(t)->last_cpu;
+                if (!pstate[cpu]->linked ||
+                    tsk_rt(pstate[cpu]->linked)->scheduled_on != cpu) {
+                        default_cpu = cpu;
+                }
+        }
+        return default_cpu;
+}
+/* returns one if linking was redirected */
+static int pfair_link(quanta_t time, int cpu,
+                      struct task_struct* t)
+{
+        int target = target_cpu(time, t, cpu);
+        struct task_struct* prev  = pstate[cpu]->linked;
+        struct task_struct* other;
+        struct pfair_cluster* cluster = cpu_cluster(pstate[cpu]);
+        if (target != cpu) {
+                BUG_ON(pstate[target]->topology.cluster != pstate[cpu]->topology.cluster);
+                other = pstate[target]->linked;
+                pstate[target]->linked = t;
+                tsk_rt(t)->linked_on   = target;
+                if (!other)
+                        /* linked ok, but reschedule this CPU */
+                        return 1;
+                if (target < cpu) {
+                        /* link other to cpu instead */
+                        tsk_rt(other)->linked_on = cpu;
+                        pstate[cpu]->linked      = other;
+                        if (prev) {
+                                /* prev got pushed back into the ready queue */
+                                tsk_rt(prev)->linked_on = NO_CPU;
+                                __add_ready(&cluster->pfair, prev);
+                        }
+                        /* we are done with this cpu */
+                        return 0;
+                } else {
+                        /* re-add other, it's original CPU was not considered yet */
+                        tsk_rt(other)->linked_on = NO_CPU;
+                        __add_ready(&cluster->pfair, other);
+                        /* reschedule this CPU */
+                        return 1;
+                }
+        } else {
+                pstate[cpu]->linked  = t;
+                tsk_rt(t)->linked_on = cpu;
+                if (prev) {
+                        /* prev got pushed back into the ready queue */
+                        tsk_rt(prev)->linked_on = NO_CPU;
+                        __add_ready(&cluster->pfair, prev);
+                }
+                /* we are done with this CPU */
+                return 0;
+        }
+}
+static void schedule_subtasks(struct pfair_cluster *cluster, quanta_t time)
+{
+        int retry;
+        struct list_head *pos;
+        struct pfair_state *cpu_state;
+        list_for_each(pos, &cluster->topology.cpus) {
+                cpu_state = from_cluster_list(pos);
+                retry = 1;
+#ifdef CONFIG_RELEASE_MASTER
+                /* skip release master */
+                if (cluster->pfair.release_master == cpu_id(cpu_state))
+                        continue;
+#endif
+                while (retry) {
+                        if (pfair_higher_prio(__peek_ready(&cluster->pfair),
+                                              cpu_state->linked))
+                                retry = pfair_link(time, cpu_id(cpu_state),
+                                                   __take_ready(&cluster->pfair));
+                        else
+                                retry = 0;
+                }
+        }
+}
+static void schedule_next_quantum(struct pfair_cluster *cluster, quanta_t time)
+{
+        struct pfair_state *cpu;
+        struct list_head* pos;
+        /* called with interrupts disabled */
+        PTRACE("--- Q %lu at %llu PRE-SPIN\n",
+               time, litmus_clock());
+        raw_spin_lock(cluster_lock(cluster));
+        PTRACE("<<< Q %lu at %llu\n",
+               time, litmus_clock());
+        sched_trace_quantum_boundary();
+        advance_subtasks(cluster, time);
+        poll_releases(cluster);
+        schedule_subtasks(cluster, time);
+        list_for_each(pos, &cluster->topology.cpus) {
+                cpu = from_cluster_list(pos);
+                if (cpu->linked)
+                        PTRACE_TASK(cpu->linked,
+                                    " linked on %d.\n", cpu_id(cpu));
+                else
+                        PTRACE("(null) linked on %d.\n", cpu_id(cpu));
+        }
+        /* We are done. Advance time. */
+        mb();
+        list_for_each(pos, &cluster->topology.cpus) {
+                cpu = from_cluster_list(pos);
+                if (cpu->local_tick != cpu->cur_tick) {
+                        TRACE("BAD Quantum not acked on %d "
+                              "(l:%lu c:%lu p:%lu)\n",
+                              cpu_id(cpu),
+                              cpu->local_tick,
+                              cpu->cur_tick,
+                              cluster->pfair_time);
+                        cpu->missed_quanta++;
+                }
+                cpu->cur_tick = time;
+        }
+        PTRACE(">>> Q %lu at %llu\n",
+               time, litmus_clock());
+        raw_spin_unlock(cluster_lock(cluster));
+}
+static noinline void wait_for_quantum(quanta_t q, struct pfair_state* state)
+{
+        quanta_t loc;
+        goto first; /* skip mb() on first iteration */
+        do {
+                cpu_relax();
+                mb();
+        first:  loc = state->cur_tick;
+                /* FIXME: what if loc > cur? */
+        } while (time_before(loc, q));
+        PTRACE("observed cur_tick:%lu >= q:%lu\n",
+               loc, q);
+}
+static quanta_t current_quantum(struct pfair_state* state)
+{
+        lt_t t = litmus_clock() - state->offset;
+        return time2quanta(t, FLOOR);
+}
+static void catchup_quanta(quanta_t from, quanta_t target,
+                           struct pfair_state* state)
+{
+        quanta_t cur = from, time;
+        TRACE("+++< BAD catching up quanta from %lu to %lu\n",
+              from, target);
+        while (time_before(cur, target)) {
+                wait_for_quantum(cur, state);
+                cur++;
+                time = cmpxchg(&cpu_cluster(state)->pfair_time,
+                               cur - 1,   /* expected */
+                               cur        /* next     */
+                        );
+                if (time == cur - 1)
+                        schedule_next_quantum(cpu_cluster(state), cur);
+        }
+        TRACE("+++> catching up done\n");
+}
+/* pfair_tick - this function is called for every local timer
+ *                         interrupt.
+ */
+static void pfair_tick(struct task_struct* t)
+{
+        struct pfair_state* state = this_cpu_ptr(&pfair_state);
+        quanta_t time, cur;
+        int retry = 10;
+        do {
+                cur  = current_quantum(state);
+                PTRACE("q %lu at %llu\n", cur, litmus_clock());
+                /* Attempt to advance time. First CPU to get here
+                 * will prepare the next quantum.
+                 */
+                time = cpu_cluster(state)->pfair_time;
+                if (time == cur - 1)
+                {
+                        /* looks good, see if we can advance the time */
+                        time = cmpxchg(&cpu_cluster(state)->pfair_time,
+                                       cur - 1,   /* expected */
+                                       cur        /* next     */
+                                );
+                }
+                if (time == cur - 1) {
+                        /* exchange succeeded */
+                        wait_for_quantum(cur - 1, state);
+                        schedule_next_quantum(cpu_cluster(state), cur);
+                        retry = 0;
+                } else if (time_before(time, cur - 1)) {
+                        /* the whole system missed a tick !? */
+                        catchup_quanta(time, cur, state);
+                        retry--;
+                } else if (time_after(time, cur)) {
+                        /* our timer lagging behind!? */
+                        TRACE("BAD pfair_time:%lu > cur:%lu\n", time, cur);
+                        retry--;
+                } else {
+                        /* Some other CPU already started scheduling
+                         * this quantum. Let it do its job and then update.
+                         */
+                        retry = 0;
+                }
+        } while (retry);
+        /* Spin locally until time advances. */
+        wait_for_quantum(cur, state);
+        /* copy assignment */
+        /* FIXME: what if we race with a future update? Corrupted state? */
+        state->local      = state->linked;
+        /* signal that we are done */
+        mb();
+        state->local_tick = state->cur_tick;
+        if (state->local != current
+            && (is_realtime(current) || is_present(state->local)))
+                litmus_reschedule_local();
+}
+static void process_out_of_budget_tasks(
+        struct pfair_state* state,
+        struct task_struct* prev,
+        unsigned int blocks)
+{
+        struct task_struct *t;
+        while (!list_empty(&state->out_of_budget))
+        {
+                t = list_first_entry(&state->out_of_budget,
+                                     struct task_struct, rt_param.list);
+                TRACE_TASK(t, "found on out_of_budget queue is_prev=%d\n", t == prev);
+                list_del(&tsk_rt(t)->list);
+                if (t != prev || !blocks)
+                {
+                        if (time_after(cur_release(t), state->local_tick)) {
+                                TRACE_TASK(t, "adding to release queue (budget exhausted)\n");
+                                add_release(&cpu_cluster(state)->pfair, t);
+                        } else {
+                                TRACE_TASK(t, "adding to ready queue (budget exhausted)\n");
+                                sched_trace_task_release(t);
+                                __add_ready(&cpu_cluster(state)->pfair, t);
+                        }
+                } else {
+                        TRACE_TASK(t, "not added to release queue (blocks=%d)\n", blocks);
+                        tsk_pfair(t)->needs_requeue = 1;
+                }
+                if (unlikely(state->local == t)) {
+                        TRACE_TASK(t, "still linked as ->local, cleaning up\n");
+                        state->local = NULL;
+                }
+        }
+}
+/* Custom scheduling tick: called on each quantum boundary. */
+static enum hrtimer_restart on_quantum_boundary(struct hrtimer *timer)
+{
+        TS_QUANTUM_BOUNDARY_START;
+        pfair_tick(current);
+        hrtimer_add_expires_ns(timer, LITMUS_QUANTUM_LENGTH_NS);
+        TS_QUANTUM_BOUNDARY_END;
+        return  HRTIMER_RESTART;
+}
+static int safe_to_schedule(struct task_struct* t, int cpu)
+{
+        int where = tsk_rt(t)->scheduled_on;
+        if (where != NO_CPU && where != cpu) {
+                TRACE_TASK(t, "BAD: can't be scheduled on %d, "
+                           "scheduled already on %d.\n", cpu, where);
+                return 0;
+        } else
+                return is_present(t) && !is_completed(t);
+}
+static struct task_struct* pfair_schedule(struct task_struct * prev)
+{
+        struct pfair_state* state = this_cpu_ptr(&pfair_state);
+        struct pfair_cluster* cluster = cpu_cluster(state);
+        int blocks, completion, out_of_time;
+        struct task_struct* next = NULL;
+#ifdef CONFIG_RELEASE_MASTER
+        /* Bail out early if we are the release master.
+         * The release master never schedules any real-time tasks.
+         */
+        if (unlikely(cluster->pfair.release_master == cpu_id(state))) {
+                goto out;
+        }
+#endif
+        raw_spin_lock(cpu_lock(state));
+        blocks      = is_realtime(prev) && !is_current_running();
+        completion  = is_realtime(prev) && is_completed(prev);
+        out_of_time = is_realtime(prev) && time_after(cur_release(prev),
+                                                      state->local_tick);
+        if (is_realtime(prev))
+            PTRACE_TASK(prev, "blocks:%d completion:%d out_of_time:%d\n",
+                        blocks, completion, out_of_time);
+        if (completion && !out_of_time) {
+                sched_trace_task_completion(prev, 0);
+                pfair_prepare_next_period(prev);
+                prepare_release(prev, cur_release(prev));
+                drop_all_references(prev);
+                list_add(&tsk_rt(prev)->list, &state->out_of_budget);
+        }
+        process_out_of_budget_tasks(state, prev, blocks);
+        if (state->local && safe_to_schedule(state->local, cpu_id(state)))
+                next = state->local;
+        if (prev != next) {
+                tsk_rt(prev)->scheduled_on = NO_CPU;
+                if (next)
+                        tsk_rt(next)->scheduled_on = cpu_id(state);
+        }
+        sched_state_task_picked();
+        raw_spin_unlock(cpu_lock(state));
+        if (next)
+                TRACE_TASK(next, "scheduled rel=%lu at %lu (%llu)\n",
+                           tsk_pfair(next)->release, cpu_cluster(state)->pfair_time, litmus_clock());
+        else if (is_realtime(prev))
+                TRACE("Becomes idle at %lu (%llu)\n", cpu_cluster(state)->pfair_time, litmus_clock());
+#ifdef CONFIG_RELEASE_MASTER
+out:
+#endif
+        if (unlikely(!hrtimer_active(&state->quantum_timer))) {
+                TRACE("activating quantum timer start=%llu\n",
+                        hrtimer_get_expires(&state->quantum_timer));
+                hrtimer_start(&state->quantum_timer,
+                        hrtimer_get_expires(&state->quantum_timer),
+                        HRTIMER_MODE_ABS);
+        }
+        return next;
+}
+static void pfair_task_new(struct task_struct * t, int on_rq, int is_scheduled)
+{
+        unsigned long flags;
+        struct pfair_cluster* cluster;
+        TRACE("pfair: task new %d state:%d\n", t->pid, t->state);
+        cluster = tsk_pfair(t)->cluster;
+        raw_spin_lock_irqsave(cluster_lock(cluster), flags);
+        prepare_release(t, cluster->pfair_time + 1);
+        release_at(t, quanta2time(cur_release(t)));
+        t->rt_param.scheduled_on = NO_CPU;
+        t->rt_param.linked_on    = NO_CPU;
+        if (is_scheduled) {
+#ifdef CONFIG_RELEASE_MASTER
+                if (task_cpu(t) != cluster->pfair.release_master)
+#endif
+                        t->rt_param.scheduled_on = task_cpu(t);
+        }
+        if (on_rq || is_scheduled) {
+                tsk_rt(t)->present = 1;
+                __add_ready(&cluster->pfair, t);
+        } else {
+                tsk_rt(t)->present = 0;
+                tsk_pfair(t)->needs_requeue = 1;
+        }
+        check_preempt(t);
+        raw_spin_unlock_irqrestore(cluster_lock(cluster), flags);
+}
+static void pfair_task_wake_up(struct task_struct *t)
+{
+        unsigned long flags;
+        lt_t now;
+        struct pfair_cluster* cluster;
+        struct pfair_state* state;
+        int sporadic_release = 0;
+        cluster = tsk_pfair(t)->cluster;
+        TRACE_TASK(t, "wakes at %llu, release=%lu, pfair_time:%lu\n",
+                   litmus_clock(), cur_release(t), cluster->pfair_time);
+        raw_spin_lock_irqsave(cluster_lock(cluster), flags);
+        state = this_cpu_ptr(&pfair_state);
+        /* If a task blocks and wakes before its next job release,
+         * then it may resume if it is currently linked somewhere
+         * (as if it never blocked at all). Otherwise, we have a
+         * new sporadic job release.
+         */
+        now = litmus_clock();
+        if (is_tardy(t, now)) {
+                TRACE_TASK(t, "sporadic release!\n");
+                sporadic_release = 1;
+                inferred_sporadic_job_release_at(t, now);
+                prepare_release(t, time2quanta(now, CEIL));
+        }
+        /* only add to ready queue if the task isn't still linked somewhere */
+        if (tsk_pfair(t)->needs_requeue) {
+                tsk_pfair(t)->needs_requeue = 0;
+                TRACE_TASK(t, "requeueing required (released:%d)\n",
+                        !time_after(cur_release(t), state->local_tick));
+                tsk_rt(t)->completed = 0;
+                if (time_after(cur_release(t), state->local_tick)
+                    && !sporadic_release)
+                        add_release(&cluster->pfair, t);
+                else
+                        __add_ready(&cluster->pfair, t);
+        }
+        check_preempt(t);
+        raw_spin_unlock_irqrestore(cluster_lock(cluster), flags);
+        TRACE_TASK(t, "wake up done at %llu\n", litmus_clock());
+}
+static void pfair_task_block(struct task_struct *t)
+{
+        BUG_ON(!is_realtime(t));
+        TRACE_TASK(t, "blocks at %llu, state:%d\n",
+                   litmus_clock(), t->state);
+}
+static void pfair_task_exit(struct task_struct * t)
+{
+        unsigned long flags;
+        struct pfair_cluster *cluster;
+        BUG_ON(!is_realtime(t));
+        cluster = tsk_pfair(t)->cluster;
+        /* Remote task from release or ready queue, and ensure
+         * that it is not the scheduled task for ANY CPU. We
+         * do this blanket check because occassionally when
+         * tasks exit while blocked, the task_cpu of the task
+         * might not be the same as the CPU that the PFAIR scheduler
+         * has chosen for it.
+         */
+        raw_spin_lock_irqsave(cluster_lock(cluster), flags);
+        TRACE_TASK(t, "RIP, state:%d\n", t->state);
+        drop_all_references(t);
+        raw_spin_unlock_irqrestore(cluster_lock(cluster), flags);
+        kfree(t->rt_param.pfair);
+        t->rt_param.pfair = NULL;
+}
+static void init_subtask(struct subtask* sub, unsigned long i,
+                         lt_t quanta, lt_t period)
+{
+        /* since i is zero-based, the formulas are shifted by one */
+        lt_t tmp;
+        /* release */
+        tmp = period * i;
+        do_div(tmp, quanta); /* floor */
+        sub->release = (quanta_t) tmp;
+        /* deadline */
+        tmp = period * (i + 1);
+        if (do_div(tmp, quanta)) /* ceil */
+                tmp++;
+        sub->deadline = (quanta_t) tmp;
+        /* next release */
+        tmp = period * (i + 1);
+        do_div(tmp, quanta); /* floor */
+        sub->overlap =  sub->deadline - (quanta_t) tmp;
+        /* Group deadline.
+         * Based on the formula given in Uma's thesis.
+         */
+        if (2 * quanta >= period) {
+                /* heavy */
+                tmp = (sub->deadline - (i + 1)) * period;
+                if (period > quanta &&
+                    do_div(tmp, (period - quanta))) /* ceil */
+                        tmp++;
+                sub->group_deadline = (quanta_t) tmp;
+        } else
+                sub->group_deadline = 0;
+}
+static void dump_subtasks(struct task_struct* t)
+{
+        unsigned long i;
+        for (i = 0; i < t->rt_param.pfair->quanta; i++)
+                TRACE_TASK(t, "SUBTASK %lu: rel=%lu dl=%lu bbit:%lu gdl:%lu\n",
+                           i + 1,
+                           t->rt_param.pfair->subtasks[i].release,
+                           t->rt_param.pfair->subtasks[i].deadline,
+                           t->rt_param.pfair->subtasks[i].overlap,
+                           t->rt_param.pfair->subtasks[i].group_deadline);
+}
+static long pfair_admit_task(struct task_struct* t)
+{
+        lt_t quanta;
+        lt_t period;
+        s64  quantum_length = LITMUS_QUANTUM_LENGTH_NS;
+        struct pfair_param* param;
+        unsigned long i;
+        /* first check that the task is in the right cluster */
+        if (cpu_cluster(pstate[tsk_rt(t)->task_params.cpu]) !=
+            cpu_cluster(pstate[task_cpu(t)]))
+                return -EINVAL;
+        if (get_rt_period(t) != get_rt_relative_deadline(t)) {
+                printk(KERN_INFO "%s: Admission rejected. "
+                        "Only implicit deadlines are currently supported.\n",
+                        litmus->plugin_name);
+                return -EINVAL;
+        }
+        /* Pfair is a tick-based scheduler, so the unit of time
+         * is one quantum. Calculate quantum-based parameters for everything.
+         * (Ceiling of exec cost, floor of period.)
+         */
+        quanta = get_exec_cost(t);
+        period = get_rt_period(t);
+        quanta = time2quanta(get_exec_cost(t), CEIL);
+        if (do_div(period, quantum_length))
+                printk(KERN_WARNING
+                       "The period of %s/%d is not a multiple of %llu.\n",
+                       t->comm, t->pid, (unsigned long long) quantum_length);
+        if (quanta == period) {
+                PTRACE_TASK(t, "Admitting weight 1.0 task. (%llu, %llu).\n", quanta, period);
+        }
+        param = kzalloc(sizeof(*param) +
+                        quanta * sizeof(struct subtask), GFP_ATOMIC);
+        if (!param)
+                return -ENOMEM;
+        param->quanta  = quanta;
+        param->period  = period;
+        param->cluster = cpu_cluster(pstate[tsk_rt(t)->task_params.cpu]);
+        for (i = 0; i < quanta; i++)
+                init_subtask(param->subtasks + i, i, quanta, period);
+        if (t->rt_param.pfair)
+                /* get rid of stale allocation */
+                kfree(t->rt_param.pfair);
+        t->rt_param.pfair = param;
+        /* spew out some debug info */
+        dump_subtasks(t);
+        /* Disable generic budget enforcement (if enabled).
+         * The plugin provides its own (non-optional) enforcement
+         * of allocations at quantum granularity. */
+        tsk_rt(t)->task_params.budget_policy = NO_ENFORCEMENT;
+        return 0;
+}
+static void pfair_init_cluster(struct pfair_cluster* cluster)
+{
+        rt_domain_init(&cluster->pfair, pfair_ready_order, NULL, pfair_release_jobs);
+        bheap_init(&cluster->release_queue);
+        raw_spin_lock_init(&cluster->release_lock);
+        INIT_LIST_HEAD(&cluster->topology.cpus);
+}
+static void cleanup_clusters(void)
+{
+        int i;
+        if (num_pfair_clusters)
+                kfree(pfair_clusters);
+        pfair_clusters = NULL;
+        num_pfair_clusters = 0;
+        /* avoid stale pointers */
+        for (i = 0; i < num_online_cpus(); i++) {
+                pstate[i]->topology.cluster = NULL;
+                printk("P%d missed %u updates and %u quanta.\n", cpu_id(pstate[i]),
+                       pstate[i]->missed_updates, pstate[i]->missed_quanta);
+        }
+}
+static struct domain_proc_info pfair_domain_proc_info;
+static long pfair_get_domain_proc_info(struct domain_proc_info **ret)
+{
+        *ret = &pfair_domain_proc_info;
+        return 0;
+}
+static void pfair_setup_domain_proc(void)
+{
+        int i, cpu, domain;
+#ifdef CONFIG_RELEASE_MASTER
+        int release_master = atomic_read(&release_master_cpu);
+        /* skip over the domain with the release master if cluster size is 1 */
+        int cluster_size = num_online_cpus() / num_pfair_clusters;
+        int skip_domain = (1 == cluster_size && release_master != NO_CPU) ?
+                        release_master : NO_CPU;
+#else
+        int release_master = NO_CPU;
+        int skip_domain = NO_CPU;
+#endif
+        int num_rt_cpus = num_online_cpus() - (release_master != NO_CPU);
+        int num_rt_domains = num_pfair_clusters - (skip_domain != NO_CPU);
+        struct cd_mapping *map;
+        memset(&pfair_domain_proc_info, 0, sizeof(pfair_domain_proc_info));
+        init_domain_proc_info(&pfair_domain_proc_info, num_rt_cpus, num_pfair_clusters);
+        pfair_domain_proc_info.num_cpus = num_rt_cpus;
+        pfair_domain_proc_info.num_domains = num_rt_domains;
+        for (cpu = 0, i = 0; cpu < num_online_cpus(); ++cpu) {
+                if (cpu == release_master)
+                        continue;
+                map = &pfair_domain_proc_info.cpu_to_domains[i];
+                /* pointer math to figure out the domain index */
+                domain = cpu_cluster(&per_cpu(pfair_state, cpu)) - pfair_clusters;
+                map->id = cpu;
+                cpumask_set_cpu(domain, map->mask);
+                ++i;
+        }
+        for (domain = 0, i = 0; domain < num_pfair_clusters; ++domain) {
+                struct pfair_cluster *cluster;
+                struct list_head *pos;
+                if (domain == skip_domain)
+                        continue;
+                cluster = &pfair_clusters[domain];
+                map = &pfair_domain_proc_info.domain_to_cpus[i];
+                map->id = i;
+                list_for_each(pos, &cluster->topology.cpus) {
+                        cpu = cpu_id(from_cluster_list(pos));
+                        if (cpu != release_master)
+                                cpumask_set_cpu(cpu, map->mask);
+                }
+                ++i;
+        }
+}
+static long pfair_activate_plugin(void)
+{
+        int err, i;
+        struct pfair_state* state;
+        struct pfair_cluster* cluster;
+        quanta_t now, start;
+        int cluster_size;
+        struct cluster_cpu* cpus[NR_CPUS];
+        struct scheduling_cluster* clust[NR_CPUS];
+        lt_t quantum_timer_start;
+        cluster_size = get_cluster_size(pfair_cluster_level);
+        if (cluster_size <= 0 || num_online_cpus() % cluster_size != 0)
+                return -EINVAL;
+        num_pfair_clusters = num_online_cpus() / cluster_size;
+        pfair_clusters = kzalloc(num_pfair_clusters * sizeof(struct pfair_cluster), GFP_ATOMIC);
+        if (!pfair_clusters) {
+                num_pfair_clusters = 0;
+                printk(KERN_ERR "Could not allocate Pfair clusters!\n");
+                return -ENOMEM;
+        }
+        state = this_cpu_ptr(&pfair_state);
+        now   = current_quantum(state);
+        start = now + 50;
+        quantum_timer_start = quanta2time(start);
+        TRACE("Activating PFAIR at %llu (q=%lu), first tick at %llu (q=%lu)\n",
+                litmus_clock(),
+                now,
+                quantum_timer_start,
+                time2quanta(quantum_timer_start, CEIL));
+        for (i = 0; i < num_pfair_clusters; i++) {
+                cluster = &pfair_clusters[i];
+                pfair_init_cluster(cluster);
+                cluster->pfair_time = start;
+                clust[i] = &cluster->topology;
+#ifdef CONFIG_RELEASE_MASTER
+                cluster->pfair.release_master = atomic_read(&release_master_cpu);
+#endif
+        }
+        for_each_online_cpu(i) {
+                state = &per_cpu(pfair_state, i);
+                state->cur_tick   = start;
+                state->local_tick = start;
+                state->missed_quanta = 0;
+                state->missed_updates = 0;
+                state->offset     = cpu_stagger_offset(i);
+                hrtimer_set_expires(&state->quantum_timer,
+                        ns_to_ktime(quantum_timer_start + state->offset));
+                cpus[i] = &state->topology;
+                TRACE("cpus[%d] set; offset=%llu; %d\n", i, state->offset, num_online_cpus());
+                INIT_LIST_HEAD(&state->out_of_budget);
+                /* force rescheduling to start quantum timer */
+                litmus_reschedule(i);
+                WARN_ONCE(!hrtimer_is_hres_active(&state->quantum_timer),
+                        KERN_ERR "WARNING: no high resolution timers available!?\n");
+        }
+        err = assign_cpus_to_clusters(pfair_cluster_level, clust, num_pfair_clusters,
+                                      cpus, num_online_cpus());
+        if (err < 0)
+                cleanup_clusters();
+        else
+                pfair_setup_domain_proc();
+        return err;
+}
+static long pfair_deactivate_plugin(void)
+{
+        int cpu;
+        struct pfair_state* state;
+        for_each_online_cpu(cpu) {
+                state = &per_cpu(pfair_state, cpu);
+                TRACE("stopping quantum timer on CPU%d\n", cpu);
+                hrtimer_cancel(&state->quantum_timer);
+        }
+        cleanup_clusters();
+        destroy_domain_proc_info(&pfair_domain_proc_info);
+        return 0;
+}
+/*      Plugin object   */
+static struct sched_plugin pfair_plugin __cacheline_aligned_in_smp = {
+        .plugin_name            = "PFAIR",
+        .task_new               = pfair_task_new,
+        .task_exit              = pfair_task_exit,
+        .schedule               = pfair_schedule,
+        .task_wake_up           = pfair_task_wake_up,
+        .task_block             = pfair_task_block,
+        .admit_task             = pfair_admit_task,
+        .complete_job           = complete_job,
+        .activate_plugin        = pfair_activate_plugin,
+        .deactivate_plugin      = pfair_deactivate_plugin,
+        .get_domain_proc_info   = pfair_get_domain_proc_info,
+};
+static struct proc_dir_entry *cluster_file = NULL, *pfair_dir = NULL;
+static int __init init_pfair(void)
+{
+        int cpu, err, fs;
+        struct pfair_state *state;
+        /*
+         * initialize short_cut for per-cpu pfair state;
+         * there may be a problem here if someone removes a cpu
+         * while we are doing this initialization... and if cpus
+         * are added / removed later... but we don't support CPU hotplug atm anyway.
+         */
+        pstate = kmalloc(sizeof(struct pfair_state*) * num_online_cpus(), GFP_KERNEL);
+        /* initialize CPU state */
+        for (cpu = 0; cpu < num_online_cpus(); cpu++)  {
+                state = &per_cpu(pfair_state, cpu);
+                hrtimer_init(&state->quantum_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
+                state->quantum_timer.function = on_quantum_boundary;
+                state->topology.id = cpu;
+                state->cur_tick   = 0;
+                state->local_tick = 0;
+                state->linked     = NULL;
+                state->local      = NULL;
+                state->scheduled  = NULL;
+                state->missed_quanta = 0;
+                state->offset     = cpu_stagger_offset(cpu);
+                pstate[cpu] = state;
+        }
+        pfair_clusters = NULL;
+        num_pfair_clusters = 0;
+        err = register_sched_plugin(&pfair_plugin);
+        if (!err) {
+                fs = make_plugin_proc_dir(&pfair_plugin, &pfair_dir);
+                if (!fs)
+                        cluster_file = create_cluster_file(pfair_dir, &pfair_cluster_level);
+                else
+                        printk(KERN_ERR "Could not allocate PFAIR procfs dir.\n");
+        }
+        return err;
+}
+static void __exit clean_pfair(void)
+{
+        kfree(pstate);
+        if (cluster_file)
+                remove_proc_entry("cluster", pfair_dir);
+        if (pfair_dir)
+                remove_plugin_proc_dir(&pfair_plugin);
+}
+module_init(init_pfair);
+module_exit(clean_pfair);
author	Bjoern Brandenburg <bbb@mpi-sws.org>	2015-08-09 07:18:56 -0400
committer	Bjoern Brandenburg <bbb@mpi-sws.org>	2017-05-26 17:12:42 -0400
commit	55b4b8689a88d6cb457ecfaabbccc09d5f7c121a (patch)
tree	e4f28b33dd7a0cb2f18bd0a6dd4aeb2af3504e65 /litmus
parent	d3976b0260a5241d7f5461d1a12b51b53c833a91 (diff)