Extended Reservation Initial Commit

author: Zelin Tong <ztong@ludwig.cs.unc.edu> 2020-08-24 18:52:28 -0400
committer: Zelin Tong <ztong@ludwig.cs.unc.edu> 2020-08-24 18:52:28 -0400
commit: ddad9de416639a19016f38cd3161d4840315a7a7 (patch)
tree: 37429d746725b81ac565140e18a6cc18d979361a /litmus
parent: c8feef5aa054eb6d2fac2d5f65fc6118874ddb2c (diff)
1 files changed, 612 insertions, 0 deletions
diff --git a/litmus/sched_ext_res.c b/litmus/sched_ext_res.c
new file mode 100644
index 000000000000..0a3270346656
--- /dev/null
+++ b/litmus/sched_ext_res.c
@@ -0,0 +1,612 @@
+#include <linux/percpu.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <asm/uaccess.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/preempt.h>
+#include <litmus/debug_trace.h>
+#include <litmus/litmus.h>
+#include <litmus/jobs.h>
+#include <litmus/budget.h>
+#include <litmus/litmus_proc.h>
+#include <litmus/sched_trace.h>
+#include <litmus/reservations/reservation.h>
+#include <litmus/reservations/alloc.h>
+struct pres_task_state {
+        struct reservation_client *client;
+        int cpu;
+        struct task_client res_info;
+};
+struct pres_cpu_state {
+        raw_spinlock_t lock;
+        struct sup_reservation_environment sup_env;
+        struct hrtimer timer;
+        int cpu;
+        struct task_struct* scheduled;
+};
+static DEFINE_PER_CPU(struct pres_cpu_state, pres_cpu_state);
+#define cpu_state_for(cpu_id)   (&per_cpu(pres_cpu_state, cpu_id))
+#define local_cpu_state()       (this_cpu_ptr(&pres_cpu_state))
+static struct pres_task_state* get_pres_state(struct task_struct *tsk)
+{
+        return (struct pres_task_state*) tsk_rt(tsk)->plugin_state;
+}
+static void task_departs(struct task_struct *tsk, int job_complete)
+{
+        struct pres_task_state* state = get_pres_state(tsk);
+        struct reservation* res;
+        struct reservation_client *client;
+        client = state->client;
+        res    = client->reservation;
+        res->ops->client_departs(res, client, job_complete);
+        TRACE_TASK(tsk, "client_departs: removed from reservation R%d\n", res->id);
+}
+static void task_arrives(struct task_struct *tsk)
+{
+        struct pres_task_state* state = get_pres_state(tsk);
+        struct reservation* res;
+        struct reservation_client *client;
+        client = state->client;
+        res    = client->reservation;
+        res->ops->client_arrives(res, client);
+        TRACE_TASK(tsk, "client_arrives: added to reservation R%d\n", res->id);
+}
+/* NOTE: drops state->lock */
+static void pres_update_timer_and_unlock(struct pres_cpu_state *state)
+{
+        int local;
+        lt_t update, now;
+        update = state->sup_env.next_scheduler_update;
+        now = state->sup_env.env.current_time;
+        /* Be sure we're actually running on the right core,
+         * as pres_update_timer() is also called from pres_task_resume(),
+         * which might be called on any CPU when a thread resumes.
+         */
+        local = local_cpu_state() == state;
+        /* Must drop state lock before calling into hrtimer_start(), which
+         * may raise a softirq, which in turn may wake ksoftirqd. */
+        raw_spin_unlock(&state->lock);
+        if (update <= now) {
+                litmus_reschedule(state->cpu);
+        } else if (likely(local && update != SUP_NO_SCHEDULER_UPDATE)) {
+                /* Reprogram only if not already set correctly. */
+                if (!hrtimer_active(&state->timer) ||
+                    ktime_to_ns(hrtimer_get_expires(&state->timer)) != update) {
+                        TRACE("canceling timer...\n");
+                        hrtimer_cancel(&state->timer);
+                        TRACE("setting scheduler timer for %llu\n", update);
+                        hrtimer_start(&state->timer,
+                                        ns_to_ktime(update),
+                                        HRTIMER_MODE_ABS_PINNED);
+                        if (update < litmus_clock()) {
+                                /* uh oh, timer expired while trying to set it */
+                                TRACE("timer expired during setting "
+                                      "update:%llu now:%llu actual:%llu\n",
+                                      update, now, litmus_clock());
+                                /* The timer HW may not have been reprogrammed
+                                 * correctly; force rescheduling now. */
+                                litmus_reschedule(state->cpu);
+                        }
+                }
+        } else if (unlikely(!local && update != SUP_NO_SCHEDULER_UPDATE)) {
+                /* Poke remote core only if timer needs to be set earlier than
+                 * it is currently set.
+                 */
+                TRACE("pres_update_timer for remote CPU %d (update=%llu, "
+                      "active:%d, set:%llu)\n",
+                        state->cpu,
+                        update,
+                        hrtimer_active(&state->timer),
+                        ktime_to_ns(hrtimer_get_expires(&state->timer)));
+                if (!hrtimer_active(&state->timer) ||
+                    ktime_to_ns(hrtimer_get_expires(&state->timer)) > update) {
+                        TRACE("poking CPU %d so that it can update its "
+                               "scheduling timer (active:%d, set:%llu)\n",
+                               state->cpu,
+                               hrtimer_active(&state->timer),
+                               ktime_to_ns(hrtimer_get_expires(&state->timer)));
+                        litmus_reschedule(state->cpu);
+                }
+        }
+}
+static enum hrtimer_restart on_scheduling_timer(struct hrtimer *timer)
+{
+        unsigned long flags;
+        enum hrtimer_restart restart = HRTIMER_NORESTART;
+        struct pres_cpu_state *state;
+        lt_t update, now;
+        state = container_of(timer, struct pres_cpu_state, timer);
+        /* The scheduling timer should only fire on the local CPU, because
+         * otherwise deadlocks via timer_cancel() are possible.
+         * Note: this does not interfere with dedicated interrupt handling, as
+         * even under dedicated interrupt handling scheduling timers for
+         * budget enforcement must occur locally on each CPU.
+         */
+        BUG_ON(state->cpu != raw_smp_processor_id());
+        raw_spin_lock_irqsave(&state->lock, flags);
+        sup_update_time(&state->sup_env, litmus_clock());
+        update = state->sup_env.next_scheduler_update;
+        now = state->sup_env.env.current_time;
+        TRACE_CUR("on_scheduling_timer at %llu, upd:%llu (for cpu=%d)\n",
+                now, update, state->cpu);
+        if (update <= now) {
+                litmus_reschedule_local();
+        } else if (update != SUP_NO_SCHEDULER_UPDATE) {
+                hrtimer_set_expires(timer, ns_to_ktime(update));
+                restart = HRTIMER_RESTART;
+        }
+        raw_spin_unlock_irqrestore(&state->lock, flags);
+        return restart;
+}
+static struct task_struct* pres_schedule(struct task_struct * prev)
+{
+        /* next == NULL means "schedule background work". */
+        struct pres_cpu_state *state = local_cpu_state();
+        raw_spin_lock(&state->lock);
+        BUG_ON(state->scheduled && state->scheduled != prev);
+        BUG_ON(state->scheduled && !is_realtime(prev));
+        /* update time */
+        state->sup_env.will_schedule = true;
+        sup_update_time(&state->sup_env, litmus_clock());
+        /* figure out what to schedule next */
+        state->scheduled = sup_dispatch(&state->sup_env);
+        /* Notify LITMUS^RT core that we've arrived at a scheduling decision. */
+        sched_state_task_picked();
+        /* program scheduler timer */
+        state->sup_env.will_schedule = false;
+        /* NOTE: drops state->lock */
+        pres_update_timer_and_unlock(state);
+        if (prev != state->scheduled && is_realtime(prev))
+                TRACE_TASK(prev, "descheduled.\n");
+        if (state->scheduled)
+                TRACE_TASK(state->scheduled, "scheduled.\n");
+        return state->scheduled;
+}
+static void resume_legacy_task_model_updates(struct task_struct *tsk)
+{
+        lt_t now;
+        if (is_sporadic(tsk)) {
+                /* If this sporadic task was gone for a "long" time and woke up past
+                 * its deadline, then give it a new budget by triggering a job
+                 * release. This is purely cosmetic and has no effect on the
+                 * P-RES scheduler. */
+                now = litmus_clock();
+                if (is_tardy(tsk, now)) {
+                        inferred_sporadic_job_release_at(tsk, now);
+                }
+        }
+}
+/* Called when a task should be removed from the ready queue.
+ */
+static void pres_task_block(struct task_struct *tsk)
+{
+        unsigned long flags;
+        struct pres_task_state* tinfo = get_pres_state(tsk);
+        struct pres_cpu_state *state = cpu_state_for(tinfo->cpu);
+        TRACE_TASK(tsk, "thread suspends at %llu (state:%d, running:%d)\n",
+                litmus_clock(), tsk->state, is_current_running());
+        raw_spin_lock_irqsave(&state->lock, flags);
+        sup_update_time(&state->sup_env, litmus_clock());
+        task_departs(tsk, is_completed(tsk));
+        raw_spin_unlock_irqrestore(&state->lock, flags);
+}
+/* Called when the state of tsk changes back to TASK_RUNNING.
+ * We need to requeue the task.
+ */
+static void pres_task_resume(struct task_struct  *tsk)
+{
+        unsigned long flags;
+        struct pres_task_state* tinfo = get_pres_state(tsk);
+        struct pres_cpu_state *state = cpu_state_for(tinfo->cpu);
+        TRACE_TASK(tsk, "thread wakes up at %llu\n", litmus_clock());
+        raw_spin_lock_irqsave(&state->lock, flags);
+        /* Assumption: litmus_clock() is synchronized across cores,
+         * since we might not actually be executing on tinfo->cpu
+         * at the moment. */
+        sup_update_time(&state->sup_env, litmus_clock());
+        task_arrives(tsk);
+        /* NOTE: drops state->lock */
+        pres_update_timer_and_unlock(state);
+        local_irq_restore(flags);
+        resume_legacy_task_model_updates(tsk);
+}
+static long pres_admit_task(struct task_struct *tsk)
+{
+        long err = -EINVAL;
+        unsigned long flags;
+        struct reservation *res;
+        struct pres_cpu_state *state;
+        struct pres_task_state *tinfo = kzalloc(sizeof(*tinfo), GFP_ATOMIC);
+        if (!tinfo)
+                return -ENOMEM;
+        preempt_disable();
+        /* NOTE: this is obviously racy w.r.t. affinity changes since
+         *       we are not holding any runqueue locks. */
+        if (tsk->nr_cpus_allowed != 1) {
+                printk(KERN_WARNING "%s/%d: task does not have "
+                       "singleton affinity mask\n",
+                        tsk->comm, tsk->pid);
+                state = cpu_state_for(task_cpu(tsk));
+        } else {
+                state = cpu_state_for(cpumask_first(&tsk->cpus_allowed));
+        }
+        TRACE_TASK(tsk, "on CPU %d, valid?:%d\n",
+                task_cpu(tsk), cpumask_test_cpu(task_cpu(tsk), &tsk->cpus_allowed));
+        raw_spin_lock_irqsave(&state->lock, flags);
+        res = sup_find_by_id(&state->sup_env, tsk_rt(tsk)->task_params.cpu);
+        /* found the appropriate reservation (or vCPU) */
+        if (res) {
+                task_client_init(&tinfo->res_info, tsk, res);
+                tinfo->cpu = state->cpu;
+                tinfo->client = &tinfo->res_info.client;
+                tsk_rt(tsk)->plugin_state = tinfo;
+                err = 0;
+                /* disable LITMUS^RT's per-thread budget enforcement */
+                tsk_rt(tsk)->task_params.budget_policy = NO_ENFORCEMENT;
+        } else {
+                printk(KERN_WARNING "Could not find reservation %d on "
+                       "core %d for task %s/%d\n",
+                       tsk_rt(tsk)->task_params.cpu, state->cpu,
+                       tsk->comm, tsk->pid);
+        }
+        raw_spin_unlock_irqrestore(&state->lock, flags);
+        preempt_enable();
+        if (err)
+                kfree(tinfo);
+        return err;
+}
+static void task_new_legacy_task_model_updates(struct task_struct *tsk)
+{
+        lt_t now = litmus_clock();
+        /* the first job exists starting as of right now */
+        release_at(tsk, now);
+        sched_trace_task_release(tsk);
+}
+static void pres_task_new(struct task_struct *tsk, int on_runqueue,
+                          int is_running)
+{
+        unsigned long flags;
+        struct pres_task_state* tinfo = get_pres_state(tsk);
+        struct pres_cpu_state *state = cpu_state_for(tinfo->cpu);
+        TRACE_TASK(tsk, "new RT task %llu (on_rq:%d, running:%d)\n",
+                   litmus_clock(), on_runqueue, is_running);
+        /* acquire the lock protecting the state and disable interrupts */
+        raw_spin_lock_irqsave(&state->lock, flags);
+        if (is_running) {
+                state->scheduled = tsk;
+                /* make sure this task should actually be running */
+                litmus_reschedule_local();
+        }
+        if (on_runqueue || is_running) {
+                /* Assumption: litmus_clock() is synchronized across cores
+                 * [see comment in pres_task_resume()] */
+                sup_update_time(&state->sup_env, litmus_clock());
+                task_arrives(tsk);
+                /* NOTE: drops state->lock */
+                pres_update_timer_and_unlock(state);
+                local_irq_restore(flags);
+        } else
+                raw_spin_unlock_irqrestore(&state->lock, flags);
+        task_new_legacy_task_model_updates(tsk);
+}
+static bool pres_fork_task(struct task_struct *tsk)
+{
+        TRACE_CUR("is forking\n");
+        TRACE_TASK(tsk, "forked child rt:%d cpu:%d task_cpu:%d "
+                        "wcet:%llu per:%llu\n",
+                is_realtime(tsk),
+                tsk_rt(tsk)->task_params.cpu,
+                task_cpu(tsk),
+                tsk_rt(tsk)->task_params.exec_cost,
+                tsk_rt(tsk)->task_params.period);
+        /* We always allow forking. */
+        /* The newly forked task will be in the same reservation. */
+        return true;
+}
+static void pres_task_exit(struct task_struct *tsk)
+{
+        unsigned long flags;
+        struct pres_task_state* tinfo = get_pres_state(tsk);
+        struct pres_cpu_state *state = cpu_state_for(tinfo->cpu);
+        raw_spin_lock_irqsave(&state->lock, flags);
+        TRACE_TASK(tsk, "task exits at %llu (present:%d sched:%d)\n",
+                litmus_clock(), is_present(tsk), state->scheduled == tsk);
+        if (state->scheduled == tsk)
+                state->scheduled = NULL;
+        /* remove from queues */
+        if (is_present(tsk)) {
+                /* Assumption: litmus_clock() is synchronized across cores
+                 * [see comment in pres_task_resume()] */
+                sup_update_time(&state->sup_env, litmus_clock());
+                task_departs(tsk, 0);
+                /* NOTE: drops state->lock */
+                pres_update_timer_and_unlock(state);
+                local_irq_restore(flags);
+        } else
+                raw_spin_unlock_irqrestore(&state->lock, flags);
+        kfree(tsk_rt(tsk)->plugin_state);
+        tsk_rt(tsk)->plugin_state = NULL;
+}
+static void pres_current_budget(lt_t *used_so_far, lt_t *remaining)
+{
+        struct pres_task_state *tstate = get_pres_state(current);
+        struct pres_cpu_state *state;
+        /* FIXME: protect against concurrent task_exit() */
+        local_irq_disable();
+        state = cpu_state_for(tstate->cpu);
+        raw_spin_lock(&state->lock);
+        sup_update_time(&state->sup_env, litmus_clock());
+        if (remaining)
+                *remaining = tstate->client->reservation->cur_budget;
+        if (used_so_far)
+                *used_so_far = tstate->client->reservation->budget_consumed;
+        pres_update_timer_and_unlock(state);
+        local_irq_enable();
+}
+static long do_pres_reservation_create(
+        int res_type,
+        struct reservation_config *config)
+{
+        struct pres_cpu_state *state;
+        struct reservation* res;
+        struct reservation* new_res = NULL;
+        unsigned long flags;
+        long err;
+        /* Allocate before we grab a spin lock. */
+        switch (res_type) {
+                case PERIODIC_POLLING:
+                case SPORADIC_POLLING:
+                        err = alloc_polling_reservation(res_type, config, &new_res);
+                        break;
+                case TABLE_DRIVEN:
+                        err = alloc_table_driven_reservation(config, &new_res);
+                        break;
+                default:
+                        err = -EINVAL;
+                        break;
+        }
+        if (err)
+                return err;
+        state = cpu_state_for(config->cpu);
+        raw_spin_lock_irqsave(&state->lock, flags);
+        res = sup_find_by_id(&state->sup_env, config->id);
+        if (!res) {
+                sup_add_new_reservation(&state->sup_env, new_res);
+                err = config->id;
+        } else {
+                err = -EEXIST;
+        }
+        raw_spin_unlock_irqrestore(&state->lock, flags);
+        if (err < 0)
+                kfree(new_res);
+        return err;
+}
+static long pres_reservation_create(int res_type, void* __user _config)
+{
+        struct reservation_config config;
+        TRACE("Attempt to create reservation (%d)\n", res_type);
+        if (copy_from_user(&config, _config, sizeof(config)))
+                return -EFAULT;
+        if (config.cpu < 0 || !cpu_online(config.cpu)) {
+                printk(KERN_ERR "invalid polling reservation (%u): "
+                       "CPU %d offline\n", config.id, config.cpu);
+                return -EINVAL;
+        }
+        return do_pres_reservation_create(res_type, &config);
+}
+static struct domain_proc_info pres_domain_proc_info;
+static long pres_get_domain_proc_info(struct domain_proc_info **ret)
+{
+        *ret = &pres_domain_proc_info;
+        return 0;
+}
+static void pres_setup_domain_proc(void)
+{
+        int i, cpu;
+        int num_rt_cpus = num_online_cpus();
+        struct cd_mapping *cpu_map, *domain_map;
+        memset(&pres_domain_proc_info, 0, sizeof(pres_domain_proc_info));
+        init_domain_proc_info(&pres_domain_proc_info, num_rt_cpus, num_rt_cpus);
+        pres_domain_proc_info.num_cpus = num_rt_cpus;
+        pres_domain_proc_info.num_domains = num_rt_cpus;
+        i = 0;
+        for_each_online_cpu(cpu) {
+                cpu_map = &pres_domain_proc_info.cpu_to_domains[i];
+                domain_map = &pres_domain_proc_info.domain_to_cpus[i];
+                cpu_map->id = cpu;
+                domain_map->id = i;
+                cpumask_set_cpu(i, cpu_map->mask);
+                cpumask_set_cpu(cpu, domain_map->mask);
+                ++i;
+        }
+}
+static long pres_activate_plugin(void)
+{
+        int cpu;
+        struct pres_cpu_state *state;
+        for_each_online_cpu(cpu) {
+                TRACE("Initializing CPU%d...\n", cpu);
+                state = cpu_state_for(cpu);
+                raw_spin_lock_init(&state->lock);
+                state->cpu = cpu;
+                state->scheduled = NULL;
+                sup_init(&state->sup_env);
+                hrtimer_init(&state->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
+                state->timer.function = on_scheduling_timer;
+        }
+        pres_setup_domain_proc();
+        return 0;
+}
+static long pres_deactivate_plugin(void)
+{
+        int cpu;
+        struct pres_cpu_state *state;
+        struct reservation *res;
+        for_each_online_cpu(cpu) {
+                state = cpu_state_for(cpu);
+                raw_spin_lock(&state->lock);
+                hrtimer_cancel(&state->timer);
+                /* Delete all reservations --- assumes struct reservation
+                 * is prefix of containing struct. */
+                while (!list_empty(&state->sup_env.all_reservations)) {
+                        res = list_first_entry(
+                                &state->sup_env.all_reservations,
+                                struct reservation, all_list);
+                        list_del(&res->all_list);
+                        if (res->ops->shutdown)
+                                res->ops->shutdown(res);
+                        kfree(res);
+                }
+                raw_spin_unlock(&state->lock);
+        }
+        destroy_domain_proc_info(&pres_domain_proc_info);
+        return 0;
+}
+static struct sched_plugin pres_plugin = {
+        .plugin_name            = "P-RES",
+        .schedule               = pres_schedule,
+        .task_block             = pres_task_block,
+        .task_wake_up           = pres_task_resume,
+        .admit_task             = pres_admit_task,
+        .task_new               = pres_task_new,
+        .fork_task              = pres_fork_task,
+        .task_exit              = pres_task_exit,
+        .complete_job           = complete_job_oneshot,
+        .get_domain_proc_info   = pres_get_domain_proc_info,
+        .activate_plugin        = pres_activate_plugin,
+        .deactivate_plugin      = pres_deactivate_plugin,
+        .reservation_create     = pres_reservation_create,
+        .current_budget         = pres_current_budget,
+};
+static int __init init_pres(void)
+{
+        return register_sched_plugin(&pres_plugin);
+}
+module_init(init_pres);
author	Zelin Tong <ztong@ludwig.cs.unc.edu>	2020-08-24 18:52:28 -0400
committer	Zelin Tong <ztong@ludwig.cs.unc.edu>	2020-08-24 18:52:28 -0400
commit	ddad9de416639a19016f38cd3161d4840315a7a7 (patch)
tree	37429d746725b81ac565140e18a6cc18d979361a /litmus
parent	c8feef5aa054eb6d2fac2d5f65fc6118874ddb2c (diff)

diff --git a/litmus/sched_ext_res.c b/litmus/sched_ext_res.c new file mode 100644 index 000000000000..0a3270346656 --- /dev/null +++ b/litmus/sched_ext_res.c
@@ -0,0 +1,612 @@
	1	#include <linux/percpu.h>
	2	#include <linux/slab.h>
	3	#include <linux/module.h>
	4	#include <asm/uaccess.h>
	5
	6	#include <litmus/sched_plugin.h>
	7	#include <litmus/preempt.h>
	8	#include <litmus/debug_trace.h>
	9
	10	#include <litmus/litmus.h>
	11	#include <litmus/jobs.h>
	12	#include <litmus/budget.h>
	13	#include <litmus/litmus_proc.h>
	14	#include <litmus/sched_trace.h>
	15
	16	#include <litmus/reservations/reservation.h>
	17	#include <litmus/reservations/alloc.h>
	18
	19	struct pres_task_state {
	20	struct reservation_client *client;
	21	int cpu;
	22	struct task_client res_info;
	23	};
	24
	25	struct pres_cpu_state {
	26	raw_spinlock_t lock;
	27
	28	struct sup_reservation_environment sup_env;
	29	struct hrtimer timer;
	30
	31	int cpu;
	32	struct task_struct* scheduled;
	33	};
	34
	35	static DEFINE_PER_CPU(struct pres_cpu_state, pres_cpu_state);
	36
	37	#define cpu_state_for(cpu_id) (&per_cpu(pres_cpu_state, cpu_id))
	38	#define local_cpu_state() (this_cpu_ptr(&pres_cpu_state))
	39
	40	static struct pres_task_state* get_pres_state(struct task_struct *tsk)
	41	{
	42	return (struct pres_task_state*) tsk_rt(tsk)->plugin_state;
	43	}
	44
	45	static void task_departs(struct task_struct *tsk, int job_complete)
	46	{
	47	struct pres_task_state* state = get_pres_state(tsk);
	48	struct reservation* res;
	49	struct reservation_client *client;
	50
	51	client = state->client;
	52	res = client->reservation;
	53
	54	res->ops->client_departs(res, client, job_complete);
	55	TRACE_TASK(tsk, "client_departs: removed from reservation R%d\n", res->id);
	56	}
	57
	58	static void task_arrives(struct task_struct *tsk)
	59	{
	60	struct pres_task_state* state = get_pres_state(tsk);
	61	struct reservation* res;
	62	struct reservation_client *client;
	63
	64	client = state->client;
	65	res = client->reservation;
	66
	67	res->ops->client_arrives(res, client);
	68	TRACE_TASK(tsk, "client_arrives: added to reservation R%d\n", res->id);
	69	}
	70
	71	/* NOTE: drops state->lock */
	72	static void pres_update_timer_and_unlock(struct pres_cpu_state *state)
	73	{
	74	int local;
	75	lt_t update, now;
	76
	77	update = state->sup_env.next_scheduler_update;
	78	now = state->sup_env.env.current_time;
	79
	80	/* Be sure we're actually running on the right core,
	81	* as pres_update_timer() is also called from pres_task_resume(),
	82	* which might be called on any CPU when a thread resumes.
	83	*/
	84	local = local_cpu_state() == state;
	85
	86	/* Must drop state lock before calling into hrtimer_start(), which
	87	* may raise a softirq, which in turn may wake ksoftirqd. */
	88	raw_spin_unlock(&state->lock);
	89
	90	if (update <= now) {
	91	litmus_reschedule(state->cpu);
	92	} else if (likely(local && update != SUP_NO_SCHEDULER_UPDATE)) {
	93	/* Reprogram only if not already set correctly. */
	94	if (!hrtimer_active(&state->timer) \|\|
	95	ktime_to_ns(hrtimer_get_expires(&state->timer)) != update) {
	96	TRACE("canceling timer...\n");
	97	hrtimer_cancel(&state->timer);
	98	TRACE("setting scheduler timer for %llu\n", update);
	99	hrtimer_start(&state->timer,
	100	ns_to_ktime(update),
	101	HRTIMER_MODE_ABS_PINNED);
	102	if (update < litmus_clock()) {
	103	/* uh oh, timer expired while trying to set it */
	104	TRACE("timer expired during setting "
	105	"update:%llu now:%llu actual:%llu\n",
	106	update, now, litmus_clock());
	107	/* The timer HW may not have been reprogrammed
	108	* correctly; force rescheduling now. */
	109	litmus_reschedule(state->cpu);
	110	}
	111	}
	112	} else if (unlikely(!local && update != SUP_NO_SCHEDULER_UPDATE)) {
	113	/* Poke remote core only if timer needs to be set earlier than
	114	* it is currently set.
	115	*/
	116	TRACE("pres_update_timer for remote CPU %d (update=%llu, "
	117	"active:%d, set:%llu)\n",
	118	state->cpu,
	119	update,
	120	hrtimer_active(&state->timer),
	121	ktime_to_ns(hrtimer_get_expires(&state->timer)));
	122	if (!hrtimer_active(&state->timer) \|\|
	123	ktime_to_ns(hrtimer_get_expires(&state->timer)) > update) {
	124	TRACE("poking CPU %d so that it can update its "
	125	"scheduling timer (active:%d, set:%llu)\n",
	126	state->cpu,
	127	hrtimer_active(&state->timer),
	128	ktime_to_ns(hrtimer_get_expires(&state->timer)));
	129	litmus_reschedule(state->cpu);
	130	}
	131	}
	132	}
	133
	134	static enum hrtimer_restart on_scheduling_timer(struct hrtimer *timer)
	135	{
	136	unsigned long flags;
	137	enum hrtimer_restart restart = HRTIMER_NORESTART;
	138	struct pres_cpu_state *state;
	139	lt_t update, now;
	140
	141	state = container_of(timer, struct pres_cpu_state, timer);
	142
	143	/* The scheduling timer should only fire on the local CPU, because
	144	* otherwise deadlocks via timer_cancel() are possible.
	145	* Note: this does not interfere with dedicated interrupt handling, as
	146	* even under dedicated interrupt handling scheduling timers for
	147	* budget enforcement must occur locally on each CPU.
	148	*/
	149	BUG_ON(state->cpu != raw_smp_processor_id());
	150
	151	raw_spin_lock_irqsave(&state->lock, flags);
	152	sup_update_time(&state->sup_env, litmus_clock());
	153
	154	update = state->sup_env.next_scheduler_update;
	155	now = state->sup_env.env.current_time;
	156
	157	TRACE_CUR("on_scheduling_timer at %llu, upd:%llu (for cpu=%d)\n",
	158	now, update, state->cpu);
	159
	160	if (update <= now) {
	161	litmus_reschedule_local();
	162	} else if (update != SUP_NO_SCHEDULER_UPDATE) {
	163	hrtimer_set_expires(timer, ns_to_ktime(update));
	164	restart = HRTIMER_RESTART;
	165	}
	166
	167	raw_spin_unlock_irqrestore(&state->lock, flags);
	168
	169	return restart;
	170	}
	171
	172	static struct task_struct* pres_schedule(struct task_struct * prev)
	173	{
	174	/* next == NULL means "schedule background work". */
	175	struct pres_cpu_state *state = local_cpu_state();
	176
	177	raw_spin_lock(&state->lock);
	178
	179	BUG_ON(state->scheduled && state->scheduled != prev);
	180	BUG_ON(state->scheduled && !is_realtime(prev));
	181
	182	/* update time */
	183	state->sup_env.will_schedule = true;
	184	sup_update_time(&state->sup_env, litmus_clock());
	185
	186	/* figure out what to schedule next */
	187	state->scheduled = sup_dispatch(&state->sup_env);
	188
	189	/* Notify LITMUS^RT core that we've arrived at a scheduling decision. */
	190	sched_state_task_picked();
	191
	192	/* program scheduler timer */
	193	state->sup_env.will_schedule = false;
	194	/* NOTE: drops state->lock */
	195	pres_update_timer_and_unlock(state);
	196
	197	if (prev != state->scheduled && is_realtime(prev))
	198	TRACE_TASK(prev, "descheduled.\n");
	199	if (state->scheduled)
	200	TRACE_TASK(state->scheduled, "scheduled.\n");
	201
	202	return state->scheduled;
	203	}
	204
	205	static void resume_legacy_task_model_updates(struct task_struct *tsk)
	206	{
	207	lt_t now;
	208	if (is_sporadic(tsk)) {
	209	/* If this sporadic task was gone for a "long" time and woke up past
	210	* its deadline, then give it a new budget by triggering a job
	211	* release. This is purely cosmetic and has no effect on the
	212	* P-RES scheduler. */
	213
	214	now = litmus_clock();
	215	if (is_tardy(tsk, now)) {
	216	inferred_sporadic_job_release_at(tsk, now);
	217	}
	218	}
	219	}
	220
	221
	222	/* Called when a task should be removed from the ready queue.
	223	*/
	224	static void pres_task_block(struct task_struct *tsk)
	225	{
	226	unsigned long flags;
	227	struct pres_task_state* tinfo = get_pres_state(tsk);
	228	struct pres_cpu_state *state = cpu_state_for(tinfo->cpu);
	229
	230	TRACE_TASK(tsk, "thread suspends at %llu (state:%d, running:%d)\n",
	231	litmus_clock(), tsk->state, is_current_running());
	232
	233	raw_spin_lock_irqsave(&state->lock, flags);
	234	sup_update_time(&state->sup_env, litmus_clock());
	235	task_departs(tsk, is_completed(tsk));
	236	raw_spin_unlock_irqrestore(&state->lock, flags);
	237	}
	238
	239
	240	/* Called when the state of tsk changes back to TASK_RUNNING.
	241	* We need to requeue the task.
	242	*/
	243	static void pres_task_resume(struct task_struct *tsk)
	244	{
	245	unsigned long flags;
	246	struct pres_task_state* tinfo = get_pres_state(tsk);
	247	struct pres_cpu_state *state = cpu_state_for(tinfo->cpu);
	248
	249	TRACE_TASK(tsk, "thread wakes up at %llu\n", litmus_clock());
	250
	251	raw_spin_lock_irqsave(&state->lock, flags);
	252	/* Assumption: litmus_clock() is synchronized across cores,
	253	* since we might not actually be executing on tinfo->cpu
	254	* at the moment. */
	255	sup_update_time(&state->sup_env, litmus_clock());
	256	task_arrives(tsk);
	257	/* NOTE: drops state->lock */
	258	pres_update_timer_and_unlock(state);
	259	local_irq_restore(flags);
	260
	261	resume_legacy_task_model_updates(tsk);
	262	}
	263
	264	static long pres_admit_task(struct task_struct *tsk)
	265	{
	266	long err = -EINVAL;
	267	unsigned long flags;
	268	struct reservation *res;
	269	struct pres_cpu_state *state;
	270	struct pres_task_state tinfo = kzalloc(sizeof(tinfo), GFP_ATOMIC);
	271
	272	if (!tinfo)
	273	return -ENOMEM;
	274
	275	preempt_disable();
	276
	277	/* NOTE: this is obviously racy w.r.t. affinity changes since
	278	* we are not holding any runqueue locks. */
	279	if (tsk->nr_cpus_allowed != 1) {
	280	printk(KERN_WARNING "%s/%d: task does not have "
	281	"singleton affinity mask\n",
	282	tsk->comm, tsk->pid);
	283	state = cpu_state_for(task_cpu(tsk));
	284	} else {
	285	state = cpu_state_for(cpumask_first(&tsk->cpus_allowed));
	286	}
	287
	288	TRACE_TASK(tsk, "on CPU %d, valid?:%d\n",
	289	task_cpu(tsk), cpumask_test_cpu(task_cpu(tsk), &tsk->cpus_allowed));
	290
	291	raw_spin_lock_irqsave(&state->lock, flags);
	292
	293	res = sup_find_by_id(&state->sup_env, tsk_rt(tsk)->task_params.cpu);
	294
	295	/* found the appropriate reservation (or vCPU) */
	296	if (res) {
	297	task_client_init(&tinfo->res_info, tsk, res);
	298	tinfo->cpu = state->cpu;
	299	tinfo->client = &tinfo->res_info.client;
	300	tsk_rt(tsk)->plugin_state = tinfo;
	301	err = 0;
	302
	303	/* disable LITMUS^RT's per-thread budget enforcement */
	304	tsk_rt(tsk)->task_params.budget_policy = NO_ENFORCEMENT;
	305	} else {
	306	printk(KERN_WARNING "Could not find reservation %d on "
	307	"core %d for task %s/%d\n",
	308	tsk_rt(tsk)->task_params.cpu, state->cpu,
	309	tsk->comm, tsk->pid);
	310	}
	311
	312	raw_spin_unlock_irqrestore(&state->lock, flags);
	313
	314	preempt_enable();
	315
	316	if (err)
	317	kfree(tinfo);
	318
	319	return err;
	320	}
	321
	322	static void task_new_legacy_task_model_updates(struct task_struct *tsk)
	323	{
	324	lt_t now = litmus_clock();
	325
	326	/* the first job exists starting as of right now */
	327	release_at(tsk, now);
	328	sched_trace_task_release(tsk);
	329	}
	330
	331	static void pres_task_new(struct task_struct *tsk, int on_runqueue,
	332	int is_running)
	333	{
	334	unsigned long flags;
	335	struct pres_task_state* tinfo = get_pres_state(tsk);
	336	struct pres_cpu_state *state = cpu_state_for(tinfo->cpu);
	337
	338	TRACE_TASK(tsk, "new RT task %llu (on_rq:%d, running:%d)\n",
	339	litmus_clock(), on_runqueue, is_running);
	340
	341	/* acquire the lock protecting the state and disable interrupts */
	342	raw_spin_lock_irqsave(&state->lock, flags);
	343
	344	if (is_running) {
	345	state->scheduled = tsk;
	346	/* make sure this task should actually be running */
	347	litmus_reschedule_local();
	348	}
	349
	350	if (on_runqueue \|\| is_running) {
	351	/* Assumption: litmus_clock() is synchronized across cores
	352	* [see comment in pres_task_resume()] */
	353	sup_update_time(&state->sup_env, litmus_clock());
	354	task_arrives(tsk);
	355	/* NOTE: drops state->lock */
	356	pres_update_timer_and_unlock(state);
	357	local_irq_restore(flags);
	358	} else
	359	raw_spin_unlock_irqrestore(&state->lock, flags);
	360
	361	task_new_legacy_task_model_updates(tsk);
	362	}
	363
	364	static bool pres_fork_task(struct task_struct *tsk)
	365	{
	366	TRACE_CUR("is forking\n");
	367	TRACE_TASK(tsk, "forked child rt:%d cpu:%d task_cpu:%d "
	368	"wcet:%llu per:%llu\n",
	369	is_realtime(tsk),
	370	tsk_rt(tsk)->task_params.cpu,
	371	task_cpu(tsk),
	372	tsk_rt(tsk)->task_params.exec_cost,
	373	tsk_rt(tsk)->task_params.period);
	374
	375	/* We always allow forking. */
	376	/* The newly forked task will be in the same reservation. */
	377	return true;
	378	}
	379
	380	static void pres_task_exit(struct task_struct *tsk)
	381	{
	382	unsigned long flags;
	383	struct pres_task_state* tinfo = get_pres_state(tsk);
	384	struct pres_cpu_state *state = cpu_state_for(tinfo->cpu);
	385
	386	raw_spin_lock_irqsave(&state->lock, flags);
	387
	388	TRACE_TASK(tsk, "task exits at %llu (present:%d sched:%d)\n",
	389	litmus_clock(), is_present(tsk), state->scheduled == tsk);
	390
	391	if (state->scheduled == tsk)
	392	state->scheduled = NULL;
	393
	394	/* remove from queues */
	395	if (is_present(tsk)) {
	396	/* Assumption: litmus_clock() is synchronized across cores
	397	* [see comment in pres_task_resume()] */
	398	sup_update_time(&state->sup_env, litmus_clock());
	399	task_departs(tsk, 0);
	400	/* NOTE: drops state->lock */
	401	pres_update_timer_and_unlock(state);
	402	local_irq_restore(flags);
	403	} else
	404	raw_spin_unlock_irqrestore(&state->lock, flags);
	405
	406	kfree(tsk_rt(tsk)->plugin_state);
	407	tsk_rt(tsk)->plugin_state = NULL;
	408	}
	409
	410	static void pres_current_budget(lt_t used_so_far, lt_t remaining)
	411	{
	412	struct pres_task_state *tstate = get_pres_state(current);
	413	struct pres_cpu_state *state;
	414
	415	/* FIXME: protect against concurrent task_exit() */
	416
	417	local_irq_disable();
	418
	419	state = cpu_state_for(tstate->cpu);
	420
	421	raw_spin_lock(&state->lock);
	422
	423	sup_update_time(&state->sup_env, litmus_clock());
	424	if (remaining)
	425	*remaining = tstate->client->reservation->cur_budget;
	426	if (used_so_far)
	427	*used_so_far = tstate->client->reservation->budget_consumed;
	428	pres_update_timer_and_unlock(state);
	429
	430	local_irq_enable();
	431	}
	432
	433	static long do_pres_reservation_create(
	434	int res_type,
	435	struct reservation_config *config)
	436	{
	437	struct pres_cpu_state *state;
	438	struct reservation* res;
	439	struct reservation* new_res = NULL;
	440	unsigned long flags;
	441	long err;
	442
	443	/* Allocate before we grab a spin lock. */
	444	switch (res_type) {
	445	case PERIODIC_POLLING:
	446	case SPORADIC_POLLING:
	447	err = alloc_polling_reservation(res_type, config, &new_res);
	448	break;
	449
	450	case TABLE_DRIVEN:
	451	err = alloc_table_driven_reservation(config, &new_res);
	452	break;
	453
	454	default:
	455	err = -EINVAL;
	456	break;
	457	}
	458
	459	if (err)
	460	return err;
	461
	462	state = cpu_state_for(config->cpu);
	463	raw_spin_lock_irqsave(&state->lock, flags);
	464
	465	res = sup_find_by_id(&state->sup_env, config->id);
	466	if (!res) {
	467	sup_add_new_reservation(&state->sup_env, new_res);
	468	err = config->id;
	469	} else {
	470	err = -EEXIST;
	471	}
	472
	473	raw_spin_unlock_irqrestore(&state->lock, flags);
	474
	475	if (err < 0)
	476	kfree(new_res);
	477
	478	return err;
	479	}
	480
	481	static long pres_reservation_create(int res_type, void* __user _config)
	482	{
	483	struct reservation_config config;
	484
	485	TRACE("Attempt to create reservation (%d)\n", res_type);
	486
	487	if (copy_from_user(&config, _config, sizeof(config)))
	488	return -EFAULT;
	489
	490	if (config.cpu < 0 \|\| !cpu_online(config.cpu)) {
	491	printk(KERN_ERR "invalid polling reservation (%u): "
	492	"CPU %d offline\n", config.id, config.cpu);
	493	return -EINVAL;
	494	}
	495
	496	return do_pres_reservation_create(res_type, &config);
	497	}
	498
	499	static struct domain_proc_info pres_domain_proc_info;
	500
	501	static long pres_get_domain_proc_info(struct domain_proc_info **ret)
	502	{
	503	*ret = &pres_domain_proc_info;
	504	return 0;
	505	}
	506
	507	static void pres_setup_domain_proc(void)
	508	{
	509	int i, cpu;
	510	int num_rt_cpus = num_online_cpus();
	511
	512	struct cd_mapping cpu_map, domain_map;
	513
	514	memset(&pres_domain_proc_info, 0, sizeof(pres_domain_proc_info));
	515	init_domain_proc_info(&pres_domain_proc_info, num_rt_cpus, num_rt_cpus);
	516	pres_domain_proc_info.num_cpus = num_rt_cpus;
	517	pres_domain_proc_info.num_domains = num_rt_cpus;
	518
	519	i = 0;
	520	for_each_online_cpu(cpu) {
	521	cpu_map = &pres_domain_proc_info.cpu_to_domains[i];
	522	domain_map = &pres_domain_proc_info.domain_to_cpus[i];
	523
	524	cpu_map->id = cpu;
	525	domain_map->id = i;
	526	cpumask_set_cpu(i, cpu_map->mask);
	527	cpumask_set_cpu(cpu, domain_map->mask);
	528	++i;
	529	}
	530	}
	531
	532	static long pres_activate_plugin(void)
	533	{
	534	int cpu;
	535	struct pres_cpu_state *state;
	536
	537	for_each_online_cpu(cpu) {
	538	TRACE("Initializing CPU%d...\n", cpu);
	539
	540	state = cpu_state_for(cpu);
	541
	542	raw_spin_lock_init(&state->lock);
	543	state->cpu = cpu;
	544	state->scheduled = NULL;
	545
	546	sup_init(&state->sup_env);
	547
	548	hrtimer_init(&state->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
	549	state->timer.function = on_scheduling_timer;
	550	}
	551
	552	pres_setup_domain_proc();
	553
	554	return 0;
	555	}
	556
	557	static long pres_deactivate_plugin(void)
	558	{
	559	int cpu;
	560	struct pres_cpu_state *state;
	561	struct reservation *res;
	562
	563	for_each_online_cpu(cpu) {
	564	state = cpu_state_for(cpu);
	565	raw_spin_lock(&state->lock);
	566
	567	hrtimer_cancel(&state->timer);
	568
	569	/* Delete all reservations --- assumes struct reservation
	570	* is prefix of containing struct. */
	571
	572	while (!list_empty(&state->sup_env.all_reservations)) {
	573	res = list_first_entry(
	574	&state->sup_env.all_reservations,
	575	struct reservation, all_list);
	576	list_del(&res->all_list);
	577	if (res->ops->shutdown)
	578	res->ops->shutdown(res);
	579	kfree(res);
	580	}
	581
	582	raw_spin_unlock(&state->lock);
	583	}
	584
	585	destroy_domain_proc_info(&pres_domain_proc_info);
	586	return 0;
	587	}
	588
	589	static struct sched_plugin pres_plugin = {
	590	.plugin_name = "P-RES",
	591	.schedule = pres_schedule,
	592	.task_block = pres_task_block,
	593	.task_wake_up = pres_task_resume,
	594	.admit_task = pres_admit_task,
	595	.task_new = pres_task_new,
	596	.fork_task = pres_fork_task,
	597	.task_exit = pres_task_exit,
	598	.complete_job = complete_job_oneshot,
	599	.get_domain_proc_info = pres_get_domain_proc_info,
	600	.activate_plugin = pres_activate_plugin,
	601	.deactivate_plugin = pres_deactivate_plugin,
	602	.reservation_create = pres_reservation_create,
	603	.current_budget = pres_current_budget,
	604	};
	605
	606	static int __init init_pres(void)
	607	{
	608	return register_sched_plugin(&pres_plugin);
	609	}
	610
	611	module_init(init_pres);
	612