Add partitioned reservation-based scheduler plugin (P-RES)

A simple partitioned scheduler that provides a reservation environment on each core, based on the generic reservations code. Hierarchical scheduling is not supported in this version.
author: Bjoern Brandenburg <bbb@mpi-sws.org> 2016-03-16 08:01:32 -0400
committer: Bjoern Brandenburg <bbb@mpi-sws.org> 2016-03-20 14:30:37 -0400
commit: dbf173a2fbe2abe9a5ee149390705f18c2b17f25 (patch)
tree: 4f5b6105e2a5e25fa6e30776215191b810ddf78f /litmus
parent: 2a38056cc098c56a04bbe18f4e752f4fa782599f (diff)
2 files changed, 580 insertions, 0 deletions
diff --git a/litmus/Makefile b/litmus/Makefile
index c969ce59db67..ecaa28dc68ad 100644
--- a/litmus/Makefile
+++ b/litmus/Makefile
@@ -31,4 +31,6 @@ obj-$(CONFIG_SCHED_TASK_TRACE) += sched_task_trace.o
 obj-$(CONFIG_SCHED_DEBUG_TRACE) += sched_trace.o
 obj-$(CONFIG_SCHED_OVERHEAD_TRACE) += trace.o
+obj-y += sched_pres.o
 obj-y += reservations/
diff --git a/litmus/sched_pres.c b/litmus/sched_pres.c
new file mode 100644
index 000000000000..5992c55ee737
--- /dev/null
+++ b/litmus/sched_pres.c
@@ -0,0 +1,578 @@
+#include <linux/percpu.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/preempt.h>
+#include <litmus/debug_trace.h>
+#include <litmus/litmus.h>
+#include <litmus/jobs.h>
+#include <litmus/budget.h>
+#include <litmus/litmus_proc.h>
+#include <litmus/sched_trace.h>
+#include <litmus/reservations/reservation.h>
+#include <litmus/reservations/alloc.h>
+struct pres_task_state {
+        struct reservation_client *client;
+        int cpu;
+        struct task_client res_info;
+};
+struct pres_cpu_state {
+        raw_spinlock_t lock;
+        struct sup_reservation_environment sup_env;
+        struct hrtimer timer;
+        int cpu;
+        struct task_struct* scheduled;
+};
+static DEFINE_PER_CPU(struct pres_cpu_state, pres_cpu_state);
+#define cpu_state_for(cpu_id)   (&per_cpu(pres_cpu_state, cpu_id))
+#define local_cpu_state()       (this_cpu_ptr(&pres_cpu_state))
+static struct pres_task_state* get_pres_state(struct task_struct *tsk)
+{
+        return (struct pres_task_state*) tsk_rt(tsk)->plugin_state;
+}
+static void task_departs(struct task_struct *tsk, int job_complete)
+{
+        struct pres_task_state* state = get_pres_state(tsk);
+        struct reservation* res;
+        struct reservation_client *client;
+        client = state->client;
+        res    = client->reservation;
+        res->ops->client_departs(res, client, job_complete);
+        TRACE_TASK(tsk, "client_departs: removed from reservation R%d\n", res->id);
+}
+static void task_arrives(struct task_struct *tsk)
+{
+        struct pres_task_state* state = get_pres_state(tsk);
+        struct reservation* res;
+        struct reservation_client *client;
+        client = state->client;
+        res    = client->reservation;
+        res->ops->client_arrives(res, client);
+        TRACE_TASK(tsk, "client_arrives: added to reservation R%d\n", res->id);
+}
+/* NOTE: drops state->lock */
+static void pres_update_timer_and_unlock(struct pres_cpu_state *state)
+{
+        int local;
+        lt_t update, now;
+        update = state->sup_env.next_scheduler_update;
+        now = state->sup_env.env.current_time;
+        /* Be sure we're actually running on the right core,
+         * as pres_update_timer() is also called from pres_task_resume(),
+         * which might be called on any CPU when a thread resumes.
+         */
+        local = local_cpu_state() == state;
+        /* Must drop state lock before calling into hrtimer_start(), which
+         * may raise a softirq, which in turn may wake ksoftirqd. */
+        raw_spin_unlock(&state->lock);
+        if (update <= now) {
+                litmus_reschedule(state->cpu);
+        } else if (likely(local && update != SUP_NO_SCHEDULER_UPDATE)) {
+                /* Reprogram only if not already set correctly. */
+                if (!hrtimer_active(&state->timer) ||
+                    ktime_to_ns(hrtimer_get_expires(&state->timer)) != update) {
+                        TRACE("canceling timer...\n");
+                        hrtimer_cancel(&state->timer);
+                        TRACE("setting scheduler timer for %llu\n", update);
+                        /* We cannot use hrtimer_start() here because the
+                         * wakeup flag must be set to zero. */
+                        __hrtimer_start_range_ns(&state->timer,
+                                        ns_to_ktime(update),
+                                        0 /* timer coalescing slack */,
+                                        HRTIMER_MODE_ABS_PINNED,
+                                        0 /* wakeup */);
+                        if (update < litmus_clock()) {
+                                /* uh oh, timer expired while trying to set it */
+                                TRACE("timer expired during setting "
+                                      "update:%llu now:%llu actual:%llu\n",
+                                      update, now, litmus_clock());
+                                /* The timer HW may not have been reprogrammed
+                                 * correctly; force rescheduling now. */
+                                litmus_reschedule(state->cpu);
+                        }
+                }
+        } else if (unlikely(!local && update != SUP_NO_SCHEDULER_UPDATE)) {
+                /* Poke remote core only if timer needs to be set earlier than
+                 * it is currently set.
+                 */
+                TRACE("pres_update_timer for remote CPU %d (update=%llu, "
+                      "active:%d, set:%llu)\n",
+                        state->cpu,
+                        update,
+                        hrtimer_active(&state->timer),
+                        ktime_to_ns(hrtimer_get_expires(&state->timer)));
+                if (!hrtimer_active(&state->timer) ||
+                    ktime_to_ns(hrtimer_get_expires(&state->timer)) > update) {
+                        TRACE("poking CPU %d so that it can update its "
+                               "scheduling timer (active:%d, set:%llu)\n",
+                               state->cpu,
+                               hrtimer_active(&state->timer),
+                               ktime_to_ns(hrtimer_get_expires(&state->timer)));
+                        litmus_reschedule(state->cpu);
+                }
+        }
+}
+static enum hrtimer_restart on_scheduling_timer(struct hrtimer *timer)
+{
+        unsigned long flags;
+        enum hrtimer_restart restart = HRTIMER_NORESTART;
+        struct pres_cpu_state *state;
+        lt_t update, now;
+        state = container_of(timer, struct pres_cpu_state, timer);
+        /* The scheduling timer should only fire on the local CPU, because
+         * otherwise deadlocks via timer_cancel() are possible.
+         * Note: this does not interfere with dedicated interrupt handling, as
+         * even under dedicated interrupt handling scheduling timers for
+         * budget enforcement must occur locally on each CPU.
+         */
+        BUG_ON(state->cpu != raw_smp_processor_id());
+        raw_spin_lock_irqsave(&state->lock, flags);
+        sup_update_time(&state->sup_env, litmus_clock());
+        update = state->sup_env.next_scheduler_update;
+        now = state->sup_env.env.current_time;
+        TRACE_CUR("on_scheduling_timer at %llu, upd:%llu (for cpu=%d)\n",
+                now, update, state->cpu);
+        if (update <= now) {
+                litmus_reschedule_local();
+        } else if (update != SUP_NO_SCHEDULER_UPDATE) {
+                hrtimer_set_expires(timer, ns_to_ktime(update));
+                restart = HRTIMER_RESTART;
+        }
+        raw_spin_unlock_irqrestore(&state->lock, flags);
+        return restart;
+}
+static struct task_struct* pres_schedule(struct task_struct * prev)
+{
+        /* next == NULL means "schedule background work". */
+        struct pres_cpu_state *state = local_cpu_state();
+        raw_spin_lock(&state->lock);
+        BUG_ON(state->scheduled && state->scheduled != prev);
+        BUG_ON(state->scheduled && !is_realtime(prev));
+        /* update time */
+        state->sup_env.will_schedule = true;
+        sup_update_time(&state->sup_env, litmus_clock());
+        /* figure out what to schedule next */
+        state->scheduled = sup_dispatch(&state->sup_env);
+        /* Notify LITMUS^RT core that we've arrived at a scheduling decision. */
+        sched_state_task_picked();
+        /* program scheduler timer */
+        state->sup_env.will_schedule = false;
+        /* NOTE: drops state->lock */
+        pres_update_timer_and_unlock(state);
+        if (prev != state->scheduled && is_realtime(prev))
+                TRACE_TASK(prev, "descheduled.\n");
+        if (state->scheduled)
+                TRACE_TASK(state->scheduled, "scheduled.\n");
+        return state->scheduled;
+}
+static void resume_legacy_task_model_updates(struct task_struct *tsk)
+{
+        lt_t now;
+        if (is_sporadic(tsk)) {
+                /* If this sporadic task was gone for a "long" time and woke up past
+                 * its deadline, then give it a new budget by triggering a job
+                 * release. This is purely cosmetic and has no effect on the
+                 * P-RES scheduler. */
+                now = litmus_clock();
+                if (is_tardy(tsk, now))
+                        release_at(tsk, now);
+        }
+}
+/* Called when a task should be removed from the ready queue.
+ */
+static void pres_task_block(struct task_struct *tsk)
+{
+        unsigned long flags;
+        struct pres_task_state* tinfo = get_pres_state(tsk);
+        struct pres_cpu_state *state = cpu_state_for(tinfo->cpu);
+        TRACE_TASK(tsk, "thread suspends at %llu (state:%d, running:%d)\n",
+                litmus_clock(), tsk->state, is_current_running());
+        raw_spin_lock_irqsave(&state->lock, flags);
+        task_departs(tsk, is_completed(tsk));
+        raw_spin_unlock_irqrestore(&state->lock, flags);
+}
+/* Called when the state of tsk changes back to TASK_RUNNING.
+ * We need to requeue the task.
+ */
+static void pres_task_resume(struct task_struct  *tsk)
+{
+        unsigned long flags;
+        struct pres_task_state* tinfo = get_pres_state(tsk);
+        struct pres_cpu_state *state = cpu_state_for(tinfo->cpu);
+        TRACE_TASK(tsk, "thread wakes up at %llu\n", litmus_clock());
+        raw_spin_lock_irqsave(&state->lock, flags);
+        /* Assumption: litmus_clock() is synchronized across cores,
+         * since we might not actually be executing on tinfo->cpu
+         * at the moment. */
+        sup_update_time(&state->sup_env, litmus_clock());
+        task_arrives(tsk);
+        /* NOTE: drops state->lock */
+        pres_update_timer_and_unlock(state);
+        local_irq_restore(flags);
+        resume_legacy_task_model_updates(tsk);
+}
+static long pres_admit_task(struct task_struct *tsk)
+{
+        long err = -ESRCH;
+        unsigned long flags;
+        struct reservation *res;
+        struct pres_cpu_state *state;
+        struct pres_task_state *tinfo = kzalloc(sizeof(*tinfo), GFP_ATOMIC);
+        if (!tinfo)
+                return -ENOMEM;
+        preempt_disable();
+        state = cpu_state_for(task_cpu(tsk));
+        raw_spin_lock_irqsave(&state->lock, flags);
+        res = sup_find_by_id(&state->sup_env, tsk_rt(tsk)->task_params.cpu);
+        /* found the appropriate reservation (or vCPU) */
+        if (res) {
+                task_client_init(&tinfo->res_info, tsk, res);
+                tinfo->cpu = task_cpu(tsk);
+                tinfo->client = &tinfo->res_info.client;
+                tsk_rt(tsk)->plugin_state = tinfo;
+                err = 0;
+                /* disable LITMUS^RT's per-thread budget enforcement */
+                tsk_rt(tsk)->task_params.budget_policy = NO_ENFORCEMENT;
+        }
+        raw_spin_unlock_irqrestore(&state->lock, flags);
+        preempt_enable();
+        if (err)
+                kfree(tinfo);
+        return err;
+}
+static void task_new_legacy_task_model_updates(struct task_struct *tsk)
+{
+        lt_t now = litmus_clock();
+        /* the first job exists starting as of right now */
+        release_at(tsk, now);
+        sched_trace_task_release(tsk);
+}
+static void pres_task_new(struct task_struct *tsk, int on_runqueue,
+                          int is_running)
+{
+        unsigned long flags;
+        struct pres_task_state* tinfo = get_pres_state(tsk);
+        struct pres_cpu_state *state = cpu_state_for(tinfo->cpu);
+        TRACE_TASK(tsk, "new RT task %llu (on_rq:%d, running:%d)\n",
+                   litmus_clock(), on_runqueue, is_running);
+        /* acquire the lock protecting the state and disable interrupts */
+        raw_spin_lock_irqsave(&state->lock, flags);
+        if (is_running) {
+                state->scheduled = tsk;
+                /* make sure this task should actually be running */
+                litmus_reschedule_local();
+        }
+        if (on_runqueue || is_running) {
+                /* Assumption: litmus_clock() is synchronized across cores
+                 * [see comment in pres_task_resume()] */
+                sup_update_time(&state->sup_env, litmus_clock());
+                task_arrives(tsk);
+                /* NOTE: drops state->lock */
+                pres_update_timer_and_unlock(state);
+                local_irq_restore(flags);
+        } else
+                raw_spin_unlock_irqrestore(&state->lock, flags);
+        task_new_legacy_task_model_updates(tsk);
+}
+static void pres_task_exit(struct task_struct *tsk)
+{
+        unsigned long flags;
+        struct pres_task_state* tinfo = get_pres_state(tsk);
+        struct pres_cpu_state *state = cpu_state_for(tinfo->cpu);
+        raw_spin_lock_irqsave(&state->lock, flags);
+        TRACE_TASK(tsk, "task exits at %llu (present:%d sched:%d)\n",
+                litmus_clock(), is_present(tsk), state->scheduled == tsk);
+        if (state->scheduled == tsk)
+                state->scheduled = NULL;
+        /* remove from queues */
+        if (is_present(tsk)) {
+                /* Assumption: litmus_clock() is synchronized across cores
+                 * [see comment in pres_task_resume()] */
+                sup_update_time(&state->sup_env, litmus_clock());
+                task_departs(tsk, 0);
+                /* NOTE: drops state->lock */
+                pres_update_timer_and_unlock(state);
+                local_irq_restore(flags);
+        } else
+                raw_spin_unlock_irqrestore(&state->lock, flags);
+        kfree(tsk_rt(tsk)->plugin_state);
+        tsk_rt(tsk)->plugin_state = NULL;
+}
+static void pres_current_budget(lt_t *used_so_far, lt_t *remaining)
+{
+        struct pres_task_state *tstate = get_pres_state(current);
+        struct pres_cpu_state *state;
+        /* FIXME: protect against concurrent task_exit() */
+        local_irq_disable();
+        state = cpu_state_for(tstate->cpu);
+        raw_spin_lock(&state->lock);
+        sup_update_time(&state->sup_env, litmus_clock());
+        if (remaining)
+                *remaining = tstate->client->reservation->cur_budget;
+        if (used_so_far)
+                *used_so_far = tstate->client->reservation->budget_consumed;
+        pres_update_timer_and_unlock(state);
+        local_irq_enable();
+}
+static long do_pres_reservation_create(
+        int res_type,
+        struct reservation_config *config)
+{
+        struct pres_cpu_state *state;
+        struct reservation* res;
+        struct reservation* new_res = NULL;
+        unsigned long flags;
+        long err;
+        /* Allocate before we grab a spin lock. */
+        switch (res_type) {
+                case PERIODIC_POLLING:
+                case SPORADIC_POLLING:
+                        err = alloc_polling_reservation(res_type, config, &new_res);
+                        break;
+                case TABLE_DRIVEN:
+                        err = alloc_table_driven_reservation(config, &new_res);
+                        break;
+                default:
+                        err = -EINVAL;
+                        break;
+        }
+        if (err)
+                return err;
+        state = cpu_state_for(config->cpu);
+        raw_spin_lock_irqsave(&state->lock, flags);
+        res = sup_find_by_id(&state->sup_env, config->id);
+        if (!res) {
+                sup_add_new_reservation(&state->sup_env, new_res);
+                err = config->id;
+        } else {
+                err = -EEXIST;
+        }
+        raw_spin_unlock_irqrestore(&state->lock, flags);
+        if (err < 0)
+                kfree(new_res);
+        return err;
+}
+static long pres_reservation_create(int res_type, void* __user _config)
+{
+        struct reservation_config config;
+        TRACE("Attempt to create reservation (%d)\n", res_type);
+        if (copy_from_user(&config, _config, sizeof(config)))
+                return -EFAULT;
+        if (config.cpu < 0 || !cpu_online(config.cpu)) {
+                printk(KERN_ERR "invalid polling reservation (%u): "
+                       "CPU %d offline\n", config.id, config.cpu);
+                return -EINVAL;
+        }
+        return do_pres_reservation_create(res_type, &config);
+}
+static struct domain_proc_info pres_domain_proc_info;
+static long pres_get_domain_proc_info(struct domain_proc_info **ret)
+{
+        *ret = &pres_domain_proc_info;
+        return 0;
+}
+static void pres_setup_domain_proc(void)
+{
+        int i, cpu;
+        int num_rt_cpus = num_online_cpus();
+        struct cd_mapping *cpu_map, *domain_map;
+        memset(&pres_domain_proc_info, 0, sizeof(pres_domain_proc_info));
+        init_domain_proc_info(&pres_domain_proc_info, num_rt_cpus, num_rt_cpus);
+        pres_domain_proc_info.num_cpus = num_rt_cpus;
+        pres_domain_proc_info.num_domains = num_rt_cpus;
+        i = 0;
+        for_each_online_cpu(cpu) {
+                cpu_map = &pres_domain_proc_info.cpu_to_domains[i];
+                domain_map = &pres_domain_proc_info.domain_to_cpus[i];
+                cpu_map->id = cpu;
+                domain_map->id = i;
+                cpumask_set_cpu(i, cpu_map->mask);
+                cpumask_set_cpu(cpu, domain_map->mask);
+                ++i;
+        }
+}
+static long pres_activate_plugin(void)
+{
+        int cpu;
+        struct pres_cpu_state *state;
+        for_each_online_cpu(cpu) {
+                TRACE("Initializing CPU%d...\n", cpu);
+                state = cpu_state_for(cpu);
+                raw_spin_lock_init(&state->lock);
+                state->cpu = cpu;
+                state->scheduled = NULL;
+                sup_init(&state->sup_env);
+                hrtimer_init(&state->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
+                state->timer.function = on_scheduling_timer;
+        }
+        pres_setup_domain_proc();
+        return 0;
+}
+static long pres_deactivate_plugin(void)
+{
+        int cpu;
+        struct pres_cpu_state *state;
+        struct reservation *res;
+        for_each_online_cpu(cpu) {
+                state = cpu_state_for(cpu);
+                raw_spin_lock(&state->lock);
+                hrtimer_cancel(&state->timer);
+                /* Delete all reservations --- assumes struct reservation
+                 * is prefix of containing struct. */
+                while (!list_empty(&state->sup_env.all_reservations)) {
+                        res = list_first_entry(
+                                &state->sup_env.all_reservations,
+                                struct reservation, all_list);
+                        list_del(&res->all_list);
+                        if (res->ops->shutdown)
+                                res->ops->shutdown(res);
+                        kfree(res);
+                }
+                raw_spin_unlock(&state->lock);
+        }
+        destroy_domain_proc_info(&pres_domain_proc_info);
+        return 0;
+}
+static struct sched_plugin pres_plugin = {
+        .plugin_name            = "P-RES",
+        .schedule               = pres_schedule,
+        .task_block             = pres_task_block,
+        .task_wake_up           = pres_task_resume,
+        .admit_task             = pres_admit_task,
+        .task_new               = pres_task_new,
+        .task_exit              = pres_task_exit,
+        .complete_job           = complete_job_oneshot,
+        .get_domain_proc_info   = pres_get_domain_proc_info,
+        .activate_plugin        = pres_activate_plugin,
+        .deactivate_plugin      = pres_deactivate_plugin,
+        .reservation_create     = pres_reservation_create,
+        .current_budget         = pres_current_budget,
+};
+static int __init init_pres(void)
+{
+        return register_sched_plugin(&pres_plugin);
+}
+module_init(init_pres);
author	Bjoern Brandenburg <bbb@mpi-sws.org>	2016-03-16 08:01:32 -0400
committer	Bjoern Brandenburg <bbb@mpi-sws.org>	2016-03-20 14:30:37 -0400
commit	dbf173a2fbe2abe9a5ee149390705f18c2b17f25 (patch)
tree	4f5b6105e2a5e25fa6e30776215191b810ddf78f /litmus
parent	2a38056cc098c56a04bbe18f4e752f4fa782599f (diff)

diff --git a/litmus/Makefile b/litmus/Makefile index c969ce59db67..ecaa28dc68ad 100644 --- a/litmus/Makefile +++ b/litmus/Makefile
@@ -31,4 +31,6 @@ obj-$(CONFIG_SCHED_TASK_TRACE) += sched_task_trace.o
31	obj-$(CONFIG_SCHED_DEBUG_TRACE) += sched_trace.o	31	obj-$(CONFIG_SCHED_DEBUG_TRACE) += sched_trace.o
32	obj-$(CONFIG_SCHED_OVERHEAD_TRACE) += trace.o	32	obj-$(CONFIG_SCHED_OVERHEAD_TRACE) += trace.o
33		33
		34	obj-y += sched_pres.o
		35
34	obj-y += reservations/	36	obj-y += reservations/


diff --git a/litmus/sched_pres.c b/litmus/sched_pres.c new file mode 100644 index 000000000000..5992c55ee737 --- /dev/null +++ b/litmus/sched_pres.c
@@ -0,0 +1,578 @@
		1	#include <linux/percpu.h>
		2	#include <linux/slab.h>
		3	#include <asm/uaccess.h>
		4
		5	#include <litmus/sched_plugin.h>
		6	#include <litmus/preempt.h>
		7	#include <litmus/debug_trace.h>
		8
		9	#include <litmus/litmus.h>
		10	#include <litmus/jobs.h>
		11	#include <litmus/budget.h>
		12	#include <litmus/litmus_proc.h>
		13	#include <litmus/sched_trace.h>
		14
		15	#include <litmus/reservations/reservation.h>
		16	#include <litmus/reservations/alloc.h>
		17
		18	struct pres_task_state {
		19	struct reservation_client *client;
		20	int cpu;
		21	struct task_client res_info;
		22	};
		23
		24	struct pres_cpu_state {
		25	raw_spinlock_t lock;
		26
		27	struct sup_reservation_environment sup_env;
		28	struct hrtimer timer;
		29
		30	int cpu;
		31	struct task_struct* scheduled;
		32	};
		33
		34	static DEFINE_PER_CPU(struct pres_cpu_state, pres_cpu_state);
		35
		36	#define cpu_state_for(cpu_id) (&per_cpu(pres_cpu_state, cpu_id))
		37	#define local_cpu_state() (this_cpu_ptr(&pres_cpu_state))
		38
		39	static struct pres_task_state* get_pres_state(struct task_struct *tsk)
		40	{
		41	return (struct pres_task_state*) tsk_rt(tsk)->plugin_state;
		42	}
		43
		44	static void task_departs(struct task_struct *tsk, int job_complete)
		45	{
		46	struct pres_task_state* state = get_pres_state(tsk);
		47	struct reservation* res;
		48	struct reservation_client *client;
		49
		50	client = state->client;
		51	res = client->reservation;
		52
		53	res->ops->client_departs(res, client, job_complete);
		54	TRACE_TASK(tsk, "client_departs: removed from reservation R%d\n", res->id);
		55	}
		56
		57	static void task_arrives(struct task_struct *tsk)
		58	{
		59	struct pres_task_state* state = get_pres_state(tsk);
		60	struct reservation* res;
		61	struct reservation_client *client;
		62
		63	client = state->client;
		64	res = client->reservation;
		65
		66	res->ops->client_arrives(res, client);
		67	TRACE_TASK(tsk, "client_arrives: added to reservation R%d\n", res->id);
		68	}
		69
		70	/* NOTE: drops state->lock */
		71	static void pres_update_timer_and_unlock(struct pres_cpu_state *state)
		72	{
		73	int local;
		74	lt_t update, now;
		75
		76	update = state->sup_env.next_scheduler_update;
		77	now = state->sup_env.env.current_time;
		78
		79	/* Be sure we're actually running on the right core,
		80	* as pres_update_timer() is also called from pres_task_resume(),
		81	* which might be called on any CPU when a thread resumes.
		82	*/
		83	local = local_cpu_state() == state;
		84
		85	/* Must drop state lock before calling into hrtimer_start(), which
		86	* may raise a softirq, which in turn may wake ksoftirqd. */
		87	raw_spin_unlock(&state->lock);
		88
		89	if (update <= now) {
		90	litmus_reschedule(state->cpu);
		91	} else if (likely(local && update != SUP_NO_SCHEDULER_UPDATE)) {
		92	/* Reprogram only if not already set correctly. */
		93	if (!hrtimer_active(&state->timer) \|\|
		94	ktime_to_ns(hrtimer_get_expires(&state->timer)) != update) {
		95	TRACE("canceling timer...\n");
		96	hrtimer_cancel(&state->timer);
		97	TRACE("setting scheduler timer for %llu\n", update);
		98	/* We cannot use hrtimer_start() here because the
		99	* wakeup flag must be set to zero. */
		100	__hrtimer_start_range_ns(&state->timer,
		101	ns_to_ktime(update),
		102	0 /* timer coalescing slack */,
		103	HRTIMER_MODE_ABS_PINNED,
		104	0 /* wakeup */);
		105	if (update < litmus_clock()) {
		106	/* uh oh, timer expired while trying to set it */
		107	TRACE("timer expired during setting "
		108	"update:%llu now:%llu actual:%llu\n",
		109	update, now, litmus_clock());
		110	/* The timer HW may not have been reprogrammed
		111	* correctly; force rescheduling now. */
		112	litmus_reschedule(state->cpu);
		113	}
		114	}
		115	} else if (unlikely(!local && update != SUP_NO_SCHEDULER_UPDATE)) {
		116	/* Poke remote core only if timer needs to be set earlier than
		117	* it is currently set.
		118	*/
		119	TRACE("pres_update_timer for remote CPU %d (update=%llu, "
		120	"active:%d, set:%llu)\n",
		121	state->cpu,
		122	update,
		123	hrtimer_active(&state->timer),
		124	ktime_to_ns(hrtimer_get_expires(&state->timer)));
		125	if (!hrtimer_active(&state->timer) \|\|
		126	ktime_to_ns(hrtimer_get_expires(&state->timer)) > update) {
		127	TRACE("poking CPU %d so that it can update its "
		128	"scheduling timer (active:%d, set:%llu)\n",
		129	state->cpu,
		130	hrtimer_active(&state->timer),
		131	ktime_to_ns(hrtimer_get_expires(&state->timer)));
		132	litmus_reschedule(state->cpu);
		133	}
		134	}
		135	}
		136
		137	static enum hrtimer_restart on_scheduling_timer(struct hrtimer *timer)
		138	{
		139	unsigned long flags;
		140	enum hrtimer_restart restart = HRTIMER_NORESTART;
		141	struct pres_cpu_state *state;
		142	lt_t update, now;
		143
		144	state = container_of(timer, struct pres_cpu_state, timer);
		145
		146	/* The scheduling timer should only fire on the local CPU, because
		147	* otherwise deadlocks via timer_cancel() are possible.
		148	* Note: this does not interfere with dedicated interrupt handling, as
		149	* even under dedicated interrupt handling scheduling timers for
		150	* budget enforcement must occur locally on each CPU.
		151	*/
		152	BUG_ON(state->cpu != raw_smp_processor_id());
		153
		154	raw_spin_lock_irqsave(&state->lock, flags);
		155	sup_update_time(&state->sup_env, litmus_clock());
		156
		157	update = state->sup_env.next_scheduler_update;
		158	now = state->sup_env.env.current_time;
		159
		160	TRACE_CUR("on_scheduling_timer at %llu, upd:%llu (for cpu=%d)\n",
		161	now, update, state->cpu);
		162
		163	if (update <= now) {
		164	litmus_reschedule_local();
		165	} else if (update != SUP_NO_SCHEDULER_UPDATE) {
		166	hrtimer_set_expires(timer, ns_to_ktime(update));
		167	restart = HRTIMER_RESTART;
		168	}
		169
		170	raw_spin_unlock_irqrestore(&state->lock, flags);
		171
		172	return restart;
		173	}
		174
		175	static struct task_struct* pres_schedule(struct task_struct * prev)
		176	{
		177	/* next == NULL means "schedule background work". */
		178	struct pres_cpu_state *state = local_cpu_state();
		179
		180	raw_spin_lock(&state->lock);
		181
		182	BUG_ON(state->scheduled && state->scheduled != prev);
		183	BUG_ON(state->scheduled && !is_realtime(prev));
		184
		185	/* update time */
		186	state->sup_env.will_schedule = true;
		187	sup_update_time(&state->sup_env, litmus_clock());
		188
		189	/* figure out what to schedule next */
		190	state->scheduled = sup_dispatch(&state->sup_env);
		191
		192	/* Notify LITMUS^RT core that we've arrived at a scheduling decision. */
		193	sched_state_task_picked();
		194
		195	/* program scheduler timer */
		196	state->sup_env.will_schedule = false;
		197	/* NOTE: drops state->lock */
		198	pres_update_timer_and_unlock(state);
		199
		200	if (prev != state->scheduled && is_realtime(prev))
		201	TRACE_TASK(prev, "descheduled.\n");
		202	if (state->scheduled)
		203	TRACE_TASK(state->scheduled, "scheduled.\n");
		204
		205	return state->scheduled;
		206	}
		207
		208	static void resume_legacy_task_model_updates(struct task_struct *tsk)
		209	{
		210	lt_t now;
		211	if (is_sporadic(tsk)) {
		212	/* If this sporadic task was gone for a "long" time and woke up past
		213	* its deadline, then give it a new budget by triggering a job
		214	* release. This is purely cosmetic and has no effect on the
		215	* P-RES scheduler. */
		216
		217	now = litmus_clock();
		218	if (is_tardy(tsk, now))
		219	release_at(tsk, now);
		220	}
		221	}
		222
		223
		224	/* Called when a task should be removed from the ready queue.
		225	*/
		226	static void pres_task_block(struct task_struct *tsk)
		227	{
		228	unsigned long flags;
		229	struct pres_task_state* tinfo = get_pres_state(tsk);
		230	struct pres_cpu_state *state = cpu_state_for(tinfo->cpu);
		231
		232	TRACE_TASK(tsk, "thread suspends at %llu (state:%d, running:%d)\n",
		233	litmus_clock(), tsk->state, is_current_running());
		234
		235	raw_spin_lock_irqsave(&state->lock, flags);
		236	task_departs(tsk, is_completed(tsk));
		237	raw_spin_unlock_irqrestore(&state->lock, flags);
		238	}
		239
		240
		241	/* Called when the state of tsk changes back to TASK_RUNNING.
		242	* We need to requeue the task.
		243	*/
		244	static void pres_task_resume(struct task_struct *tsk)
		245	{
		246	unsigned long flags;
		247	struct pres_task_state* tinfo = get_pres_state(tsk);
		248	struct pres_cpu_state *state = cpu_state_for(tinfo->cpu);
		249
		250	TRACE_TASK(tsk, "thread wakes up at %llu\n", litmus_clock());
		251
		252	raw_spin_lock_irqsave(&state->lock, flags);
		253	/* Assumption: litmus_clock() is synchronized across cores,
		254	* since we might not actually be executing on tinfo->cpu
		255	* at the moment. */
		256	sup_update_time(&state->sup_env, litmus_clock());
		257	task_arrives(tsk);
		258	/* NOTE: drops state->lock */
		259	pres_update_timer_and_unlock(state);
		260	local_irq_restore(flags);
		261
		262	resume_legacy_task_model_updates(tsk);
		263	}
		264
		265	static long pres_admit_task(struct task_struct *tsk)
		266	{
		267	long err = -ESRCH;
		268	unsigned long flags;
		269	struct reservation *res;
		270	struct pres_cpu_state *state;
		271	struct pres_task_state tinfo = kzalloc(sizeof(tinfo), GFP_ATOMIC);
		272
		273	if (!tinfo)
		274	return -ENOMEM;
		275
		276	preempt_disable();
		277
		278	state = cpu_state_for(task_cpu(tsk));
		279	raw_spin_lock_irqsave(&state->lock, flags);
		280
		281	res = sup_find_by_id(&state->sup_env, tsk_rt(tsk)->task_params.cpu);
		282
		283	/* found the appropriate reservation (or vCPU) */
		284	if (res) {
		285	task_client_init(&tinfo->res_info, tsk, res);
		286	tinfo->cpu = task_cpu(tsk);
		287	tinfo->client = &tinfo->res_info.client;
		288	tsk_rt(tsk)->plugin_state = tinfo;
		289	err = 0;
		290
		291	/* disable LITMUS^RT's per-thread budget enforcement */
		292	tsk_rt(tsk)->task_params.budget_policy = NO_ENFORCEMENT;
		293	}
		294
		295	raw_spin_unlock_irqrestore(&state->lock, flags);
		296
		297	preempt_enable();
		298
		299	if (err)
		300	kfree(tinfo);
		301
		302	return err;
		303	}
		304
		305	static void task_new_legacy_task_model_updates(struct task_struct *tsk)
		306	{
		307	lt_t now = litmus_clock();
		308
		309	/* the first job exists starting as of right now */
		310	release_at(tsk, now);
		311	sched_trace_task_release(tsk);
		312	}
		313
		314	static void pres_task_new(struct task_struct *tsk, int on_runqueue,
		315	int is_running)
		316	{
		317	unsigned long flags;
		318	struct pres_task_state* tinfo = get_pres_state(tsk);
		319	struct pres_cpu_state *state = cpu_state_for(tinfo->cpu);
		320
		321	TRACE_TASK(tsk, "new RT task %llu (on_rq:%d, running:%d)\n",
		322	litmus_clock(), on_runqueue, is_running);
		323
		324	/* acquire the lock protecting the state and disable interrupts */
		325	raw_spin_lock_irqsave(&state->lock, flags);
		326
		327	if (is_running) {
		328	state->scheduled = tsk;
		329	/* make sure this task should actually be running */
		330	litmus_reschedule_local();
		331	}
		332
		333	if (on_runqueue \|\| is_running) {
		334	/* Assumption: litmus_clock() is synchronized across cores
		335	* [see comment in pres_task_resume()] */
		336	sup_update_time(&state->sup_env, litmus_clock());
		337	task_arrives(tsk);
		338	/* NOTE: drops state->lock */
		339	pres_update_timer_and_unlock(state);
		340	local_irq_restore(flags);
		341	} else
		342	raw_spin_unlock_irqrestore(&state->lock, flags);
		343
		344	task_new_legacy_task_model_updates(tsk);
		345	}
		346
		347	static void pres_task_exit(struct task_struct *tsk)
		348	{
		349	unsigned long flags;
		350	struct pres_task_state* tinfo = get_pres_state(tsk);
		351	struct pres_cpu_state *state = cpu_state_for(tinfo->cpu);
		352
		353	raw_spin_lock_irqsave(&state->lock, flags);
		354
		355	TRACE_TASK(tsk, "task exits at %llu (present:%d sched:%d)\n",
		356	litmus_clock(), is_present(tsk), state->scheduled == tsk);
		357
		358	if (state->scheduled == tsk)
		359	state->scheduled = NULL;
		360
		361	/* remove from queues */
		362	if (is_present(tsk)) {
		363	/* Assumption: litmus_clock() is synchronized across cores
		364	* [see comment in pres_task_resume()] */
		365	sup_update_time(&state->sup_env, litmus_clock());
		366	task_departs(tsk, 0);
		367	/* NOTE: drops state->lock */
		368	pres_update_timer_and_unlock(state);
		369	local_irq_restore(flags);
		370	} else
		371	raw_spin_unlock_irqrestore(&state->lock, flags);
		372
		373	kfree(tsk_rt(tsk)->plugin_state);
		374	tsk_rt(tsk)->plugin_state = NULL;
		375	}
		376
		377	static void pres_current_budget(lt_t used_so_far, lt_t remaining)
		378	{
		379	struct pres_task_state *tstate = get_pres_state(current);
		380	struct pres_cpu_state *state;
		381
		382	/* FIXME: protect against concurrent task_exit() */
		383
		384	local_irq_disable();
		385
		386	state = cpu_state_for(tstate->cpu);
		387
		388	raw_spin_lock(&state->lock);
		389
		390	sup_update_time(&state->sup_env, litmus_clock());
		391	if (remaining)
		392	*remaining = tstate->client->reservation->cur_budget;
		393	if (used_so_far)
		394	*used_so_far = tstate->client->reservation->budget_consumed;
		395	pres_update_timer_and_unlock(state);
		396
		397	local_irq_enable();
		398	}
		399
		400	static long do_pres_reservation_create(
		401	int res_type,
		402	struct reservation_config *config)
		403	{
		404	struct pres_cpu_state *state;
		405	struct reservation* res;
		406	struct reservation* new_res = NULL;
		407	unsigned long flags;
		408	long err;
		409
		410	/* Allocate before we grab a spin lock. */
		411	switch (res_type) {
		412	case PERIODIC_POLLING:
		413	case SPORADIC_POLLING:
		414	err = alloc_polling_reservation(res_type, config, &new_res);
		415	break;
		416
		417	case TABLE_DRIVEN:
		418	err = alloc_table_driven_reservation(config, &new_res);
		419	break;
		420
		421	default:
		422	err = -EINVAL;
		423	break;
		424	}
		425
		426	if (err)
		427	return err;
		428
		429	state = cpu_state_for(config->cpu);
		430	raw_spin_lock_irqsave(&state->lock, flags);
		431
		432	res = sup_find_by_id(&state->sup_env, config->id);
		433	if (!res) {
		434	sup_add_new_reservation(&state->sup_env, new_res);
		435	err = config->id;
		436	} else {
		437	err = -EEXIST;
		438	}
		439
		440	raw_spin_unlock_irqrestore(&state->lock, flags);
		441
		442	if (err < 0)
		443	kfree(new_res);
		444
		445	return err;
		446	}
		447
		448	static long pres_reservation_create(int res_type, void* __user _config)
		449	{
		450	struct reservation_config config;
		451
		452	TRACE("Attempt to create reservation (%d)\n", res_type);
		453
		454	if (copy_from_user(&config, _config, sizeof(config)))
		455	return -EFAULT;
		456
		457	if (config.cpu < 0 \|\| !cpu_online(config.cpu)) {
		458	printk(KERN_ERR "invalid polling reservation (%u): "
		459	"CPU %d offline\n", config.id, config.cpu);
		460	return -EINVAL;
		461	}
		462
		463	return do_pres_reservation_create(res_type, &config);
		464	}
		465
		466	static struct domain_proc_info pres_domain_proc_info;
		467
		468	static long pres_get_domain_proc_info(struct domain_proc_info **ret)
		469	{
		470	*ret = &pres_domain_proc_info;
		471	return 0;
		472	}
		473
		474	static void pres_setup_domain_proc(void)
		475	{
		476	int i, cpu;
		477	int num_rt_cpus = num_online_cpus();
		478
		479	struct cd_mapping cpu_map, domain_map;
		480
		481	memset(&pres_domain_proc_info, 0, sizeof(pres_domain_proc_info));
		482	init_domain_proc_info(&pres_domain_proc_info, num_rt_cpus, num_rt_cpus);
		483	pres_domain_proc_info.num_cpus = num_rt_cpus;
		484	pres_domain_proc_info.num_domains = num_rt_cpus;
		485
		486	i = 0;
		487	for_each_online_cpu(cpu) {
		488	cpu_map = &pres_domain_proc_info.cpu_to_domains[i];
		489	domain_map = &pres_domain_proc_info.domain_to_cpus[i];
		490
		491	cpu_map->id = cpu;
		492	domain_map->id = i;
		493	cpumask_set_cpu(i, cpu_map->mask);
		494	cpumask_set_cpu(cpu, domain_map->mask);
		495	++i;
		496	}
		497	}
		498
		499	static long pres_activate_plugin(void)
		500	{
		501	int cpu;
		502	struct pres_cpu_state *state;
		503
		504	for_each_online_cpu(cpu) {
		505	TRACE("Initializing CPU%d...\n", cpu);
		506
		507	state = cpu_state_for(cpu);
		508
		509	raw_spin_lock_init(&state->lock);
		510	state->cpu = cpu;
		511	state->scheduled = NULL;
		512
		513	sup_init(&state->sup_env);
		514
		515	hrtimer_init(&state->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
		516	state->timer.function = on_scheduling_timer;
		517	}
		518
		519	pres_setup_domain_proc();
		520
		521	return 0;
		522	}
		523
		524	static long pres_deactivate_plugin(void)
		525	{
		526	int cpu;
		527	struct pres_cpu_state *state;
		528	struct reservation *res;
		529
		530	for_each_online_cpu(cpu) {
		531	state = cpu_state_for(cpu);
		532	raw_spin_lock(&state->lock);
		533
		534	hrtimer_cancel(&state->timer);
		535
		536	/* Delete all reservations --- assumes struct reservation
		537	* is prefix of containing struct. */
		538
		539	while (!list_empty(&state->sup_env.all_reservations)) {
		540	res = list_first_entry(
		541	&state->sup_env.all_reservations,
		542	struct reservation, all_list);
		543	list_del(&res->all_list);
		544	if (res->ops->shutdown)
		545	res->ops->shutdown(res);
		546	kfree(res);
		547	}
		548
		549	raw_spin_unlock(&state->lock);
		550	}
		551
		552	destroy_domain_proc_info(&pres_domain_proc_info);
		553	return 0;
		554	}
		555
		556	static struct sched_plugin pres_plugin = {
		557	.plugin_name = "P-RES",
		558	.schedule = pres_schedule,
		559	.task_block = pres_task_block,
		560	.task_wake_up = pres_task_resume,
		561	.admit_task = pres_admit_task,
		562	.task_new = pres_task_new,
		563	.task_exit = pres_task_exit,
		564	.complete_job = complete_job_oneshot,
		565	.get_domain_proc_info = pres_get_domain_proc_info,
		566	.activate_plugin = pres_activate_plugin,
		567	.deactivate_plugin = pres_deactivate_plugin,
		568	.reservation_create = pres_reservation_create,
		569	.current_budget = pres_current_budget,
		570	};
		571
		572	static int __init init_pres(void)
		573	{
		574	return register_sched_plugin(&pres_plugin);
		575	}
		576
		577	module_init(init_pres);
		578