diff --git a/include/litmus/litmus.h b/include/litmus/litmus.h index 5d20276..8672398 100644 --- a/include/litmus/litmus.h +++ b/include/litmus/litmus.h @@ -88,7 +88,7 @@ inline static int budget_exhausted(struct task_struct* t) inline static lt_t budget_remaining(struct task_struct* t) { if (!budget_exhausted(t)) - return get_exec_time(t) - get_exec_cost(t); + return get_exec_cost(t) - get_exec_time(t); else /* avoid overflow */ return 0; diff --git a/include/litmus/rt_param.h b/include/litmus/rt_param.h index a7a183f..6f43dea 100644 --- a/include/litmus/rt_param.h +++ b/include/litmus/rt_param.h @@ -1,3 +1,5 @@ +#include + /* * Definition of the scheduler plugin interface. * @@ -33,6 +35,72 @@ typedef enum { PRECISE_ENFORCEMENT /* NOT IMPLEMENTED - enforced with hrtimers */ } budget_policy_t; + +/* The parameters for EDF-Fm scheduling algorithm. + * Each task may be fixed or migratory. Migratory tasks may + * migrate on 2 (contiguous) CPU only. NR_CPUS_EDF_FM = 2. + */ +#define NR_CPUS_EDF_FM 2 + +struct edffm_params { + /* EDF-fm where can a migratory task execute? */ + unsigned int cpus[NR_CPUS_EDF_FM]; + /* how many cpus are used by this task? + * fixed = 0, migratory = (NR_CPUS_EDF_FM - 1) + * Efficient way to allow writing cpus[nr_cpus]. + */ + unsigned int nr_cpus; + /* Fraction of this task exec_cost that each CPU should handle. + * We keep the fraction divided in num/denom : a matrix of + * (NR_CPUS_EDF_FM rows) x (2 columns). + * The first column is the numerator of the fraction. + * The second column is the denominator. + * In EDF-fm this is a 2*2 matrix + */ + lt_t fraction[2][NR_CPUS_EDF_FM]; +}; + +/* Parameters for NPS-F semi-partitioned scheduling algorithm. + * Each (cpu, budget) entry defines the share ('budget' in ns, a % of + * the slot_length) of the notional processor on the CPU 'cpu'. + * This structure is used by the library - syscall interface in order + * to go through the overhead of a syscall only once per server. + */ +struct npsf_budgets { + int cpu; + lt_t budget; +}; + +/* The parameters for the EDF-WM semi-partitioned scheduler. + * Each task may be split across multiple cpus. Each per-cpu allocation + * is called a 'slice'. + */ +#define MAX_EDF_WM_SLICES 24 +#define MIN_EDF_WM_SLICE_SIZE 50000 /* .05 millisecond = 50us */ + +struct edf_wm_slice { + /* on which CPU is this slice allocated */ + unsigned int cpu; + /* relative deadline from job release (not from slice release!) */ + lt_t deadline; + /* budget of this slice; must be precisely enforced */ + lt_t budget; + /* offset of this slice relative to the job release */ + lt_t offset; +}; + +/* If a job is not sliced across multiple CPUs, then + * count is set to zero and none of the slices is used. + * This implies that count == 1 is illegal. + */ +struct edf_wm_params { + /* enumeration of all slices */ + struct edf_wm_slice slices[MAX_EDF_WM_SLICES]; + + /* how many slices are defined? */ + unsigned int count; +}; + struct rt_task { lt_t exec_cost; lt_t period; @@ -40,6 +108,22 @@ struct rt_task { unsigned int cpu; task_class_t cls; budget_policy_t budget_policy; /* ignored by pfair */ + + /* parameters used by the semi-partitioned algorithms */ + union { + /* EDF-Fm; defined in sched_edf_fm.c */ + struct edffm_params fm; + + /* NPS-F; defined in sched_npsf.c + * id for the server (notional processor) that holds + * this task; the same npfs_id can be assigned to "the same" + * server split on different cpus + */ + int npsf_id; + + /* EDF-WM; defined in sched_edf_wm.c */ + struct edf_wm_params wm; + } semi_part; }; /* The definition of the data that is shared between the kernel and real-time @@ -184,6 +268,27 @@ struct rt_param { /* Pointer to the page shared between userspace and kernel. */ struct control_page * ctrl_page; + + /* runtime info for the semi-part plugins */ + union { + /* EDF-Fm runtime information + * number of jobs handled by this cpu + * (to determine next cpu for a migrating task) + */ + unsigned int cpu_job_no[NR_CPUS_EDF_FM]; + + /* EDF-WM runtime information */ + struct { + /* at which exec time did the current slice start? */ + lt_t exec_time; + /* when did the job suspend? */ + lt_t suspend_time; + /* cached job parameters */ + lt_t job_release, job_deadline; + /* pointer to the current slice */ + struct edf_wm_slice* slice; + } wm; + } semi_part; }; /* Possible RT flags */ diff --git a/include/litmus/sched_plugin.h b/include/litmus/sched_plugin.h index 9c1c9f2..7ea9176 100644 --- a/include/litmus/sched_plugin.h +++ b/include/litmus/sched_plugin.h @@ -6,6 +6,8 @@ #define _LINUX_SCHED_PLUGIN_H_ #include +/* NSEC_PER... conversions */ +#include /* struct for semaphore with priority inheritance */ struct pi_semaphore { @@ -136,6 +138,9 @@ extern struct sched_plugin *litmus; /* cluster size: cache_index = 2 L2, cache_index = 3 L3 */ extern int cluster_cache_index; +/* Slot length (ns) for NPS-F semi-part. algo */ +extern lt_t npsf_slot_length; + int register_sched_plugin(struct sched_plugin* plugin); struct sched_plugin* find_sched_plugin(const char* name); int print_sched_plugins(char* buf, int max); diff --git a/include/litmus/trace.h b/include/litmus/trace.h index b32c711..6afbf96 100644 --- a/include/litmus/trace.h +++ b/include/litmus/trace.h @@ -78,6 +78,8 @@ feather_callback void save_timestamp_cpu(unsigned long event, unsigned long cpu) #define TS_TICK_START(t) TTIMESTAMP(110, t) #define TS_TICK_END(t) TTIMESTAMP(111, t) +#define TS_PULL_TIMER_START TIMESTAMP(112) +#define TS_PULL_TIMER_END TIMESTAMP(113) #define TS_PLUGIN_SCHED_START /* TIMESTAMP(120) */ /* currently unused */ #define TS_PLUGIN_SCHED_END /* TIMESTAMP(121) */ diff --git a/include/litmus/unistd_64.h b/include/litmus/unistd_64.h index f0618e7..4e82c52 100644 --- a/include/litmus/unistd_64.h +++ b/include/litmus/unistd_64.h @@ -33,5 +33,7 @@ __SYSCALL(__NR_wait_for_ts_release, sys_wait_for_ts_release) __SYSCALL(__NR_release_ts, sys_release_ts) #define __NR_null_call __LSC(13) __SYSCALL(__NR_null_call, sys_null_call) +#define __NR_add_server __LSC(14) +__SYSCALL(__NR_add_server, sys_add_server) -#define NR_litmus_syscalls 14 +#define NR_litmus_syscalls 15 diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index fdf9596..23d3712 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -47,6 +47,7 @@ #include #include +#include #include @@ -1063,6 +1064,7 @@ void hrtimer_pull(void) struct hrtimer_start_on_info *info; struct list_head *pos, *safe, list; + TS_PULL_TIMER_START; raw_spin_lock(&base->lock); list_replace_init(&base->to_pull, &list); raw_spin_unlock(&base->lock); @@ -1073,6 +1075,7 @@ void hrtimer_pull(void) list_del(pos); hrtimer_start(info->timer, info->time, info->mode); } + TS_PULL_TIMER_END; } /** diff --git a/litmus/Makefile b/litmus/Makefile index f301d28..5533a58 100644 --- a/litmus/Makefile +++ b/litmus/Makefile @@ -14,7 +14,10 @@ obj-y = sched_plugin.o litmus.o \ bheap.o \ ctrldev.o \ sched_gsn_edf.o \ - sched_psn_edf.o + sched_psn_edf.o \ + sched_edf_wm.o \ + sched_npsf.o \ + sched_edf_fm.o obj-$(CONFIG_PLUGIN_CEDF) += sched_cedf.o obj-$(CONFIG_PLUGIN_PFAIR) += sched_pfair.o diff --git a/litmus/litmus.c b/litmus/litmus.c index b04a42b..2f78022 100644 --- a/litmus/litmus.c +++ b/litmus/litmus.c @@ -632,6 +632,55 @@ static int proc_write_cluster_size(struct file *file, return len; } +static int proc_read_npsf_slot_length(char *page, char **start, + off_t off, int count, + int *eof, void *data) +{ + return snprintf(page, PAGE_SIZE, "%d us\n", + (int) (npsf_slot_length / NSEC_PER_USEC)); +} + +extern void npsf_hrtimers_cleanup(void); +/* NPS-F slot length in us. + * + * Writing 0 as npsf_slot_length will trigger the removal of the + * hrtimers for the domain_reschedule_tick() in the NPS-F plugin. + */ +static int proc_write_npsf_slot_length(struct file *file, + const char *buffer, + unsigned long count, + void *data) +{ + int err, slot_length; + char msg[64]; + + if (count > 63) + return -EINVAL; + + if (copy_from_user(msg, buffer, count)) + return -EFAULT; + + /* terminate */ + msg[count] = '\0'; + /* chomp */ + if (count > 1 && msg[count - 1] == '\n') + msg[count - 1] = '\0'; + + err = sscanf(msg, "%d", &slot_length); + + if (err == 1) { + if (!slot_length) { + npsf_hrtimers_cleanup(); + /* reset to default */ + slot_length = 5000; + } + npsf_slot_length = (lt_t)((lt_t) slot_length * NSEC_PER_USEC); + return count; + } + + return -EINVAL; +} + #ifdef CONFIG_RELEASE_MASTER static int proc_read_release_master(char *page, char **start, off_t off, int count, @@ -691,7 +740,8 @@ static struct proc_dir_entry *litmus_dir = NULL, #ifdef CONFIG_RELEASE_MASTER *release_master_file = NULL, #endif - *clus_cache_idx_file = NULL; + *clus_cache_idx_file = NULL, + *npsf_slot_length_file = NULL; static int __init init_litmus_proc(void) { @@ -733,6 +783,16 @@ static int __init init_litmus_proc(void) clus_cache_idx_file->read_proc = proc_read_cluster_size; clus_cache_idx_file->write_proc = proc_write_cluster_size; + npsf_slot_length_file = create_proc_entry("npsf_slot_length", + 0644, litmus_dir); + if (!npsf_slot_length_file) { + printk(KERN_ERR "Could not allocate npsf_slot_length " + "procfs entry.\n"); + return -ENOMEM; + } + npsf_slot_length_file->read_proc = proc_read_npsf_slot_length; + npsf_slot_length_file->write_proc = proc_write_npsf_slot_length; + stat_file = create_proc_read_entry("stats", 0444, litmus_dir, proc_read_stats, NULL); @@ -752,6 +812,8 @@ static void exit_litmus_proc(void) remove_proc_entry("active_plugin", litmus_dir); if (clus_cache_idx_file) remove_proc_entry("cluster_cache", litmus_dir); + if (npsf_slot_length_file) + remove_proc_entry("npsf_slot_length", litmus_dir); #ifdef CONFIG_RELEASE_MASTER if (release_master_file) remove_proc_entry("release_master", litmus_dir); diff --git a/litmus/sched_edf_fm.c b/litmus/sched_edf_fm.c new file mode 100644 index 0000000..b721072 --- /dev/null +++ b/litmus/sched_edf_fm.c @@ -0,0 +1,565 @@ +/* + * litmus/sched_edf_fm.c + * + * Implementation of the EDF-fm scheduling algorithm. + */ + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +typedef struct { + rt_domain_t domain; + int cpu; + struct task_struct* scheduled; /* only RT tasks */ +/* domain lock */ +#define slock domain.ready_lock +} edffm_domain_t; + +DEFINE_PER_CPU(edffm_domain_t, edffm_domains); + +#define local_edffm (&__get_cpu_var(edffm_domains)) +#define remote_edf(cpu) (&per_cpu(edffm_domains, cpu).domain) +#define remote_edffm(cpu) (&per_cpu(edffm_domains, cpu)) +#define task_edf(task) remote_edf(get_partition(task)) +#define task_edffm(task) remote_edffm(get_partition(task)) + +#define edffm_params(t) (t->rt_param.task_params.semi_part.fm) + +/* Is the task a migratory task? */ +#define is_migrat_task(task) (edffm_params(task).nr_cpus) +/* t is on the wrong CPU (it should be requeued properly) */ +#define wrong_cpu(t) is_migrat_task((t)) && task_cpu((t)) != get_partition((t)) +/* Get next CPU */ +#define migrat_next_cpu(t) \ + ((tsk_rt(t)->task_params.cpu == edffm_params(t).cpus[0]) ? \ + edffm_params(t).cpus[1] : \ + edffm_params(t).cpus[0]) +/* Get current cpu */ +#define migrat_cur_cpu(t) \ + ((tsk_rt(t)->task_params.cpu == edffm_params(t).cpus[0]) ? \ + edffm_params(t).cpus[0] : \ + edffm_params(t).cpus[1]) +/* Manipulate share for current cpu */ +#define cur_cpu_fract_num(t) \ + ((tsk_rt(t)->task_params.cpu == edffm_params(t).cpus[0]) ? \ + edffm_params(t).fraction[0][0] : \ + edffm_params(t).fraction[0][1]) +#define cur_cpu_fract_den(t) \ + ((tsk_rt(t)->task_params.cpu == edffm_params(t).cpus[0]) ? \ + edffm_params(t).fraction[1][0] : \ + edffm_params(t).fraction[1][1]) +/* Get job number for current cpu */ +#define cur_cpu_job_no(t) \ + ((tsk_rt(t)->task_params.cpu == edffm_params(t).cpus[0]) ? \ + tsk_rt(t)->semi_part.cpu_job_no[0] : \ + tsk_rt(t)->semi_part.cpu_job_no[1]) +/* What is the current cpu position in the array? */ +#define edffm_cpu_pos(cpu,t) \ + ((cpu == edffm_params(t).cpus[0]) ? \ + 0 : 1) + +/* + * EDF-fm: migratory tasks have higher prio than fixed, EDF in both classes. + * (Both first and second may be NULL). + */ +int edffm_higher_prio(struct task_struct* first, struct task_struct* second) +{ + if ((first && edffm_params(first).nr_cpus) || + (second && edffm_params(second).nr_cpus)) { + if ((first && edffm_params(first).nr_cpus) && + (second && edffm_params(second).nr_cpus)) + /* both are migrating */ + return edf_higher_prio(first, second); + + if (first && edffm_params(first).nr_cpus) + /* first is migrating */ + return 1; + else + /* second is migrating */ + return 0; + } + + /* both are fixed or not real time */ + return edf_higher_prio(first, second); +} + +/* need_to_preempt - check whether the task t needs to be preempted + * call only with irqs disabled and with ready_lock acquired + */ +int edffm_preemption_needed(rt_domain_t* rt, struct task_struct *t) +{ + /* we need the read lock for edf_ready_queue */ + /* no need to preempt if there is nothing pending */ + if (!__jobs_pending(rt)) + return 0; + /* we need to reschedule if t doesn't exist */ + if (!t) + return 1; + + /* make sure to get non-rt stuff out of the way */ + return !is_realtime(t) || edffm_higher_prio(__next_ready(rt), t); +} + +/* we assume the lock is being held */ +static void preempt(edffm_domain_t *edffm) +{ + preempt_if_preemptable(edffm->scheduled, edffm->cpu); +} + +static void edffm_release_jobs(rt_domain_t* rt, struct bheap* tasks) +{ + unsigned long flags; + edffm_domain_t *edffm = container_of(rt, edffm_domain_t, domain); + + raw_spin_lock_irqsave(&edffm->slock, flags); + + __merge_ready(rt, tasks); + + if (edffm_preemption_needed(rt, edffm->scheduled)) + preempt(edffm); + + raw_spin_unlock_irqrestore(&edffm->slock, flags); +} + +/* EDF-fm uses the "release_master" field to force the next release for + * the task 'task' to happen on a remote CPU. The remote cpu for task is + * previously set up during job_completion() taking into consideration + * whether a task is a migratory task or not. + */ +static inline void +edffm_add_release_remote(struct task_struct *task) +{ + unsigned long flags; + rt_domain_t *rt = task_edf(task); + + raw_spin_lock_irqsave(&rt->tobe_lock, flags); + + /* "modify" destination cpu */ + rt->release_master = get_partition(task); + + TRACE_TASK(task, "Add remote release: smp_proc_id = %d, cpu = %d, remote = %d\n", + smp_processor_id(), task_cpu(task), rt->release_master); + + /* trigger future release */ + __add_release(rt, task); + + /* reset proper release_master and unlock */ + rt->release_master = NO_CPU; + raw_spin_unlock_irqrestore(&rt->tobe_lock, flags); +} + +/* perform double ready_queue locking in an orderwise fashion + * this is called with: interrupt disabled and rq->lock held (from + * schedule()) + */ +static noinline void double_domain_lock(edffm_domain_t *dom1, edffm_domain_t *dom2) +{ + if (dom1 == dom2) { + /* fake */ + raw_spin_lock(&dom1->slock); + } else { + if (dom1 < dom2) { + raw_spin_lock(&dom1->slock); + raw_spin_lock(&dom2->slock); + TRACE("acquired %d and %d\n", dom1->cpu, dom2->cpu); + } else { + raw_spin_lock(&dom2->slock); + raw_spin_lock(&dom1->slock); + TRACE("acquired %d and %d\n", dom2->cpu, dom1->cpu); + } + } +} + +/* Directly insert a task in a remote ready queue. This function + * should only be called if this task is a migrating task and its + * last job for this CPU just completed (a new one is released for + * a remote CPU), but the new job is already tardy. + */ +static noinline void insert_task_in_remote_ready(struct task_struct *task) +{ + edffm_domain_t *this = remote_edffm(task_cpu(task)); + edffm_domain_t *remote = remote_edffm(get_partition(task)); + + BUG_ON(get_partition(task) != remote->cpu); + + TRACE_TASK(task, "Migrate From P%d -> To P%d\n", + this->cpu, remote->cpu); + TRACE_TASK(task, "Inserting in remote ready queue\n"); + + WARN_ON(!irqs_disabled()); + + raw_spin_unlock(&this->slock); + mb(); + TRACE_TASK(task,"edffm_lock %d released\n", this->cpu); + + /* lock both ready queues */ + double_domain_lock(this, remote); + mb(); + + __add_ready(&remote->domain, task); + + /* release remote but keep ours */ + raw_spin_unlock(&remote->slock); + TRACE_TASK(task,"edffm_lock %d released\n", remote->cpu); + + /* ask remote cpu to reschedule, we are already rescheduling on this */ + preempt(remote); +} + +static void requeue(struct task_struct* t, rt_domain_t *edf) +{ + if (t->state != TASK_RUNNING) + TRACE_TASK(t, "requeue: !TASK_RUNNING\n"); + + set_rt_flags(t, RT_F_RUNNING); + if (is_released(t, litmus_clock())) { + if (wrong_cpu(t)) { + /* this should only happen if t just completed, but + * its next release is already tardy, so it should be + * migrated and inserted in the remote ready queue + */ + TRACE_TASK(t, "Migrating task already released, " + "move from P%d to P%d\n", + task_cpu(t), get_partition(t)); + + insert_task_in_remote_ready(t); + } else { + /* not a migrat task or the job is on the right CPU */ + __add_ready(edf, t); + } + } else { + if (wrong_cpu(t)) { + + TRACE_TASK(t, "Migrating task, adding remote release\n"); + edffm_add_release_remote(t); + } else { + TRACE_TASK(t, "Adding local release\n"); + add_release(edf, t); + } + } +} + +/* Update statistics for the _current_ job. + * - job_no was incremented _before_ starting this job + * (release_at / prepare_for_next_period) + * - cpu_job_no is incremented when the job completes + */ +static void update_job_counter(struct task_struct *t) +{ + int cpu_pos; + + /* Which CPU counter should be incremented? */ + cpu_pos = edffm_cpu_pos(t->rt_param.task_params.cpu, t); + t->rt_param.semi_part.cpu_job_no[cpu_pos]++; + + TRACE_TASK(t, "job_no = %d, cpu_job_no(pos %d) = %d, cpu %d\n", + t->rt_param.job_params.job_no, cpu_pos, cur_cpu_job_no(t), + t->rt_param.task_params.cpu); +} + +/* What is the next cpu for this job? (eq. 8, in EDF-Fm paper) */ +static int next_cpu_for_job(struct task_struct *t) +{ + BUG_ON(!is_migrat_task(t)); + + TRACE_TASK(t, "%u = %u * %u / %u\n", + t->rt_param.job_params.job_no, cur_cpu_job_no(t), + cur_cpu_fract_den(t), cur_cpu_fract_num(t)); + if ((t->rt_param.job_params.job_no) == + (((lt_t) cur_cpu_job_no(t) * cur_cpu_fract_den(t)) / + cur_cpu_fract_num(t))) + return edffm_params(t).cpus[0]; + + return edffm_params(t).cpus[1]; +} + +/* If needed (the share for task t on this CPU is exhausted), updates + * the task_params.cpu for the _migrating_ task t + */ +static void change_migrat_cpu_if_needed(struct task_struct *t) +{ + BUG_ON(!is_migrat_task(t)); + /* EDF-fm: if it is a migrating task and it has already executed + * the required number of jobs on this CPU, we need to move it + * on its next CPU; changing the cpu here will affect the requeue + * and the next release + */ + if (unlikely(next_cpu_for_job(t) != migrat_cur_cpu(t))) { + + tsk_rt(t)->task_params.cpu = migrat_next_cpu(t); + TRACE_TASK(t, "EDF-fm: will migrate job %d -> %d\n", + task_cpu(t), tsk_rt(t)->task_params.cpu); + return; + } + + TRACE_TASK(t, "EDF-fm: job will stay on %d -> %d\n", + task_cpu(t), tsk_rt(t)->task_params.cpu); +} + +static void job_completion(struct task_struct* t, int forced) +{ + sched_trace_task_completion(t,forced); + TRACE_TASK(t, "job_completion().\n"); + + if (unlikely(is_migrat_task(t))) { + update_job_counter(t); + change_migrat_cpu_if_needed(t); + } + + set_rt_flags(t, RT_F_SLEEP); + prepare_for_next_period(t); +} + +static void edffm_tick(struct task_struct *t) +{ + edffm_domain_t *edffm = local_edffm; + + BUG_ON(is_realtime(t) && t != edffm->scheduled); + + if (is_realtime(t) && budget_enforced(t) && budget_exhausted(t)) { + set_tsk_need_resched(t); + TRACE("edffm_scheduler_tick: " + "%d is preemptable " + " => FORCE_RESCHED\n", t->pid); + } +} + +static struct task_struct* edffm_schedule(struct task_struct * prev) +{ + edffm_domain_t* edffm = local_edffm; + rt_domain_t* edf = &edffm->domain; + struct task_struct* next; + + int out_of_time, sleep, preempt, exists, blocks, change_cpu, resched; + + raw_spin_lock(&edffm->slock); + + BUG_ON(edffm->scheduled && edffm->scheduled != prev); + BUG_ON(edffm->scheduled && !is_realtime(prev)); + + /* (0) Determine state */ + exists = edffm->scheduled != NULL; + blocks = exists && !is_running(edffm->scheduled); + out_of_time = exists && + budget_enforced(edffm->scheduled) && + budget_exhausted(edffm->scheduled); + sleep = exists && get_rt_flags(edffm->scheduled) == RT_F_SLEEP; + change_cpu = exists && wrong_cpu(edffm->scheduled); + preempt = edffm_preemption_needed(edf, prev); + + BUG_ON(blocks && change_cpu); + + if (exists) + TRACE_TASK(prev, + "blocks:%d out_of_time:%d sleep:%d preempt:%d " + "wrong_cpu:%d state:%d sig:%d\n", + blocks, out_of_time, sleep, preempt, + change_cpu, prev->state, signal_pending(prev)); + + /* If we need to preempt do so. */ + resched = preempt; + + /* If a task blocks we have no choice but to reschedule. */ + if (blocks) + resched = 1; + + /* If a task has just woken up, it was tardy and the wake up + * raced with this schedule, a new job has already been released, + * but scheduled should be enqueued on a remote ready queue, and a + * new task should be selected for the current queue. + */ + if (change_cpu) + resched = 1; + + /* Any task that is preemptable and either exhausts its execution + * budget or wants to sleep completes. We may have to reschedule after + * this. + */ + if ((out_of_time || sleep) && !blocks) { + job_completion(edffm->scheduled, !sleep); + resched = 1; + } + + /* The final scheduling decision. Do we need to switch for some reason? + * Switch if we are in RT mode and have no task or if we need to + * resched. + */ + next = NULL; + if (resched || !exists) { + + if (edffm->scheduled && !blocks) + requeue(edffm->scheduled, edf); + next = __take_ready(edf); + } else + /* Only override Linux scheduler if we have a real-time task + * scheduled that needs to continue. + */ + if (exists) + next = prev; + + if (next) { + TRACE_TASK(next, "scheduled at %llu\n", litmus_clock()); + set_rt_flags(next, RT_F_RUNNING); + } else { + TRACE("becoming idle at %llu\n", litmus_clock()); + } + + edffm->scheduled = next; + raw_spin_unlock(&edffm->slock); + + return next; +} + +/* Prepare a task for running in RT mode + */ +static void edffm_task_new(struct task_struct * t, int on_rq, int running) +{ + rt_domain_t* edf = task_edf(t); + edffm_domain_t* edffm = task_edffm(t); + unsigned long flags; + + TRACE_TASK(t, "EDF-fm: task new, cpu = %d\n", + t->rt_param.task_params.cpu); + + release_at(t, litmus_clock()); + update_job_counter(t); + + /* The task should be running in the queue, otherwise signal + * code will try to wake it up with fatal consequences. + */ + raw_spin_lock_irqsave(&edffm->slock, flags); + if (running) { + /* there shouldn't be anything else running at the time */ + BUG_ON(edffm->scheduled); + edffm->scheduled = t; + } else { + requeue(t, edf); + /* maybe we have to reschedule */ + preempt(edffm); + } + raw_spin_unlock_irqrestore(&edffm->slock, flags); +} + +static void edffm_task_wake_up(struct task_struct *task) +{ + unsigned long flags; + edffm_domain_t* edffm = task_edffm(task); + rt_domain_t* edf = task_edf(task); + lt_t now; + + TRACE_TASK(task, "wake_up at %llu\n", litmus_clock()); + + TRACE_TASK(task, "acquire edffm %d\n", edffm->cpu); + raw_spin_lock_irqsave(&edffm->slock, flags); + + BUG_ON(edffm != task_edffm(task)); + BUG_ON(is_queued(task)); + + now = litmus_clock(); + if (is_tardy(task, now)) { + if (unlikely(is_migrat_task(task))) { + /* a new job will be released. + * Update current job counter */ + update_job_counter(task); + /* Switch CPU if needed */ + change_migrat_cpu_if_needed(task); + } + /* new sporadic release */ + TRACE_TASK(task, "release new\n"); + release_at(task, now); + sched_trace_task_release(task); + } + + /* Only add to ready queue if it is not the currently-scheduled + * task. This could be the case if a task was woken up concurrently + * on a remote CPU before the executing CPU got around to actually + * de-scheduling the task, i.e., wake_up() raced with schedule() + * and won. + */ + if (edffm->scheduled != task) + requeue(task, edf); + + raw_spin_unlock_irqrestore(&edffm->slock, flags); + TRACE_TASK(task, "release edffm %d\n", edffm->cpu); + TRACE_TASK(task, "wake up done\n"); +} + +static void edffm_task_block(struct task_struct *t) +{ + TRACE_TASK(t, "block at %llu, state=%d\n", litmus_clock(), t->state); + + BUG_ON(!is_realtime(t)); + if (is_queued(t)) { + edffm_domain_t *edffm = local_edffm; + TRACE_TASK(t, "task blocked, race with wakeup, " + "remove from queue %d\n", edffm->cpu); + remove(&edffm->domain, t); + } +} + +static void edffm_task_exit(struct task_struct * t) +{ + unsigned long flags; + edffm_domain_t* edffm = task_edffm(t); + rt_domain_t* edf; + + raw_spin_lock_irqsave(&edffm->slock, flags); + if (is_queued(t)) { + /* dequeue */ + edf = task_edf(t); + remove(edf, t); + } + if (edffm->scheduled == t) + edffm->scheduled = NULL; + + TRACE_TASK(t, "RIP\n"); + + preempt(edffm); + raw_spin_unlock_irqrestore(&edffm->slock, flags); +} + +static long edffm_admit_task(struct task_struct* tsk) +{ + return task_cpu(tsk) == tsk->rt_param.task_params.cpu ? 0 : -EINVAL; +} + +/* Plugin object */ +static struct sched_plugin edffm_plugin __cacheline_aligned_in_smp = { + .plugin_name = "EDF-fm", + .tick = edffm_tick, + .task_new = edffm_task_new, + .complete_job = complete_job, + .task_exit = edffm_task_exit, + .schedule = edffm_schedule, + .task_wake_up = edffm_task_wake_up, + .task_block = edffm_task_block, + .admit_task = edffm_admit_task +}; + +static int __init init_edffm(void) +{ + int i; + edffm_domain_t *edffm; + + /* Note, broken if num_online_cpus() may change */ + for (i = 0; i < num_online_cpus(); i++) { + edffm = remote_edffm(i); + edffm->cpu = i; + edffm->scheduled = NULL; + edf_domain_init(&edffm->domain, NULL, edffm_release_jobs); + } + + return register_sched_plugin(&edffm_plugin); +} + +module_init(init_edffm); + diff --git a/litmus/sched_edf_wm.c b/litmus/sched_edf_wm.c new file mode 100644 index 0000000..8b7be32 --- /dev/null +++ b/litmus/sched_edf_wm.c @@ -0,0 +1,688 @@ +/* EDF-WM: based on PSN-EDF. + */ + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +typedef struct { + rt_domain_t domain; + int cpu; + struct task_struct* scheduled; /* only RT tasks */ + +/* + * scheduling lock slock + * protects the domain and serializes scheduling decisions + */ +#define slock domain.ready_lock + +} wm_domain_t; + +DEFINE_PER_CPU(wm_domain_t, wm_domains); + +#define TRACE_DOM(dom, fmt, args...) \ + TRACE("(wm_domains[%d]) " fmt, (dom)->cpu, ##args) + + +#define local_domain (&__get_cpu_var(wm_domains)) +#define remote_domain(cpu) (&per_cpu(wm_domains, cpu)) +#define domain_of_task(task) (remote_domain(get_partition(task))) + +static int is_sliced_task(struct task_struct* t) +{ + return tsk_rt(t)->task_params.semi_part.wm.count; +} + +static struct edf_wm_slice* get_last_slice(struct task_struct* t) +{ + int idx = tsk_rt(t)->task_params.semi_part.wm.count - 1; + return tsk_rt(t)->task_params.semi_part.wm.slices + idx; +} + +static void compute_slice_params(struct task_struct* t) +{ + struct rt_param* p = tsk_rt(t); + /* Here we do a little trick to make the generic EDF code + * play well with job slices. We overwrite the job-level + * release and deadline fields with the slice-specific values + * so that we can enqueue this task in an EDF rt_domain_t + * without issue. The actual values are cached in the semi_part.wm + * structure. */ + p->job_params.deadline = p->semi_part.wm.job_release + + p->semi_part.wm.slice->deadline; + p->job_params.release = p->semi_part.wm.job_release + + p->semi_part.wm.slice->offset; + + /* Similarly, we play a trick on the cpu field. */ + p->task_params.cpu = p->semi_part.wm.slice->cpu; + + /* update the per-slice budget reference */ + p->semi_part.wm.exec_time = p->job_params.exec_time; +} + +static void complete_sliced_job(struct task_struct* t) +{ + struct rt_param* p = tsk_rt(t); + + /* We need to undo our trickery to the + * job parameters (see above). */ + p->job_params.release = p->semi_part.wm.job_release; + p->job_params.deadline = p->semi_part.wm.job_deadline; + + /* Ok, now let generic code do the actual work. */ + prepare_for_next_period(t); + + /* And finally cache the updated parameters. */ + p->semi_part.wm.job_release = p->job_params.release; + p->semi_part.wm.job_deadline = p->job_params.deadline; +} + +static lt_t slice_exec_time(struct task_struct* t) +{ + struct rt_param* p = tsk_rt(t); + + /* Compute how much execution time has been consumed + * since last slice advancement. */ + return p->job_params.exec_time - p->semi_part.wm.exec_time; +} + +static lt_t slice_budget(struct task_struct* t) +{ + return tsk_rt(t)->semi_part.wm.slice->budget; +} + +static int slice_budget_exhausted(struct task_struct* t) +{ + return slice_exec_time(t) >= slice_budget(t); +} + +/* assumes positive remainder; overflows otherwise */ +static lt_t slice_budget_remaining(struct task_struct* t) +{ + return slice_budget(t) - slice_exec_time(t); +} + +static int wm_budget_exhausted(struct task_struct* t) +{ + if (is_sliced_task(t)) + return slice_budget_exhausted(t); + else + return budget_exhausted(t); +} + +static void advance_next_slice(struct task_struct* t, int completion_signaled) +{ + int idx; + struct rt_param* p = tsk_rt(t); + + /* make sure this is actually a sliced job */ + BUG_ON(!is_sliced_task(t)); + BUG_ON(is_queued(t)); + + /* determine index of current slice */ + idx = p->semi_part.wm.slice - + p->task_params.semi_part.wm.slices; + + TRACE_TASK(t, "advancing slice %d; excess=%lluns; " + "completion_signaled=%d.\n", + idx, slice_exec_time(t) - slice_budget(t), + completion_signaled); + + if (completion_signaled) + idx = 0; + else + /* increment and wrap around, if necessary */ + idx = (idx + 1) % p->task_params.semi_part.wm.count; + + /* point to next slice */ + p->semi_part.wm.slice = + p->task_params.semi_part.wm.slices + idx; + + /* Check if we need to update essential job parameters. */ + if (!idx) { + /* job completion */ + sched_trace_task_completion(t, !completion_signaled); + TRACE_TASK(t, "completed sliced job" + "(signaled:%d)\n", completion_signaled); + complete_sliced_job(t); + } + + /* Update job parameters for new slice. */ + compute_slice_params(t); +} + +/* assumes time_passed does not advance past the last slice */ +static void fast_forward_slices(struct task_struct* t, lt_t time_passed) +{ + TRACE_TASK(t, "fast forwarding %lluns\n", time_passed); + + /* this is NOT the slice version */ + BUG_ON(budget_remaining(t) <= time_passed); + + if (wm_budget_exhausted(t)) { + /* This can happen if a suspension raced + * with a normal slice advancement. wm_schedule() + * does not process out_of_time when a task blocks. */ + TRACE_TASK(t, "block raced with out_of_time?\n"); + advance_next_slice(t, 0); + } + + while (time_passed && + time_passed >= slice_budget_remaining(t)) { + /* slice completely exhausted */ + time_passed -= slice_budget_remaining(t); + tsk_rt(t)->job_params.exec_time += + slice_budget_remaining(t); + + BUG_ON(!slice_budget_exhausted(t)); + BUG_ON(slice_budget_remaining(t) != 0); + BUG_ON(tsk_rt(t)->semi_part.wm.slice == get_last_slice(t)); + + advance_next_slice(t, 0); + } + /* add remainder to exec cost */ + tsk_rt(t)->job_params.exec_time += time_passed; +} + +/* we assume the lock is being held */ +static void preempt(wm_domain_t *dom) +{ + TRACE_DOM(dom, "will be preempted.\n"); + /* We pass NULL as the task since non-preemptive sections are not + * supported in this plugin, so per-task checks are not needed. */ + preempt_if_preemptable(NULL, dom->cpu); +} + +static void wm_domain_init(wm_domain_t* dom, + check_resched_needed_t check, + release_jobs_t release, + int cpu) +{ + edf_domain_init(&dom->domain, check, release); + dom->cpu = cpu; + dom->scheduled = NULL; +} + +static void wm_requeue_remote(struct task_struct *t) +{ + wm_domain_t *dom = domain_of_task(t); + + set_rt_flags(t, RT_F_RUNNING); + if (is_released(t, litmus_clock())) + /* acquires necessary lock */ + add_ready(&dom->domain, t); + else + /* force timer on remote CPU */ + add_release_on(&dom->domain, t, get_partition(t)); +} + +static void wm_requeue_local(struct task_struct* t, rt_domain_t *edf) +{ + if (t->state != TASK_RUNNING) + TRACE_TASK(t, "requeue: !TASK_RUNNING\n"); + + set_rt_flags(t, RT_F_RUNNING); + if (is_released(t, litmus_clock())) + __add_ready(edf, t); + else + add_release(edf, t); /* it has got to wait */ +} + +static int wm_check_resched(rt_domain_t *edf) +{ + wm_domain_t *dom = container_of(edf, wm_domain_t, domain); + + /* because this is a callback from rt_domain_t we already hold + * the necessary lock for the ready queue + */ + if (edf_preemption_needed(edf, dom->scheduled)) { + preempt(dom); + return 1; + } else + return 0; +} + +static void regular_job_completion(struct task_struct* t, int forced) +{ + sched_trace_task_completion(t, forced); + TRACE_TASK(t, "job_completion().\n"); + + set_rt_flags(t, RT_F_SLEEP); + prepare_for_next_period(t); +} + +static void wm_job_or_slice_completion(struct task_struct* t, + int completion_signaled) +{ + if (is_sliced_task(t)) + advance_next_slice(t, completion_signaled); + else + regular_job_completion(t, !completion_signaled); +} + +static void wm_tick(struct task_struct *t) +{ + wm_domain_t *dom = local_domain; + + /* Check for inconsistency. We don't need the lock for this since + * ->scheduled is only changed in schedule, which obviously is not + * executing in parallel on this CPU + */ + BUG_ON(is_realtime(t) && t != dom->scheduled); + + if (is_realtime(t) && budget_enforced(t) && wm_budget_exhausted(t)) { + set_tsk_need_resched(t); + TRACE_DOM(dom, "budget of %d exhausted in tick\n", + t->pid); + } +} + +static struct task_struct* wm_schedule(struct task_struct * prev) +{ + wm_domain_t *dom = local_domain; + rt_domain_t *edf = &dom->domain; + struct task_struct *next, *migrate = NULL; + + int out_of_time, sleep, preempt, wrong_cpu, exists, blocks, resched; + + raw_spin_lock(&dom->slock); + + /* Sanity checking: + * When a task exits (dead) dom->schedule may be null + * and prev _is_ realtime. */ + BUG_ON(dom->scheduled && dom->scheduled != prev); + BUG_ON(dom->scheduled && !is_realtime(prev)); + + /* (0) Determine state */ + exists = dom->scheduled != NULL; + wrong_cpu = exists && get_partition(dom->scheduled) != dom->cpu; + blocks = exists && !is_running(dom->scheduled); + out_of_time = exists + && budget_enforced(dom->scheduled) + && wm_budget_exhausted(dom->scheduled); + sleep = exists && get_rt_flags(dom->scheduled) == RT_F_SLEEP; + preempt = edf_preemption_needed(edf, prev); + + /* If we need to preempt do so. + * The following checks set resched to 1 in case of special + * circumstances. + */ + resched = preempt; + + + if (exists) + TRACE_TASK(prev, + "blocks:%d out_of_time:%d sleep:%d preempt:%d " + "wrong_cpu:%d state:%d sig:%d\n", + blocks, out_of_time, sleep, preempt, wrong_cpu, + prev->state, signal_pending(prev)); + + /* If a task blocks we have no choice but to reschedule. + */ + if (blocks) + resched = 1; + + /* This can happen if sliced task was moved to the next slice + * by the wake_up() code path while still being scheduled. + */ + if (wrong_cpu) + resched = 1; + + /* Any task that is preemptable and either exhausts its execution + * budget or wants to sleep completes. We may have to reschedule after + * this. + */ + if ((out_of_time || sleep) && !blocks) { + wm_job_or_slice_completion(dom->scheduled, sleep); + resched = 1; + } + + /* The final scheduling decision. Do we need to switch for some reason? + * Switch if we are in RT mode and have no task or if we need to + * resched. + */ + next = NULL; + if (resched || !exists) { + if (dom->scheduled && !blocks) { + if (get_partition(dom->scheduled) == dom->cpu) + /* local task */ + wm_requeue_local(dom->scheduled, edf); + else + /* not local anymore; wait until we drop the + * ready queue lock */ + migrate = dom->scheduled; + } + next = __take_ready(edf); + } else + /* Only override Linux scheduler if we have a real-time task + * scheduled that needs to continue. */ + if (exists) + next = prev; + + if (next) { + TRACE_TASK(next, "scheduled at %llu (state:%d/%d)\n", litmus_clock(), + next->state, is_running(next)); + set_rt_flags(next, RT_F_RUNNING); + } else if (exists) { + TRACE("becoming idle at %llu\n", litmus_clock()); + } + + dom->scheduled = next; + raw_spin_unlock(&dom->slock); + + /* check if we need to push the previous task onto another queue */ + if (migrate) { + TRACE_TASK(migrate, "schedule-initiated migration to %d\n", + get_partition(migrate)); + wm_requeue_remote(migrate); + } + + return next; +} + + +/* Prepare a task for running in RT mode + */ +static void wm_task_new(struct task_struct * t, int on_rq, int running) +{ + wm_domain_t* dom = domain_of_task(t); + rt_domain_t* edf = &dom->domain; + unsigned long flags; + + TRACE_TASK(t, "edf-wm: task new, cpu = %d\n", + t->rt_param.task_params.cpu); + + /* setup job parameters */ + release_at(t, litmus_clock()); + + /* The task should be running in the queue, otherwise signal + * code will try to wake it up with fatal consequences. + */ + raw_spin_lock_irqsave(&dom->slock, flags); + + if (is_sliced_task(t)) { + /* make sure parameters are initialized consistently */ + tsk_rt(t)->semi_part.wm.exec_time = 0; + tsk_rt(t)->semi_part.wm.job_release = get_release(t); + tsk_rt(t)->semi_part.wm.job_deadline = get_deadline(t); + tsk_rt(t)->semi_part.wm.slice = tsk_rt(t)->task_params.semi_part.wm.slices; + tsk_rt(t)->job_params.exec_time = 0; + } + + if (running) { + /* there shouldn't be anything else running at the time */ + BUG_ON(dom->scheduled); + dom->scheduled = t; + } else { + wm_requeue_local(t, edf); + /* maybe we have to reschedule */ + preempt(dom); + } + raw_spin_unlock_irqrestore(&dom->slock, flags); +} + +static void wm_release_at(struct task_struct *t, lt_t start) +{ + struct rt_param* p = tsk_rt(t); + + if (is_sliced_task(t)) { + /* simulate wrapping to the first slice */ + p->semi_part.wm.job_deadline = start; + p->semi_part.wm.slice = get_last_slice(t); + /* FIXME: creates bogus completion event... */ + advance_next_slice(t, 0); + set_rt_flags(t, RT_F_RUNNING); + } else + /* generic code handles it */ + release_at(t, start); +} + +static lt_t wm_earliest_release(struct task_struct *t, lt_t now) +{ + lt_t deadline; + if (is_sliced_task(t)) + deadline = tsk_rt(t)->semi_part.wm.job_deadline; + else + deadline = get_deadline(t); + if (lt_before(deadline, now)) + return now; + else + return deadline; +} + +static void wm_task_wake_up(struct task_struct *t) +{ + unsigned long flags; + wm_domain_t* dom = domain_of_task(t); + rt_domain_t* edf = &dom->domain; + struct rt_param* p = tsk_rt(t); + lt_t now, sleep_time; + int migrate = 0; + + raw_spin_lock_irqsave(&dom->slock, flags); + BUG_ON(is_queued(t)); + + now = litmus_clock(); + + sleep_time = now - p->semi_part.wm.suspend_time; + + TRACE_TASK(t, "wake_up at %llu after %llu, still-scheduled:%d\n", + now, sleep_time, dom->scheduled == t); + + /* account sleep time as execution time */ + if (get_exec_time(t) + sleep_time >= get_exec_cost(t)) { + /* new sporadic release */ + TRACE_TASK(t, "new sporadic release\n"); + wm_release_at(t, wm_earliest_release(t, now)); + sched_trace_task_release(t); + } else if (is_sliced_task(t)) { + /* figure out which slice we should be executing on */ + fast_forward_slices(t, sleep_time); + /* can't be exhausted now */ + BUG_ON(wm_budget_exhausted(t)); + } else { + /* simply add to the execution time */ + tsk_rt(t)->job_params.exec_time += sleep_time; + } + + + /* Only add to ready queue if it is not the currently-scheduled + * task. This could be the case if a task was woken up concurrently + * on a remote CPU before the executing CPU got around to actually + * de-scheduling the task, i.e., wake_up() raced with schedule() + * and won. + */ + if (dom->scheduled != t) { + if (get_partition(t) == dom->cpu) + wm_requeue_local(t, edf); + else + /* post-pone migration until after unlocking */ + migrate = 1; + } + + raw_spin_unlock_irqrestore(&dom->slock, flags); + + if (migrate) { + TRACE_TASK(t, "wake_up-initiated migration to %d\n", + get_partition(t)); + wm_requeue_remote(t); + } + + TRACE_TASK(t, "wake up done\n"); +} + +static void wm_task_block(struct task_struct *t) +{ + wm_domain_t* dom = domain_of_task(t); + unsigned long flags; + lt_t now = litmus_clock(); + + TRACE_TASK(t, "block at %llu, state=%d\n", now, t->state); + + tsk_rt(t)->semi_part.wm.suspend_time = now; + + raw_spin_lock_irqsave(&dom->slock, flags); + if (is_queued(t)) { + TRACE_TASK(t, "still queued; migration invariant failed?\n"); + remove(&dom->domain, t); + } + raw_spin_unlock_irqrestore(&dom->slock, flags); + + BUG_ON(!is_realtime(t)); +} + +static void wm_task_exit(struct task_struct * t) +{ + unsigned long flags; + wm_domain_t* dom = domain_of_task(t); + rt_domain_t* edf = &dom->domain; + + raw_spin_lock_irqsave(&dom->slock, flags); + if (is_queued(t)) { + /* dequeue */ + remove(edf, t); + } + if (dom->scheduled == t) + dom->scheduled = NULL; + + TRACE_TASK(t, "RIP, now reschedule\n"); + + preempt(dom); + raw_spin_unlock_irqrestore(&dom->slock, flags); +} + +static long wm_check_params(struct task_struct *t) +{ + struct rt_param* p = tsk_rt(t); + struct edf_wm_params* wm = &p->task_params.semi_part.wm; + int i; + lt_t tmp; + + if (!is_sliced_task(t)) { + /* regular task; nothing to check */ + TRACE_TASK(t, "accepted regular (non-sliced) task with " + "%d slices\n", + wm->count); + return 0; + } + + /* (1) Either not sliced, or more than 1 slice. */ + if (wm->count == 1 || wm->count > MAX_EDF_WM_SLICES) { + TRACE_TASK(t, "bad number of slices (%u) \n", + wm->count); + return -EINVAL; + } + + /* (2) The partition has to agree with the first slice. */ + if (get_partition(t) != wm->slices[0].cpu) { + TRACE_TASK(t, "partition and first slice CPU differ " + "(%d != %d)\n", get_partition(t), wm->slices[0].cpu); + return -EINVAL; + } + + /* (3) The total budget must agree. */ + for (i = 0, tmp = 0; i < wm->count; i++) + tmp += wm->slices[i].budget; + if (get_exec_cost(t) != tmp) { + TRACE_TASK(t, "total budget and sum of slice budgets differ\n"); + return -EINVAL; + } + + /* (4) The release of each slice must not precede the previous + * deadline. */ + for (i = 0; i < wm->count - 1; i++) + if (wm->slices[i].deadline > wm->slices[i + 1].offset) { + TRACE_TASK(t, "slice %d overlaps with slice %d\n", + i, i + 1); + return -EINVAL; + } + + /* (5) The budget of each slice must fit within [offset, deadline] */ + for (i = 0; i < wm->count; i++) + if (lt_before(wm->slices[i].deadline, wm->slices[i].offset) || + wm->slices[i].deadline - wm->slices[i].offset < + wm->slices[i].budget) { + TRACE_TASK(t, "slice %d is overloaded\n", i); + return -EINVAL; + } + + /* (6) The budget of each slice must exceed the minimum budget size. */ + for (i = 0; i < wm->count; i++) + if (wm->slices[i].budget < MIN_EDF_WM_SLICE_SIZE) { + TRACE_TASK(t, "slice %d is too short\n", i); + return -EINVAL; + } + + /* (7) The CPU of each slice must be different from the previous CPU. */ + for (i = 0; i < wm->count - 1; i++) + if (wm->slices[i].cpu == wm->slices[i + 1].cpu) { + TRACE_TASK(t, "slice %d does not migrate\n", i); + return -EINVAL; + } + + /* (8) The CPU of each slice must be online. */ + for (i = 0; i < wm->count; i++) + if (!cpu_online(wm->slices[i].cpu)) { + TRACE_TASK(t, "slice %d is allocated on offline CPU\n", + i); + return -EINVAL; + } + + /* (9) A sliced task's budget must be precisely enforced. */ + if (!budget_precisely_enforced(t)) { + TRACE_TASK(t, "budget is not precisely enforced " + "(policy: %d).\n", + tsk_rt(t)->task_params.budget_policy); + return -EINVAL; + } + + TRACE_TASK(t, "accepted sliced task with %d slices\n", + wm->count); + + return 0; +} + +static long wm_admit_task(struct task_struct* t) +{ + return task_cpu(t) == get_partition(t) ? wm_check_params(t) : -EINVAL; +} + +/* Plugin object */ +static struct sched_plugin edf_wm_plugin __cacheline_aligned_in_smp = { + .plugin_name = "EDF-WM", + .tick = wm_tick, + .task_new = wm_task_new, + .complete_job = complete_job, + .task_exit = wm_task_exit, + .schedule = wm_schedule, + .release_at = wm_release_at, + .task_wake_up = wm_task_wake_up, + .task_block = wm_task_block, + .admit_task = wm_admit_task +}; + + +static int __init init_edf_wm(void) +{ + int i; + + /* FIXME: breaks with CPU hotplug + */ + for (i = 0; i < num_online_cpus(); i++) { + wm_domain_init(remote_domain(i), + wm_check_resched, + NULL, i); + } + return register_sched_plugin(&edf_wm_plugin); +} + +module_init(init_edf_wm); + diff --git a/litmus/sched_npsf.c b/litmus/sched_npsf.c new file mode 100644 index 0000000..aad99c7 --- /dev/null +++ b/litmus/sched_npsf.c @@ -0,0 +1,1185 @@ +/* + * litmus/sched_npsf.c + * + * Implementation of the NPS-F scheduling algorithm. + * + * A _server_ may span on multiple _reserves_ on different CPUs. + * + * * 1 + * +--------------+ +--> +--------------+ +--> +--------------+ + * | cpu_entry_t | | | npsf_reserve | | | npsf_server | + * +--------------+ | +--------------+ | +--------------+ + * | |1 | | |1 | | | + * | cpu_reserve |--+ 1| server |--+ 1| | + * | | +---| cpu | +---| curr_reserve | + * +--------------+ <-+ +--------------+ <-+ +--------------+ + * 1 * + */ + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +/* Be extra verbose (log spam) */ +#define NPSF_VERBOSE + +#ifdef NPSF_VERBOSE +#define npsf_printk(fmt, arg...) printk(KERN_INFO fmt, ##arg) +#else +#define npsf_printk(fmt, arg...) +#endif + +struct npsf_reserve; + +/* cpu_entry_t + * + * Each cpu has a list of reserves assigned on the cpu. + * Each reserve has a pointer to its server (Notional processor) + * that may be shared among multiple reserves. + */ +typedef struct { + /* lock to protect cpu_reserve and list changes */ + raw_spinlock_t cpu_res_lock; + /* the reserve currently executing on this cpu */ + struct npsf_reserve *cpu_reserve; + /* list of reserves on this cpu */ + struct list_head npsf_reserves; + /* cpu ID */ + int cpu; + /* timer to control reserve switching */ + struct hrtimer timer; + /* virtual timer expiring (wrt time_origin) */ + lt_t should_expire; + /* delegate timer firing to proper cpu */ + struct hrtimer_start_on_info info; + /* FIXME: the ids for servers should be an increasing int >=0 */ + int last_seen_npsf_id; +} cpu_entry_t; + +/* one cpu_entry_t per CPU */ +DEFINE_PER_CPU(cpu_entry_t, npsf_cpu_entries); + +/* This is the "notional processor" (i.e., simple server) abstraction. */ +typedef struct npsf_server { + /* shared among reserves */ + rt_domain_t dom; + /* the real-time task that this server *SHOULD* be scheduling */ + struct task_struct *highest_prio; + /* current reserve where this dom is executing */ + struct npsf_reserve *curr_reserve; + /* The "first" reserve for this server in a time slot. + * For non-migrating servers this will always be the same as curr_reserve. */ + struct npsf_reserve *first_reserve; + /* Prevent a race between the last CPU in a reserve chain an the first. */ + int first_cpu_wants_ipi; + /* rt_domain_t lock + npsf_server_t lock */ +#define lock dom.ready_lock +} npsf_server_t; + +typedef struct npsf_reserve { + /* Pointer to the server for this reserve: a server may be shared among + * multiple cpus with different budget per cpu, but same npsf_id. */ + npsf_server_t *server; + /* we queue here in npsf_reserves */ + struct list_head node; + /* budget of this npsf_id on this cpu */ + lt_t budget; + /* cpu for this (portion of) server */ + cpu_entry_t *cpu; + /* id of this server, it is the same for the + * same server on different cpus */ + int npsf_id; + /* Can be used to identify if a reserve continues + * next npsf in the chain, needed for proper server deletion */ + struct npsf_reserve *next_npsf; + /* flag that is true if the reserve is currently scheduled */ + int is_currently_scheduled; +} npsf_reserve_t; + +/* synchronization point to start moving and switching servers only + * when all servers have been properly set up by the user. + */ +static atomic_t all_servers_added; +static atomic_t timers_activated = ATOMIC_INIT(0); + +/* Virtual time starts here */ +static lt_t time_origin; + +/* save number of online cpus seen at init time */ +static unsigned int _online_cpus = 1; + +#define no_reserves(entry) (list_empty(&((entry)->npsf_reserves))) +#define local_entry (&__get_cpu_var(npsf_cpu_entries)) +#define remote_entry(cpu) (&per_cpu(npsf_cpu_entries, (cpu))) + +#define server_from_dom(domain) (container_of((domain), npsf_server_t, dom)) + +/* task_entry uses get_partition() therefore we must take care of + * updating correclty the task_params.cpu whenever we switch task, + * otherwise we'll deadlock. + */ +#define task_entry(task) remote_entry(get_partition(task)) +#define domain_edf(npsf) (&((npsf)->server->dom)) + +#define task_npsfid(task) ((task)->rt_param.task_params.semi_part.npsf_id) + +static inline int owns_server(npsf_reserve_t *npsf) +{ + return (npsf->server->curr_reserve == npsf); +} + +/* utility functions to get next and prev domains; must hold entry lock */ +static inline npsf_reserve_t* local_next_reserve(npsf_reserve_t *curr, + cpu_entry_t *entry) +{ + return (list_is_last(&curr->node, &entry->npsf_reserves)) ? + list_entry(entry->npsf_reserves.next, npsf_reserve_t, node) : + list_entry(curr->node.next, npsf_reserve_t, node); + +} + +static inline npsf_reserve_t* local_prev_reserve(npsf_reserve_t *curr, + cpu_entry_t *entry) +{ + return ((curr->node.prev == &entry->npsf_reserves) ? + list_entry(entry->npsf_reserves.prev, npsf_reserve_t, node) : + list_entry(curr->node.prev, npsf_reserve_t, node)); +} +static void requeue(struct task_struct* t, rt_domain_t *edf) +{ + if (t->state != TASK_RUNNING) + TRACE_TASK(t, "requeue: !TASK_RUNNING\n"); + + BUG_ON(is_queued(t)); + + set_rt_flags(t, RT_F_RUNNING); + if (is_released(t, litmus_clock())) + __add_ready(edf, t); + else + add_release(edf, t); /* it has got to wait */ +} + +/* we assume the lock is being held */ +static void preempt(npsf_reserve_t *npsf) +{ + /* Since we do not support non-preemptable sections, + * we don't need to pass in a task. If we call this, + * we want the remote CPU to reschedule, no matter what. + */ + preempt_if_preemptable(NULL, npsf->cpu->cpu); +} + + +static void npsf_preempt_if_server_is_scheduled(npsf_server_t* srv) +{ + npsf_reserve_t *reserve = srv->curr_reserve; + if (reserve->is_currently_scheduled) { + preempt(reserve); + } +} + +/* assumes lock is held by caller */ +static void npsf_reschedule_server(npsf_server_t* srv) +{ + struct task_struct* hp = srv->highest_prio; + rt_domain_t* edf = &srv->dom; + + if (edf_preemption_needed(edf, hp)) { + srv->highest_prio = __take_ready(edf); + if (hp) { + TRACE_TASK(hp, "requeue: no longer highest prio\n"); + requeue(hp, edf); + } + npsf_preempt_if_server_is_scheduled(srv); + } +} + +static void npsf_release_jobs(rt_domain_t* rt, struct bheap* tasks) +{ + npsf_server_t *srv = server_from_dom(rt); + unsigned long flags; + + raw_spin_lock_irqsave(&srv->lock, flags); + + __merge_ready(rt, tasks); + npsf_reschedule_server(srv); + + raw_spin_unlock_irqrestore(&srv->lock, flags); +} + +static void job_completion(struct task_struct* t, int forced) +{ + sched_trace_task_completion(t, forced); + TRACE_TASK(t, "job_completion().\n"); + + set_rt_flags(t, RT_F_SLEEP); + prepare_for_next_period(t); +} + +/* When did this slot start ? */ +static inline lt_t slot_begin(lt_t now) +{ + return (((now - time_origin) / npsf_slot_length) + * npsf_slot_length + time_origin); +} + +/* Compute the delta from the beginning of the current slot. */ +static inline lt_t delta_from_slot_begin(lt_t now) +{ + return (now - slot_begin(now)); +} + +/* Given an offset into a slot, return the corresponding eligible reserve. + * The output param reservation_end is used to return the (relative) time at which + * the returned reserve ends. + */ +static npsf_reserve_t* get_reserve_for_offset(cpu_entry_t *entry, lt_t offset, + lt_t *reservation_end) +{ + npsf_reserve_t *tmp; + + *reservation_end = 0; + + /* linear search through all reserves, figure out which one is the last one + * to become eligible before delta */ + list_for_each_entry(tmp, &entry->npsf_reserves, node) { + *reservation_end += tmp->budget; + + /* We are always "late". Found tmp is the right one */ + if ((*reservation_end > offset)) + return tmp; + } + + /* error: we should never fall of the reserve list */ + BUG(); + return NULL; +} + +/* Determine which reserve is eligible based on the current time. + */ +static npsf_reserve_t* get_current_reserve(cpu_entry_t *entry) +{ + lt_t reservation_end; + lt_t offset = delta_from_slot_begin(litmus_clock()); + return get_reserve_for_offset(entry, offset, &reservation_end); +} + +/* This is used to ensure that we are "always" late, i.e., to make + * sure that the timer jitter is always positive. This should + * only trigger in KVM (or in real machines with bad TSC drift after + * an IPI). + * + * ATM proper tracing for this event is done in reserve_switch_tick(). + */ +static noinline ktime_t catchup_time(lt_t from, lt_t target) +{ + while(lt_before(from, target)) { + from = litmus_clock(); + + mb(); + cpu_relax(); + } + + return ns_to_ktime(from); +} + + +/* compute the next ABSOLUTE timer value */ +static lt_t get_next_reserve_switch_time(void) +{ + cpu_entry_t *entry = local_entry; + lt_t now = litmus_clock(); + lt_t slot_start = slot_begin(now); + lt_t offset = now - slot_start; + lt_t next_time; + npsf_reserve_t* reserve; + + /* compute the absolute litmus time of the next reserve switch */ + reserve = get_reserve_for_offset(entry, offset, &next_time); + /* get_reserve_for_offset returns a relative start time; let's make it + absolute */ + next_time += slot_start; + + /* Let's see if we need to skip the next timer. */ + reserve = local_next_reserve(reserve, entry); + /* if the next reserve is a continuing reserve + * (i.e., if it belongs to a migrating server), + * then we skip the timer event because we will + * receive an IPI from the previous processor instead. */ + if (reserve->server->first_reserve != reserve) { + /* it is indeed not the first reserve */ + next_time += reserve->budget; + } + + return next_time; +} + +/* This is the callback for reserve-switching interrupts. + * The timer is reprogrammed to expire at the beginning of every logical + * reserve (i.e., a continuing reserve may be split among different CPUs + * but is a _single_ logical reserve). get_next_reserve_switch_time() + * will return the right next_expire time. + */ +static enum hrtimer_restart reserve_switch_tick(struct hrtimer *timer) +{ + unsigned long flags; + cpu_entry_t *entry; + /* we are using CLOCK_MONOTONIC */ + ktime_t now = ktime_get(); + ktime_t delta; + int late; + + entry = container_of(timer, cpu_entry_t, timer); + raw_spin_lock_irqsave(&entry->cpu_res_lock, flags); + + /* jitter wrt virtual time */ + delta = ktime_sub(now, ns_to_ktime(entry->should_expire)); + late = (ktime_to_ns(delta) >= 0) ? 1 : 0; + +#ifdef NPSF_VERBOSE + if (entry->cpu_reserve && atomic_read(&all_servers_added)) + TRACE("(npsf_id: %d) tick starts at %Ld, " + "now - should_expire: %Ld\n", + entry->cpu_reserve->npsf_id, + ktime_to_ns(now), ktime_to_ns(delta)); +#endif + /* if the timer expires earlier than the should_expire time, + * we delay the switching until time it's synchronized with + * the switch boundary. Otherwise next reserve will execute + * longer (wrong). + */ + if (!late) { + TRACE("+++ Timer fired early, waiting...\n"); + now = catchup_time(ktime_to_ns(now), entry->should_expire); + + delta = ktime_sub(now, ns_to_ktime(entry->should_expire)); + TRACE("+++ done, tick restarts at %Ld, " + "now - should_expire: %Ld\n", + ktime_to_ns(now), ktime_to_ns(delta)); + } + + BUG_ON(!atomic_read(&all_servers_added)); + BUG_ON(no_reserves(entry)); + + /* Compute the next time that we need to be notified. */ + entry->should_expire = get_next_reserve_switch_time(); + + /* kindly ask the Penguin to let us know... */ + hrtimer_set_expires(timer, ns_to_ktime(entry->should_expire)); + + /* set resched flag to reschedule local cpu */ + set_need_resched(); + + raw_spin_unlock_irqrestore(&entry->cpu_res_lock, flags); +#ifdef NPSF_VERBOSE + if (atomic_read(&all_servers_added)) + TRACE("(npsf_id: %d) tick ends at %Ld, should_expire: %llu\n", + entry->cpu_reserve->npsf_id, ktime_to_ns(ktime_get()), + entry->should_expire); +#endif + + return HRTIMER_RESTART; +} + +static void npsf_scheduler_tick(struct task_struct *t) +{ + if (is_realtime(t) && budget_enforced(t) && budget_exhausted(t)) { + set_tsk_need_resched(t); + TRACE("npsf_tick: %d is preemptable " + " => FORCE_RESCHED\n", t->pid); + } +} + +/* Assumption: caller holds srv lock and prev belongs to + * the currently-scheduled reservation. + */ +static void npsf_schedule_server(struct task_struct* prev, + cpu_entry_t *entry) +{ + npsf_server_t* srv = entry->cpu_reserve->server; + + int out_of_time, sleep, exists, blocks; + + exists = is_realtime(prev); + blocks = exists && !is_running(prev); + out_of_time = exists && + budget_enforced(prev) && + budget_exhausted(prev); + sleep = exists && get_rt_flags(prev) == RT_F_SLEEP; + + if (exists) + TRACE_TASK(prev, "(npsf_id %d) blocks:%d " + "out_of_time:%d sleep:%d state:%d sig:%d\n", + task_npsfid(prev), + blocks, out_of_time, sleep, + prev->state, + signal_pending(prev)); + + /* Any task that is preemptable and either exhausts its + * execution budget or wants to sleep completes. We may have + * to reschedule after this. + */ + if ((out_of_time || sleep) && !blocks) { + job_completion(prev, !sleep); + + if (srv->highest_prio != prev) { + BUG_ON(!is_queued(prev)); + remove(&srv->dom, prev); + } + + requeue(prev, &srv->dom); + + if (srv->highest_prio == prev) + srv->highest_prio = __take_ready(&srv->dom); + } + + BUG_ON(blocks && prev == srv->highest_prio); +// BUG_ON(!srv->highest_prio && jobs_pending(&srv->dom)); +} + +static void npsf_notify_next_cpu(npsf_reserve_t *npsf_prev) +{ + npsf_server_t *srv; + + if (unlikely(npsf_prev->next_npsf != npsf_prev)) { + /* This reserve is actually shared. Let's update its 'owner' + * and notify the next CPU. */ + srv = npsf_prev->server; + raw_spin_lock(&srv->lock); + srv->curr_reserve = npsf_prev->next_npsf; + if (srv->first_reserve != srv->curr_reserve || + srv->first_cpu_wants_ipi) { + /* send an IPI to notify next CPU in chain */ + srv->first_cpu_wants_ipi = 0; + TRACE("sending IPI\n"); + preempt(srv->curr_reserve); + } + raw_spin_unlock(&srv->lock); + } +} + +static struct task_struct* npsf_schedule(struct task_struct * prev) +{ + npsf_reserve_t *npsf_prev, *npsf_next; + npsf_server_t *srv_prev, *srv_next; + cpu_entry_t *entry = local_entry; + struct task_struct *next; + + int reserve_switch; + + /* servers not ready yet, yield to linux */ + if (!atomic_read(&all_servers_added)) + return NULL; + +#ifdef NPSF_VERBOSE + TRACE_TASK(prev, "schedule\n"); +#endif + raw_spin_lock(&entry->cpu_res_lock); + + BUG_ON(no_reserves(entry)); + + /* step 1: what are we currently serving? */ + npsf_prev = entry->cpu_reserve; + srv_prev = npsf_prev->server; + + /* step 2: what SHOULD we be currently serving? */ + npsf_next = get_current_reserve(entry); + srv_next = npsf_next->server; + + /* TODO second measuring point for IPI receiving + * if (!srv_next->measure_wait_IPI) --- the remote reset + * trace_time_end. + */ + raw_spin_lock(&srv_prev->lock); + + + /* step 3: update prev server */ + if (is_realtime(prev) && task_npsfid(prev) == entry->cpu_reserve->npsf_id) + npsf_schedule_server(prev, entry); + else if (is_realtime(prev)) + TRACE_TASK(prev, "npsf_id %d != cpu_reserve npsf_id %d\n", + task_npsfid(prev), entry->cpu_reserve->npsf_id); + + /* step 4: determine if we need to switch to another reserve */ + reserve_switch = npsf_prev != npsf_next; + + if (!reserve_switch) { + /* easy case: just enact what the server scheduler decided */ + next = srv_prev->highest_prio; + + /* Unlock AFTER observing highest_prio to avoid races with + * remote rescheduling activity. */ + raw_spin_unlock(&srv_prev->lock); + } else { + /* In this case we have a reserve switch. We are done with the + * previous server, so release its lock. */ + TRACE("switch reserve npsf_id %d -> npsf_id %d\n", + npsf_prev->npsf_id, npsf_next->npsf_id); + npsf_prev->is_currently_scheduled = 0; + raw_spin_unlock(&srv_prev->lock); + + /* Move on to the next server. */ + + raw_spin_lock(&srv_next->lock); + npsf_next->is_currently_scheduled = 1; + + /* make sure we are owner of a server (if it is shared) */ + if (unlikely(srv_next->curr_reserve != npsf_next)) { + /* We raced with the previous owner. Let's schedule + * the previous reserve for now. The previous owner + * will send us an IPI when the server has been pushed + * to us. + */ + TRACE("(npsf_id %d) raced with previous server owner\n", + npsf_next->npsf_id); + + /* check if we are the first CPU, in which case we need + * to request a notification explicitly */ + if (srv_next->first_reserve == npsf_next) + srv_next->first_cpu_wants_ipi = 1; + + npsf_next->is_currently_scheduled = 0; + raw_spin_unlock(&srv_next->lock); + + /* just keep the previous reserve one more time */ + raw_spin_lock(&srv_prev->lock); + + npsf_prev->is_currently_scheduled = 1; + /* Note that there is not a race condition here. + * Since curr_reserve didn't point yet to this reserve, + * so no processor would have observed the one in npsf_next. + * A processor might have observed the flag being zero + * in npsf_prev and decided not to send an IPI, which + * doesn't matter since we are going to reschedule + * below anyay. */ + + next = srv_prev->highest_prio; + + raw_spin_unlock(&srv_prev->lock); + + /* TODO first measuring point for '0'-switching time + * remote is not ready yet and will send us an IPI + * when it's done. + * local: + * srv_next->measure_wait_IPI = 1; + * remote before sending IPI: + * if (srv_next->measure_wait_IPI) reset; + */ + } else { + /* invariant: srv->highest_prio is always the + * highest-priority job in the server, and it is always + * runnable. Any update to the server must maintain + * this invariant. */ + next = srv_next->highest_prio; + + entry->cpu_reserve = npsf_next; + raw_spin_unlock(&srv_next->lock); + + /* send an IPI (if necessary) */ + npsf_notify_next_cpu(npsf_prev); + } + + } + + if (next) { + TRACE_TASK(next, "(npsf_id %d) scheduled at %llu\n", + task_npsfid(next), litmus_clock()); + set_rt_flags(next, RT_F_RUNNING); + /* The TASK_RUNNING flag is set by the Penguin _way_ after + * activating a task. This dosn't matter much to Linux as + * the rq lock will prevent any changes, but it matters to + * us. It is possible for a remote cpu waking up this task + * to requeue the task before it's runnable, send an IPI here, + * we schedule that task (still "not-runnable"), and only + * before the real execution of next, the running flag is set. + */ + if (!is_running(next)) + TRACE_TASK(next, "BAD: !TASK_RUNNING\n"); + } else { + /* FIXME npsf_id is wrong if reserve switch but "switching back" + * if we race */ + TRACE("(npsf_id %d) becoming idle at %llu\n", + reserve_switch ? npsf_next->npsf_id : npsf_prev->npsf_id, + litmus_clock()); + } + + raw_spin_unlock(&entry->cpu_res_lock); + + return next; +} + +/* Prepare a task for running in RT mode + * + * We can only be sure that the cpu is a right one (admit checks + * against tasks released on a cpu that doesn't host the right npsf_id) + * but we _cannot_ be sure that: + * 1) the found npsf is the reserve currently running on this cpu. + * 2) the current reserve (the one in charge of scheduling) is not + * running on a different cpu. + */ +static void npsf_task_new(struct task_struct * t, int on_rq, int running) +{ + npsf_reserve_t *npsf; + npsf_server_t *srv; + cpu_entry_t *entry = task_entry(t); + rt_domain_t *edf; + unsigned long flags; + + BUG_ON(no_reserves(entry)); + + /* search the proper npsf_server where to add the new task */ + list_for_each_entry(npsf, &entry->npsf_reserves, node) { + if (npsf->npsf_id == task_npsfid(t)) + break; + } + + + srv = npsf->server; + + /* The task should be running in the queue, otherwise signal + * code will try to wake it up with fatal consequences. + */ + raw_spin_lock_irqsave(&entry->cpu_res_lock, flags); + raw_spin_lock(&srv->lock); + + edf = domain_edf(npsf); + tsk_rt(t)->domain = edf; + + TRACE_TASK(t, "task_new: P%d, task_npsfid %d, " + "npsf->npsf_id %d, entry->cpu %d\n", + t->rt_param.task_params.cpu, task_npsfid(t), + npsf->npsf_id, entry->cpu); + + /* setup job parameters */ + release_at(t, litmus_clock()); + + /* There are four basic scenarios that could happen: + * 1) the server is on another cpu and scheduled; + * 2) the server is on another cpu and not scheduled; + * 3) the server is on this cpu and scheduled; and + * 4) the server is on this cpu and not scheduled. + * + * Whatever scenario we're in, it cannot change while we are + * holding the server lock. + * + * If the new task does not have a high priority, then + * we can just queue it and be done. + * + * In theory, the requeue() and reschedule_server() code + * take care of all that. + */ + + requeue(t, edf); + /* reschedule will cause a remote preemption, if required */ + npsf_reschedule_server(srv); + /* always preempt to make sure we don't + * use the stack if it needs to migrate */ + set_tsk_need_resched(t); + + raw_spin_unlock(&srv->lock); + raw_spin_unlock_irqrestore(&entry->cpu_res_lock, flags); +} + +static void npsf_task_wake_up(struct task_struct *t) +{ + rt_domain_t *edf; + npsf_server_t* srv; + unsigned long flags; + lt_t now; + + BUG_ON(!is_realtime(t)); + + edf = tsk_rt(t)->domain; + srv = server_from_dom(edf); + + raw_spin_lock_irqsave(&srv->lock, flags); + + BUG_ON(is_queued(t)); + + now = litmus_clock(); + /* FIXME: this should be a configurable policy... */ + if (is_tardy(t, now)) { + /* new sporadic release */ + release_at(t, now); + sched_trace_task_release(t); + } + + /* Only add to ready queue if it is not the + * currently-scheduled task. + */ + if (srv->highest_prio != t) { + requeue(t, edf); + npsf_reschedule_server(srv); + } +#ifdef NPSF_VERBOSE + else + TRACE_TASK(t, "wake_up, is curr_sched, not requeued\n"); +#endif + + raw_spin_unlock_irqrestore(&srv->lock, flags); + + TRACE_TASK(t, "wake up done\n"); +} + +static void remove_from_server(struct task_struct *t, npsf_server_t* srv) +{ + if (srv->highest_prio == t) { + TRACE_TASK(t, "remove from server: is highest-prio task\n"); + srv->highest_prio = NULL; + npsf_reschedule_server(srv); + } else if (is_queued(t)) { + TRACE_TASK(t, "remove from server: removed from queue\n"); + remove(&srv->dom, t); + } +#ifdef NPSF_VERBOSE + else + TRACE_TASK(t, "WARN: where is this task?\n"); +#endif +} + +static void npsf_task_block(struct task_struct *t) +{ + rt_domain_t *edf; + npsf_server_t* srv; + unsigned long flags; + + TRACE_TASK(t, "(npsf_id %d) block at %llu, state=%d\n", + task_npsfid(t), litmus_clock(), t->state); + + BUG_ON(!is_realtime(t)); + + edf = tsk_rt(t)->domain; + srv = server_from_dom(edf); + + raw_spin_lock_irqsave(&srv->lock, flags); + + remove_from_server(t, srv); + + raw_spin_unlock_irqrestore(&srv->lock, flags); +} + +static void npsf_task_exit(struct task_struct * t) +{ + rt_domain_t *edf; + npsf_server_t* srv; + unsigned long flags; + + BUG_ON(!is_realtime(t)); + + edf = tsk_rt(t)->domain; + srv = server_from_dom(edf); + + raw_spin_lock_irqsave(&srv->lock, flags); + + remove_from_server(t, srv); + + raw_spin_unlock_irqrestore(&srv->lock, flags); + + TRACE_TASK(t, "RIP, now reschedule\n"); +} + +static long npsf_admit_task(struct task_struct* tsk) +{ + npsf_reserve_t *npsf; + cpu_entry_t *entry = task_entry(tsk); + int id_ok = 0; + + if (!atomic_read(&all_servers_added)) { + printk(KERN_DEBUG "not all servers added\n"); + return -ENODEV; + } + + /* check to be on the right cpu and on the right server */ + if (task_cpu(tsk) != tsk->rt_param.task_params.cpu) { + printk(KERN_DEBUG "wrong CPU(%d, %d, %d) for npsf_id %d\n", + task_cpu(tsk), tsk->rt_param.task_params.cpu, + entry->cpu, task_npsfid(tsk)); + return -EINVAL; + } + + /* 1) this cpu should have the proper npsf_id in the list + * 2) the rt_domain for the proper npsf_id is not null + */ + list_for_each_entry(npsf, &entry->npsf_reserves, node) { + if (npsf->npsf_id == task_npsfid(tsk)) { + id_ok = 1; + break; + } + } + if (!id_ok) + printk(KERN_DEBUG "wrong npsf_id (%d) for entry %d\n", + task_npsfid(tsk), entry->cpu); + + return id_ok ? 0 : -EINVAL; +} + +/* in litmus.c */ +extern atomic_t rt_task_count; + +/* initialization status control */ +static int reserves_allocated = 0; + +#ifdef NPSF_VERBOSE +static void print_reserve(cpu_entry_t *cpu) +{ + npsf_reserve_t *tmp; + + printk(KERN_INFO "NPS-F: reserves on CPU %d:\n", cpu->cpu); + list_for_each_entry(tmp, &cpu->npsf_reserves, node) { + BUG_ON(!tmp->server); + BUG_ON(!&(tmp->server->dom)); + BUG_ON(tmp->server->highest_prio); + printk(KERN_INFO "%d: %d us\n", tmp->npsf_id, + (int)(tmp->budget / 1000)); + } +} +#endif +/* + * do_add_reserve: add a reserve(cpu, id, budget) + * + * Callback for syscall add_server(); it allows to add the reserve "id" + * to the CPU "cpu". "budget" is the length of the reserve for the + * notional processor (server) id on the cpu cpu. + */ +static long do_add_reserve(npsf_reserve_t **new, cpu_entry_t *cpu, + npsf_server_t *the_dom, int npsf_id, lt_t budget) +{ + unsigned long flags; + + /* npsf_id for each cpu should be given in increasing order, + * it doesn't make sense the same np on the same cpu. + * The last_seen_npsf_id is reset upon plugin insertion. + */ + if (cpu->last_seen_npsf_id >= npsf_id) + return -EINVAL; + + /* don't allow server changes if there are tasks in the system */ + if (atomic_read(&rt_task_count)) + return -EACCES; + + if ((*new = kmalloc(sizeof(npsf_reserve_t), GFP_ATOMIC)) == NULL) + return -ENOMEM; + + (*new)->server = the_dom; + (*new)->npsf_id = npsf_id; + (*new)->budget = budget; + (*new)->cpu = cpu; + + npsf_printk("Add npsf_id %d on P%d with budget %llu\n", (*new)->npsf_id, + (*new)->cpu->cpu, (*new)->budget); + + raw_spin_lock_irqsave(&cpu->cpu_res_lock, flags); + + list_add_tail(&(*new)->node, &cpu->npsf_reserves); + cpu->last_seen_npsf_id = npsf_id; + cpu->cpu_reserve = list_first_entry(&cpu->npsf_reserves, npsf_reserve_t, node); + + raw_spin_unlock_irqrestore(&cpu->cpu_res_lock, flags); + + return 0; +} + +static void kickoff_timers(void) +{ + int cpu; + cpu_entry_t *entry; + lt_t kickoff; + + kickoff = slot_begin(litmus_clock() + npsf_slot_length * 2); + + for_each_online_cpu(cpu) { + entry = &per_cpu(npsf_cpu_entries, cpu); + hrtimer_start_on(cpu, &entry->info, &entry->timer, + ns_to_ktime(kickoff), + HRTIMER_MODE_ABS_PINNED); + entry->should_expire = kickoff; + } + atomic_set(&timers_activated, 1); +} + +/* We offer to library a budgets array interface (so we go through the + * syscall path only once) and we internally cycle on do_add_reserve. + * + * last == 1 means that the user is adding the last server and after + * the insertion the plugin is properly set up. (FIXME it should be + * done in a better way, but I doubt this plugin will ever go + * to the master branch). + */ +asmlinkage long sys_add_server(int __user *__id, + struct npsf_budgets __user *__budgets, int last) +{ + int id, i; + int ret = -EFAULT; + struct npsf_budgets *budgets; + cpu_entry_t *entry; + npsf_server_t *npsfserver; + npsf_reserve_t *npsf_reserve_array[NR_CPUS]; + npsf_reserve_t *first_reserve; + + if (_online_cpus != num_online_cpus()) + return ret; + + if (copy_from_user(&id, __id, sizeof(id))) + return ret; + + budgets = kmalloc(_online_cpus * sizeof(struct npsf_budgets), + GFP_ATOMIC); + + for (i = 0; i < _online_cpus; i++) { + budgets[i].cpu = NO_CPU; + budgets[i].budget = 0; + } + + if (copy_from_user(budgets, __budgets, + sizeof(budgets) * _online_cpus)) + goto err; + + /* initialize the npsf_server_t for this npsf_server series */ + npsfserver = kmalloc(sizeof(npsf_server_t), GFP_ATOMIC); + if (!npsfserver) { + ret = -ENOMEM; + goto err; + } + edf_domain_init(&npsfserver->dom, NULL, npsf_release_jobs); + npsfserver->highest_prio = NULL; + + /* initialize all npsf_reserve_t for this server */ + for (i = 0; budgets[i].cpu != NO_CPU && i < _online_cpus; i++) { + entry = &per_cpu(npsf_cpu_entries, budgets[i].cpu); + if ((ret = do_add_reserve(&npsf_reserve_array[i], entry, + npsfserver, + id, budgets[i].budget)) < 0) + goto err; + } + /* set the current reserve to the first (and possibly unique) + * slice for this npsf_id */ + npsfserver->curr_reserve = npsf_reserve_array[0]; + npsfserver->first_reserve = npsf_reserve_array[0]; + npsfserver->first_cpu_wants_ipi = 0; + for (i = 0; budgets[i].cpu != NO_CPU && i < _online_cpus; i++) { + + if (i == 0 && budgets[i+1].cpu == NO_CPU) { + /* Fixed reserve always has itself as next */ + npsf_reserve_array[i]->next_npsf = npsf_reserve_array[i]; + } else if (((i+1) < _online_cpus) && + (i > 0 && budgets[i+1].cpu == NO_CPU)) { + /* Last reserve in the chain has the first reserve as next */ + npsf_reserve_array[i]->next_npsf = npsf_reserve_array[0]; + } else { + /* Normal continuing reserve */ + npsf_reserve_array[i]->next_npsf = npsf_reserve_array[i+1]; + } + } +#ifdef NPSF_VERBOSE + for (i = 0; budgets[i].cpu != NO_CPU && i < _online_cpus; i++) { + entry = &per_cpu(npsf_cpu_entries, budgets[i].cpu); + print_reserve(entry); + } +#endif + + if (last) { + /* force the first slot switching by setting the + * current_reserve to the last server for each cpu. + * + * FIXME:don't assume there exists at least one reserve per CPU + */ + for_each_online_cpu(i) { + entry = &per_cpu(npsf_cpu_entries, i); + first_reserve = list_entry(entry->npsf_reserves.next, + npsf_reserve_t, node); + + first_reserve->server->curr_reserve = first_reserve; + entry->cpu_reserve = first_reserve; + npsf_printk("npsf_id %d is the current reserve " + "and server on CPU %d\n", + first_reserve->npsf_id, entry->cpu); + + } + + kickoff_timers(); + + /* real plugin enable */ + atomic_set(&all_servers_added, 1); + mb(); + } + + /* at least one server was initialized and may need deletion */ + reserves_allocated = 1; +err: + kfree(budgets); + return ret; +} + + +/* Cancel server_reschedule_tick() hrtimers. Wait for all callbacks + * to complete. The function is triggered writing 0 as npsf_slot_length. + */ +void npsf_hrtimers_cleanup(void) +{ + int cpu; + cpu_entry_t *entry; + int redo; + + if (!atomic_read(&timers_activated)) + return; + + atomic_set(&timers_activated, 0); + + /* prevent the firing of the timer on this cpu */ + do { + redo = 0; + for_each_online_cpu(cpu) { + entry = &per_cpu(npsf_cpu_entries, cpu); + + /* if callback active, skip it for now and redo later */ + if (hrtimer_try_to_cancel(&entry->timer) == -1) { + redo = 1; +#ifdef NPSF_VERBOSE + printk(KERN_INFO "(P%d) hrtimer on P%d was " + "active, try to delete again\n", + get_cpu(), cpu); + put_cpu(); +#endif + } + } + } while (redo); + + printk(KERN_INFO "npsf hrtimers deleted\n"); +} + +static void cleanup_npsf(void) +{ + int cpu; + cpu_entry_t *entry; + struct list_head *nd, *next; + npsf_reserve_t *tmp, *tmp_save; + + for_each_online_cpu(cpu) { + entry = &per_cpu(npsf_cpu_entries, cpu); + + /* FIXME probably not needed as we should be the only cpu + * doing the removal */ + raw_spin_lock(&entry->cpu_res_lock); + + list_for_each_safe(nd, next, &entry->npsf_reserves) { + tmp = list_entry(nd, npsf_reserve_t, node); + npsf_printk("Del. (id, cpu):(%d, %d)\n", + tmp->npsf_id, + tmp->cpu->cpu); + if (tmp->server) { + npsf_printk("Del. reserves for npsf_id %d\n", + tmp->npsf_id); + tmp_save = tmp; + while (tmp_save->next_npsf && + tmp_save->next_npsf != tmp) { + tmp_save = tmp_save->next_npsf; + tmp_save->server = NULL; + } + npsf_printk("Freeing server 0x%p\n", tmp->server); + kfree(tmp->server); + } + npsf_printk("Freeing npsf_reserve_t 0x%p\n", tmp); + kfree(tmp); + } + list_del(&entry->npsf_reserves); + raw_spin_unlock(&entry->cpu_res_lock); + } +} + +/* prevent plugin deactivation if timers are still active */ +static long npsf_deactivate_plugin(void) +{ + return (atomic_read(&timers_activated)) ? -1 : 0; +} + +static long npsf_activate_plugin(void) +{ + int cpu; + cpu_entry_t *entry; + ktime_t now = ktime_get(); + + /* prevent plugin switching if timers are active */ + if (atomic_read(&timers_activated)) + return -1; + + atomic_set(&all_servers_added, 0); + + /* de-allocate old servers (if any) */ + if (reserves_allocated) + cleanup_npsf(); + + _online_cpus = num_online_cpus(); + + for_each_online_cpu(cpu) { + entry = &per_cpu(npsf_cpu_entries, cpu); + + raw_spin_lock_init(&entry->cpu_res_lock); + + entry->cpu_reserve = NULL; + INIT_LIST_HEAD(&entry->npsf_reserves); + + entry->cpu = cpu; + hrtimer_init(&entry->timer, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS_PINNED); + + /* initialize (reinitialize) pull timers */ + hrtimer_start_on_info_init(&entry->info); + + entry->timer.function = reserve_switch_tick; + entry->last_seen_npsf_id = -1; + } + + printk(KERN_INFO "NPS-F activated: slot length = %lld ns\n", + npsf_slot_length); + + /* time starts now! */ + time_origin = (lt_t) ktime_to_ns(now); + TRACE("Time_origin = %llu\n", time_origin); + return 0; +} + +/* Plugin object */ +static struct sched_plugin npsf_plugin __cacheline_aligned_in_smp = { + .plugin_name = "NPS-F", + + .tick = npsf_scheduler_tick, + .task_new = npsf_task_new, + .complete_job = complete_job, + .task_exit = npsf_task_exit, + .schedule = npsf_schedule, + .task_wake_up = npsf_task_wake_up, + .task_block = npsf_task_block, + .admit_task = npsf_admit_task, + .activate_plugin = npsf_activate_plugin, + .deactivate_plugin = npsf_deactivate_plugin, +}; + +static int __init init_npsf(void) +{ + return register_sched_plugin(&npsf_plugin); +} + +static void __exit exit_npsf(void) +{ + if (atomic_read(&timers_activated)) { + atomic_set(&timers_activated, 0); + return; + } + + if (reserves_allocated) + cleanup_npsf(); +} + +module_init(init_npsf); +module_exit(exit_npsf); + diff --git a/litmus/sched_plugin.c b/litmus/sched_plugin.c index 3543b7b..3036df9 100644 --- a/litmus/sched_plugin.c +++ b/litmus/sched_plugin.c @@ -179,6 +179,12 @@ struct sched_plugin linux_sched_plugin = { int cluster_cache_index = 2; /* + * Slot length (in ns) for NPS-F semi-partitioned plugin. + * This value can be changed at "runtime" through proc file. + */ +lt_t npsf_slot_length = 5 * NSEC_PER_MSEC; + +/* * The reference to current plugin that is used to schedule tasks within * the system. It stores references to actual function implementations * Should be initialized by calling "init_***_plugin()"