/* * kernel/sched_psn_edf.c * * Implementation of the PSN-EDF scheduler plugin. * Based on kern/sched_part_edf.c and kern/sched_gsn_edf.c. * * Suspensions and non-preemptable sections are supported. * Priority inheritance is not supported. */ #include #include #include #include #include #include #include typedef struct { rt_domain_t domain; int cpu; struct task_struct* scheduled; /* only RT tasks */ spinlock_t lock; /* protects the domain and * serializes scheduling decisions */ } psnedf_domain_t; DEFINE_PER_CPU(psnedf_domain_t, psnedf_domains); #define local_edf (&__get_cpu_var(psnedf_domains).domain) #define local_pedf (&__get_cpu_var(psnedf_domains)) #define remote_edf(cpu) (&per_cpu(psnedf_domains, cpu).domain) #define remote_pedf(cpu) (&per_cpu(psnedf_domains, cpu)) #define task_edf(task) remote_edf(get_partition(task)) #define task_pedf(task) remote_pedf(get_partition(task)) static void psnedf_domain_init(psnedf_domain_t* pedf, check_resched_needed_t check, int cpu) { edf_domain_init(&pedf->domain, check); pedf->cpu = cpu; pedf->lock = SPIN_LOCK_UNLOCKED; pedf->scheduled = NULL; } static void requeue(struct task_struct* t, rt_domain_t *edf) { /* only requeue if t is actually running */ BUG_ON(!is_running(t)); if (t->state != TASK_RUNNING) TRACE_TASK(t, "requeue: !TASK_RUNNING"); set_rt_flags(t, RT_F_RUNNING); if (!is_released(t) || get_rt_mode() != MODE_RT_RUN) __add_release(edf, t); /* it has got to wait */ else __add_ready(edf, t); } /* we assume the lock is being held */ static void preempt(psnedf_domain_t *pedf) { if (smp_processor_id() == pedf->cpu) { if (pedf->scheduled && is_np(pedf->scheduled)) request_exit_np(pedf->scheduled); else set_tsk_need_resched(current); } else /* in case that it is a remote CPU we have to defer the * the decision to the remote CPU */ smp_send_reschedule(pedf->cpu); } /* This check is trivial in partioned systems as we only have to consider * the CPU of the partition. */ static int psnedf_check_resched(rt_domain_t *edf) { psnedf_domain_t *pedf = container_of(edf, psnedf_domain_t, domain); int ret = 0; /* because this is a callback from rt_domain_t we already hold * the necessary lock for the ready queue */ if (edf_preemption_needed(edf, pedf->scheduled)) { preempt(pedf); ret = 1; } return ret; } static reschedule_check_t psnedf_scheduler_tick(void) { unsigned long flags; struct task_struct *t = current; reschedule_check_t want_resched = NO_RESCHED; rt_domain_t *edf = local_edf; psnedf_domain_t *pedf = local_pedf; /* Check for inconsistency. We don't need the lock for this since * ->scheduled is only changed in schedule, which obviously is not * executing in parallel on this CPU */ BUG_ON(is_realtime(t) && t != pedf->scheduled); if (is_realtime(t)) TRACE("%s/%d was hit by scheduler tick\n", t->comm, t->pid); /* expire tasks even if not in real-time mode * this makes sure that at the end of real-time mode * no tasks "run away forever". */ if (is_realtime(t) && t->time_slice && !--t->time_slice) { if (!is_np(t)) { want_resched = FORCE_RESCHED; } else { TRACE("psnedf_scheduler_tick: " "%d is non-preemptable, " "preemption delayed.\n", t->pid); request_exit_np(t); } } if (get_rt_mode() == MODE_RT_RUN) { /* check whether anything is waiting to be released * this could probably be moved to the global timer * interrupt handler since the state will only change * once per jiffie */ spin_lock_irqsave(&pedf->lock, flags); __release_pending(edf); if (want_resched != FORCE_RESCHED && edf_preemption_needed(edf, t)) want_resched = FORCE_RESCHED; spin_unlock_irqrestore(&pedf->lock, flags); } return want_resched; } static void job_completion(struct task_struct* t) { TRACE_TASK(t, "job_completion().\n"); set_rt_flags(t, RT_F_SLEEP); edf_prepare_for_next_period(t); } static int psnedf_schedule(struct task_struct * prev, struct task_struct ** next, runqueue_t * rq) { psnedf_domain_t* pedf = local_pedf; rt_domain_t* edf = &pedf->domain; int out_of_time, sleep, preempt, np, exists, rt, blocks, resched; spin_lock(&pedf->lock); /* sanity checking */ BUG_ON(pedf->scheduled && pedf->scheduled != prev); BUG_ON(pedf->scheduled && !is_realtime(prev)); /* (0) Determine state */ exists = pedf->scheduled != NULL; blocks = exists && !is_running(pedf->scheduled); out_of_time = exists && !pedf->scheduled->time_slice; np = exists && is_np(pedf->scheduled); sleep = exists && get_rt_flags(pedf->scheduled) == RT_F_SLEEP; preempt = edf_preemption_needed(edf, prev); rt = get_rt_mode() == MODE_RT_RUN; /* If we need to preempt do so. * The following checks set resched to 1 in case of special * circumstances. */ resched = preempt; /* If a task blocks we have no choice but to reschedule. */ if (blocks) resched = 1; /* Request a sys_exit_np() call if we would like to preempt but cannot. * Multiple calls to request_exit_np() don't hurt. */ if (np && (out_of_time || preempt || sleep)) request_exit_np(pedf->scheduled); /* Any task that is preemptable and either exhausts its execution * budget or wants to sleep completes. We may have to reschedule after * this. */ if (!np && (out_of_time || sleep)) { job_completion(pedf->scheduled); resched = 1; } /* Stop real-time tasks when we leave real-time mode */ if (!rt && exists) resched = 1; /* The final scheduling decision. Do we need to switch for some reason? * Switch if we are in RT mode and have no task or if we need to * resched. */ *next = NULL; if ((!np || blocks) && (resched || (!exists && rt))) { /* Take care of a previously scheduled * job by taking it out of the Linux runqueue. */ if (pedf->scheduled) { /* as opposed to global schedulers that switch without * a lock being held we can requeue already here since * no other CPU will schedule from this domain. */ if (!blocks) requeue(pedf->scheduled, edf); if (prev->array) /* take it out of the run queue */ deactivate_task(prev, rq); } /* only pick tasks if we are actually in RT mode */ if (rt) *next = __take_ready(edf); if (*next) { /* stick the task into the runqueue */ __activate_task(*next, rq); set_task_cpu(*next, smp_processor_id()); } } else /* Only override Linux scheduler if we have a real-time task * scheduled that needs to continue. */ if (exists) *next = prev; if (*next) set_rt_flags(*next, RT_F_RUNNING); pedf->scheduled = *next; spin_unlock(&pedf->lock); return 0; } /* Prepare a task for running in RT mode * Enqueues the task into master queue data structure * returns * -EPERM if task is not TASK_STOPPED */ static long psnedf_prepare_task(struct task_struct * t) { rt_domain_t* edf = task_edf(t); psnedf_domain_t* pedf = task_pedf(t); unsigned long flags; TRACE("[%d] psn edf: prepare task %d on CPU %d\n", smp_processor_id(), t->pid, get_partition(t)); if (t->state == TASK_STOPPED) { __setscheduler(t, SCHED_FIFO, MAX_RT_PRIO - 1); if (get_rt_mode() == MODE_RT_RUN) /* The action is already on. * Prepare immediate release. */ edf_release_now(t); /* The task should be running in the queue, otherwise signal * code will try to wake it up with fatal consequences. */ t->state = TASK_RUNNING; spin_lock_irqsave(&pedf->lock, flags); __add_release(edf, t); spin_unlock_irqrestore(&pedf->lock, flags); return 0; } else return -EPERM; } static void psnedf_wake_up_task(struct task_struct *task) { unsigned long flags; psnedf_domain_t* pedf = task_pedf(task); rt_domain_t* edf = task_edf(task); TRACE("psnedf: %d unsuspends with budget=%d\n", task->pid, task->time_slice); /* After fixing the litmus_controlled bug, * this should hold again. */ BUG_ON(in_list(&task->rt_list)); /* FIXME: * There exists a race between this function, suspensions due to IO, * and switching in and out of real-time mode. For some reason, the * BUG_ON triggered after a task system warm-up phase. * * BUG_ON(in_list(&task->rt_list)); * * Replaced by an if to gather more information. */ /* if (unlikely(in_list(&task->rt_list))) { TRACE(KERN_CRIT "wake_up_task: Why is %s/%d in rt list? " "state=%ld next=%p prev=%p flags=0x%8lx mode=%d " "partition=%d cpu=%d deadline=%ld now=%ld release=%ld" "rtflags=%d timeslice=%d job=%u knp=%d", task->comm, task->pid, task->state, task->rt_list.next, task->rt_list.prev, task->flags, get_rt_mode(), get_partition(task), smp_processor_id(), get_deadline(task), jiffies, get_release(task), get_rt_flags(task), task->time_slice, task->rt_param.times.job_no, task->rt_param.kernel_np); task->state = TASK_RUNNING; return; } */ task->state = TASK_RUNNING; /* We need to take suspensions because of semaphores into * account! If a job resumes after being suspended due to acquiring * a semaphore, it should never be treated as a new job release. */ if (is_tardy(task) && get_rt_flags(task) != RT_F_EXIT_SEM) { /* new sporadic release */ edf_release_now(task); sched_trace_job_release(task); } spin_lock_irqsave(&pedf->lock, flags); requeue(task, edf); spin_unlock_irqrestore(&pedf->lock, flags); } static void psnedf_task_blocks(struct task_struct *t) { BUG_ON(!is_realtime(t)); /* not really anything to do since it can only block if * it is running, and when it is not running it is not in any * queue anyway. */ TRACE("task %d blocks with budget=%d\n", t->pid, t->time_slice); BUG_ON(in_list(&t->rt_list)); } /* When _tear_down is called, the task should not be in any queue any more * as it must have blocked first. We don't have any internal state for the task, * it is all in the task_struct. */ static long psnedf_tear_down(struct task_struct * t) { BUG_ON(!is_realtime(t)); TRACE_TASK(t, "tear down called"); BUG_ON(t->array); BUG_ON(in_list(&t->rt_list)); return 0; } static long psnedf_pi_block(struct pi_semaphore *sem, struct task_struct *new_waiter) { psnedf_domain_t* pedf; rt_domain_t* edf; struct task_struct* t; int cpu = get_partition(new_waiter); BUG_ON(!new_waiter); if (edf_higher_prio(new_waiter, sem->hp.cpu_task[cpu])) { TRACE_TASK(new_waiter, " boosts priority\n"); pedf = task_pedf(new_waiter); edf = task_edf(new_waiter); /* interrupts already disabled */ spin_lock(&pedf->lock); /* store new highest-priority task */ sem->hp.cpu_task[cpu] = new_waiter; if (sem->holder && get_partition(sem->holder) == get_partition(new_waiter)) { /* let holder inherit */ sem->holder->rt_param.inh_task = new_waiter; t = sem->holder; if (in_list(&t->rt_list)) { /* queued in domain*/ list_del(&t->rt_list); /* readd to make priority change take place */ if (is_released(t)) __add_ready(edf, t); else __add_release(edf, t); } } /* check if we need to reschedule */ if (edf_preemption_needed(edf, current)) preempt(pedf); spin_unlock(&pedf->lock); } return 0; } static long psnedf_inherit_priority(struct pi_semaphore *sem, struct task_struct *new_owner) { int cpu = get_partition(new_owner); new_owner->rt_param.inh_task = sem->hp.cpu_task[cpu]; if (sem->hp.cpu_task[cpu] && new_owner != sem->hp.cpu_task[cpu]) { TRACE_TASK(new_owner, "inherited priority from %s/%d\n", sem->hp.cpu_task[cpu]->comm, sem->hp.cpu_task[cpu]->pid); } else TRACE_TASK(new_owner, "cannot inherit priority: " "no higher priority job waits on this CPU!\n"); /* make new owner non-preemptable as required by FMLP under * PSN-EDF. */ make_np(new_owner); return 0; } /* This function is called on a semaphore release, and assumes that * the current task is also the semaphore holder. */ static long psnedf_return_priority(struct pi_semaphore *sem) { struct task_struct* t = current; psnedf_domain_t* pedf = task_pedf(t); rt_domain_t* edf = task_edf(t); int ret = 0; int cpu = get_partition(current); /* Find new highest-priority semaphore task * if holder task is the current hp.cpu_task[cpu]. * * Calling function holds sem->wait.lock. */ if (t == sem->hp.cpu_task[cpu]) edf_set_hp_cpu_task(sem, cpu); take_np(t); if (current->rt_param.inh_task) { TRACE_CUR("return priority of %s/%d\n", current->rt_param.inh_task->comm, current->rt_param.inh_task->pid); spin_lock(&pedf->lock); /* Reset inh_task to NULL. */ current->rt_param.inh_task = NULL; /* check if we need to reschedule */ if (edf_preemption_needed(edf, current)) preempt(pedf); spin_unlock(&pedf->lock); } else TRACE_CUR(" no priority to return %p\n", sem); return ret; } static int psnedf_mode_change(int new_mode) { int cpu; if (new_mode == MODE_RT_RUN) for_each_online_cpu(cpu) { spin_lock(&remote_pedf(cpu)->lock); __rerelease_all(remote_edf(cpu), edf_release_at); spin_unlock(&remote_pedf(cpu)->lock); } TRACE("[%d] psn edf: mode changed to %d\n", smp_processor_id(), new_mode); return 0; } /* Plugin object */ static sched_plugin_t s_plugin __cacheline_aligned_in_smp = { .ready_to_use = 0 }; /* * Plugin initialization code. */ #define INIT_SCHED_PLUGIN (struct sched_plugin) {\ .plugin_name = "PSN-EDF",\ .ready_to_use = 1,\ .scheduler_tick = psnedf_scheduler_tick,\ .prepare_task = psnedf_prepare_task,\ .sleep_next_period = edf_sleep_next_period,\ .tear_down = psnedf_tear_down,\ .shutdown_hook = NULL,\ .schedule = psnedf_schedule,\ .mode_change = psnedf_mode_change,\ .wake_up_task = psnedf_wake_up_task,\ .task_blocks = psnedf_task_blocks, \ .pi_block = psnedf_pi_block, \ .inherit_priority = psnedf_inherit_priority, \ .return_priority = psnedf_return_priority \ } sched_plugin_t *__init init_psn_edf_plugin(void) { int i; if (!s_plugin.ready_to_use) { for (i = 0; i < NR_CPUS; i++) { psnedf_domain_init(remote_pedf(i), psnedf_check_resched, i); printk("PSN-EDF: CPU partition %d initialized.\n", i); } s_plugin = INIT_SCHED_PLUGIN; } return &s_plugin; }