/*
* litmus/sched_edf_os.c
*
* Implementation of the EDF-os scheduling algorithm.
*/
#include <linux/percpu.h>
#include <linux/sched.h>
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/module.h>
#include <litmus/litmus.h>
#include <litmus/jobs.h>
#include <litmus/sched_plugin.h>
#include <litmus/edf_common.h>
typedef struct {
rt_domain_t domain;
int cpu;
struct task_struct* scheduled; /* only RT tasks */
/* domain lock */
#define slock domain.ready_lock
} edfos_domain_t;
DEFINE_PER_CPU(edfos_domain_t, edfos_domains);
#define local_edfos (&__get_cpu_var(edfos_domains))
#define remote_edf(cpu) (&per_cpu(edfos_domains, cpu).domain)
#define remote_edfos(cpu) (&per_cpu(edfos_domains, cpu))
#define task_edf(task) remote_edf(get_partition(task))
#define task_edfos(task) remote_edfos(get_partition(task))
#define edfos_params(t) (t->rt_param.task_params.semi_part.os)
/* Is the task a migratory task? */
#define is_migrat_task(task) (edfos_params(task).migrat)
/* t is on the wrong CPU (it should be requeued properly) */
#define wrong_cpu(t) is_migrat_task((t)) \
&& task_cpu((t)) != get_partition((t))
/* Manipulate share for current cpu */
#define cur_cpu_fract_num(t) edfos_params(t).fraction[get_partition(t)][0]
#define cur_cpu_fract_den(t) edfos_params(t).fraction[get_partition(t)][1]
/* Get job number for current cpu */
#define cur_cpu_job_no(t) \
tsk_rt(t)->semi_part.cpu_job_no[get_partition(t)]
/*
* EDF-os: migratory tasks have higher prio than fixed, EDF in both classes.
* (Both first and second may be NULL).
*/
int edfos_higher_prio(struct task_struct* first, struct task_struct* second)
{
if ((first && edfos_params(first).migrat) ||
(second && edfos_params(second).migrat)) {
if ((first && edfos_params(first).migrat) &&
(second && edfos_params(second).migrat))
{
/* both are migrating */
if (edfos_params(first).first_cpu <
edfos_params(second).first_cpu)
return 1;
else
return 0;
}
if (first && edfos_params(first).migrat)
/* first is migrating */
return 1;
else
/* second is migrating */
return 0;
}
/* both are fixed or not real time */
return edf_higher_prio(first, second);
}
int edfos_ready_order(struct bheap_node* a, struct bheap_node* b)
{
return edfos_higher_prio(bheap2task(a), bheap2task(b));
}
static int fakepfair_ready_order(struct bheap_node* a, struct bheap_node* b)
{
return *((int*)a->value) < *((int*)b->value);
}
/* need_to_preempt - check whether the task t needs to be preempted
* call only with irqs disabled and with ready_lock acquired
*/
int edfos_preemption_needed(rt_domain_t* rt, struct task_struct *t)
{
/* we need the read lock for edf_ready_queue */
/* no need to preempt if there is nothing pending */
if (!__jobs_pending(rt))
return 0;
/* we need to reschedule if t doesn't exist */
if (!t)
return 1;
/* make sure to get non-rt stuff out of the way */
return !is_realtime(t) || edfos_higher_prio(__next_ready(rt), t);
}
/* we assume the lock is being held */
static void preempt(edfos_domain_t *edfos)
{
preempt_if_preemptable(edfos->scheduled, edfos->cpu);
}
static void edfos_release_jobs(rt_domain_t* rt, struct bheap* tasks)
{
unsigned long flags;
edfos_domain_t *edfos = container_of(rt, edfos_domain_t, domain);
raw_spin_lock_irqsave(&edfos->slock, flags);
__merge_ready(rt, tasks);
if (edfos_preemption_needed(rt, edfos->scheduled))
preempt(edfos);
raw_spin_unlock_irqrestore(&edfos->slock, flags);
}
/* EDF-os uses the "release_master" field to force the next release for
* the task 'task' to happen on a remote CPU. The remote cpu for task is
* previously set up during job_completion() taking into consideration
* whether a task is a migratory task or not.
*/
static inline void
edfos_add_release_remote(struct task_struct *task)
{
unsigned long flags;
rt_domain_t *rt = task_edf(task);
raw_spin_lock_irqsave(&rt->tobe_lock, flags);
/* "modify" destination cpu */
rt->release_master = get_partition(task);
TRACE_TASK(task, "Add remote release: smp_proc_id = %d, cpu = %d, remote = %d\n",
smp_processor_id(), task_cpu(task), rt->release_master);
/* trigger future release */
__add_release(rt, task);
/* reset proper release_master and unlock */
rt->release_master = NO_CPU;
raw_spin_unlock_irqrestore(&rt->tobe_lock, flags);
}
/* perform double ready_queue locking in an orderwise fashion
* this is called with: interrupt disabled and rq->lock held (from
* schedule())
*/
static noinline void double_domain_lock(edfos_domain_t *dom1, edfos_domain_t *dom2)
{
if (dom1 == dom2) {
/* fake */
raw_spin_lock(&dom1->slock);
} else {
if (dom1 < dom2) {
raw_spin_lock(&dom1->slock);
raw_spin_lock(&dom2->slock);
TRACE("acquired %d and %d\n", dom1->cpu, dom2->cpu);
} else {
raw_spin_lock(&dom2->slock);
raw_spin_lock(&dom1->slock);
TRACE("acquired %d and %d\n", dom2->cpu, dom1->cpu);
}
}
}
/* Directly insert a task in a remote ready queue. This function
* should only be called if this task is a migrating task and its
* last job for this CPU just completed (a new one is released for
* a remote CPU), but the new job is already tardy.
*/
static noinline void insert_task_in_remote_ready(struct task_struct *task)
{
edfos_domain_t *this = remote_edfos(task_cpu(task));
edfos_domain_t *remote = remote_edfos(get_partition(task));
BUG_ON(get_partition(task) != remote->cpu);
TRACE_TASK(task, "Migrate From P%d -> To P%d\n",
this->cpu, remote->cpu);
TRACE_TASK(task, "Inserting in remote ready queue\n");
WARN_ON(!irqs_disabled());
raw_spin_unlock(&this->slock);
mb();
TRACE_TASK(task,"edfos_lock %d released\n", this->cpu);
/* lock both ready queues */
double_domain_lock(this, remote);
mb();
__add_ready(&remote->domain, task);
/* release remote but keep ours */
raw_spin_unlock(&remote->slock);
TRACE_TASK(task,"edfos_lock %d released\n", remote->cpu);
/* ask remote cpu to reschedule, we are already rescheduling on this */
preempt(remote);
}
static void requeue(struct task_struct* t, rt_domain_t *edf)
{
if (t->state != TASK_RUNNING)
TRACE_TASK(t, "requeue: !TASK_RUNNING\n");
set_rt_flags(t, RT_F_RUNNING);
if (is_released(t, litmus_clock())) {
if (wrong_cpu(t)) {
/* this should only happen if t just completed, but
* its next release is already tardy, so it should be
* migrated and inserted in the remote ready queue
*/
TRACE_TASK(t, "Migrating task already released, "
"move from P%d to P%d\n",
task_cpu(t), get_partition(t));
insert_task_in_remote_ready(t);
} else {
/* not a migrat task or the job is on the right CPU */
__add_ready(edf, t);
}
} else {
if (wrong_cpu(t)) {
TRACE_TASK(t, "Migrating task, adding remote release\n");
edfos_add_release_remote(t);
} else {
TRACE_TASK(t, "Adding local release\n");
add_release(edf, t);
}
}
}
/* Update statistics for the _current_ job.
* - job_no was incremented _before_ starting this job
* (release_at / prepare_for_next_period)
* - cpu_job_no is incremented when the job completes
*/
static void update_job_counter(struct task_struct *t)
{
t->rt_param.semi_part.cpu_job_no[get_partition(t)]++;
TRACE_TASK(t, "job_no = %d, cpu_job_no(pos %d) = %d, cpu %d\n",
t->rt_param.job_params.job_no, get_partition(t),
cur_cpu_job_no(t), t->rt_param.task_params.cpu);
}
static int compute_pfair_deadline(lt_t wt_num, lt_t wt_den,
unsigned int job_no)
{
lt_t num;
num = job_no * wt_den;
if (do_div(num, wt_num))
num++;
return (int)num;
}
static int compute_pfair_release(lt_t wt_num, lt_t wt_den,
unsigned int job_no)
{
lt_t num;
num = (job_no - 1) * wt_den;
do_div(num, wt_num);
return (int)num;
}
static int next_cpu_for_job(struct task_struct *t)
{
unsigned int cpu;
lt_t next_rel;
struct bheap_node* node;
BUG_ON(!is_migrat_task(t));
/* Process any new subtask releases. */
node = bheap_peek(fakepfair_ready_order,
&edfos_params(t).release_queue);
while (node && *((int*)node->value) <= tsk_rt(t)->job_params.job_no) {
node = bheap_take(fakepfair_ready_order,
&edfos_params(t).release_queue);
BUG_ON(!node);
cpu = ((int*)node->value) - edfos_params(t).heap_data;
*((int*)node->value) = compute_pfair_deadline(
edfos_params(t).fraction[cpu][0],
edfos_params(t).fraction[cpu][1],
tsk_rt(t)->semi_part.cpu_job_no[cpu] + 1);
bheap_insert(fakepfair_ready_order,
&edfos_params(t).ready_queue, node);
node = bheap_peek(fakepfair_ready_order,
&edfos_params(t).release_queue);
}
/* Choose the next Pfair subtask. */
node = bheap_take(fakepfair_ready_order,
&edfos_params(t).ready_queue);
BUG_ON(!node);
cpu = ((int*)node->value) - edfos_params(t).heap_data;
next_rel = compute_pfair_release(edfos_params(t).fraction[cpu][0],
edfos_params(t).fraction[cpu][1],
tsk_rt(t)->semi_part.cpu_job_no[cpu]
+ 1);
if (next_rel <= tsk_rt(t)->job_params.job_no)
{
/* Next subtask already released. */
*((int*)node->value) = compute_pfair_deadline(
edfos_params(t).fraction[cpu][0],
edfos_params(t).fraction[cpu][1],
tsk_rt(t)->semi_part.cpu_job_no[cpu] +
1);
bheap_insert(fakepfair_ready_order,
&edfos_params(t).ready_queue, node);
}
else
{
/* Next subtask not yet released. */
*((int*)node->value) = next_rel;
bheap_insert(fakepfair_ready_order,
&edfos_params(t).release_queue, node);
}
TRACE_TASK(t, "%u = %u * %u / %u\n",
t->rt_param.job_params.job_no, cur_cpu_job_no(t),
cur_cpu_fract_den(t), cur_cpu_fract_num(t));
return cpu;
}
/* If needed (the share for task t on this CPU is exhausted), updates
* the task_params.cpu for the _migrating_ task t
*/
static void change_migrat_cpu_if_needed(struct task_struct *t)
{
int cpu;
BUG_ON(!is_migrat_task(t));
/* EDF-os: if it is a migrating task and it has already executed
* the required number of jobs on this CPU, we need to move it
* on its next CPU; changing the cpu here will affect the requeue
* and the next release
*/
cpu = next_cpu_for_job(t);
BUG();
if (unlikely(cpu != get_partition(t))) {
tsk_rt(t)->task_params.cpu = cpu;
TRACE_TASK(t, "EDF-os: will migrate job %d -> %d\n",
task_cpu(t), tsk_rt(t)->task_params.cpu);
return;
}
TRACE_TASK(t, "EDF-os: job will stay on %d -> %d\n",
task_cpu(t), tsk_rt(t)->task_params.cpu);
}
static void job_completion(struct task_struct* t, int forced)
{
sched_trace_task_completion(t,forced);
TRACE_TASK(t, "job_completion().\n");
if (unlikely(is_migrat_task(t))) {
update_job_counter(t);
change_migrat_cpu_if_needed(t);
}
set_rt_flags(t, RT_F_SLEEP);
prepare_for_next_period(t);
}
static void edfos_tick(struct task_struct *t)
{
edfos_domain_t *edfos = local_edfos;
BUG_ON(is_realtime(t) && t != edfos->scheduled);
if (is_realtime(t) && budget_enforced(t) && budget_exhausted(t)) {
set_tsk_need_resched(t);
TRACE("edfos_scheduler_tick: "
"%d is preemptable "
" => FORCE_RESCHED\n", t->pid);
}
}
static struct task_struct* edfos_schedule(struct task_struct * prev)
{
edfos_domain_t* edfos = local_edfos;
rt_domain_t* edf = &edfos->domain;
struct task_struct* next;
int out_of_time, sleep, preempt, exists, blocks, change_cpu, resched;
raw_spin_lock(&edfos->slock);
BUG_ON(edfos->scheduled && edfos->scheduled != prev);
BUG_ON(edfos->scheduled && !is_realtime(prev));
/* (0) Determine state */
exists = edfos->scheduled != NULL;
blocks = exists && !is_running(edfos->scheduled);
out_of_time = exists &&
budget_enforced(edfos->scheduled) &&
budget_exhausted(edfos->scheduled);
sleep = exists && get_rt_flags(edfos->scheduled) == RT_F_SLEEP;
change_cpu = exists && wrong_cpu(edfos->scheduled);
preempt = edfos_preemption_needed(edf, prev);
BUG_ON(blocks && change_cpu);
if (exists)
TRACE_TASK(prev,
"blocks:%d out_of_time:%d sleep:%d preempt:%d "
"wrong_cpu:%d state:%d sig:%d\n",
blocks, out_of_time, sleep, preempt,
change_cpu, prev->state, signal_pending(prev));
/* If we need to preempt do so. */
resched = preempt;
/* If a task blocks we have no choice but to reschedule. */
if (blocks)
resched = 1;
/* If a task has just woken up, it was tardy and the wake up
* raced with this schedule, a new job has already been released,
* but scheduled should be enqueued on a remote ready queue, and a
* new task should be selected for the current queue.
*/
if (change_cpu)
resched = 1;
/* Any task that is preemptable and either exhausts its execution
* budget or wants to sleep completes. We may have to reschedule after
* this.
*/
if ((out_of_time || sleep) && !blocks) {
job_completion(edfos->scheduled, !sleep);
resched = 1;
}
/* The final scheduling decision. Do we need to switch for some reason?
* Switch if we are in RT mode and have no task or if we need to
* resched.
*/
next = NULL;
if (resched || !exists) {
if (edfos->scheduled && !blocks)
requeue(edfos->scheduled, edf);
next = __take_ready(edf);
} else
/* Only override Linux scheduler if we have a real-time task
* scheduled that needs to continue.
*/
if (exists)
next = prev;
if (next) {
TRACE_TASK(next, "scheduled at %llu\n", litmus_clock());
set_rt_flags(next, RT_F_RUNNING);
} else {
TRACE("becoming idle at %llu\n", litmus_clock());
}
edfos->scheduled = next;
raw_spin_unlock(&edfos->slock);
return next;
}
/* Prepare a task for running in RT mode
*/
static void edfos_task_new(struct task_struct * t, int on_rq, int running)
{
rt_domain_t* edf = task_edf(t);
edfos_domain_t* edfos = task_edfos(t);
unsigned long flags;
unsigned int i;
unsigned int has_cpu = 0;
if (edfos_params(t).migrat) {
bheap_init(&edfos_params(t).release_queue);
bheap_init(&edfos_params(t).ready_queue);
for (i = 0; i < NR_CPUS; i++) {
if (edfos_params(t).fraction[i][0] > 0) {
has_cpu = 1;
edfos_params(t).heap_data[i] =
compute_pfair_deadline(
edfos_params(t).fraction[i][0],
edfos_params(t).fraction[i][1], 0);
bheap_add(fakepfair_ready_order,
&edfos_params(t).ready_queue,
&edfos_params(t).heap_data[i],
GFP_ATOMIC);
}
}
BUG_ON(!has_cpu);
/* Pick the first CPU to execute on. */
change_migrat_cpu_if_needed(t);
}
TRACE_TASK(t, "EDF-os: task new, cpu = %d\n",
t->rt_param.task_params.cpu);
release_at(t, litmus_clock());
update_job_counter(t);
/* The task should be running in the queue, otherwise signal
* code will try to wake it up with fatal consequences.
*/
raw_spin_lock_irqsave(&edfos->slock, flags);
if (running) {
/* there shouldn't be anything else running at the time */
BUG_ON(edfos->scheduled);
edfos->scheduled = t;
} else {
requeue(t, edf);
/* maybe we have to reschedule */
preempt(edfos);
}
raw_spin_unlock_irqrestore(&edfos->slock, flags);
}
static void edfos_task_wake_up(struct task_struct *task)
{
unsigned long flags;
edfos_domain_t* edfos = task_edfos(task);
rt_domain_t* edf = task_edf(task);
lt_t now;
TRACE_TASK(task, "wake_up at %llu\n", litmus_clock());
TRACE_TASK(task, "acquire edfos %d\n", edfos->cpu);
raw_spin_lock_irqsave(&edfos->slock, flags);
BUG_ON(edfos != task_edfos(task));
BUG_ON(is_queued(task));
now = litmus_clock();
if (is_tardy(task, now)) {
if (unlikely(is_migrat_task(task))) {
/* a new job will be released.
* Update current job counter */
update_job_counter(task);
/* Switch CPU if needed */
change_migrat_cpu_if_needed(task);
}
/* new sporadic release */
TRACE_TASK(task, "release new\n");
release_at(task, now);
sched_trace_task_release(task);
}
/* Only add to ready queue if it is not the currently-scheduled
* task. This could be the case if a task was woken up concurrently
* on a remote CPU before the executing CPU got around to actually
* de-scheduling the task, i.e., wake_up() raced with schedule()
* and won.
*/
if (edfos->scheduled != task)
requeue(task, edf);
raw_spin_unlock_irqrestore(&edfos->slock, flags);
TRACE_TASK(task, "release edfos %d\n", edfos->cpu);
TRACE_TASK(task, "wake up done\n");
}
static void edfos_task_block(struct task_struct *t)
{
TRACE_TASK(t, "block at %llu, state=%d\n", litmus_clock(), t->state);
BUG_ON(!is_realtime(t));
if (is_queued(t)) {
edfos_domain_t *edfos = local_edfos;
TRACE_TASK(t, "task blocked, race with wakeup, "
"remove from queue %d\n", edfos->cpu);
remove(&edfos->domain, t);
}
}
static void edfos_task_exit(struct task_struct * t)
{
unsigned long flags;
edfos_domain_t* edfos = task_edfos(t);
rt_domain_t* edf;
raw_spin_lock_irqsave(&edfos->slock, flags);
if (is_queued(t)) {
/* dequeue */
edf = task_edf(t);
remove(edf, t);
}
if (edfos->scheduled == t)
edfos->scheduled = NULL;
/* Deallocate heap nodes. */
while (bheap_take_del(fakepfair_ready_order,
&edfos_params(t).release_queue)) {}
while (bheap_take_del(fakepfair_ready_order,
&edfos_params(t).ready_queue)) {}
TRACE_TASK(t, "RIP\n");
preempt(edfos);
raw_spin_unlock_irqrestore(&edfos->slock, flags);
}
static long edfos_admit_task(struct task_struct* tsk)
{
return task_cpu(tsk) == tsk->rt_param.task_params.cpu ? 0 : -EINVAL;
}
/* Plugin object */
static struct sched_plugin edfos_plugin __cacheline_aligned_in_smp = {
.plugin_name = "EDF-os",
.tick = edfos_tick,
.task_new = edfos_task_new,
.complete_job = complete_job,
.task_exit = edfos_task_exit,
.schedule = edfos_schedule,
.task_wake_up = edfos_task_wake_up,
.task_block = edfos_task_block,
.admit_task = edfos_admit_task
};
static int __init init_edfos(void)
{
int i;
edfos_domain_t *edfos;
/* Note, broken if num_online_cpus() may change */
for (i = 0; i < num_online_cpus(); i++) {
edfos = remote_edfos(i);
edfos->cpu = i;
edfos->scheduled = NULL;
rt_domain_init(&edfos->domain, edfos_ready_order, NULL,
edfos_release_jobs);
}
return register_sched_plugin(&edfos_plugin);
}
module_init(init_edfos);