path: root/litmus/sched_edf_os.c



/*
 * litmus/sched_edf_os.c
 *
 * Implementation of the EDF-os scheduling algorithm.
 */

#include <linux/percpu.h>
#include <linux/sched.h>
#include <linux/list.h>
#include <linux/spinlock.h>

#include <linux/module.h>

#include <litmus/litmus.h>
#include <litmus/jobs.h>
#include <litmus/sched_plugin.h>
#include <litmus/edf_common.h>

typedef struct {
	rt_domain_t 		domain;
	int          		cpu;
	struct task_struct* 	scheduled; /* only RT tasks */
/* domain lock */
#define slock domain.ready_lock
} edfos_domain_t;

DEFINE_PER_CPU(edfos_domain_t, edfos_domains);

#define local_edfos		(&__get_cpu_var(edfos_domains))
#define remote_edf(cpu)		(&per_cpu(edfos_domains, cpu).domain)
#define remote_edfos(cpu)	(&per_cpu(edfos_domains, cpu))
#define task_edf(task)		remote_edf(get_partition(task))
#define task_edfos(task)	remote_edfos(get_partition(task))

#define edfos_params(t)		(t->rt_param.task_params.semi_part.os)

/* Is the task a migratory task? */
#define is_migrat_task(task)	(edfos_params(task).migrat)
/* t is on the wrong CPU (it should be requeued properly) */
#define wrong_cpu(t)		is_migrat_task((t)) \
				&& task_cpu((t)) != get_partition((t))
/* Manipulate share for current cpu */
#define cur_cpu_fract_num(t)	edfos_params(t).fraction[get_partition(t)][0]
#define cur_cpu_fract_den(t)	edfos_params(t).fraction[get_partition(t)][1]
/* Get job number for current cpu */
#define cur_cpu_job_no(t)	\
	tsk_rt(t)->semi_part.cpu_job_no[get_partition(t)]

/*
 * EDF-os: migratory tasks have higher prio than fixed, EDF in both classes.
 * (Both first and second may be NULL).
 */
int edfos_higher_prio(struct task_struct* first, struct task_struct* second)
{
	if ((first && edfos_params(first).migrat) ||
			(second && edfos_params(second).migrat)) {
		if ((first && edfos_params(first).migrat) &&
		    (second && edfos_params(second).migrat))
		{
			/* both are migrating */
			if (edfos_params(first).first_cpu <
			    edfos_params(second).first_cpu)
				return 1;
			else
				return 0;
		}

		if (first && edfos_params(first).migrat)
			/* first is migrating */
			return 1;
		else
			/* second is migrating */
			return 0;
	}

	/* both are fixed or not real time */
	return edf_higher_prio(first, second);
}

int edfos_ready_order(struct bheap_node* a, struct bheap_node* b)
{
	return edfos_higher_prio(bheap2task(a), bheap2task(b));
}

static int fakepfair_ready_order(struct bheap_node* a, struct bheap_node* b)
{
	return *((int*)a->value) < *((int*)b->value);
}

/* need_to_preempt - check whether the task t needs to be preempted
 *                   call only with irqs disabled and with ready_lock acquired
 */
int edfos_preemption_needed(rt_domain_t* rt, struct task_struct *t)
{
	/* we need the read lock for edf_ready_queue */
	/* no need to preempt if there is nothing pending */
	if (!__jobs_pending(rt))
		return 0;
	/* we need to reschedule if t doesn't exist */
	if (!t)
		return 1;

	/* make sure to get non-rt stuff out of the way */
	return !is_realtime(t) || edfos_higher_prio(__next_ready(rt), t);
}

/* we assume the lock is being held */
static void preempt(edfos_domain_t *edfos)
{
	preempt_if_preemptable(edfos->scheduled, edfos->cpu);
}

static void edfos_release_jobs(rt_domain_t* rt, struct bheap* tasks)
{
	unsigned long flags;
	edfos_domain_t *edfos = container_of(rt, edfos_domain_t, domain);

	raw_spin_lock_irqsave(&edfos->slock, flags);

	__merge_ready(rt, tasks);

	if (edfos_preemption_needed(rt, edfos->scheduled))
		preempt(edfos);

	raw_spin_unlock_irqrestore(&edfos->slock, flags);
}

/* EDF-os uses the "release_master" field to force the next release for
 * the task 'task' to happen on a remote CPU. The remote cpu for task is
 * previously set up during job_completion() taking into consideration
 * whether a task is a migratory task or not.
 */
static inline void
edfos_add_release_remote(struct task_struct *task)
{
	unsigned long flags;
	rt_domain_t *rt = task_edf(task);

	raw_spin_lock_irqsave(&rt->tobe_lock, flags);

	/* "modify" destination cpu */
	rt->release_master = get_partition(task);

	TRACE_TASK(task, "Add remote release: smp_proc_id = %d, cpu = %d, remote = %d\n",
			smp_processor_id(), task_cpu(task), rt->release_master);

	/* trigger future release */
	__add_release(rt, task);

	/* reset proper release_master and unlock */
	rt->release_master = NO_CPU;
	raw_spin_unlock_irqrestore(&rt->tobe_lock, flags);
}

/* perform double ready_queue locking in an orderwise fashion
 * this is called with: interrupt disabled and rq->lock held (from
 * schedule())
 */
static noinline void double_domain_lock(edfos_domain_t *dom1, edfos_domain_t *dom2)
{
	if (dom1 == dom2) {
		/* fake */
		raw_spin_lock(&dom1->slock);
	} else {
		if (dom1 < dom2) {
			raw_spin_lock(&dom1->slock);
			raw_spin_lock(&dom2->slock);
			TRACE("acquired %d and %d\n", dom1->cpu, dom2->cpu);
		} else {
			raw_spin_lock(&dom2->slock);
			raw_spin_lock(&dom1->slock);
			TRACE("acquired %d and %d\n", dom2->cpu, dom1->cpu);
		}
	}
}

/* Directly insert a task in a remote ready queue. This function
 * should only be called if this task is a migrating task and its
 * last job for this CPU just completed (a new one is released for
 * a remote CPU), but the new job is already tardy.
 */
static noinline void insert_task_in_remote_ready(struct task_struct *task)
{
	edfos_domain_t *this = remote_edfos(task_cpu(task));
	edfos_domain_t *remote = remote_edfos(get_partition(task));

	BUG_ON(get_partition(task) != remote->cpu);

	TRACE_TASK(task, "Migrate From P%d -> To P%d\n",
			this->cpu, remote->cpu);
	TRACE_TASK(task, "Inserting in remote ready queue\n");

	WARN_ON(!irqs_disabled());

	raw_spin_unlock(&this->slock);
	mb();
	TRACE_TASK(task,"edfos_lock %d released\n", this->cpu);

	/* lock both ready queues */
	double_domain_lock(this, remote);
	mb();

	__add_ready(&remote->domain, task);

	/* release remote but keep ours */
	raw_spin_unlock(&remote->slock);
	TRACE_TASK(task,"edfos_lock %d released\n", remote->cpu);

	/* ask remote cpu to reschedule, we are already rescheduling on this */
	preempt(remote);
}

static void requeue(struct task_struct* t, rt_domain_t *edf)
{
	if (t->state != TASK_RUNNING)
		TRACE_TASK(t, "requeue: !TASK_RUNNING\n");

	set_rt_flags(t, RT_F_RUNNING);
	if (is_released(t, litmus_clock())) {
		if (wrong_cpu(t)) {
			/* this should only happen if t just completed, but
			 * its next release is already tardy, so it should be
			 * migrated and inserted in the remote ready queue
			 */
			TRACE_TASK(t, "Migrating task already released, "
				       "move from P%d to P%d\n",
					task_cpu(t), get_partition(t));

			insert_task_in_remote_ready(t);
		} else {
			/* not a migrat task or the job is on the right CPU */
			__add_ready(edf, t);
		}
	} else {
		if (wrong_cpu(t)) {

			TRACE_TASK(t, "Migrating task, adding remote release\n");
			edfos_add_release_remote(t);
		} else {
			TRACE_TASK(t, "Adding local release\n");
			add_release(edf, t);
		}
	}
}

/* Update statistics for the _current_ job.
 * 	- job_no was incremented _before_ starting this job
 * 	(release_at / prepare_for_next_period)
 * 	- cpu_job_no is incremented when the job completes
 */
static void update_job_counter(struct task_struct *t)
{
	t->rt_param.semi_part.cpu_job_no[get_partition(t)]++;

	TRACE_TASK(t, "job_no = %d, cpu_job_no(pos %d) = %d, cpu %d\n",
			t->rt_param.job_params.job_no, get_partition(t),
			cur_cpu_job_no(t), t->rt_param.task_params.cpu);
}


static int compute_pfair_deadline(lt_t wt_num, lt_t wt_den,
				  unsigned int job_no)
{	
	lt_t num;
	num = job_no * wt_den;
	if (do_div(num, wt_num))
		num++;
	return (int)num;
}

static int compute_pfair_release(lt_t wt_num, lt_t wt_den,
				 unsigned int job_no)
{
	lt_t num;
	num = (job_no - 1) * wt_den;
	do_div(num, wt_num);
	return (int)num;
}

static int next_cpu_for_job(struct task_struct *t)
{
	unsigned int cpu;
	lt_t next_rel;
	struct bheap_node* node;
	BUG_ON(!is_migrat_task(t));
	
	/* Process any new subtask releases. */
	node = bheap_peek(fakepfair_ready_order,
			  &edfos_params(t).release_queue);
	while (node && *((int*)node->value) <= tsk_rt(t)->job_params.job_no) {
		node = bheap_take(fakepfair_ready_order,
				  &edfos_params(t).release_queue);
		BUG_ON(!node);
		cpu = ((int*)node->value) - edfos_params(t).heap_data;
		*((int*)node->value) = compute_pfair_deadline(
				edfos_params(t).fraction[cpu][0],
				edfos_params(t).fraction[cpu][1],
				tsk_rt(t)->semi_part.cpu_job_no[cpu] + 1);
		bheap_insert(fakepfair_ready_order,
			     &edfos_params(t).ready_queue, node);
		node = bheap_peek(fakepfair_ready_order,
				  &edfos_params(t).release_queue);
	}

	/* Choose the next Pfair subtask. */
	node = bheap_take(fakepfair_ready_order,
			  &edfos_params(t).ready_queue);
	BUG_ON(!node);
	cpu = ((int*)node->value) - edfos_params(t).heap_data;

	next_rel = compute_pfair_release(edfos_params(t).fraction[cpu][0],
					 edfos_params(t).fraction[cpu][1],
					 tsk_rt(t)->semi_part.cpu_job_no[cpu]
					 + 1);
	if (next_rel <= tsk_rt(t)->job_params.job_no)
	{
		/* Next subtask already released. */
		*((int*)node->value) = compute_pfair_deadline(
					edfos_params(t).fraction[cpu][0],
					edfos_params(t).fraction[cpu][1],
					tsk_rt(t)->semi_part.cpu_job_no[cpu] +
					1);
		bheap_insert(fakepfair_ready_order,
			     &edfos_params(t).ready_queue, node);
	}
	else
	{
		/* Next subtask not yet released. */
		*((int*)node->value) = next_rel;
		bheap_insert(fakepfair_ready_order,
			     &edfos_params(t).release_queue, node);
	}

	TRACE_TASK(t, "%u = %u * %u / %u\n",
			t->rt_param.job_params.job_no, cur_cpu_job_no(t),
			cur_cpu_fract_den(t), cur_cpu_fract_num(t));
	return cpu;
}

/* If needed (the share for task t on this CPU is exhausted), updates
 * the task_params.cpu for the _migrating_ task t
 */
static void change_migrat_cpu_if_needed(struct task_struct *t)
{
	int cpu;
	BUG_ON(!is_migrat_task(t));
	/* EDF-os: if it is a migrating task and it has already executed
	 * the required number of jobs on this CPU, we need to move it
	 * on its next CPU; changing the cpu here will affect the requeue
	 * and the next release
	 */
	cpu = next_cpu_for_job(t);
	BUG();
	if (unlikely(cpu != get_partition(t))) {
		tsk_rt(t)->task_params.cpu = cpu;
		TRACE_TASK(t, "EDF-os: will migrate job %d -> %d\n",
			task_cpu(t), tsk_rt(t)->task_params.cpu);
		return;
	}

	TRACE_TASK(t, "EDF-os: job will stay on %d -> %d\n",
			task_cpu(t), tsk_rt(t)->task_params.cpu);
}

static void job_completion(struct task_struct* t, int forced)
{
	sched_trace_task_completion(t,forced);
	TRACE_TASK(t, "job_completion().\n");

	if (unlikely(is_migrat_task(t))) {
		update_job_counter(t);
		change_migrat_cpu_if_needed(t);
	}

	set_rt_flags(t, RT_F_SLEEP);
	prepare_for_next_period(t);
}

static void edfos_tick(struct task_struct *t)
{
	edfos_domain_t *edfos = local_edfos;

	BUG_ON(is_realtime(t) && t != edfos->scheduled);

	if (is_realtime(t) && budget_enforced(t) && budget_exhausted(t)) {
		set_tsk_need_resched(t);
		TRACE("edfos_scheduler_tick: "
			"%d is preemptable "
			" => FORCE_RESCHED\n", t->pid);
	}
}

static struct task_struct* edfos_schedule(struct task_struct * prev)
{
	edfos_domain_t* 	edfos = local_edfos;
	rt_domain_t*		edf  = &edfos->domain;
	struct task_struct*	next;

	int out_of_time, sleep, preempt, exists, blocks, change_cpu, resched;

	raw_spin_lock(&edfos->slock);

	BUG_ON(edfos->scheduled && edfos->scheduled != prev);
	BUG_ON(edfos->scheduled && !is_realtime(prev));

	/* (0) Determine state */
	exists      = edfos->scheduled != NULL;
	blocks      = exists && !is_running(edfos->scheduled);
	out_of_time = exists &&
				  budget_enforced(edfos->scheduled) &&
				  budget_exhausted(edfos->scheduled);
	sleep	    = exists && get_rt_flags(edfos->scheduled) == RT_F_SLEEP;
	change_cpu  = exists && wrong_cpu(edfos->scheduled);
	preempt     = edfos_preemption_needed(edf, prev);

	BUG_ON(blocks && change_cpu);

	if (exists)
		TRACE_TASK(prev,
			   "blocks:%d out_of_time:%d sleep:%d preempt:%d "
			   "wrong_cpu:%d state:%d sig:%d\n",
			   blocks, out_of_time, sleep, preempt,
			   change_cpu, prev->state, signal_pending(prev));

	/* If we need to preempt do so. */
	resched = preempt;

	/* If a task blocks we have no choice but to reschedule. */
	if (blocks)
		resched = 1;

	/* If a task has just woken up, it was tardy and the wake up
	 * raced with this schedule, a new job has already been released,
	 * but scheduled should be enqueued on a remote ready queue, and a
	 * new task should be selected for the current queue.
	 */
	if (change_cpu)
		resched = 1;

	/* Any task that is preemptable and either exhausts its execution
	 * budget or wants to sleep completes. We may have to reschedule after
	 * this.
	 */
	if ((out_of_time || sleep) && !blocks) {
		job_completion(edfos->scheduled, !sleep);
		resched = 1;
	}

	/* The final scheduling decision. Do we need to switch for some reason?
	 * Switch if we are in RT mode and have no task or if we need to
	 * resched.
	 */
	next = NULL;
	if (resched || !exists) {

		if (edfos->scheduled && !blocks)
			requeue(edfos->scheduled, edf);
		next = __take_ready(edf);
	} else
		/* Only override Linux scheduler if we have a real-time task
		 * scheduled that needs to continue.
		 */
		if (exists)
			next = prev;

	if (next) {
		TRACE_TASK(next, "scheduled at %llu\n", litmus_clock());
		set_rt_flags(next, RT_F_RUNNING);
	} else {
		TRACE("becoming idle at %llu\n", litmus_clock());
	}

	edfos->scheduled = next;
	raw_spin_unlock(&edfos->slock);

	return next;
}

/*	Prepare a task for running in RT mode
 */
static void edfos_task_new(struct task_struct * t, int on_rq, int running)
{
	rt_domain_t* 		edf  = task_edf(t);
	edfos_domain_t* 	edfos = task_edfos(t);
	unsigned long		flags;
	unsigned int		i;
	unsigned int		has_cpu = 0;
	
	if (edfos_params(t).migrat) {
		bheap_init(&edfos_params(t).release_queue);
		bheap_init(&edfos_params(t).ready_queue);
		for (i = 0; i < NR_CPUS; i++) {
			if (edfos_params(t).fraction[i][0] > 0) {
				has_cpu = 1;
				edfos_params(t).heap_data[i] =
					compute_pfair_deadline(
					edfos_params(t).fraction[i][0],
					edfos_params(t).fraction[i][1], 0);
				bheap_add(fakepfair_ready_order,
					  &edfos_params(t).ready_queue,
					  &edfos_params(t).heap_data[i],
					  GFP_ATOMIC);
			}
		}
		BUG_ON(!has_cpu);
		/* Pick the first CPU to execute on. */
		change_migrat_cpu_if_needed(t);
	}

	TRACE_TASK(t, "EDF-os: task new, cpu = %d\n",
		   t->rt_param.task_params.cpu);

	release_at(t, litmus_clock());
	update_job_counter(t);

	/* The task should be running in the queue, otherwise signal
	 * code will try to wake it up with fatal consequences.
	 */
	raw_spin_lock_irqsave(&edfos->slock, flags);
	if (running) {
		/* there shouldn't be anything else running at the time */
		BUG_ON(edfos->scheduled);
		edfos->scheduled = t;
	} else {
		requeue(t, edf);
		/* maybe we have to reschedule */
		preempt(edfos);
	}
	raw_spin_unlock_irqrestore(&edfos->slock, flags);
}

static void edfos_task_wake_up(struct task_struct *task)
{
	unsigned long		flags;
	edfos_domain_t* 	edfos = task_edfos(task);
	rt_domain_t* 		edf  = task_edf(task);
	lt_t			now;

	TRACE_TASK(task, "wake_up at %llu\n", litmus_clock());

	TRACE_TASK(task, "acquire edfos %d\n", edfos->cpu);
	raw_spin_lock_irqsave(&edfos->slock, flags);

	BUG_ON(edfos != task_edfos(task));
	BUG_ON(is_queued(task));

	now = litmus_clock();
	if (is_tardy(task, now)) {
		if (unlikely(is_migrat_task(task))) {
			/* a new job will be released.
			 * Update current job counter */
			update_job_counter(task);
			/* Switch CPU if needed */
			change_migrat_cpu_if_needed(task);
		}
		/* new sporadic release */
		TRACE_TASK(task, "release new\n");
		release_at(task, now);
		sched_trace_task_release(task);
	}

	/* Only add to ready queue if it is not the currently-scheduled
	 * task. This could be the case if a task was woken up concurrently
	 * on a remote CPU before the executing CPU got around to actually
	 * de-scheduling the task, i.e., wake_up() raced with schedule()
	 * and won.
	 */
	if (edfos->scheduled != task)
		requeue(task, edf);

	raw_spin_unlock_irqrestore(&edfos->slock, flags);
	TRACE_TASK(task, "release edfos %d\n", edfos->cpu);
	TRACE_TASK(task, "wake up done\n");
}

static void edfos_task_block(struct task_struct *t)
{
	TRACE_TASK(t, "block at %llu, state=%d\n", litmus_clock(), t->state);

	BUG_ON(!is_realtime(t));
	if (is_queued(t)) {
		edfos_domain_t *edfos = local_edfos;
		TRACE_TASK(t, "task blocked, race with wakeup, "
				"remove from queue %d\n", edfos->cpu);
		remove(&edfos->domain, t);
	}
}

static void edfos_task_exit(struct task_struct * t)
{
	unsigned long flags;
	edfos_domain_t* 	edfos = task_edfos(t);
	rt_domain_t*		edf;

	raw_spin_lock_irqsave(&edfos->slock, flags);
	if (is_queued(t)) {
		/* dequeue */
		edf  = task_edf(t);
		remove(edf, t);
	}
	if (edfos->scheduled == t)
		edfos->scheduled = NULL;

	/* Deallocate heap nodes. */
	while (bheap_take_del(fakepfair_ready_order,
			      &edfos_params(t).release_queue)) {}
	while (bheap_take_del(fakepfair_ready_order,
			      &edfos_params(t).ready_queue)) {}

	TRACE_TASK(t, "RIP\n");

	preempt(edfos);
	raw_spin_unlock_irqrestore(&edfos->slock, flags);
}

static long edfos_admit_task(struct task_struct* tsk)
{
	return task_cpu(tsk) == tsk->rt_param.task_params.cpu ? 0 : -EINVAL;
}

/*	Plugin object	*/
static struct sched_plugin edfos_plugin __cacheline_aligned_in_smp = {
	.plugin_name		= "EDF-os",
	.tick			= edfos_tick,
	.task_new		= edfos_task_new,
	.complete_job		= complete_job,
	.task_exit		= edfos_task_exit,
	.schedule		= edfos_schedule,
	.task_wake_up		= edfos_task_wake_up,
	.task_block		= edfos_task_block,
	.admit_task		= edfos_admit_task
};

static int __init init_edfos(void)
{
	int i;
	edfos_domain_t *edfos;

	/* Note, broken if num_online_cpus() may change */
	for (i = 0; i < num_online_cpus(); i++) {
		edfos = remote_edfos(i);
		edfos->cpu = i;
		edfos->scheduled = NULL;
		rt_domain_init(&edfos->domain, edfos_ready_order, NULL,
			       edfos_release_jobs);
	}

	return register_sched_plugin(&edfos_plugin);
}

module_init(init_edfos);