/*
 * kernel/sched_cedf.c
 *
 * Implementation of the Clustered EDF (C-EDF) scheduling algorithm.
 * Linking is included so that support for synchronization (e.g., through
 * the implementation of a "CSN-EDF" algorithm) can be added later if desired.
 *
 * This version uses the simple approach and serializes all scheduling
 * decisions by the use of a queue lock. This is probably not the
 * best way to do it, but it should suffice for now.
 */

#include <linux/spinlock.h>
#include <linux/percpu.h>
#include <linux/sched.h>
#include <linux/list.h>

#include <litmus/litmus.h>
#include <litmus/jobs.h>
#include <litmus/sched_plugin.h>
#include <litmus/edf_common.h>
#include <litmus/sched_trace.h>
#include <litmus/bheap.h>

#include <linux/module.h>

/* Overview of C-EDF operations.
 *
 * link_task_to_cpu(T, cpu) 	- Low-level operation to update the linkage
 *                                structure (NOT the actually scheduled
 *                                task). If there is another linked task To
 *                                already it will set To->linked_on = NO_CPU
 *                                (thereby removing its association with this
 *                                CPU). However, it will not requeue the
 *                                previously linked task (if any). It will set
 *                                T's state to RT_F_RUNNING and check whether
 *                                it is already running somewhere else. If T
 *                                is scheduled somewhere else it will link
 *                                it to that CPU instead (and pull the linked
 *                                task to cpu). T may be NULL.
 *
 * unlink(T)			- Unlink removes T from all scheduler data
 *                                structures. If it is linked to some CPU it
 *                                will link NULL to that CPU. If it is
 *                                currently queued in the cedf queue for
 *                                a partition, it will be removed from
 *                                the rt_domain. It is safe to call
 *                                unlink(T) if T is not linked. T may not
 *                                be NULL.
 *
 * requeue(T)			- Requeue will insert T into the appropriate
 *                                queue. If the system is in real-time mode and
 *                                the T is released already, it will go into the
 *                                ready queue. If the system is not in
 *                                real-time mode is T, then T will go into the
 *                                release queue. If T's release time is in the
 *                                future, it will go into the release
 *                                queue. That means that T's release time/job
 *                                no/etc. has to be updated before requeue(T) is
 *                                called. It is not safe to call requeue(T)
 *                                when T is already queued. T may not be NULL.
 *
 * cedf_job_arrival(T)		- This is the catch-all function when T enters
 *                                the system after either a suspension or at a
 *                                job release. It will queue T (which means it
 *                                is not safe to call cedf_job_arrival(T) if
 *                                T is already queued) and then check whether a
 *                                preemption is necessary. If a preemption is
 *                                necessary it will update the linkage
 *                                accordingly and cause scheduled to be called
 *                                (either with an IPI or need_resched). It is
 *                                safe to call cedf_job_arrival(T) if T's
 *                                next job has not been actually released yet
 *                                (release time in the future). T will be put
 *                                on the release queue in that case.
 *
 * job_completion(T)		- Take care of everything that needs to be done
 *                                to prepare T for its next release and place
 *                                it in the right queue with
 *                                cedf_job_arrival().
 *
 *
 * When we now that T is linked to CPU then link_task_to_cpu(NULL, CPU) is
 * equivalent to unlink(T). Note that if you unlink a task from a CPU none of
 * the functions will automatically propagate pending task from the ready queue
 * to a linked task. This is the job of the calling function ( by means of
 * __take_ready).
 */

/* cpu_entry_t - maintain the linked and scheduled state
 */
typedef struct  {
	int 			cpu;
	struct task_struct*	linked;		/* only RT tasks */
	struct task_struct*	scheduled;	/* only RT tasks */
	struct list_head	list;
	atomic_t		will_schedule;	/* prevent unneeded IPIs */
} cpu_entry_t;
DEFINE_PER_CPU(cpu_entry_t, cedf_cpu_entries);

cpu_entry_t* *cedf_cpu_entries_array;

#define set_will_schedule() \
	(atomic_set(&__get_cpu_var(cedf_cpu_entries).will_schedule, 1))
#define clear_will_schedule() \
	(atomic_set(&__get_cpu_var(cedf_cpu_entries).will_schedule, 0))
#define test_will_schedule(cpu) \
	(atomic_read(&per_cpu(cedf_cpu_entries, cpu).will_schedule))

/* Cluster size -- currently four. This is a variable to allow for
 * the possibility of changing the cluster size online in the future.
 */
int cluster_size = 4;

int do_cleanup = 1;

typedef struct {
	rt_domain_t 		domain;
	int          		first_cpu;
	int			last_cpu;

	/* the cpus queue themselves according to priority in here */
	struct list_head	cedf_cpu_queue;

	/* per-partition spinlock: protects the domain and
	 * serializes scheduling decisions
	 */
#define slock domain.ready_lock
} cedf_domain_t;

DEFINE_PER_CPU(cedf_domain_t*, cedf_domains) = NULL;

cedf_domain_t* *cedf_domains_array;


/* These are defined similarly to partitioning, except that a
 * tasks partition is any cpu of the cluster to which it
 * is assigned, typically the lowest-numbered cpu.
 */
#define local_edf		(&__get_cpu_var(cedf_domains)->domain)
#define local_cedf		__get_cpu_var(cedf_domains)
#define remote_edf(cpu)		(&per_cpu(cedf_domains, cpu)->domain)
#define remote_cedf(cpu)	per_cpu(cedf_domains, cpu)
#define task_edf(task)		remote_edf(get_partition(task))
#define task_cedf(task)		remote_cedf(get_partition(task))

/* update_cpu_position - Move the cpu entry to the correct place to maintain
 *                       order in the cpu queue. Caller must hold cedf lock.
 *
 *						This really should be a heap.
 */
static void update_cpu_position(cpu_entry_t *entry)
{
	cpu_entry_t *other;
	struct list_head *cedf_cpu_queue =
		&(remote_cedf(entry->cpu))->cedf_cpu_queue;
	struct list_head *pos;

	BUG_ON(!cedf_cpu_queue);

	if (likely(in_list(&entry->list)))
		list_del(&entry->list);
	/* if we do not execute real-time jobs we just move
	 * to the end of the queue
	 */
	if (entry->linked) {
		list_for_each(pos, cedf_cpu_queue) {
			other = list_entry(pos, cpu_entry_t, list);
			if (edf_higher_prio(entry->linked, other->linked)) {
				__list_add(&entry->list, pos->prev, pos);
				return;
			}
		}
	}
	/* if we get this far we have the lowest priority job */
	list_add_tail(&entry->list, cedf_cpu_queue);
}

/* link_task_to_cpu - Update the link of a CPU.
 *                    Handles the case where the to-be-linked task is already
 *                    scheduled on a different CPU.
 */
static noinline void link_task_to_cpu(struct task_struct* linked,
				      cpu_entry_t *entry)
{
	cpu_entry_t *sched;
	struct task_struct* tmp;
	int on_cpu;

	BUG_ON(linked && !is_realtime(linked));

	/* Cannot link task to a CPU that doesn't belong to its partition... */
	BUG_ON(linked && remote_cedf(entry->cpu) != task_cedf(linked));

	/* Currently linked task is set to be unlinked. */
	if (entry->linked) {
		entry->linked->rt_param.linked_on = NO_CPU;
	}

	/* Link new task to CPU. */
	if (linked) {
		set_rt_flags(linked, RT_F_RUNNING);
		/* handle task is already scheduled somewhere! */
		on_cpu = linked->rt_param.scheduled_on;
		if (on_cpu != NO_CPU) {
			sched = &per_cpu(cedf_cpu_entries, on_cpu);
			/* this should only happen if not linked already */
			BUG_ON(sched->linked == linked);

			/* If we are already scheduled on the CPU to which we
			 * wanted to link, we don't need to do the swap --
			 * we just link ourselves to the CPU and depend on
			 * the caller to get things right.
			 */
			if (entry != sched) {
				tmp = sched->linked;
				linked->rt_param.linked_on = sched->cpu;
				sched->linked = linked;
				update_cpu_position(sched);
				linked = tmp;
			}
		}
		if (linked) /* might be NULL due to swap */
			linked->rt_param.linked_on = entry->cpu;
	}
	entry->linked = linked;

	if (entry->linked)
		TRACE_TASK(entry->linked, "linked to CPU %d, state:%d\n",
			   entry->cpu, entry->linked->state);
	else
		TRACE("NULL linked to CPU %d\n", entry->cpu);

	update_cpu_position(entry);
}

/* unlink - Make sure a task is not linked any longer to an entry
 *          where it was linked before. Must hold cedf_lock.
 */
static noinline void unlink(struct task_struct* t)
{
    	cpu_entry_t *entry;

	if (unlikely(!t)) {
		TRACE_BUG_ON(!t);
		return;
	}

	if (t->rt_param.linked_on != NO_CPU) {
		/* unlink */
		entry = &per_cpu(cedf_cpu_entries, t->rt_param.linked_on);
		t->rt_param.linked_on = NO_CPU;
		link_task_to_cpu(NULL, entry);
	} else if (is_queued(t)) {
		/* This is an interesting situation: t is scheduled,
		 * but was just recently unlinked.  It cannot be
		 * linked anywhere else (because then it would have
		 * been relinked to this CPU), thus it must be in some
		 * queue. We must remove it from the list in this
		 * case.
		 */
		remove(task_edf(t), t);
	}
}


/* preempt - force a CPU to reschedule
 */
static noinline void preempt(cpu_entry_t *entry)
{
	preempt_if_preemptable(entry->scheduled, entry->cpu);
}

/* requeue - Put an unlinked task into c-edf domain.
 *           Caller must hold cedf_lock.
 */
static noinline void requeue(struct task_struct* task)
{
	cedf_domain_t* cedf;
	rt_domain_t* edf;

	BUG_ON(!task);
	/* sanity check rt_list before insertion */
	BUG_ON(is_queued(task));

	/* Get correct real-time domain. */
	cedf = task_cedf(task);
	edf = &cedf->domain;

	if (is_released(task, litmus_clock()))
		__add_ready(edf, task);
	else {
		/* it has got to wait */
		add_release(edf, task);
	}
}

static void check_for_preemptions(cedf_domain_t* cedf)
{
	cpu_entry_t *last;
	struct task_struct *task;
	struct list_head *cedf_cpu_queue;
	cedf_cpu_queue = &cedf->cedf_cpu_queue;

	for(last = list_entry(cedf_cpu_queue->prev, cpu_entry_t, list);
	    edf_preemption_needed(&cedf->domain, last->linked);
	    last = list_entry(cedf_cpu_queue->prev, cpu_entry_t, list)) {
		/* preemption necessary */
		task = __take_ready(&cedf->domain);
		TRACE("check_for_preemptions: task %d linked to %d, state:%d\n",
		      task->pid, last->cpu, task->state);
		if (last->linked)
			requeue(last->linked);
		link_task_to_cpu(task, last);
		preempt(last);
	}

}

/* cedf_job_arrival: task is either resumed or released */
static noinline void cedf_job_arrival(struct task_struct* task)
{
	cedf_domain_t* cedf;
	rt_domain_t* edf;

	BUG_ON(!task);

	/* Get correct real-time domain. */
	cedf = task_cedf(task);
	edf = &cedf->domain;

	/* first queue arriving job */
	requeue(task);

	/* then check for any necessary preemptions */
	check_for_preemptions(cedf);
}

/* check for current job releases */
static void cedf_release_jobs(rt_domain_t* rt, struct bheap* tasks)
{
        cedf_domain_t*          cedf = container_of(rt, cedf_domain_t, domain);
	unsigned long 		flags;

	spin_lock_irqsave(&cedf->slock, flags);

	__merge_ready(&cedf->domain, tasks);
	check_for_preemptions(cedf);
	spin_unlock_irqrestore(&cedf->slock, flags);
}

/* cedf_tick - this function is called for every local timer
 *                         interrupt.
 *
 *                   checks whether the current task has expired and checks
 *                   whether we need to preempt it if it has not expired
 */
static void cedf_tick(struct task_struct* t)
{
	BUG_ON(!t);

	if (is_realtime(t) && budget_exhausted(t)) {
		if (!is_np(t)) {
			/* np tasks will be preempted when they become
			 * preemptable again
			 */
			set_tsk_need_resched(t);
			set_will_schedule();
			TRACE("cedf_scheduler_tick: "
			      "%d is preemptable (state:%d) "
			      " => FORCE_RESCHED\n", t->pid, t->state);
		} else if(is_user_np(t)) {
			TRACE("cedf_scheduler_tick: "
			      "%d is non-preemptable (state:%d), "
			      "preemption delayed.\n", t->pid, t->state);
			request_exit_np(t);
		}
	}
}

/* caller holds cedf_lock */
static noinline void job_completion(struct task_struct *t, int forced)
{
	BUG_ON(!t);

	sched_trace_task_completion(t, forced);

	TRACE_TASK(t, "job_completion(). [state:%d]\n", t->state);

	/* set flags */
	set_rt_flags(t, RT_F_SLEEP);
	/* prepare for next period */
	prepare_for_next_period(t);
	/* unlink */
	unlink(t);
	/* requeue
	 * But don't requeue a blocking task. */
	if (is_running(t))
		cedf_job_arrival(t);
}

/* Getting schedule() right is a bit tricky. schedule() may not make any
 * assumptions on the state of the current task since it may be called for a
 * number of reasons. The reasons include a scheduler_tick() determined that it
 * was necessary, because sys_exit_np() was called, because some Linux
 * subsystem determined so, or even (in the worst case) because there is a bug
 * hidden somewhere. Thus, we must take extreme care to determine what the
 * current state is.
 *
 * The CPU could currently be scheduling a task (or not), be linked (or not).
 *
 * The following assertions for the scheduled task could hold:
 *
 *      - !is_running(scheduled)        // the job blocks
 *	- scheduled->timeslice == 0	// the job completed (forcefully)
 *	- get_rt_flag() == RT_F_SLEEP	// the job completed (by syscall)
 * 	- linked != scheduled		// we need to reschedule (for any reason)
 * 	- is_np(scheduled)		// rescheduling must be delayed,
 *					   sys_exit_np must be requested
 *
 * Any of these can occur together.
 */
static struct task_struct* cedf_schedule(struct task_struct * prev)
{
	cedf_domain_t* 		cedf = local_cedf;
	rt_domain_t*		edf  = &cedf->domain;
	cpu_entry_t* 		entry = &__get_cpu_var(cedf_cpu_entries);
	int 			out_of_time, sleep, preempt, np,
				exists, blocks;
	struct task_struct* 	next = NULL;

	BUG_ON(!prev);
	BUG_ON(!cedf);
	BUG_ON(!edf);
	BUG_ON(!entry);
	BUG_ON(cedf != remote_cedf(entry->cpu));
	BUG_ON(is_realtime(prev) && cedf != task_cedf(prev));

	/* Will be released in finish_switch. */
	spin_lock(&cedf->slock);
	clear_will_schedule();

	/* sanity checking */
	BUG_ON(entry->scheduled && entry->scheduled != prev);
	BUG_ON(entry->scheduled && !is_realtime(prev));
	BUG_ON(is_realtime(prev) && !entry->scheduled);

	/* (0) Determine state */
	exists      = entry->scheduled != NULL;
	blocks      = exists && !is_running(entry->scheduled);
	out_of_time = exists && budget_exhausted(entry->scheduled);
	np 	    = exists && is_np(entry->scheduled);
	sleep	    = exists && get_rt_flags(entry->scheduled) == RT_F_SLEEP;
	preempt     = entry->scheduled != entry->linked;

	/* If a task blocks we have no choice but to reschedule.
	 */
	if (blocks)
		unlink(entry->scheduled);

	/* Request a sys_exit_np() call if we would like to preempt but cannot.
	 * We need to make sure to update the link structure anyway in case
	 * that we are still linked. Multiple calls to request_exit_np() don't
	 * hurt.
	 */
	if (np && (out_of_time || preempt || sleep)) {
		unlink(entry->scheduled);
		request_exit_np(entry->scheduled);
	}

	/* Any task that is preemptable and either exhausts its execution
	 * budget or wants to sleep completes. We may have to reschedule after
	 * this. Don't do a job completion if blocks (can't have timers
	 * running for blocked jobs). Preemption go first for the same reason.
	 */
	if (!np && (out_of_time || sleep) && !blocks && !preempt)
		job_completion(entry->scheduled, !sleep);

	/* Link pending task if we became unlinked.
	 */
	if (!entry->linked)
		link_task_to_cpu(__take_ready(edf), entry);

	/* The final scheduling decision. Do we need to switch for some reason?
	 * If linked different from scheduled select linked as next.
	 */
	if ((!np || blocks) &&
	    entry->linked != entry->scheduled) {
		/* Schedule a linked job? */
		if (entry->linked) {
			entry->linked->rt_param.scheduled_on = entry->cpu;
			next = entry->linked;
		}
                if (entry->scheduled) {
			/* not gonna be scheduled soon */
			entry->scheduled->rt_param.scheduled_on = NO_CPU;
			TRACE_TASK(entry->scheduled, "cedf_schedule: scheduled_on = NO_CPU\n");
		}
	} else
		/* Only override Linux scheduler if we have real-time task
		 * scheduled that needs to continue.
		 */
		if (exists)
			next = prev;

	spin_unlock(&cedf->slock);

	return next;
}

/* _finish_switch - we just finished the switch away from prev
 */
static void cedf_finish_switch(struct task_struct *prev)
{
	cpu_entry_t* entry = &__get_cpu_var(cedf_cpu_entries);

	BUG_ON(!prev);
	BUG_ON(!entry);

	entry->scheduled = is_realtime(current) ? current : NULL;
}

/*	Prepare a task for running in RT mode
 */
static void cedf_task_new(struct task_struct *t, int on_rq, int running)
{
	unsigned long 		flags;
	cedf_domain_t*	 	cedf = task_cedf(t);
	cpu_entry_t* 		entry;

	BUG_ON(!cedf);

	spin_lock_irqsave(&cedf->slock, flags);
	if (running) {
		entry = &per_cpu(cedf_cpu_entries, task_cpu(t));
		BUG_ON(!entry);
		BUG_ON(entry->scheduled);
		entry->scheduled = t;
		t->rt_param.scheduled_on = task_cpu(t);
	} else
		t->rt_param.scheduled_on = NO_CPU;
	t->rt_param.linked_on = NO_CPU;

	/* setup job params */
	release_at(t, litmus_clock());

	cedf_job_arrival(t);
	spin_unlock_irqrestore(&cedf->slock, flags);
}


static void cedf_task_wake_up(struct task_struct *task)
{
	unsigned long		flags;
	cedf_domain_t*	 	cedf;
	lt_t 			now;

	BUG_ON(!task);

	cedf = task_cedf(task);
	BUG_ON(!cedf);

	spin_lock_irqsave(&cedf->slock, flags);
	/* We need to take suspensions because of semaphores into
	 * account! If a job resumes after being suspended due to acquiring
	 * a semaphore, it should never be treated as a new job release.
	 */
	if (get_rt_flags(task) == RT_F_EXIT_SEM) {
		set_rt_flags(task, RT_F_RUNNING);
	} else {
		now = litmus_clock();
		if (is_tardy(task, now)) {
			/* new sporadic release */
			release_at(task, now);
			sched_trace_task_release(task);
		}
		else if (task->rt.time_slice)
			/* came back in time before deadline
			 */
			set_rt_flags(task, RT_F_RUNNING);
	}
	cedf_job_arrival(task);
	spin_unlock_irqrestore(&cedf->slock, flags);
}


static void cedf_task_block(struct task_struct *t)
{
	unsigned long flags;

	BUG_ON(!t);

	/* unlink if necessary */
	spin_lock_irqsave(&task_cedf(t)->slock, flags);

	t->rt_param.scheduled_on = NO_CPU;
	unlink(t);

	spin_unlock_irqrestore(&task_cedf(t)->slock, flags);

	BUG_ON(!is_realtime(t));
}

static void cedf_task_exit(struct task_struct * t)
{
	unsigned long flags;

	BUG_ON(!t);

	/* unlink if necessary */
	spin_lock_irqsave(&task_cedf(t)->slock, flags);
	unlink(t);
	if (tsk_rt(t)->scheduled_on != NO_CPU) {
		cedf_cpu_entries_array[tsk_rt(t)->scheduled_on]->
			scheduled = NULL;
		tsk_rt(t)->scheduled_on = NO_CPU;
	}
	spin_unlock_irqrestore(&task_cedf(t)->slock, flags);

	BUG_ON(!is_realtime(t));
        TRACE_TASK(t, "RIP\n");
}

static long cedf_admit_task(struct task_struct* tsk)
{
	return (task_cpu(tsk) >= task_cedf(tsk)->first_cpu &&
	        task_cpu(tsk) <= task_cedf(tsk)->last_cpu) ? 0 : -EINVAL;
}


/*	Plugin object	*/
static struct sched_plugin cedf_plugin __cacheline_aligned_in_smp = {
	.plugin_name		= "C-EDF",
	.finish_switch		= cedf_finish_switch,
	.tick			= cedf_tick,
	.task_new		= cedf_task_new,
	.complete_job		= complete_job,
	.task_exit		= cedf_task_exit,
	.schedule		= cedf_schedule,
	.task_wake_up		= cedf_task_wake_up,
	.task_block		= cedf_task_block,
	.admit_task		= cedf_admit_task
};

static void cedf_domain_init(int first_cpu, int last_cpu)
{
	int cpu;

	/* Create new domain for this cluster. */
	cedf_domain_t *new_cedf_domain = kmalloc(sizeof(*new_cedf_domain),
						     GFP_KERNEL);

	/* Initialize cluster domain. */
	edf_domain_init(&new_cedf_domain->domain, NULL,
			cedf_release_jobs);
	new_cedf_domain->first_cpu	= first_cpu;
	new_cedf_domain->last_cpu	= last_cpu;
	INIT_LIST_HEAD(&new_cedf_domain->cedf_cpu_queue);

	/* Assign all cpus in cluster to point to this domain. */
	for (cpu = first_cpu; cpu <= last_cpu; cpu++) {
		remote_cedf(cpu) = new_cedf_domain;
		cedf_domains_array[cpu] = new_cedf_domain;
	}
}

static int __init init_cedf(void)
{
	int cpu;
	cpu_entry_t *entry;

	/* num_online_cpus() should have been set already
	 * if the number of available cpus is less then the cluster
	 * size (currently 4) then it is pointless trying to use
	 * CEDF, so we disable this plugin
	 */
	if(num_online_cpus() < cluster_size) {
		printk(KERN_INFO "Not registering C-EDF plugin: "
			"Num Online Cpus (%d) < Min Cluster Size (%d)\n",
			num_online_cpus(), cluster_size);
		do_cleanup = 0;
		return 0;
	}

	/*
	 * initialize short_cut for per-cpu cedf state;
	 * there may be a problem here if someone removes a cpu
	 * while we are doing this initialization... and if cpus
	 * are added / removed later... is it a _real_ problem for cedf?
	 */
	cedf_cpu_entries_array = kmalloc(
			sizeof(cpu_entry_t *) * num_online_cpus(),
			GFP_KERNEL);

	cedf_domains_array = kmalloc(
			sizeof(cedf_domain_t *) * num_online_cpus(),
			GFP_KERNEL);

	/* initialize CPU state */
	for (cpu = 0; cpu < num_online_cpus(); cpu++)  {
		entry = &per_cpu(cedf_cpu_entries, cpu);
		cedf_cpu_entries_array[cpu] = entry;
		atomic_set(&entry->will_schedule, 0);
		entry->linked    = NULL;
		entry->scheduled = NULL;
		entry->cpu 	 = cpu;
		INIT_LIST_HEAD(&entry->list);
	}

	/* initialize all cluster domains */
	for (cpu = 0; cpu < num_online_cpus(); cpu += cluster_size)
		cedf_domain_init(cpu, cpu+cluster_size-1);

	return register_sched_plugin(&cedf_plugin);
}

static void clean_cedf(void)
{
	if(do_cleanup) {
		kfree(cedf_cpu_entries_array);
		kfree(cedf_domains_array);
	}
}

module_init(init_cedf);
module_exit(clean_cedf);