/*
 * kernel/sched_part_edf.c
 *
 * Implementation of the partitioned EDF scheduler plugin.
 */

#include <linux/percpu.h>
#include <linux/sched.h>
#include <linux/list.h>
#include <linux/spinlock.h>

#include <linux/litmus.h>
#include <linux/sched_plugin.h>
#include <linux/edf_common.h>


typedef struct {
	rt_domain_t 		domain;
	int          		cpu;
	struct task_struct* 	scheduled; /* only RT tasks */
	spinlock_t   		lock;
} part_edf_domain_t;


#define local_edf		(&__get_cpu_var(part_edf_domains).domain)
#define local_pedf		(&__get_cpu_var(part_edf_domains))
#define remote_edf(cpu)		(&per_cpu(part_edf_domains, cpu).domain)
#define remote_pedf(cpu)	(&per_cpu(part_edf_domains, cpu))
#define task_edf(task)		remote_edf(get_partition(task))

static void part_edf_domain_init(part_edf_domain_t* pedf, 
				 check_resched_needed_t check,
				 int cpu)
{
	edf_domain_init(&pedf->domain, check);
	pedf->cpu      		= cpu;
	pedf->lock     		= SPIN_LOCK_UNLOCKED;
	pedf->scheduled		= NULL;
}

DEFINE_PER_CPU(part_edf_domain_t, part_edf_domains);

/* This check is trivial in partioned systems as we only have to consider
 * the CPU of the partition.
 *
 */
static int part_edf_check_resched(rt_domain_t *edf) 
{
	part_edf_domain_t *pedf = container_of(edf, part_edf_domain_t, domain);
	int ret = 0;

	spin_lock(&pedf->lock);
       
	/* because this is a callback from rt_domain_t we already hold
	 * the necessary lock for the ready queue
	 */
	if (edf_preemption_needed(edf, pedf->scheduled)) {
		if (pedf->cpu == smp_processor_id())
			set_tsk_need_resched(current);
		else
			smp_send_reschedule(pedf->cpu);
		ret = 1;
	}
	spin_unlock(&pedf->lock);
	return ret;
}


static reschedule_check_t part_edf_scheduler_tick(void)
{
	unsigned long flags;
	struct task_struct *t = current;
	reschedule_check_t want_resched = NO_RESCHED;
	rt_domain_t *edf       = local_edf;
	part_edf_domain_t *pedf = local_pedf;

	/* Check for inconsistency. We don't need the lock for this since
	 * ->scheduled is only changed in schedule, which obviously is not 
	 *  executing in parallel on this CPU
	 */
	BUG_ON(is_realtime(t) && t != pedf->scheduled);

	/* expire tasks even if not in real-time mode
	 * this makes sure that at the end of real-time mode
	 * no tasks "run away forever".
	 */
	if (is_realtime(t) && (!--t->time_slice)) {
		/* this task has exhausted its budget in this period */
		set_rt_flags(t, RT_F_SLEEP);
		want_resched = FORCE_RESCHED;
	} 
	if (get_rt_mode() == MODE_RT_RUN)
	{
		/* check whether anything is waiting to be released 
		 * this could probably be moved to the global timer
		 * interrupt handler since the state will only change
		 * once per jiffie
		 */
		try_release_pending(edf);
		if (want_resched != FORCE_RESCHED)
		{
			read_lock_irqsave(&edf->ready_lock, flags);
			if (edf_preemption_needed(edf, t))
				want_resched = FORCE_RESCHED;
			read_unlock_irqrestore(&edf->ready_lock, flags);
		}
	}       
	return want_resched;
}

static int part_edf_schedule(struct task_struct * prev, 
			     struct task_struct ** next, 
			     runqueue_t * rq)
{
	int 			need_deactivate = 1;
	part_edf_domain_t* 	pedf = local_pedf;
	rt_domain_t*		edf  = &pedf->domain; 


	if (is_realtime(prev) && get_rt_flags(prev) == RT_F_SLEEP)
		edf_prepare_for_next_period(prev);	
	
	if (get_rt_mode() == MODE_RT_RUN) {
		write_lock(&edf->ready_lock);
		if (is_realtime(prev) && is_released(prev) && is_running(prev)
		    && !edf_preemption_needed(edf, prev)) {
			/* this really should only happen if the task has 
			 * 100% utilization... 
			 */
			TRACE("prev will be next, already released\n");
			*next = prev;
			need_deactivate = 0;
		} else {
			/* either not yet released, preempted, or non-rt */
			*next = __take_ready(edf);
			if (*next) {				
				/* stick the task into the runqueue */
				__activate_task(*next, rq);
				set_task_cpu(*next, smp_processor_id());
			}
		}
		spin_lock(&pedf->lock);
		pedf->scheduled = *next;
		spin_unlock(&pedf->lock);
		if (*next)
			set_rt_flags(*next, RT_F_RUNNING);
		
		write_unlock(&edf->ready_lock);
	} 

	if (is_realtime(prev) && need_deactivate && prev->array) {
		/* take it out of the run queue */
		deactivate_task(prev, rq);
	}

	return 0;
}


static void part_edf_finish_switch(struct task_struct *prev) 
{
	rt_domain_t* 	edf = local_edf;

	if (!is_realtime(prev) || !is_running(prev))
		return;

	if (get_rt_flags(prev) == RT_F_SLEEP || 
	    get_rt_mode() != MODE_RT_RUN) {
		/* this task has expired
		 * _schedule has already taken care of updating 
		 * the release and
		 * deadline. We just must check if has been released.
		 */
		if (is_released(prev) && get_rt_mode() == MODE_RT_RUN) {
			/* already released */
			add_ready(edf, prev);
			TRACE("%d goes straight to ready queue\n", prev->pid);
		} else 
			/* it has got to wait */
			add_release(edf, prev);		
	} else {
		/* this is a forced preemption 
		 * thus the task stays in the ready_queue
		 * we only must make it available to others
		 */
		add_ready(edf, prev);
	}	
}


/*	Prepare a task for running in RT mode
 *	Enqueues the task into master queue data structure
 *	returns 
 *		-EPERM  if task is not TASK_STOPPED
 */
static long part_edf_prepare_task(struct task_struct * t)
{
	rt_domain_t* 	edf = task_edf(t);


	TRACE("[%d] part edf: prepare task %d on CPU %d\n", 
		smp_processor_id(), t->pid, get_partition(t));
	if (t->state == TASK_STOPPED) {
		__setscheduler(t, SCHED_FIFO, MAX_RT_PRIO - 1);

		if (get_rt_mode() == MODE_RT_RUN)
			/* The action is already on. 
			 * Prepare immediate release.
			 */
			edf_release_now(t);		
		/* The task should be running in the queue, otherwise signal 
		 * code will try to wake it up with fatal consequences.
		 */
		t->state = TASK_RUNNING; 
		add_release(edf, t);
		return 0;
	} else
		return -EPERM;
}

static void part_edf_wake_up_task(struct task_struct *task) 
{
	rt_domain_t* edf;

	edf = task_edf(task);

	/* We must determine whether task should go into the release 	   
	 * queue or into the ready queue. It may enter the ready queue 
	 * if it has credit left in its time slice and has not yet reached 
	 * its deadline. If it is now passed its deadline we assume this the 
	 * arrival of a new sporadic job and thus put it in the ready queue 
	 * anyway.If it has zero budget and the next release is in the future 
	 * it has to go to the release queue.
	 */
	TRACE("part edf: wake up %d with budget=%d for cpu %d\n", 
	      task->pid, task->time_slice, get_partition(task));
	task->state = TASK_RUNNING;
	if (is_tardy(task)) {
		/* new sporadic release */
		edf_release_now(task);
		add_ready(edf, task);

	} else if (task->time_slice) {
		/* Came back in time before deadline. This may cause
		 * deadline overruns, but since we don't handle suspensions
		 * in the analytical model, we don't care since we can't
		 * guarantee anything at all if tasks block.
		 */
		set_rt_flags(task, RT_F_RUNNING);
		add_ready(edf, task);

	} else {
		add_release(edf, task);
	}
		
}

static void part_edf_task_blocks(struct task_struct *t)
{
	BUG_ON(!is_realtime(t));
	/* not really anything to do since it can only block if 
	 * it is running, and when it is not running it is not in any 
	 * queue anyway.
	 *
	 */
	TRACE("task %d blocks with budget=%d\n", t->pid, t->time_slice);
	BUG_ON(in_list(&t->rt_list));
}


/* When _tear_down is called, the task should not be in any queue any more
 * as it must have blocked first. We don't have any internal state for the task,
 * it is all in the task_struct.
 */
static long part_edf_tear_down(struct task_struct * t)
{
	BUG_ON(!is_realtime(t));
        TRACE("part edf: tear down called for %d \n", t->pid);
	BUG_ON(t->array);
	BUG_ON(in_list(&t->rt_list));
	return 0;
}


static int part_edf_mode_change(int new_mode)
{
	int cpu;

	if (new_mode == MODE_RT_RUN)
		for_each_online_cpu(cpu)
			rerelease_all(remote_edf(cpu), edf_release_at);
	TRACE("[%d] part edf: mode changed to %d\n", 
	       smp_processor_id(), new_mode);
	return 0;
}


/*	Plugin object	*/
static sched_plugin_t s_plugin __cacheline_aligned_in_smp = {
	.ready_to_use = 0 
};


/*
 *	Plugin initialization code.
 */
#define INIT_SCHED_PLUGIN (struct sched_plugin) {\
	.plugin_name		= "Partitioned EDF",\
	.ready_to_use		= 1,\
	.scheduler_tick		= part_edf_scheduler_tick,\
	.prepare_task		= part_edf_prepare_task,\
	.sleep_next_period	= edf_sleep_next_period,\
	.tear_down		= part_edf_tear_down,\
	.schedule		= part_edf_schedule,\
	.finish_switch 		= part_edf_finish_switch,\
	.mode_change		= part_edf_mode_change,\
	.wake_up_task		= part_edf_wake_up_task,\
	.task_blocks		= part_edf_task_blocks \
}


sched_plugin_t *__init init_part_edf_plugin(void)
{
	int i;

	if (!s_plugin.ready_to_use)
	{
		for (i = 0; i < NR_CPUS; i++)
		{
			part_edf_domain_init(remote_pedf(i), 
					     part_edf_check_resched, i);
		        printk("CPU partition %d initialized.", i);
		}
		s_plugin = INIT_SCHED_PLUGIN;
	}
	return &s_plugin;
}