/*
 * kernel/sched-global-edf.c
 *
 * Re-Implementation of the Global EDF scheduler.
 *
 * This version works without using the struct queue. It uses the
 * builtin kernel lists.
 */

#include <linux/percpu.h>
#include <linux/sched.h>
#include <linux/list.h>

#include <linux/litmus.h>
#include <linux/sched_plugin.h>

#include <linux/edf_common.h>
#include <linux/sched_trace.h>


/* cpu_entry_t - maintain state of the priority of cpu's current task
 *               this is needed to check for priority inversions.
 */
typedef struct  {
	int 			cpu;
	int 			executes_realtime;
	jiffie_t 		cur_deadline;
	struct list_head	list;
	atomic_t		will_schedule;
} cpu_entry_t;
DEFINE_PER_CPU(cpu_entry_t, gedf_cpu_entries);

#define set_will_schedule() \
	(atomic_set(&__get_cpu_var(gedf_cpu_entries).will_schedule, 1))
#define clear_will_schedule() \
	(atomic_set(&__get_cpu_var(gedf_cpu_entries).will_schedule, 0))
#define test_will_schedule(cpu) \
	(atomic_read(&per_cpu(gedf_cpu_entries, cpu).will_schedule))


/* always acquire the cpu lock as the last lock to avoid deadlocks */
static spinlock_t gedf_cpu_lock = SPIN_LOCK_UNLOCKED;
/* the cpus queue themselves according to priority in here */
static LIST_HEAD(gedf_cpu_queue);


static rt_domain_t gedf;

#define DUMP(args...) TRACE(args)

/* adjust_cpu_queue - Move the cpu entry to the correct place to maintain 
 *                    order in the cpu queue. Caller must hold ready write lock.
 *
 */
static void adjust_cpu_queue(int exec_rt, jiffie_t deadline) 
{
	struct list_head *pos;
	cpu_entry_t *other;
	cpu_entry_t *entry;

	spin_lock(&gedf_cpu_lock);

	entry = &__get_cpu_var(gedf_cpu_entries);
	entry->executes_realtime = exec_rt;
	entry->cur_deadline      = deadline;

	/* TODO: move instead of del+reinsert */
	list_del(&entry->list);
	/* if we do not execute real-time jobs we just move 
	 * to the end of the queue 
	 */
	if (entry->executes_realtime)
		list_for_each(pos, &gedf_cpu_queue) {
			other = list_entry(pos, cpu_entry_t, list);
			if (!other->executes_realtime || 
			    time_before_eq(entry->cur_deadline, 
					   other->cur_deadline))
			{
				__list_add(&entry->list, pos->prev, pos);
				goto out;
			}
		}
	/* if we get this far we have the lowest priority task */
	list_add_tail(&entry->list, &gedf_cpu_queue);			

 out:
	spin_unlock(&gedf_cpu_lock);
}


/* check_reschedule_needed - Check whether another CPU needs to reschedule.
 *
 * The function only checks and kicks the last CPU. It will reschedule and 
 * kick the next if necessary, and so on. The caller is responsible for making
 * sure that it is not the last entry or that a reschedule is not necessary.
 *
 * TODO: This function is probably way too trigger happy. It should only send
 *       IPIs if the other CPU is not  going to reschedule anyway. But that is 
 *       hard to detect reliably. Too many schedules will hurt performance 
 *       but do not cause incorrect schedules.
 */
static int gedf_check_resched(rt_domain_t *edf) 
{
	cpu_entry_t *last;
	int ret = 0;

	spin_lock(&gedf_cpu_lock);

	if (!list_empty(&edf->ready_queue)) {
		last = list_entry(gedf_cpu_queue.prev, cpu_entry_t, list);
		if (!last->executes_realtime || 
		    time_before(next_ready(edf)->rt_param.times.deadline,
				last->cur_deadline))
		{
			if (smp_processor_id() == last->cpu)
				set_tsk_need_resched(current);
			else
				if (!test_will_schedule(last->cpu))
				    smp_send_reschedule(last->cpu);
			ret = 1;
		}
	}

	spin_unlock(&gedf_cpu_lock);
	return ret;
}


/* gedf_scheduler_tick - this function is called for every local timer 
 *                       interrupt.
 *
 *                   checks whether the current task has expired and checks
 *                   whether we need to preempt it if it has not expired
 */
static reschedule_check_t gedf_scheduler_tick(void)
{
	unsigned long flags;
	struct task_struct *t = current;
	reschedule_check_t want_resched = NO_RESCHED;

	/* expire tasks even if not in real-time mode
	 * this makes sure that at the end of real-time mode
	 * no tasks "run away forever".
	 */
	BUG_ON(is_realtime(t) && t->time_slice > 100000);
	if (is_realtime(t) && (!--t->time_slice)) {
		/* this task has exhausted its budget in this period */
		set_rt_flags(t, RT_F_SLEEP);
		want_resched = FORCE_RESCHED;
		set_will_schedule();
		sched_trace_job_completion(t);
	} 
	if (get_rt_mode() == MODE_RT_RUN)
	{
		/* check whether anything is waiting to be released 
		 * this could probably be moved to the global timer
		 * interrupt handler since the state will only change
		 * once per jiffie
		 */
		try_release_pending(&gedf);
		if (want_resched != FORCE_RESCHED)
		{
			read_lock_irqsave(&gedf.ready_lock, flags);
			if (edf_preemption_needed(&gedf, t))
			{
				want_resched = FORCE_RESCHED;
				set_will_schedule();
			}
			read_unlock_irqrestore(&gedf.ready_lock, flags);
		}
	}       
	return want_resched;
}

/* This is main Global EDF schedule function
 * 
 * Assumes the caller holds the lock for rq and that irqs are disabled
 * This is function only works for indirect switching
 */
static int gedf_schedule(struct task_struct * prev, 
			 struct task_struct ** next, 
			 runqueue_t * rq)
{
	int 		need_deactivate = 1;
	int 		rt;
	jiffie_t 	deadline;
	unsigned long 	flags;


	if (is_realtime(prev) && get_rt_flags(prev) == RT_F_SLEEP)
	{
		DUMP("preparing %d for next period\n", prev->pid);
		edf_prepare_for_next_period(prev);	
	}
	
	if (get_rt_mode() == MODE_RT_RUN) {
		write_lock_irqsave(&gedf.ready_lock, flags);

		clear_will_schedule();

		if (is_realtime(prev) && is_released(prev) && is_running(prev)
		    && !edf_preemption_needed(&gedf, prev)) {
			/* Our current task's next job has already been 
			 * released and has higher priority than the highest
			 * prioriy waiting task; in other words: it is tardy.
			 * We just keep it.
			 */
			DUMP("prev will be next, already released\n");
			*next = prev;
			rt = 1;
			deadline = prev->rt_param.times.deadline;
			need_deactivate = 0;
		} else {
			/* either not yet released, preempted, or non-rt */
			*next = __take_ready(&gedf);
			if (*next) {
				/* mark the task as executing on this cpu */
				set_task_cpu(*next, smp_processor_id());
				
				/* stick the task into the runqueue */
				__activate_task(*next, rq);
				rt = 1;
				deadline = (*next)->rt_param.times.deadline;
			} 
			else
				rt = deadline = 0;			
		}

		adjust_cpu_queue(rt, deadline);

		if (rt) {
			set_rt_flags(*next, RT_F_RUNNING);
			gedf.check_resched(&gedf);
		}
		write_unlock_irqrestore(&gedf.ready_lock, flags);
	} 

	if (is_realtime(prev) && need_deactivate && prev->array) {
		/* take it out of the run queue */
		deactivate_task(prev, rq);
	}
	
	/* don't put back into release yet. 
	 * We first need to actually switch 
	 * stacks before we can execute it 
	 * on a different CPU */
	
	/* in the current implementation nobody cares about the return value */
	return 0;
}


/* _finish_switch - we just finished the switch away from prev
 *                  it is now safe to requeue the task
 */
static void gedf_finish_switch(struct task_struct *prev) 
{
	if (!is_realtime(prev) || !is_running(prev))
		return;

	/*printk(KERN_INFO "gedf finish switch for %d\n", prev->pid);*/
	if (get_rt_flags(prev) == RT_F_SLEEP || 
	    get_rt_mode() != MODE_RT_RUN) {
		/* this task has expired
		 * _schedule has already taken care of updating 
		 * the release and
		 * deadline. We just must check if has been released.
		 */
		if (time_before_eq(prev->rt_param.times.release, jiffies) 
		    && get_rt_mode() == MODE_RT_RUN) {
			/* already released */
			add_ready(&gedf, prev);
			DUMP("%d goes straight to ready queue\n", prev->pid);
		}
		else 
			/* it has got to wait */
			add_release(&gedf, prev);		
	}
	else {
		/* this is a forced preemption 
		 * thus the task stays in the ready_queue
		 * we only must make it available to others
		 */
		add_ready(&gedf, prev);
	}	
}


/*	Prepare a task for running in RT mode
 *	Enqueues the task into master queue data structure
 *	returns 
 *		-EPERM  if task is not TASK_STOPPED
 */
static long gedf_prepare_task(struct task_struct * t)
{
	TRACE("global edf: prepare task %d\n", t->pid);

	if (t->state == TASK_STOPPED) {
		__setscheduler(t, SCHED_FIFO, MAX_RT_PRIO - 1);

		if (get_rt_mode() == MODE_RT_RUN)
			/* The action is already on. 
			 * Prepare immediate release
			 */
			edf_release_now(t);
		/* The task should be running in the queue, otherwise signal 
		 * code will try to wake it up with fatal consequences.
		 */
		t->state = TASK_RUNNING; 
		add_release(&gedf, t);
		return 0;
	}
	else
		return -EPERM;
}

static void gedf_wake_up_task(struct task_struct *task) 
{
	/* We must determine whether task should go into the release 	   
	 * queue or into the ready queue. It may enter the ready queue 
	 * if it has credit left in its time slice and has not yet reached 
	 * its deadline. If it is now passed its deadline we assume this the 
	 * arrival of a new sporadic job and thus put it in the ready queue 
	 * anyway.If it has zero budget and the next release is in the future 
	 * it has to go to the release queue.
	 */
	TRACE("global edf: wake up %d with budget=%d\n", 
	      task->pid, task->time_slice);
	task->state = TASK_RUNNING;
	if (is_tardy(task)) {
		/* new sporadic release */
		edf_release_now(task);
		sched_trace_job_release(task);
		add_ready(&gedf, task);
	}
	else if (task->time_slice) {
		/* came back in time before deadline
		 * TODO: clip budget to fit into period, otherwise it could 
		 *       cause a deadline overrun in the next period, i.e. 
		 *       over allocation in the next period.
		 */
		set_rt_flags(task, RT_F_RUNNING);
		add_ready(&gedf, task);
	}
	else {
		add_release(&gedf, task);
	}
		
}

static void gedf_task_blocks(struct task_struct *t)
{
	BUG_ON(!is_realtime(t));
	/* not really anything to do since it can only block if 
	 * it is running, and when it is not running it is not in any 
	 * queue anyway.
	 *
	 * TODO: Check whether the assumption is correct for SIGKILL and 
	 *       SIGSTOP.
	 */
	TRACE("task %d blocks with budget=%d\n", t->pid, t->time_slice);
	BUG_ON(t->rt_list.next != LIST_POISON1);
	BUG_ON(t->rt_list.prev != LIST_POISON2);
}


/* When _tear_down is called, the task should not be in any queue any more
 * as it must have blocked first. We don't have any internal state for the task,
 * it is all in the task_struct.
 */
static long gedf_tear_down(struct task_struct * t)
{
	BUG_ON(!is_realtime(t));
        TRACE("global edf: tear down called for %d \n", t->pid);
	BUG_ON(t->array);
	BUG_ON(t->rt_list.next != LIST_POISON1);
	BUG_ON(t->rt_list.prev != LIST_POISON2);
	return 0;
}


static int gedf_mode_change(int new_mode)
{
	int cpu;
	cpu_entry_t *entry;

/*	printk(KERN_INFO "[%d] global edf: mode changed to %d\n", smp_processor_id(), 
	new_mode);*/
	if (new_mode == MODE_RT_RUN) {
		rerelease_all(&gedf, edf_release_at);
		
		/* initialize per CPU state 
		 * we can't do this at boot time because we don't know
		 * which CPUs will be online and we can't put non-existing
		 * cpus into the queue
		 */
		spin_lock(&gedf_cpu_lock);
		/* get old cruft out of the way in case we reenter real-time
		 * mode for a second time
		 */
		while (!list_empty(&gedf_cpu_queue))
			list_del(gedf_cpu_queue.next);
		/* reinitialize */
		for_each_online_cpu(cpu) {
			entry = &per_cpu(gedf_cpu_entries, cpu);
			atomic_set(&entry->will_schedule, 0);
			entry->executes_realtime = 0;
			entry->cur_deadline      = 0;
			entry->cpu 	         = cpu;
			list_add(&entry->list, &gedf_cpu_queue);
		}		
		spin_unlock(&gedf_cpu_lock);
	}
	/*printk(KERN_INFO "[%d] global edf: mode change done\n", smp_processor_id()); */
	return 0;
}


/*	Plugin object	*/
static sched_plugin_t s_plugin __cacheline_aligned_in_smp = {
	.ready_to_use = 0 
};


/*
 *	Plugin initialization code.
 */
#define INIT_SCHED_PLUGIN (struct sched_plugin){\
	.plugin_name		= "Global EDF",\
	.ready_to_use		= 1,\
	.scheduler_tick		= gedf_scheduler_tick,\
	.prepare_task		= gedf_prepare_task,\
	.sleep_next_period	= edf_sleep_next_period,\
	.tear_down		= gedf_tear_down,\
	.shutdown_hook 		= 0,\
	.schedule		= gedf_schedule,\
	.finish_switch 		= gedf_finish_switch,\
	.mode_change		= gedf_mode_change,\
	.wake_up_task		= gedf_wake_up_task,\
	.task_blocks		= gedf_task_blocks \
	}


sched_plugin_t *__init init_global_edf_plugin(void)
{
	if (!s_plugin.ready_to_use)
	{
		edf_domain_init(&gedf, gedf_check_resched);
		s_plugin = INIT_SCHED_PLUGIN;
	}
	return &s_plugin;
}


/*****************************************************************************/
/*****************************************************************************/
/*****************************************************************************/
/*                       NON-PREEMPTIVE GLOBAL EDF                           */


/* gedf_np_scheduler_tick - this function is called for every local timer 
 *                          interrupt.
 *
 *                   checks whether the current task has expired and checks
 *                   whether we need to preempt it if it has not expired
 */
static reschedule_check_t gedf_np_scheduler_tick(void)
{
	if (get_rt_mode() == MODE_RT_RUN)
	{
		/* check whether anything is waiting to be released 
		 * this could probably be moved to the global timer
		 * interrupt handler since the state will only change
		 * once per jiffie
		 */
		try_release_pending(&gedf);
	}       

	/* expire tasks even if not in real-time mode
	 * this makes sure that at the end of real-time mode
	 * no tasks "run away forever".
	 */
	BUG_ON(current->time_slice > 1000);
	if (is_realtime(current) && (!--current->time_slice)) {
		/* this task has exhausted its budget in this period */
		set_rt_flags(current, RT_F_SLEEP);
		return FORCE_RESCHED;
	} 
	else
		return NO_RESCHED;
}

/* gedf_np_check_resched - Check whether another CPU needs to reschedule.
 *
 * The function only checks and kicks the last CPU. It will reschedule and 
 * kick the next if necessary, and so on. The caller is responsible for making
 * sure that it is not the last entry or that a reschedule is not necessary.
 *
 */
static int gedf_np_check_resched(rt_domain_t *edf) 
{
	cpu_entry_t *last;
	int ret = 0;

	spin_lock(&gedf_cpu_lock);

	if (!list_empty(&edf->ready_queue)) {
		last = list_entry(gedf_cpu_queue.prev, cpu_entry_t, list);
		/* preemption happens only for non-realtime tasks */
		if (!last->executes_realtime)
		{
			if (smp_processor_id() == last->cpu)
				set_tsk_need_resched(current);
			else
				smp_send_reschedule(last->cpu);
			ret = 1;
			goto out;
		}
	}

 out:
	spin_unlock(&gedf_cpu_lock);
	return ret;
}


/* non-preemptive global EDF
 * 
 * Non-preemptive EDF is almost the same as normal EDF. We only have to 
 * adjust the scheduler tick and the resched function.
 */
#define INIT_SCHED_PLUGIN_NP (struct sched_plugin){\
	.plugin_name		= "Non-Preemptive Global EDF",\
	.ready_to_use		= 1,\
	.scheduler_tick		= gedf_np_scheduler_tick,\
	.prepare_task		= gedf_prepare_task,\
	.sleep_next_period	= edf_sleep_next_period,\
	.tear_down		= gedf_tear_down,\
	.shutdown_hook 		= 0,\
	.schedule		= gedf_schedule,\
	.finish_switch 		= gedf_finish_switch,\
	.mode_change		= gedf_mode_change,\
	.wake_up_task		= gedf_wake_up_task,\
	.task_blocks		= gedf_task_blocks \
	}


/* as we only set the plugin at boot time, 
 * we use the same structure as preemptive EDF. This simplifies a lot
 * of the funtions.
 */
sched_plugin_t* __init init_global_edf_np_plugin(void)
{
	if (!s_plugin.ready_to_use)
	{
		edf_domain_init(&gedf, gedf_np_check_resched);
		s_plugin = INIT_SCHED_PLUGIN_NP;
	}
	return &s_plugin;
}