/*
 * kernel/sched_psn_edf.c
 *
 * Implementation of the PSN-EDF scheduler plugin. 
 * Based on kern/sched_part_edf.c and kern/sched_gsn_edf.c.
 *
 * Suspensions and non-preemptable sections are supported. 
 * Priority inheritance is not supported.
 */

#include <linux/percpu.h>
#include <linux/sched.h>
#include <linux/list.h>
#include <linux/spinlock.h>

#include <linux/litmus.h>
#include <linux/sched_plugin.h>
#include <linux/edf_common.h>


typedef struct {
	rt_domain_t 		domain;
	int          		cpu;
	struct task_struct* 	scheduled; /* only RT tasks */
	spinlock_t   		lock;      /* protects the domain and
                                            * serializes scheduling decisions 
					    */
} psnedf_domain_t;

DEFINE_PER_CPU(psnedf_domain_t, psnedf_domains);

#define local_edf		(&__get_cpu_var(psnedf_domains).domain)
#define local_pedf		(&__get_cpu_var(psnedf_domains))
#define remote_edf(cpu)		(&per_cpu(psnedf_domains, cpu).domain)
#define remote_pedf(cpu)	(&per_cpu(psnedf_domains, cpu))
#define task_edf(task)		remote_edf(get_partition(task))
#define task_pedf(task)		remote_pedf(get_partition(task))


static void psnedf_domain_init(psnedf_domain_t* pedf, 
				 check_resched_needed_t check,
				 int cpu)
{
	edf_domain_init(&pedf->domain, check);
	pedf->cpu      		= cpu;
	pedf->lock     		= SPIN_LOCK_UNLOCKED;
	pedf->scheduled		= NULL;
}

static void requeue(struct task_struct* t, rt_domain_t *edf)
{
	/* only requeue if t is actually running */
	BUG_ON(!is_running(t));

	if (t->state != TASK_RUNNING)
		TRACE_TASK(t, "requeue: !TASK_RUNNING");

	set_rt_flags(t, RT_F_RUNNING);
	if (!is_released(t) || 
	    get_rt_mode() != MODE_RT_RUN)
		__add_release(edf, t); /* it has got to wait */
	else
		__add_ready(edf, t);
}

/* we assume the lock is being held */
static void preempt(psnedf_domain_t *pedf)
{
	if (smp_processor_id() == pedf->cpu) {
		if (pedf->scheduled && is_np(pedf->scheduled))
			request_exit_np(pedf->scheduled);			
		else
			set_tsk_need_resched(current);
	} else
		/* in case that it is a remote CPU we have to defer the
		 * the decision to the remote CPU
		 */
		smp_send_reschedule(pedf->cpu);
}

/* This check is trivial in partioned systems as we only have to consider
 * the CPU of the partition.
 */
static int psnedf_check_resched(rt_domain_t *edf) 
{
	psnedf_domain_t *pedf = container_of(edf, psnedf_domain_t, domain);
	int ret = 0;

	/* because this is a callback from rt_domain_t we already hold
	 * the necessary lock for the ready queue
	 */
	if (edf_preemption_needed(edf, pedf->scheduled)) {
		preempt(pedf);
		ret = 1;
	}
	return ret;
}


static reschedule_check_t psnedf_scheduler_tick(void)
{
	unsigned long       flags;
	struct task_struct *t            = current;
	reschedule_check_t  want_resched = NO_RESCHED;
	rt_domain_t        *edf          = local_edf;
	psnedf_domain_t    *pedf         = local_pedf;

	/* Check for inconsistency. We don't need the lock for this since
	 * ->scheduled is only changed in schedule, which obviously is not 
	 *  executing in parallel on this CPU
	 */
	BUG_ON(is_realtime(t) && t != pedf->scheduled);

	if (is_realtime(t))
		TRACE("%s/%d was hit by scheduler tick\n", t->comm, t->pid);

	/* expire tasks even if not in real-time mode
	 * this makes sure that at the end of real-time mode
	 * no tasks "run away forever".
	 */
	if (is_realtime(t) && t->time_slice && !--t->time_slice) { 
		if (!is_np(t)) {
			want_resched = FORCE_RESCHED;
		} else {
			TRACE("psnedf_scheduler_tick: "
			      "%d is non-preemptable, "
			      "preemption delayed.\n", t->pid);
			request_exit_np(t);
		}
	} 

	if (get_rt_mode() == MODE_RT_RUN)
	{
		/* check whether anything is waiting to be released 
		 * this could probably be moved to the global timer
		 * interrupt handler since the state will only change
		 * once per jiffie
		 */
		spin_lock_irqsave(&pedf->lock, flags);
		__release_pending(edf);
		if (want_resched != FORCE_RESCHED &&
		    edf_preemption_needed(edf, t))
			want_resched = FORCE_RESCHED;

		spin_unlock_irqrestore(&pedf->lock, flags);

	}
	return want_resched;
}

static void job_completion(struct task_struct* t)
{
	TRACE_TASK(t, "job_completion().\n");
	set_rt_flags(t, RT_F_SLEEP);
	edf_prepare_for_next_period(t);	
}

static int psnedf_schedule(struct task_struct * prev, 
			     struct task_struct ** next, 
			     runqueue_t * rq)
{
	psnedf_domain_t* 	pedf = local_pedf;
	rt_domain_t*		edf  = &pedf->domain; 

	int 			out_of_time, sleep, preempt, 
				np, exists, rt, blocks, resched;

	spin_lock(&pedf->lock);

	/* sanity checking */
	BUG_ON(pedf->scheduled && pedf->scheduled != prev);
	BUG_ON(pedf->scheduled && !is_realtime(prev));

	/* (0) Determine state */	
	exists      = pedf->scheduled != NULL;
	blocks      = exists && !is_running(pedf->scheduled);
	out_of_time = exists && !pedf->scheduled->time_slice;
	np 	    = exists && is_np(pedf->scheduled);
	sleep	    = exists && get_rt_flags(pedf->scheduled) == RT_F_SLEEP;
	preempt     = edf_preemption_needed(edf, prev);
	rt          = get_rt_mode() == MODE_RT_RUN;


	/* If we need to preempt do so.
	 * The following checks set resched to 1 in case of special 
	 * circumstances.
	 */
	resched = preempt;

	/* If a task blocks we have no choice but to reschedule.
	 */
	if (blocks)
		resched = 1;

	/* Request a sys_exit_np() call if we would like to preempt but cannot.
	 * Multiple calls to request_exit_np() don't hurt.
	 */
	if (np && (out_of_time || preempt || sleep))
		request_exit_np(pedf->scheduled);	

	/* Any task that is preemptable and either exhausts its execution
	 * budget or wants to sleep completes. We may have to reschedule after
	 * this.
	 */
	if (!np && (out_of_time || sleep)) {
		job_completion(pedf->scheduled);
		resched = 1;
	}
	
	/* Stop real-time tasks when we leave real-time mode 
	 */
	if (!rt && exists)
		resched = 1;

	/* The final scheduling decision. Do we need to switch for some reason?
	 * Switch if we are in RT mode and have no task or if we need to
	 * resched.
	 */ 
	*next = NULL;
	if ((!np || blocks) && (resched || (!exists && rt))) {
		/* Take care of a previously scheduled
		 * job by taking it out of the Linux runqueue.
		 */
		if (pedf->scheduled) {			
			/* as opposed to global schedulers that switch without
			 * a lock being held we can requeue already here since
			 * no other CPU will schedule from this domain.
			 */
			if (!blocks)
				requeue(pedf->scheduled, edf);
			if (prev->array)
				/* take it out of the run queue */
				deactivate_task(prev, rq);
		}

		/* only pick tasks if we are actually in RT mode */
		if (rt)
			*next = __take_ready(edf);
		if (*next) {				
			/* stick the task into the runqueue */
			__activate_task(*next, rq);
			set_task_cpu(*next, smp_processor_id());
		}

	} else 
		/* Only override Linux scheduler if we have a real-time task 
		 * scheduled that needs to continue.
		 */
		if (exists)		
			*next = prev;

	if (*next)
		set_rt_flags(*next, RT_F_RUNNING);

	pedf->scheduled = *next;
	spin_unlock(&pedf->lock);	
	return 0;
}


/*	Prepare a task for running in RT mode
 *	Enqueues the task into master queue data structure
 *	returns 
 *		-EPERM  if task is not TASK_STOPPED
 */
static long psnedf_prepare_task(struct task_struct * t)
{
	rt_domain_t* 		edf  = task_edf(t);
	psnedf_domain_t* 	pedf = task_pedf(t);	
	unsigned long		flags;

	TRACE("[%d] psn edf: prepare task %d on CPU %d\n", 
		smp_processor_id(), t->pid, get_partition(t));
	if (t->state == TASK_STOPPED) {
		__setscheduler(t, SCHED_FIFO, MAX_RT_PRIO - 1);

		if (get_rt_mode() == MODE_RT_RUN)
			/* The action is already on. 
			 * Prepare immediate release.
			 */
		        edf_release_now(t);		
		/* The task should be running in the queue, otherwise signal 
		 * code will try to wake it up with fatal consequences.
		 */
		t->state = TASK_RUNNING; 
		spin_lock_irqsave(&pedf->lock, flags);
		__add_release(edf, t);
		spin_unlock_irqrestore(&pedf->lock, flags);
		return 0;
	} else
		return -EPERM;
}

static void psnedf_wake_up_task(struct task_struct *task) 
{	
	unsigned long		flags;
	psnedf_domain_t* 	pedf = task_pedf(task);	
	rt_domain_t* 		edf  = task_edf(task);
	
	TRACE("psnedf: %d unsuspends with budget=%d\n", 
	      task->pid, task->time_slice);


	/* After fixing the litmus_controlled bug,
	 * this should hold again.
	 */
	BUG_ON(in_list(&task->rt_list));

	/* FIXME:
	 * There exists a race between this function, suspensions due to IO,
	 * and switching in and out of real-time mode. For some reason, the
	 * BUG_ON triggered after a task system warm-up phase.
	 *
	 * 	BUG_ON(in_list(&task->rt_list));
	 *
	 * Replaced by an if to gather more information.
	 */
/*	
	if (unlikely(in_list(&task->rt_list))) {
		TRACE(KERN_CRIT "wake_up_task: Why is %s/%d in rt list? "
		       "state=%ld next=%p prev=%p flags=0x%8lx mode=%d "
		       "partition=%d cpu=%d deadline=%ld now=%ld release=%ld"
		       "rtflags=%d timeslice=%d job=%u knp=%d",
		       task->comm, task->pid,
		       task->state, task->rt_list.next, task->rt_list.prev, 
		       task->flags, get_rt_mode(),
		       get_partition(task), smp_processor_id(),
		       get_deadline(task), jiffies, get_release(task),
		       get_rt_flags(task), task->time_slice,
		       task->rt_param.times.job_no, task->rt_param.kernel_np);
		task->state = TASK_RUNNING;
		return;
	}
*/

	task->state = TASK_RUNNING;

	/* We need to take suspensions because of semaphores into
	 * account! If a job resumes after being suspended due to acquiring
	 * a semaphore, it should never be treated as a new job release.
	 */
	if (is_tardy(task) && get_rt_flags(task) != RT_F_EXIT_SEM) {
		/* new sporadic release */
		edf_release_now(task);
		sched_trace_job_release(task);
	}

	spin_lock_irqsave(&pedf->lock, flags);	
	requeue(task, edf);
	spin_unlock_irqrestore(&pedf->lock, flags);	
}

static void psnedf_task_blocks(struct task_struct *t)
{
	BUG_ON(!is_realtime(t));
	/* not really anything to do since it can only block if 
	 * it is running, and when it is not running it is not in any 
	 * queue anyway.
	 */
	TRACE("task %d blocks with budget=%d\n", t->pid, t->time_slice);
	BUG_ON(in_list(&t->rt_list));
}


/* When _tear_down is called, the task should not be in any queue any more
 * as it must have blocked first. We don't have any internal state for the task,
 * it is all in the task_struct.
 */
static long psnedf_tear_down(struct task_struct * t)
{
	BUG_ON(!is_realtime(t));
        TRACE_TASK(t, "tear down called");
	BUG_ON(t->array);
	BUG_ON(in_list(&t->rt_list));
	return 0;
}

static long psnedf_pi_block(struct pi_semaphore *sem,
			    struct task_struct *new_waiter)
{
	psnedf_domain_t* 	pedf; 
	rt_domain_t*		edf;
	struct task_struct*	t;
	int cpu  = get_partition(new_waiter);

	BUG_ON(!new_waiter);	

	if (edf_higher_prio(new_waiter, sem->hp.cpu_task[cpu])) {
		TRACE_TASK(new_waiter, " boosts priority\n");
		pedf = task_pedf(new_waiter);
		edf  = task_edf(new_waiter);

		/* interrupts already disabled */
		spin_lock(&pedf->lock);

		/* store new highest-priority task */
		sem->hp.cpu_task[cpu] = new_waiter;
		if (sem->holder && 
		    get_partition(sem->holder) == get_partition(new_waiter)) {
			/* let holder inherit */
			sem->holder->rt_param.inh_task = new_waiter;
			t = sem->holder;
			if (in_list(&t->rt_list)) {
				/* queued in domain*/			
				list_del(&t->rt_list);
				/* readd to make priority change take place */
				if (is_released(t))
					__add_ready(edf, t);
				else
					__add_release(edf, t);
			}
		}

		/* check if we need to reschedule */
		if (edf_preemption_needed(edf, current))
			preempt(pedf);

		spin_unlock(&pedf->lock);	
	}

	return 0;
}

static long psnedf_inherit_priority(struct pi_semaphore *sem,
				    struct task_struct *new_owner)
{
	int cpu  = get_partition(new_owner);

	new_owner->rt_param.inh_task = sem->hp.cpu_task[cpu];	
	if (sem->hp.cpu_task[cpu] && new_owner != sem->hp.cpu_task[cpu]) {
		TRACE_TASK(new_owner, 
			   "inherited priority from %s/%d\n", 
			   sem->hp.cpu_task[cpu]->comm, 
			   sem->hp.cpu_task[cpu]->pid);
	} else
		TRACE_TASK(new_owner, 
			   "cannot inherit priority: "
			   "no higher priority job waits on this CPU!\n");
	/* make new owner non-preemptable as required by FMLP under
	 * PSN-EDF.
	 */
	make_np(new_owner);
	return 0;
}


/* This function is called on a semaphore release, and assumes that
 * the current task is also the semaphore holder.
 */
static long psnedf_return_priority(struct pi_semaphore *sem)
{
	struct task_struct* 	t    = current;
	psnedf_domain_t* 	pedf = task_pedf(t);
	rt_domain_t*		edf  = task_edf(t);
	int 			ret  = 0;
	int			cpu  = get_partition(current);


        /* Find new highest-priority semaphore task
	 * if holder task is the current hp.cpu_task[cpu].
	 *
	 * Calling function holds sem->wait.lock.
	 */
	if (t == sem->hp.cpu_task[cpu])
		edf_set_hp_cpu_task(sem, cpu);

	take_np(t);
	if (current->rt_param.inh_task) {
		TRACE_CUR("return priority of %s/%d\n", 
			  current->rt_param.inh_task->comm,
			  current->rt_param.inh_task->pid);
		spin_lock(&pedf->lock);

		/* Reset inh_task to NULL. */
		current->rt_param.inh_task = NULL;	       		
		
		/* check if we need to reschedule */
		if (edf_preemption_needed(edf, current))
			preempt(pedf);

		spin_unlock(&pedf->lock);
	} else
		TRACE_CUR(" no priority to return %p\n", sem);

	return ret;
}


static int psnedf_mode_change(int new_mode)
{
	int cpu;

	if (new_mode == MODE_RT_RUN)
		for_each_online_cpu(cpu) {
			spin_lock(&remote_pedf(cpu)->lock);
			__rerelease_all(remote_edf(cpu), edf_release_at);
			spin_unlock(&remote_pedf(cpu)->lock);
		}
	
	TRACE("[%d] psn edf: mode changed to %d\n", 
	       smp_processor_id(), new_mode);
	return 0;
}


/*	Plugin object	*/
static sched_plugin_t s_plugin __cacheline_aligned_in_smp = {
	.ready_to_use = 0 
};


/*
 *	Plugin initialization code.
 */
#define INIT_SCHED_PLUGIN (struct sched_plugin) {\
	.plugin_name		= "PSN-EDF",\
	.ready_to_use		= 1,\
	.scheduler_tick		= psnedf_scheduler_tick,\
	.prepare_task		= psnedf_prepare_task,\
	.sleep_next_period	= edf_sleep_next_period,\
	.tear_down		= psnedf_tear_down,\
	.shutdown_hook 		= NULL,\
	.schedule		= psnedf_schedule,\
	.mode_change		= psnedf_mode_change,\
	.wake_up_task		= psnedf_wake_up_task,\
	.task_blocks		= psnedf_task_blocks, \
	.pi_block		= psnedf_pi_block, \
	.inherit_priority	= psnedf_inherit_priority, \
	.return_priority	= psnedf_return_priority \
}


sched_plugin_t *__init init_psn_edf_plugin(void)
{
	int i;

	if (!s_plugin.ready_to_use)
	{
		for (i = 0; i < NR_CPUS; i++)
		{
			psnedf_domain_init(remote_pedf(i), 
					     psnedf_check_resched, i);
		        printk("PSN-EDF: CPU partition %d initialized.\n", i);
		}
		s_plugin = INIT_SCHED_PLUGIN;
	}
	return &s_plugin;
}