/*
 * kernel/sched_psn_edf.c
 *
 * Implementation of the PSN-EDF scheduler plugin.
 * Based on kern/sched_part_edf.c and kern/sched_gsn_edf.c.
 *
 * Suspensions and non-preemptable sections are supported.
 * Priority inheritance is not supported.
 */

#include <linux/percpu.h>
#include <linux/sched.h>
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/module.h>

#include <litmus/litmus.h>
#include <litmus/jobs.h>
#include <litmus/preempt.h>
#include <litmus/budget.h>
#include <litmus/sched_plugin.h>
#include <litmus/edf_common.h>
#include <litmus/sched_trace.h>
#include <litmus/trace.h>

typedef struct {
	rt_domain_t 		domain;
	int          		cpu;
	struct task_struct* 	scheduled; /* only RT tasks */
/*
 * scheduling lock slock
 * protects the domain and serializes scheduling decisions
 */
#define slock domain.ready_lock

} psnedf_domain_t;

DEFINE_PER_CPU(psnedf_domain_t, psnedf_domains);

#define local_edf		(&__get_cpu_var(psnedf_domains).domain)
#define local_pedf		(&__get_cpu_var(psnedf_domains))
#define remote_edf(cpu)		(&per_cpu(psnedf_domains, cpu).domain)
#define remote_pedf(cpu)	(&per_cpu(psnedf_domains, cpu))
#define task_edf(task)		remote_edf(get_partition(task))
#define task_pedf(task)		remote_pedf(get_partition(task))


static void psnedf_domain_init(psnedf_domain_t* pedf,
			       check_resched_needed_t check,
			       release_jobs_t release,
			       int cpu)
{
	edf_domain_init(&pedf->domain, check, release);
	pedf->cpu      		= cpu;
	pedf->scheduled		= NULL;
}

static void requeue(struct task_struct* t, rt_domain_t *edf)
{
	if (t->state != TASK_RUNNING)
		TRACE_TASK(t, "requeue: !TASK_RUNNING\n");

	tsk_rt(t)->completed = 0;
	if (is_early_releasing(t) || is_released(t, litmus_clock()))
		__add_ready(edf, t);
	else
		add_release(edf, t); /* it has got to wait */
}

/* we assume the lock is being held */
static void preempt(psnedf_domain_t *pedf)
{
	preempt_if_preemptable(pedf->scheduled, pedf->cpu);
}

#ifdef CONFIG_LITMUS_LOCKING

static void boost_priority(struct task_struct* t)
{
	unsigned long		flags;
	psnedf_domain_t* 	pedf = task_pedf(t);
	lt_t			now;

	raw_readyq_lock_irqsave(&pedf->slock, flags);
	now = litmus_clock();

	TRACE_TASK(t, "priority boosted at %llu\n", now);

	tsk_rt(t)->priority_boosted = 1;
	tsk_rt(t)->boost_start_time = now;

	if (pedf->scheduled != t) {
		/* holder may be queued: first stop queue changes */
		raw_spin_lock(&pedf->domain.release_lock);
		if (is_queued(t) &&
		    /* If it is queued, then we need to re-order. */
		    bheap_decrease(edf_ready_order, tsk_rt(t)->heap_node) &&
		    /* If we bubbled to the top, then we need to check for preemptions. */
		    edf_preemption_needed(&pedf->domain, pedf->scheduled))
				preempt(pedf);
		raw_spin_unlock(&pedf->domain.release_lock);
	} /* else: nothing to do since the job is not queued while scheduled */

	raw_readyq_unlock_irqrestore(&pedf->slock, flags);
}

static void unboost_priority(struct task_struct* t)
{
	unsigned long		flags;
	psnedf_domain_t* 	pedf = task_pedf(t);
	lt_t			now;

	raw_readyq_lock_irqsave(&pedf->slock, flags);
	now = litmus_clock();

	/* assumption: this only happens when the job is scheduled */
	BUG_ON(pedf->scheduled != t);

	TRACE_TASK(t, "priority restored at %llu\n", now);

	/* priority boosted jobs must be scheduled */
	BUG_ON(pedf->scheduled != t);

	tsk_rt(t)->priority_boosted = 0;
	tsk_rt(t)->boost_start_time = 0;

	/* check if this changes anything */
	if (edf_preemption_needed(&pedf->domain, pedf->scheduled))
		preempt(pedf);

	raw_readyq_unlock_irqrestore(&pedf->slock, flags);
}

#endif

static int psnedf_preempt_check(psnedf_domain_t *pedf)
{
	if (edf_preemption_needed(&pedf->domain, pedf->scheduled)) {
		preempt(pedf);
		return 1;
	} else
		return 0;
}

/* This check is trivial in partioned systems as we only have to consider
 * the CPU of the partition.
 */
static int psnedf_check_resched(rt_domain_t *edf)
{
	psnedf_domain_t *pedf = container_of(edf, psnedf_domain_t, domain);

	/* because this is a callback from rt_domain_t we already hold
	 * the necessary lock for the ready queue
	 */
	return psnedf_preempt_check(pedf);
}

static void job_completion(struct task_struct* t, int forced)
{
	sched_trace_task_completion(t,forced);
	TRACE_TASK(t, "job_completion().\n");

	tsk_rt(t)->completed = 1;
	prepare_for_next_period(t);
}

static enum hrtimer_restart psnedf_simple_on_exhausted(struct task_struct *t)
{
	/* Assumption: t is scheduled on the CPU executing this callback */

	if (budget_signalled(t) && !bt_flag_is_set(t, BTF_SIG_BUDGET_SENT)) {
		/* signal exhaustion */
		send_sigbudget(t); /* will set BTF_SIG_BUDGET_SENT */
	}

	if (budget_enforced(t) && !bt_flag_test_and_set(t, BTF_BUDGET_EXHAUSTED)) {
		if (!is_np(t)) {
			/* np tasks will be preempted when they become
			 * preemptable again
			 */
			litmus_reschedule_local();
			TRACE("cedf_scheduler_tick: "
				  "%d is preemptable "
				  " => FORCE_RESCHED\n", t->pid);
		} else if (is_user_np(t)) {
			TRACE("cedf_scheduler_tick: "
				  "%d is non-preemptable, "
				  "preemption delayed.\n", t->pid);
			request_exit_np(t);
		}
	}

	return HRTIMER_NORESTART;
}

static void psnedf_tick(struct task_struct *t)
{
	psnedf_domain_t *pedf = local_pedf;

	/* Check for inconsistency. We don't need the lock for this since
	 * ->scheduled is only changed in schedule, which obviously is not
	 *  executing in parallel on this CPU
	 */
	BUG_ON(is_realtime(t) && t != pedf->scheduled);

	if (is_realtime(t) &&
		tsk_rt(t)->budget.ops && budget_quantum_tracked(t) &&
		budget_exhausted(t)) {
		TRACE_TASK(t, "budget exhausted\n");
		tsk_rt(t)->budget.ops->on_exhausted(t);
	}
}

static struct task_struct* psnedf_schedule(struct task_struct * prev)
{
	psnedf_domain_t* 	pedf = local_pedf;
	rt_domain_t*		edf  = &pedf->domain;
	struct task_struct*	next;

	int out_of_time, sleep, preempt, np, exists, blocks, resched;

	raw_readyq_lock(&pedf->slock);

	/* sanity checking
	 * differently from gedf, when a task exits (dead)
	 * pedf->schedule may be null and prev _is_ realtime
	 */
	BUG_ON(pedf->scheduled && pedf->scheduled != prev);
	BUG_ON(pedf->scheduled && !is_realtime(prev));

	/* (0) Determine state */
	exists      = pedf->scheduled != NULL;
	blocks      = exists && !is_running(pedf->scheduled);
	out_of_time = exists &&
				  budget_enforced(pedf->scheduled) &&
				  bt_flag_is_set(pedf->scheduled, BTF_BUDGET_EXHAUSTED);
	np 	    = exists && is_np(pedf->scheduled);
	sleep	    = exists && is_completed(pedf->scheduled);
	preempt     = edf_preemption_needed(edf, prev);

	/* If we need to preempt do so.
	 * The following checks set resched to 1 in case of special
	 * circumstances.
	 */
	resched = preempt;

	/* Do budget stuff */
	if (tsk_rt(prev)->budget.ops) {
		if (blocks)
			tsk_rt(prev)->budget.ops->on_blocked(prev);
		else if (sleep)
			tsk_rt(prev)->budget.ops->on_sleep(prev);
		else if (preempt)
			tsk_rt(prev)->budget.ops->on_preempt(prev);
	}

	/* If a task blocks we have no choice but to reschedule.
	 */
	if (blocks)
		resched = 1;

	/* Request a sys_exit_np() call if we would like to preempt but cannot.
	 * Multiple calls to request_exit_np() don't hurt.
	 */
	if (np && (out_of_time || preempt || sleep))
		request_exit_np(pedf->scheduled);

	/* Any task that is preemptable and either exhausts its execution
	 * budget or wants to sleep completes. We may have to reschedule after
	 * this.
	 */
	if (!np && (out_of_time || sleep) && !blocks) {
		job_completion(pedf->scheduled, !sleep);
		resched = 1;
	}

	/* The final scheduling decision. Do we need to switch for some reason?
	 * Switch if we are in RT mode and have no task or if we need to
	 * resched.
	 */
	next = NULL;
	if ((!np || blocks) && (resched || !exists)) {
		/* When preempting a task that does not block, then
		 * re-insert it into either the ready queue or the
		 * release queue (if it completed). requeue() picks
		 * the appropriate queue.
		 */
		if (pedf->scheduled && !blocks)
			requeue(pedf->scheduled, edf);
		next = __take_ready(edf);
	} else
		/* Only override Linux scheduler if we have a real-time task
		 * scheduled that needs to continue.
		 */
		if (exists)
			next = prev;

	if (next) {
		TRACE_TASK(next, "scheduled at %llu\n", litmus_clock());
		tsk_rt(next)->completed = 0;
	} else {
		TRACE("becoming idle at %llu\n", litmus_clock());
	}

	pedf->scheduled = next;
	sched_state_task_picked();
	raw_readyq_unlock(&pedf->slock);

	return next;
}


/*	Prepare a task for running in RT mode
 */
static void psnedf_task_new(struct task_struct * t, int on_rq, int running)
{
	rt_domain_t* 		edf  = task_edf(t);
	psnedf_domain_t* 	pedf = task_pedf(t);
	unsigned long		flags;

	TRACE_TASK(t, "psn edf: task new, cpu = %d\n",
		   t->rt_param.task_params.cpu);

	/* setup job parameters */
	release_at(t, litmus_clock());

	/* The task should be running in the queue, otherwise signal
	 * code will try to wake it up with fatal consequences.
	 */
	raw_readyq_lock_irqsave(&pedf->slock, flags);
	if (running) {
		/* there shouldn't be anything else running at the time */
		BUG_ON(pedf->scheduled);
		pedf->scheduled = t;
	} else {
		requeue(t, edf);
		/* maybe we have to reschedule */
		psnedf_preempt_check(pedf);
	}
	raw_readyq_unlock_irqrestore(&pedf->slock, flags);
}

static void psnedf_task_wake_up(struct task_struct *task)
{
	unsigned long		flags;
	psnedf_domain_t* 	pedf = task_pedf(task);
	rt_domain_t* 		edf  = task_edf(task);
	lt_t			now;

	TRACE_TASK(task, "wake_up at %llu\n", litmus_clock());
	raw_readyq_lock_irqsave(&pedf->slock, flags);
	BUG_ON(is_queued(task));
	now = litmus_clock();
	if (is_sporadic(task) && is_tardy(task, now)
#ifdef CONFIG_LITMUS_LOCKING
	/* We need to take suspensions because of semaphores into
	 * account! If a job resumes after being suspended due to acquiring
	 * a semaphore, it should never be treated as a new job release.
	 */
	    && !is_priority_boosted(task)
#endif
		) {
		/* new sporadic release */
		release_at(task, now);
		sched_trace_task_release(task);
	}

	/* Only add to ready queue if it is not the currently-scheduled
	 * task. This could be the case if a task was woken up concurrently
	 * on a remote CPU before the executing CPU got around to actually
	 * de-scheduling the task, i.e., wake_up() raced with schedule()
	 * and won.
	 */
	if (pedf->scheduled != task) {
		requeue(task, edf);
		psnedf_preempt_check(pedf);
	}

	raw_readyq_unlock_irqrestore(&pedf->slock, flags);
	TRACE_TASK(task, "wake up done\n");
}

static void psnedf_task_block(struct task_struct *t)
{
	/* only running tasks can block, thus t is in no queue */
	TRACE_TASK(t, "block at %llu, state=%d\n", litmus_clock(), t->state);

	BUG_ON(!is_realtime(t));
	BUG_ON(is_queued(t));
}

static void psnedf_task_exit(struct task_struct * t)
{
	unsigned long flags;
	psnedf_domain_t* 	pedf = task_pedf(t);
	rt_domain_t*		edf;

	raw_readyq_lock_irqsave(&pedf->slock, flags);

	/* disable budget enforcement */
	if (tsk_rt(t)->budget.ops)
		tsk_rt(t)->budget.ops->on_exit(t);

	if (is_queued(t)) {
		/* dequeue */
		edf  = task_edf(t);
		remove(edf, t);
	}
	if (pedf->scheduled == t)
		pedf->scheduled = NULL;

	TRACE_TASK(t, "RIP, now reschedule\n");

	preempt(pedf);
	raw_readyq_unlock_irqrestore(&pedf->slock, flags);
}

#ifdef CONFIG_LITMUS_LOCKING

#include <litmus/fdso.h>
#include <litmus/srp.h>

/* ******************** SRP support ************************ */

static unsigned int psnedf_get_srp_prio(struct task_struct* t)
{
	/* assumes implicit deadlines */
	return get_rt_period(t);
}

/* ******************** FMLP support ********************** */

/* struct for semaphore with priority inheritance */
struct fmlp_semaphore {
	struct litmus_lock litmus_lock;

	/* current resource holder */
	struct task_struct *owner;

	/* FIFO queue of waiting tasks */
	wait_queue_head_t wait;
};

static inline struct fmlp_semaphore* fmlp_from_lock(struct litmus_lock* lock)
{
	return container_of(lock, struct fmlp_semaphore, litmus_lock);
}
int psnedf_fmlp_lock(struct litmus_lock* l)
{
	struct task_struct* t = current;
	struct fmlp_semaphore *sem = fmlp_from_lock(l);
	wait_queue_t wait;
	unsigned long flags;

	if (!is_realtime(t))
		return -EPERM;

	/* prevent nested lock acquisition --- not supported by FMLP */
	if (tsk_rt(t)->num_locks_held ||
	    tsk_rt(t)->num_local_locks_held)
		return -EBUSY;

	spin_lock_irqsave(&sem->wait.lock, flags);

	if (sem->owner) {
		/* resource is not free => must suspend and wait */

		init_waitqueue_entry(&wait, t);

		/* FIXME: interruptible would be nice some day */
		set_task_state(t, TASK_UNINTERRUPTIBLE);

		__add_wait_queue_tail_exclusive(&sem->wait, &wait);

		TS_LOCK_SUSPEND;

		/* release lock before sleeping */
		spin_unlock_irqrestore(&sem->wait.lock, flags);

		/* We depend on the FIFO order.  Thus, we don't need to recheck
		 * when we wake up; we are guaranteed to have the lock since
		 * there is only one wake up per release.
		 */

		schedule();

		TS_LOCK_RESUME;

		/* Since we hold the lock, no other task will change
		 * ->owner. We can thus check it without acquiring the spin
		 * lock. */
		BUG_ON(sem->owner != t);
	} else {
		/* it's ours now */
		sem->owner = t;

		/* mark the task as priority-boosted. */
		boost_priority(t);

		spin_unlock_irqrestore(&sem->wait.lock, flags);
	}

	tsk_rt(t)->num_locks_held++;

	return 0;
}

int psnedf_fmlp_unlock(struct litmus_lock* l)
{
	struct task_struct *t = current, *next;
	struct fmlp_semaphore *sem = fmlp_from_lock(l);
	unsigned long flags;
	int err = 0;

	spin_lock_irqsave(&sem->wait.lock, flags);

	if (sem->owner != t) {
		err = -EINVAL;
		goto out;
	}

	tsk_rt(t)->num_locks_held--;

	/* we lose the benefit of priority boosting */

	unboost_priority(t);

	/* check if there are jobs waiting for this resource */
	next = __waitqueue_remove_first(&sem->wait);
	if (next) {
		/* boost next job */
		boost_priority(next);

		/* next becomes the resouce holder */
		sem->owner = next;

		/* wake up next */
		wake_up_process(next);
	} else
		/* resource becomes available */
		sem->owner = NULL;

out:
	spin_unlock_irqrestore(&sem->wait.lock, flags);
	return err;
}

int psnedf_fmlp_close(struct litmus_lock* l)
{
	struct task_struct *t = current;
	struct fmlp_semaphore *sem = fmlp_from_lock(l);
	unsigned long flags;

	int owner;

	spin_lock_irqsave(&sem->wait.lock, flags);

	owner = sem->owner == t;

	spin_unlock_irqrestore(&sem->wait.lock, flags);

	if (owner)
		psnedf_fmlp_unlock(l);

	return 0;
}

void psnedf_fmlp_free(struct litmus_lock* lock)
{
	kfree(fmlp_from_lock(lock));
}

static struct litmus_lock_ops psnedf_fmlp_lock_ops = {
	.close  = psnedf_fmlp_close,
	.lock   = psnedf_fmlp_lock,
	.unlock = psnedf_fmlp_unlock,
	.deallocate = psnedf_fmlp_free,
};

static struct litmus_lock* psnedf_new_fmlp(void)
{
	struct fmlp_semaphore* sem;

	sem = kmalloc(sizeof(*sem), GFP_KERNEL);
	if (!sem)
		return NULL;

	sem->owner   = NULL;
	init_waitqueue_head(&sem->wait);
	sem->litmus_lock.ops = &psnedf_fmlp_lock_ops;

	return &sem->litmus_lock;
}

/* **** lock constructor **** */


static long psnedf_allocate_lock(struct litmus_lock **lock, int type,
				 void* __user unused)
{
	int err = -ENXIO;
	struct srp_semaphore* srp;

	/* PSN-EDF currently supports the SRP for local resources and the FMLP
	 * for global resources. */
	switch (type) {
	case FMLP_SEM:
		/* Flexible Multiprocessor Locking Protocol */
		*lock = psnedf_new_fmlp();
		if (*lock)
			err = 0;
		else
			err = -ENOMEM;
		break;

	case SRP_SEM:
		/* Baker's Stack Resource Policy */
		srp = allocate_srp_semaphore();
		if (srp) {
			*lock = &srp->litmus_lock;
			err = 0;
		} else
			err = -ENOMEM;
		break;
	};

	return err;
}

#endif


static long psnedf_activate_plugin(void)
{
#ifdef CONFIG_RELEASE_MASTER
	int cpu;

	for_each_online_cpu(cpu) {
		remote_edf(cpu)->release_master = atomic_read(&release_master_cpu);
	}
#endif

#ifdef CONFIG_LITMUS_LOCKING
	get_srp_prio = psnedf_get_srp_prio;
#endif

	return 0;
}

static struct budget_tracker_ops psnedf_drain_simple_ops =
{
	.on_scheduled = simple_on_scheduled,
	.on_blocked = simple_on_blocked,
	.on_preempt = simple_on_preempt,
	.on_sleep = simple_on_sleep,
	.on_exit = simple_on_exit,

	.on_exhausted = psnedf_simple_on_exhausted,

	.on_inherit = NULL,
	.on_disinherit = NULL,
};

static long psnedf_admit_task(struct task_struct* tsk)
{
	long ret = 0;

	if (task_cpu(tsk) == tsk->rt_param.task_params.cpu
#ifdef CONFIG_RELEASE_MASTER
	    /* don't allow tasks on release master CPU */
	     && task_cpu(tsk) != remote_edf(task_cpu(tsk))->release_master
#endif
		) {

		if (budget_enforced(tsk) || budget_signalled(tsk)) {
			switch(get_drain_policy(tsk)) {
				case DRAIN_SIMPLE:
					init_budget_tracker(&tsk_rt(tsk)->budget, &psnedf_drain_simple_ops);
					break;
				default:
					TRACE_TASK(tsk, "Unsupported budget draining mode.\n");
					ret = -EINVAL;
					break;
			}
		}
	}
	else
		ret = -EINVAL;

	return ret;
}

/*	Plugin object	*/
static struct sched_plugin psn_edf_plugin __cacheline_aligned_in_smp = {
	.plugin_name		= "PSN-EDF",
	.tick			= psnedf_tick,
	.task_new		= psnedf_task_new,
	.complete_job		= complete_job,
	.task_exit		= psnedf_task_exit,
	.schedule		= psnedf_schedule,
	.task_wake_up		= psnedf_task_wake_up,
	.task_block		= psnedf_task_block,
	.admit_task		= psnedf_admit_task,
	.activate_plugin	= psnedf_activate_plugin,
#ifdef CONFIG_LITMUS_LOCKING
	.allocate_lock		= psnedf_allocate_lock,
#endif
};


static int __init init_psn_edf(void)
{
	int i;

	/* We do not really want to support cpu hotplug, do we? ;)
	 * However, if we are so crazy to do so,
	 * we cannot use num_online_cpu()
	 */
	for (i = 0; i < num_online_cpus(); i++) {
		psnedf_domain_init(remote_pedf(i),
				   psnedf_check_resched,
				   NULL, i);
	}
	return register_sched_plugin(&psn_edf_plugin);
}

module_init(init_psn_edf);