/*
 * kernel/sched_adaptive.c
 *
 * Implementation of Aaron's adaptive global EDF  scheduling algorithm. It is
 * based on the GSN-EDF scheduler. However, it does not support synchronization
 * primitives. 
 *
 * It implements a version of FC-GEDF with a bunch of linearity assumptions for
 * the optimizer and the the weight-transfer function. The code is meant to be
 * clear, however you really need to read the paper if you want to understand
 * what is going on here.
 *
 * Block et al., "Feedback-Controlled Adaptive Multiprocessor Real-Time
 * Systems", submitted to RTAS 2008.
 */

#include <linux/percpu.h>
#include <linux/sched.h>
#include <linux/list.h>

#include <linux/queuelock.h>
#include <linux/litmus.h>
#include <linux/sched_plugin.h>
#include <linux/edf_common.h>
#include <linux/sched_trace.h>
#include <asm/uaccess.h>

#include <linux/fpmath.h>

/* Overview of GSN-EDF operations.
 *
 * For a detailed explanation of GSN-EDF have a look at the FMLP paper. This
 * description only covers how the individual operations are implemented in
 * LITMUS. 
 *
 * link_task_to_cpu(T, cpu) 	- Low-level operation to update the linkage
 *                                structure (NOT the actually scheduled
 *                                task). If there is another linked task To
 *                                already it will set To->linked_on = NO_CPU
 *                                (thereby removing its association with this
 *                                CPU). However, it will not requeue the
 *                                previously linked task (if any). It will set
 *                                T's state to RT_F_RUNNING and check whether
 *                                it is already running somewhere else. If T
 *                                is scheduled somewhere else it will link
 *                                it to that CPU instead (and pull the linked
 *                                task to cpu). T may be NULL.
 *                                
 * unlink(T)			- Unlink removes T from all scheduler data
 *                                structures. If it is linked to some CPU it
 *                                will link NULL to that CPU. If it is
 *                                currently queued in the gsnedf queue it will
 *                                be removed from the T->rt_list. It is safe to
 *                                call unlink(T) if T is not linked. T may not
 *                                be NULL.
 *
 * requeue(T)			- Requeue will insert T into the appropriate
 *                                queue. If the system is in real-time mode and
 *                                the T is released already, it will go into the
 *                                ready queue. If the system is not in
 *                                real-time mode is T, then T will go into the
 *                                release queue. If T's release time is in the
 *                                future, it will go into the release
 *                                queue. That means that T's release time/job
 *                                no/etc. has to be updated before requeu(T) is
 *                                called. It is not safe to call requeue(T)
 *                                when T is already queued. T may not be NULL.
 *
 * gsnedf_job_arrival(T)	- This is the catch all function when T enters
 *                                the system after either a suspension or at a
 *                                job release. It will queue T (which means it
 *                                is not safe to call gsnedf_job_arrival(T) if
 *                                T is already queued) and then check whether a
 *                                preemption is necessary. If a preemption is
 *                                necessary it will update the linkage
 *                                accordingly and cause scheduled to be called
 *                                (either with an IPI or need_resched). It is
 *                                safe to call gsnedf_job_arrival(T) if T's
 *                                next job has not been actually released yet
 *                                (releast time in the future). T will be put
 *                                on the release queue in that case.
 * 
 * job_completion(T)		- Take care of everything that needs to be done
 *                                to prepare T for its next release and place
 *                                it in the right queue with
 *                                gsnedf_job_arrival(). 
 *
 *
 * When we now that T is linked to CPU then link_task_to_cpu(NULL, CPU) is
 * equivalent to unlink(T). Note that if you unlink a task from a CPU none of
 * the functions will automatically propagate pending task from the ready queue
 * to a linked task. This is the job of the calling function ( by means of
 * __take_ready).
 */

static void unlink(struct task_struct* t);
static void adaptive_job_arrival(struct task_struct* task);

/* cpu_entry_t - maintain the linked and scheduled state
 */
typedef struct  {
	int 			cpu;
	struct task_struct*	linked;		/* only RT tasks */
	struct task_struct*	scheduled;	/* only RT tasks */
	struct list_head	list;
	atomic_t		will_schedule;	/* prevent unneeded IPIs */
} cpu_entry_t;
DEFINE_PER_CPU(cpu_entry_t, adaptive_cpu_entries);

#define set_will_schedule() \
	(atomic_set(&__get_cpu_var(adaptive_cpu_entries).will_schedule, 1))
#define clear_will_schedule() \
	(atomic_set(&__get_cpu_var(adaptive_cpu_entries).will_schedule, 0))
#define test_will_schedule(cpu) \
	(atomic_read(&per_cpu(adaptive_cpu_entries, cpu).will_schedule))


#define NO_CPU 0xffffffff

/* The gsnedf_lock is used to serialize all scheduling events.
 * It protects
 */
static queuelock_t adaptive_lock;
/* the cpus queue themselves according to priority in here */
static LIST_HEAD(adaptive_cpu_queue);

static rt_domain_t adaptive;

/* feedback control parameters */
static fp_t fc_a, fc_b;

/* optimizer trigger */
static jiffie_t	last_optimizer_run;
static jiffie_t optimizer_min_invocation_sep;
static jiffie_t	optimizer_period;
static fp_t	task_error_threshold;

static fp_t	system_capacity;
/* total actual weight of the task system */
static fp_t	total_weight; 

/* optimizer time snapshot */
jiffie_t 	opt_time;

/* Delayed weight increase notification list.
 * This list gets clobbered on each optimizer run.
 */
static LIST_HEAD(adaptive_inc_list);

/* comment out to disable optimizer debugging */
#define ENABLE_OPTIMIZER_DEBUGGING

#ifdef ENABLE_OPTIMIZER_DEBUGGING
#define OPT_DBG TRACE
#define OPT_DBG_T TRACE_TASK
#else 
#define OPT_DBG
#define OPT_DBG_T OPT_D
#endif

/******************************************************************************/
/*                             OPTIMIZER MATH                                 */
/******************************************************************************/

/* All time dependent functions 
 * rely on opt_time.
 * Update in the optimizer before use!
 */

static inline fp_t ideal(fp_t weight, jiffie_t delta_t)
{
	return _mul(weight, FP(delta_t));
}

static noinline long ideal_exec_time(struct task_struct* t)
{
	jiffie_t delta = opt_time - get_last_release(t);
	return _round(ideal(get_est_weight(t), delta));
}

/* this makes a whole bunch of linearity assumptions */
static noinline fp_t weight_transfer(struct task_struct* t,
				   unsigned int from, unsigned int to,
				   fp_t act_weight)
{
	fp_t rel_from, rel_to, ret;
	rel_from = get_sl(t, from).weight;
	rel_to   = get_sl(t, to).weight;
	ret.val = (act_weight.val * rel_to.val) / rel_from.val;
	OPT_DBG("weight_transfer(%ld, %ld, %ld) => %ld  to=%u from=%u\n", 
		rel_from.val, rel_to.val, act_weight.val, ret.val, from, to);

	return ret;
}

static noinline fp_t est_weight_at(struct task_struct* t, unsigned int level)
{
	if (t->rt_param.no_service_levels)
		return weight_transfer(t, get_cur_sl(t), level, 
				       get_est_weight(t));
	else
		return get_est_weight(t);
			       
}

static noinline void update_estimate(predictor_state_t *state, fp_t actual_weight,
				     fp_t a, fp_t b)
{
	fp_t err, new;
	
	OPT_DBG("OLD ESTIMATE Weight" _FP_ " ActWt " _FP_ " A:" _FP_ ", B:" _FP_
		"\n", fp2str(state->estimate), fp2str(actual_weight), fp2str(a),
		fp2str(b));
	err                = _sub(actual_weight, state->estimate);
	new = _add(_mul(a, err), 
		   _mul(b, state->accumulated));

	total_weight = _sub(total_weight, state->estimate);
	state->estimate = new;
	total_weight = _add(total_weight, state->estimate);

	state->accumulated = _add(state->accumulated, err);
	OPT_DBG("ERROR " _FP_ ", NEW " _FP_ ", ACC" _FP_ "\n", fp2str(err),
		fp2str(new), fp2str(state->accumulated));

}

static noinline fp_t linear_metric(struct task_struct* t) 
{
	fp_t v1, vmax, g1, gmax;
	fp_t est_w;
	unsigned int l    = t->rt_param.no_service_levels;
	unsigned int lcur;

	if (l <= 1)
		return FP(0);

	lcur  = get_cur_sl(t);;
	est_w = get_est_weight(t);

	OPT_DBG_T(t, " linear_metric: lcur=%u l=%u est_w=" _FP_ "\n", 
		   lcur, l, est_w);
	OPT_DBG_T(t, " linear_metric: est_w.val=%ld\n", est_w.val); 


	v1    = t->rt_param.service_level[0].value;
	vmax  = t->rt_param.service_level[l - 1].value;

	OPT_DBG_T(t, " linear_metric: v1=" _FP_ " vmax=" _FP_  "\n", v1, vmax);
	OPT_DBG_T(t, " linear_metric: v1=%ld vmax=%ld\n", v1.val, vmax.val);


	g1    = weight_transfer(t, lcur, 0, est_w);
	gmax  = weight_transfer(t, lcur, l - 1, est_w);

	OPT_DBG_T(t, " linear_metric: g1=" _FP_ " gmax=" _FP_  "\n", g1, gmax);
	OPT_DBG_T(t, " linear_metric: g1=%ld gmax=%ld\n", g1, gmax); 


	TRACE_BUG_ON(_eq(_sub(gmax, g1), FP(0)));
	if (_eq(_sub(gmax, g1), FP(0)))
		return FP(0);
	return _div(_sub(vmax, v1),
		    _sub(gmax, g1));
}

static noinline  unsigned long reweighted_period(fp_t ow, fp_t nw, 
						 unsigned long alloc,
						 jiffie_t deadline, 
						 jiffie_t release) 
{
	fp_t dl;
	dl = _mul(FP(deadline - release), ow);
	dl = _sub(dl, FP(alloc));
	if(_eq(nw, FP(0)))
		return 0;
	dl = _div(dl, nw);
	return _round(dl);
}

static noinline int is_under_allocated(struct task_struct* t)
{
	return ideal_exec_time(t) >= t->rt_param.times.exec_time;
}

static noinline jiffie_t dec_equal_point_delay(struct task_struct* t)
{
	if (_lt(FP(0), get_est_weight(t)))
		/* when t was released plus time needed to equalize
		 * minus now 
		 */		
		return get_last_release(t) +  
			_round(_div( FP(t->rt_param.times.exec_time), 
				     get_est_weight(t))) -
			opt_time;
	else
		/* if the weight is zero we just take the 
		 * deadline
		 */
		return t->rt_param.times.deadline;
}

static noinline jiffie_t inc_equal_point_delay(struct task_struct* t)
{
	if (_lt(FP(0), t->rt_param.opt_nw))
		/* when t was released plus time needed to equalize
		 * minus now 
		 */
		return  get_last_release(t) +
			_round(_div( FP(t->rt_param.times.exec_time), 
				     t->rt_param.opt_nw)) -
			opt_time;
	else
		/* if the weight is zero we just take the 
		 * deadline
		 */
		return t->rt_param.times.deadline;
}

static noinline jiffie_t decrease_delay(struct task_struct* t)
{	
	if (has_active_job(t) && !is_under_allocated(t))
		return dec_equal_point_delay(t);	
	return 0;
}


/******************************************************************************/
/*                             SORT ORDERS                                    */
/******************************************************************************/

static int by_linear_metric(struct list_head* a, struct list_head* b)
{
	struct task_struct *ta, *tb;
	ta = list_entry(a, struct task_struct, rt_param.opt_list);
	tb = list_entry(b, struct task_struct, rt_param.opt_list);
	return _gt(ta->rt_param.opt_order, tb->rt_param.opt_order);
}

static int by_delta_weight(struct list_head* a, struct list_head* b)
{
	struct task_struct *ta, *tb;
	ta = list_entry(a, struct task_struct, rt_param.opt_list);
	tb = list_entry(b, struct task_struct, rt_param.opt_list);
	return _lt(ta->rt_param.opt_dw, tb->rt_param.opt_dw);
}

static int by_enactment_time(struct list_head* a, struct list_head* b)
{
	struct task_struct *ta, *tb;
	ta = list_entry(a, struct task_struct, rt_param.opt_list);
	tb = list_entry(b, struct task_struct, rt_param.opt_list);
	return ta->rt_param.opt_change < tb->rt_param.opt_change;
}

/******************************************************************************/
/*                         WEIGHT CHANGE MECHANICS                            */
/******************************************************************************/

static void set_service_level(struct task_struct* t, unsigned int level)
{
	service_level_t *new;
	unsigned int old;
	BUG_ON(!t);
	BUG_ON(t->rt_param.no_service_levels <= level);
	
	old = t->rt_param.cur_service_level;
	t->rt_param.cur_service_level  = level;
	new = t->rt_param.service_level + level;
	t->rt_param.basic_params.period = new->period;    
	t->rt_param.basic_params.exec_cost = _round(_mul(new->weight, 
							 FP(new->period)));

	scheduler_signal(t, SIGUSR1);

	sched_trace_service_level_change(t, old, level);
	OPT_DBG_T(t, "service level %u activated\n", level);
}

/* call this _before_ updating deadline and release of t */
static void update_weight_estimate(struct task_struct* t)
{
	fp_t nw, ow;
	jiffie_t sl_period, exec_time;

	ow        = get_est_weight(t);
	nw        = t->rt_param.opt_nw;
	exec_time = t->rt_param.times.exec_time;
	sl_period = get_sl(t, get_opt_sl(t)).period;

	OPT_DBG("ow=" _FP_ " nw=" _FP_ ", r-d " _FP_ 
		", deadline %d, release %d, exec_time=%ld sl_period=%lu\n",
		fp2str(ow), fp2str(nw), 
		fp2str(FP(get_deadline(t) - get_last_release(t))), 
		get_deadline(t), get_last_release(t), exec_time, sl_period);

	total_weight = _sub(total_weight, get_est_weight(t));
	t->rt_param.predictor_state.estimate = nw;
	OPT_DBG_T(t, "update_weight_estimate from " _FP_ " to "_FP_"\n", 
		  fp2str(ow), fp2str(nw));
	total_weight = _add(total_weight, get_est_weight(t));

	OPT_DBG_T(t, " update_weight_estimate: " _FP_ " => " _FP_ "\n", 
	fp2str(ow), fp2str(get_est_weight(t)));
}


static void decrease_weight(struct task_struct* t)
{
	fp_t ow, nw;
	jiffie_t last, period, delay;

	ow = get_sl(t, get_cur_sl(t)).weight;
	nw = get_sl(t, get_opt_sl(t)).weight;
	last = t->rt_param.times.last_release;	
	period = reweighted_period(ow, nw, t->rt_param.times.exec_time,
				   t->rt_param.times.deadline, last);
		
	/* necessary delay has already been computed by optimizer */
	delay = t->rt_param.opt_change;
	
	update_weight_estimate(t);

	if (!delay)
		t->rt_param.times.last_release = opt_time;
	t->rt_param.times.release      = opt_time + delay;
	t->rt_param.times.deadline     = opt_time + delay + period;

	set_service_level(t, get_opt_sl(t));

	/* take out of queue/link structure */
	unlink(t);
	/* present as a new job */
	adaptive_job_arrival(t);					
}


static void increase_weight(struct task_struct* t)
{
	fp_t ow, nw;
	jiffie_t last, period, delay;

	ow = get_sl(t, get_cur_sl(t)).weight;
	nw = get_sl(t, get_opt_sl(t)).weight;
	last = t->rt_param.times.last_release;	
	period = reweighted_period(ow, nw, t->rt_param.times.exec_time,
				   t->rt_param.times.deadline, last);

	if (t->rt_param.opt_change == 0) {
		/* can be enacted now */
		if (is_under_allocated(t) || 
		    time_before(opt_time + period, get_deadline(t)))
			/* do it now */
			delay = 0;
		else {
			if (is_under_allocated(t)) {
				t->rt_param.opt_change += opt_time;
				/* The next job release will notice that opt !=
				 * sl and initiate a weight change.
				 */
				return;
			} else
				/* nope, wait for equal point */
				delay = inc_equal_point_delay(t);
		}

		update_weight_estimate(t);

		if (!delay)
			t->rt_param.times.last_release = opt_time;
		t->rt_param.times.release      = opt_time + delay;
		t->rt_param.times.deadline     = opt_time + delay + period;
		
		set_service_level(t, get_opt_sl(t));
		
		/* take out of queue/link structure */
		unlink(t);
		/* present as a new job */
		adaptive_job_arrival(t);					
				
	} else {
		/* must wait until capacity is released */
		t->rt_param.opt_change += opt_time;
		list_insert(&t->rt_param.opt_list, &adaptive_inc_list, 
			    by_enactment_time);
	}
}

static void delayed_increase_weight(void)
{
	struct list_head *p, *extra;
	struct task_struct* t;

	opt_time = jiffies;
	list_for_each_safe(p, extra, &adaptive_inc_list) {
		t = list_entry(p, struct task_struct, rt_param.opt_list);
		if (time_before_eq(t->rt_param.opt_change, opt_time)) {
			list_del(p);
			/* prevent recursion */
			t->rt_param.opt_change = 0;
			/* this takes care of everything */
			increase_weight(t);
		} else			
			/* list is sorted */
			break;
	}
}

static void change_weight(struct task_struct* t)
{
	if (get_cur_sl(t) < get_opt_sl(t))
		increase_weight(t);
	else
		decrease_weight(t);
	OPT_DBG_T(t, "after change_weight: last_rel:%d rel:%d dl:%d\n",
		  get_last_release(t),
		  get_release(t),
		  get_deadline(t));
}

/******************************************************************************/
/*                               OPTIMIZER                                    */
/******************************************************************************/

/* only invoke with adaptive_lock behing held */
void adaptive_optimize(void)
{
	struct list_head list;
	struct list_head inc, dec;
	struct list_head *p, *extra;	
	cpu_entry_t *cpu;
	struct task_struct* t;
	fp_t M = FP(0), w0, wl, tmp, estU = FP(0);
	unsigned int l;
	jiffie_t enactment_time;

	if (time_before(jiffies, 
			last_optimizer_run + optimizer_min_invocation_sep))
		return;	

	OPT_DBG(":::::: running adaptive optimizer\n");
	opt_time = jiffies;

	INIT_LIST_HEAD(&list);

	/* 1) gather all tasks */
	list_for_each(p, &adaptive.ready_queue)
		list_add(&(rt_list2task(p)->rt_param.opt_list), &list);
	list_for_each(p, &adaptive.release_queue)
		list_add(&(rt_list2task(p)->rt_param.opt_list), &list);
	list_for_each(p, &adaptive_cpu_queue) {
		cpu = list_entry(p, cpu_entry_t, list);
		if (cpu->linked)
			list_add(&cpu->linked->rt_param.opt_list, &list);
	}
	
	/* 2) determine current system capacity */
	M = system_capacity;
	OPT_DBG("opt: system capacity: " _FP_ "\n", fp2str(M));

	/* 3) Compute L value for all tasks, 
	 *    and set tasks to service level 0, 
	 *    also account for weight.
	 *    Also establish current estimated utilization
	 */
	list_for_each_safe(p, extra, &list) {		
		t  = list_entry(p, struct task_struct, rt_param.opt_list);
		if (time_before(opt_time, get_last_release(t))) {
			list_del(p);
			continue;
		}
		t->rt_param.opt_order = linear_metric(t);		
		OPT_DBG_T(t, "est_w = " _FP_ " L = " _FP_ "\n", 
			  get_est_weight(t),
			  fp2str(t->rt_param.opt_order));
		t->rt_param.opt_level = 0;
		M = _sub(M, est_weight_at(t, 0));
		estU = _add(estU, get_est_weight(t));
	}
	OPT_DBG("opt: estimated utilization: " _FP_ "\n", fp2str(estU));
	OPT_DBG("opt: estimated capacity at all sl=0: " _FP_ "\n", fp2str(M));

	
	/* 4) sort list by decreasing linear metric */
	list_qsort(&list, by_linear_metric);
	
	/* 5) assign each task a service level  */
	list_for_each(p, &list) {
		t  = list_entry(p, struct task_struct, rt_param.opt_list);
		l = t->rt_param.no_service_levels;
		w0 = est_weight_at(t, 0);
		while (l > 1) {
			l--;
			wl = est_weight_at(t, l);
			tmp = _sub(M, _sub(wl, w0));
			if (_leq(FP(0), tmp)) {
				/* this level fits in */
				M = tmp;
				t->rt_param.opt_level = l;
				t->rt_param.opt_dw    = _sub(wl, 
							     get_est_weight(t));
				t->rt_param.opt_nw    = wl;
				break; /* proceed to next task */
			}
		}
		OPT_DBG_T(t, " will run at sl=%u, prior=%u dw=" _FP_ "\n",
			   l, get_cur_sl(t), fp2str(t->rt_param.opt_dw));
		
	}
	
	/* 6) filter tasks that reweight  */
	INIT_LIST_HEAD(&inc);
	INIT_LIST_HEAD(&dec);
	list_for_each_safe(p, extra, &list) {
		t  = list_entry(p, struct task_struct, rt_param.opt_list);
		list_del(p);
		if (t->rt_param.opt_level < get_cur_sl(t)) {
			list_add(p, &dec);
			t->rt_param.opt_change = decrease_delay(t);
		} else if (t->rt_param.opt_level > get_cur_sl(t)) {
			list_add(p, &inc);
			t->rt_param.opt_change = 0;
		}
		/* if t doesn't change we can ignore it from now on */
	}

	/* 7) sort dec and inc list */
	list_qsort(&dec, by_enactment_time);
	list_qsort(&inc, by_delta_weight);

	/* 8) now figure out when we can enact weight increases 
	 *    It works like this: We know the current system utilization.
	 *    Thus, we know the remaining capacity. We also know when 
	 *    decreases are going to be enacted (=> capacity increases).
	 *    Now we only need to find a spot where the weight increase will
	 *    not drive the system into overload.
	 */
	
	/* Very ugly jump, but we need to force enactment_time = 0
	 * during the first iteration.
	 */
	M = system_capacity;
	enactment_time = 0;
	goto first_iteration; 

	while (!list_empty(&inc)) {		
		enactment_time = list_entry(dec.next, struct task_struct,        
					    rt_param.opt_list)
			->rt_param.opt_change;
	first_iteration:
		/* Start by collapsing the next decrease.
		 * Except for in the first iteration, it will always
		 * pick off at least one task.
		 */
		list_for_each_safe(p, extra, &dec) {
			t  = list_entry(p, struct task_struct, 
					rt_param.opt_list);
			if (t->rt_param.opt_change == enactment_time) {
				list_del(p);
				/* opt_dw is negative */
				estU = _add(estU, t->rt_param.opt_dw);
				list_add(p, &list);

				OPT_DBG_T(t, " weight decrease at %ld => estU=" 
					  _FP_ "\n", enactment_time, 
					  fp2str(estU));

			} else
				/* stop decrease loop */
				break;
		}

		/* now start setting enactment times for increases */
		while (!list_empty(&inc)) {					
			p  = inc.next;
			t  = list_entry(p, struct task_struct, 
					rt_param.opt_list);
			tmp = _add(estU, t->rt_param.opt_dw);
			if (_leq(tmp, M)) {
				/* it fits */
				estU = tmp;
				t->rt_param.opt_change = enactment_time;
				list_del(p);
				list_add(p, &list);

				OPT_DBG_T(t, " weight increase at %ld => estU=" 
					   _FP_ "\n", enactment_time, 
					   fp2str(estU));

			} else
				/* stop increase loop */
				break;
		}

		TRACE_BUG_ON(list_empty(&dec) && !list_empty(&inc));
		if (list_empty(&dec) && !list_empty(&inc))
			/* break out in case of bug */
			break;
	}

	/* 9) Wow. We made it. Every task has a now a new service level
	 *    assigned, together with a correct (earliest) enactment time.
	 *    all we have left to do now is to enact changes that did not get 
	 *    delayed. Also convert change fields to actual timestamp for to be
	 *    nice to the scheduler_tick().
	 */
	INIT_LIST_HEAD(&adaptive_inc_list);
	list_for_each_safe(p, extra, &list) {		
		t  = list_entry(p, struct task_struct, rt_param.opt_list);
		list_del(p);
		change_weight(t);
	}

	last_optimizer_run = jiffies;
	OPT_DBG(":::::: optimizer run complete\n");	
}

/* update_cpu_position - Move the cpu entry to the correct place to maintain 
 *                       order in the cpu queue. Caller must hold adaptive lock.
 */
static void update_cpu_position(cpu_entry_t *entry) 
{
	cpu_entry_t *other;
	struct list_head *pos;
	list_del(&entry->list);
	/* if we do not execute real-time jobs we just move 
	 * to the end of the queue 
	 */
	if (entry->linked) {
		list_for_each(pos, &adaptive_cpu_queue) {
			other = list_entry(pos, cpu_entry_t, list);
			if (edf_higher_prio(entry->linked, other->linked)) {
				__list_add(&entry->list, pos->prev, pos);
				return;
			}
		}		
	}
	/* if we get this far we have the lowest priority job */
	list_add_tail(&entry->list, &adaptive_cpu_queue);
}

/* link_task_to_cpu - Update the link of a CPU.
 *                    Handles the case where the to-be-linked task is already
 *                    scheduled on a different CPU.
 */
static noinline void link_task_to_cpu(struct task_struct* linked,
			     cpu_entry_t *entry)
			     
{
	cpu_entry_t *sched;
	struct task_struct* tmp;
	int on_cpu;

	BUG_ON(linked && !is_realtime(linked));

	/* Currently linked task is set to be unlinked. */
	if (entry->linked)
		entry->linked->rt_param.linked_on = NO_CPU;	

	/* Link new task to CPU. */
	if (linked) {		
		set_rt_flags(linked, RT_F_RUNNING);
		/* handle task is already scheduled somewhere! */
		on_cpu = linked->rt_param.scheduled_on;
		if (on_cpu != NO_CPU) {
			sched = &per_cpu(adaptive_cpu_entries, on_cpu);
			/* this should only happen if not linked already */
			BUG_ON(sched->linked == linked);
			
			/* If we are already scheduled on the CPU to which we
			 * wanted to link, we don't need to do the swap --
			 * we just link ourselves to the CPU and depend on
			 * the caller to get things right.
			 */
			if (entry != sched) {
				tmp = sched->linked;
				linked->rt_param.linked_on = sched->cpu;
				sched->linked = linked;
				update_cpu_position(sched);
				linked = tmp;
			}
		}
		if (linked) /* might be NULL due to swap */
			linked->rt_param.linked_on = entry->cpu;
	}
	entry->linked = linked;
	update_cpu_position(entry);
}

/* unlink - Make sure a task is not linked any longer to an entry
 *          where it was linked before. Must hold adaptive_lock.
 */
static void unlink(struct task_struct* t)
{
    	cpu_entry_t *entry;

	if (unlikely(!t)) {
		TRACE_BUG_ON(!t);
		return;
	}

	if (t->rt_param.linked_on != NO_CPU) {
		/* unlink */
		entry = &per_cpu(adaptive_cpu_entries, t->rt_param.linked_on);
		t->rt_param.linked_on = NO_CPU;
		link_task_to_cpu(NULL, entry);
	} else if (in_list(&t->rt_list)) {
		/* This is an interesting situation: t is scheduled,
		 * but was just recently unlinked.  It cannot be
		 * linked anywhere else (because then it would have
		 * been relinked to this CPU), thus it must be in some
		 * queue. We must remove it from the list in this
		 * case.
		 */
		list_del(&t->rt_list);
	}
} 


/* preempt - force a CPU to reschedule
 */
static noinline void preempt(cpu_entry_t *entry)
{
	/* We cannot make the is_np() decision here if it is a remote CPU
	 * because requesting exit_np() requires that we currently use the
	 * address space of the task. Thus, in the remote case we just send
	 * the IPI and let schedule() handle the problem.
	 */

	if (smp_processor_id() == entry->cpu) {
		if (entry->scheduled && is_np(entry->scheduled))
			request_exit_np(entry->scheduled);			
		else
			set_tsk_need_resched(current);
	} else
		/* in case that it is a remote CPU we have to defer the
		 * the decision to the remote CPU
		 * FIXME: We could save a few IPI's here if we leave the flag
		 * set when we are waiting for a np_exit().
		 */
		if (!test_will_schedule(entry->cpu))
			smp_send_reschedule(entry->cpu);
}

/* requeue - Put an unlinked task into gsn-edf domain.
 *           Caller must hold adaptive_lock.
 */
static noinline void requeue(struct task_struct* task)
{
	BUG_ON(!task);
	/* sanity check rt_list before insertion */
	BUG_ON(in_list(&task->rt_list));

	if (get_rt_flags(task) == RT_F_SLEEP || 
	    get_rt_mode() != MODE_RT_RUN) {
		/* this task has expired
		 * _schedule has already taken care of updating 
		 * the release and
		 * deadline. We just must check if it has been released.
		 */
		if (is_released(task) && get_rt_mode() == MODE_RT_RUN)
			__add_ready(&adaptive, task);
		else { 
			/* it has got to wait */
			__add_release(&adaptive, task);
		}

	} else 
		/* this is a forced preemption 
		 * thus the task stays in the ready_queue
		 * we only must make it available to others
		 */
		__add_ready(&adaptive, task);
}

/* adaptive_job_arrival: task is either resumed or released */
static void adaptive_job_arrival(struct task_struct* task)
{
	cpu_entry_t* last;

	BUG_ON(list_empty(&adaptive_cpu_queue));
	BUG_ON(!task);

	TRACE_TASK(task, "job_arrival: last_rel=%d rel=%d dl=%d now=%d\n",
		   get_last_release(task), get_release(task), 
		   get_deadline(task),
		   jiffies);


	/* first queue arriving job */
	requeue(task);

	/* then check for any necessary preemptions */
	last = list_entry(adaptive_cpu_queue.prev, cpu_entry_t, list);
	if (edf_preemption_needed(&adaptive, last->linked)) {
		/* preemption necessary */
		task = __take_ready(&adaptive);

		TRACE("job_arrival: task %d linked to %d\n", 
		      task->pid, last->cpu);

		if (last->linked)
			requeue(last->linked);
		
		link_task_to_cpu(task, last);
		preempt(last);
	}
}

/* check for current job releases */
static noinline  void adaptive_release_jobs(void)
{
	struct list_head *pos, *save;
	struct task_struct   *queued;

	list_for_each_safe(pos, save, &adaptive.release_queue) {
		queued = list_entry(pos, struct task_struct, rt_list);
		if (likely(is_released(queued))) {
			TRACE_TASK(queued, "released rel=%d now=%d\n", 
				   get_release(queued), jiffies);
			/* this one is ready to go*/
			list_del(pos);
			set_rt_flags(queued, RT_F_RUNNING);
			queued->rt_param.times.last_release = 
				queued->rt_param.times.release;
			
			/* check for delayed weight increase */
			if (get_opt_sl(queued) != get_cur_sl(queued) &&
			    time_before_eq(queued->rt_param.opt_change, jiffies)) {
				opt_time = jiffies;
				set_service_level(queued, get_opt_sl(queued));
				queued->rt_param.times.deadline = 
					get_last_release(queued) + 
					get_rt_period(queued);
				total_weight = _sub(total_weight, get_est_weight(queued));
				queued->rt_param.predictor_state.estimate = 
					queued->rt_param.opt_nw;
				total_weight = _add(total_weight, get_est_weight(queued));
			}

			sched_trace_job_release(queued);
			adaptive_job_arrival(queued);
		} 
		else
			/* the release queue is ordered */
			break;			
	}
}

/* adaptive_scheduler_tick - this function is called for every local timer 
 *                       interrupt.
 *
 *                   checks whether the current task has expired and checks
 *                   whether we need to preempt it if it has not expired
 */
static reschedule_check_t adaptive_scheduler_tick(void)
{
	unsigned long 		flags;
	struct task_struct*	t = current;
	reschedule_check_t 	want_resched = NO_RESCHED;

	/* Account for exec time.
	 * Since we don't preempt forcefully, nothing else needs to be done.
	 */
	if (is_realtime(t))
		t->rt_param.times.exec_time++;

	/* only the first CPU needs to release jobs */
	if (get_rt_mode() == MODE_RT_RUN) {
		queue_lock_irqsave(&adaptive_lock, flags);
		
		/* (1) run the optimizer if it did not trigger often enough */
		if (time_before_eq(last_optimizer_run + optimizer_period, jiffies)) {

			OPT_DBG("adaptive: optimizing due to period threshold\n");

			adaptive_optimize();
		}

		/* (2) enact delayed weight increases */
		delayed_increase_weight();

		/* (3) try to release pending jobs */
		adaptive_release_jobs();

		/* we don't need to check linked != scheduled since
		 * set_tsk_need_resched has been set by preempt() if necessary
		 */
		
		queue_unlock_irqrestore(&adaptive_lock, flags);		
	}

	return want_resched;
}

/* caller holds adaptive_lock */
static noinline void job_completion(struct task_struct *t)
{
	long delta;
	fp_t actual_weight, old_estimate;
	unsigned int lcurr = get_cur_sl(t);
	fp_t v = t->rt_param.service_level[lcurr].value;

	int  non_zero_weight;
	fp_t error_percentage;
	int  exceeds_threshold;

	BUG_ON(!t);

	TRACE_TASK(t, " completion, last_rel=%d rel=%d dl=%d now=%d "
		   "period=%d\n",
		   get_last_release(t), get_release(t), get_deadline(t),
		   jiffies, get_rt_period(t));

	sched_trace_job_completion(t);
	delta = t->rt_param.times.exec_time - 
		t->rt_param.basic_params.exec_cost;

	OPT_DBG_T(t, "job %d completes, delta WCET = %d\n", 
		   t->rt_param.times.job_no, delta);

	actual_weight = _frac(t->rt_param.times.exec_time, 
			      t->rt_param.basic_params.period);
	sched_trace_weight_error(t, actual_weight);
	old_estimate = get_est_weight(t);
	update_estimate(&t->rt_param.predictor_state, actual_weight, 
			fc_a, fc_b);

	OPT_DBG_T(t, "Job %d completes. Current value " _FP_ 
		  ", Weight estimation: error=" _FP_  " weight=" 
		  _FP_ " => " _FP_ "\n",t->rt_param.times.job_no, v,
		   _sub(get_est_weight(t), old_estimate),
		   old_estimate, get_est_weight(t));

	/* Now we have determined the task error.
	 * Next we release the next job. 
	 * Then we optimize. It's easier for the optimizer to deal
	 * with just-released jobs.
	 */

	/* prepare for next period */
	edf_prepare_for_next_period(t);	

	TRACE_TASK(t, " prepped, last_rel=%d rel=%d dl=%d now=%d\n",
		   get_last_release(t), get_release(t), get_deadline(t),
		   jiffies);

	if (is_released(t)) {
		/* set flags */
		/* prevent fake completions */
		set_rt_flags(t, RT_F_RUNNING);
		t->rt_param.times.last_release = 
			t->rt_param.times.release;		
	}


	non_zero_weight = !_eq(get_est_weight(t),FP(0));
	if (non_zero_weight)
		error_percentage = _div(_abs(_sub(get_est_weight(t), 
						  old_estimate)),
					get_est_weight(t));
	else
		error_percentage = FP(0);
	exceeds_threshold = _gt(error_percentage, task_error_threshold);


	if (exceeds_threshold) {
		OPT_DBG("adaptive: optimizing due to task error threshold\n");
	        adaptive_optimize();
	} else if (_gt(total_weight, system_capacity)) {
		OPT_DBG("adaptive: optimizing due to system capacity exceeded\n");
	        adaptive_optimize();
	}


	/* unlink */
	unlink(t);
	/* requeue
	 * But don't requeue a blocking task. */
	if (is_running(t))
		adaptive_job_arrival(t);
}


/* Getting schedule() right is a bit tricky. schedule() may not make any
 * assumptions on the state of the current task since it may be called for a
 * number of reasons. The reasons include a scheduler_tick() determined that it
 * was necessary, because sys_exit_np() was called, because some Linux
 * subsystem determined so, or even (in the worst case) because there is a bug
 * hidden somewhere. Thus, we must take extreme care to determine what the
 * current state is. 
 *
 * The CPU could currently be scheduling a task (or not), be linked (or not).
 *
 * The following assertions for the scheduled task could hold:
 *
 *      - !is_running(scheduled)        // the job blocks
 *	- get_rt_flag() == RT_F_SLEEP	// the job completed (by syscall)
 * 	- linked != scheduled		// we need to reschedule (for any reason)
 *	
 * Any of these can occur together. 
 */
static int adaptive_schedule(struct task_struct * prev, 
			 struct task_struct ** next, 
			 runqueue_t * rq)
{
	cpu_entry_t* 		entry = &__get_cpu_var(adaptive_cpu_entries);
	int 			sleep, preempt, exists, 
				rt, blocks;
	struct task_struct* 	linked;

	/* Will be released in finish_switch. */
	queue_lock(&adaptive_lock);
	clear_will_schedule();

	/* sanity checking */
	BUG_ON(entry->scheduled && entry->scheduled != prev);
	BUG_ON(entry->scheduled && !is_realtime(prev));

	/* (0) Determine state */	
	exists      = entry->scheduled != NULL;
	blocks      = exists && !is_running(entry->scheduled);
	sleep	    = exists && get_rt_flags(entry->scheduled) == RT_F_SLEEP;
	preempt     = entry->scheduled != entry->linked;
	rt          = get_rt_mode() == MODE_RT_RUN;

	/* If a task blocks we have no choice but to reschedule.
	 */
	if (blocks)
		unlink(entry->scheduled);

	/* Task wants to sleep -> job is done.
	 */
	if (sleep)
		job_completion(entry->scheduled);		
	
	/* Stop real-time tasks when we leave real-time mode 
	 */
	if (!rt && entry->linked) {
		/* task will be preempted once it is preemptable 
		 * (which it may be already)
		 */
		linked = entry->linked;
		unlink(linked);	       
		requeue(linked);
	}

	/* Link pending task if we became unlinked.
	 */
	if (rt && !entry->linked)
		link_task_to_cpu(__take_ready(&adaptive), entry);	

	/* The final scheduling decision. Do we need to switch for some reason?
	 * If linked different from scheduled select linked as next.
	 */ 
	if (entry->linked != entry->scheduled) {
		/* Take care of a previously scheduled
		 * job by taking it out of the Linux runqueue.
		 */
		if (entry->scheduled)
			if (prev->array)
				/* take it out of the run queue */
				deactivate_task(prev, rq);		

		/* Schedule a linked job? */
		if (entry->linked) {
			*next = entry->linked;
			/* mark the task as executing on this cpu */
			set_task_cpu(*next, smp_processor_id());
			/* stick the task into the runqueue */
			__activate_task(*next, rq);
		}
	} else 
		/* Only override Linux scheduler if we have real-time task 
		 * scheduled that needs to continue.
		 */
		if (exists)		
			*next = prev;

	/* Unlock in case that we don't affect real-time tasks or
	 * if nothing changed and finish_switch won't be called.
	 */
	if (prev == *next || (!is_realtime(prev) && !*next))
		queue_unlock(&adaptive_lock);

	return 0;
}


/* _finish_switch - we just finished the switch away from prev
 */
static void adaptive_finish_switch(struct task_struct *prev) 
{
	cpu_entry_t* 	entry = &__get_cpu_var(adaptive_cpu_entries);

	if (is_realtime(current))
		entry->scheduled = current;		
	else
		entry->scheduled = NULL;

	prev->rt_param.scheduled_on    = NO_CPU;
	current->rt_param.scheduled_on = smp_processor_id();

	/* unlock in case schedule() left it locked */
	if (is_realtime(current) || is_realtime(prev))
			queue_unlock(&adaptive_lock);	
}


/*	Prepare a task for running in RT mode
 *	Enqueues the task into master queue data structure
 *	returns 
 *		-EPERM  if task is not TASK_STOPPED
 */
static long adaptive_prepare_task(struct task_struct * t)
{
	unsigned long 		flags;

	TRACE("adaptive: prepare task %d\n", t->pid);

	if (t->state == TASK_STOPPED) {
		__setscheduler(t, SCHED_FIFO, MAX_RT_PRIO - 1);

		t->rt_param.scheduled_on       = NO_CPU;
		t->rt_param.linked_on          = NO_CPU;
	        if (t->rt_param.no_service_levels) {
			t->rt_param.predictor_state.estimate = 
				get_sl(t, 0).weight;
		} else
			t->rt_param.predictor_state.estimate = 
				_frac(get_exec_cost(t), get_rt_period(t));
		
		TRACE_TASK(t, "est_weight=" _FP_  "\n", get_est_weight(t));

		if (get_rt_mode() == MODE_RT_RUN)
			/* The action is already on. 
			 * Prepare immediate release
			 */
			edf_release_now(t);
		/* The task should be running in the queue, otherwise signal 
		 * code will try to wake it up with fatal consequences.
		 */
		t->state = TASK_RUNNING; 
		
		queue_lock_irqsave(&adaptive_lock, flags);
		total_weight = _add(total_weight, get_est_weight(t));
		requeue(t);
		queue_unlock_irqrestore(&adaptive_lock, flags);		
		return 0;
	}
	else
		return -EPERM;
}

static void adaptive_wake_up_task(struct task_struct *task) 
{
	unsigned long flags;
	/* We must determine whether task should go into the release 	   
	 * queue or into the ready queue. It may enter the ready queue 
	 * if it has credit left in its time slice and has not yet reached 
	 * its deadline. If it is now passed its deadline we assume this the 
	 * arrival of a new sporadic job and thus put it in the ready queue 
	 * anyway.If it has zero budget and the next release is in the future 
	 * it has to go to the release queue.
	 */

	TRACE("adaptive: %d unsuspends\n", task->pid);

	task->state = TASK_RUNNING;

	if (is_tardy(task)) {
		/* new sporadic release */
		edf_release_now(task);
		sched_trace_job_release(task);
	}
	else if (task->time_slice)
		/* came back in time before deadline */
		set_rt_flags(task, RT_F_RUNNING);	

	queue_lock_irqsave(&adaptive_lock, flags);
	total_weight = _add(total_weight, get_est_weight(task));
	adaptive_job_arrival(task);
	queue_unlock_irqrestore(&adaptive_lock, flags);	
}

static void adaptive_task_blocks(struct task_struct *t)
{
	unsigned long flags;

	/* unlink if necessary */
	queue_lock_irqsave(&adaptive_lock, flags);
	total_weight = _sub(total_weight, get_est_weight(t));
	unlink(t);
	queue_unlock_irqrestore(&adaptive_lock, flags);

	BUG_ON(!is_realtime(t));

	TRACE("task %d suspends\n", t->pid);

	BUG_ON(t->rt_list.next != LIST_POISON1);
	BUG_ON(t->rt_list.prev != LIST_POISON2);
}


/* When _tear_down is called, the task should not be in any queue any more
 * as it must have blocked first. We don't have any internal state for the task,
 * it is all in the task_struct.
 */
static long adaptive_tear_down(struct task_struct * t)
{
	BUG_ON(!is_realtime(t));
        TRACE_TASK(t, "RIP\n");
	BUG_ON(t->array);
	BUG_ON(t->rt_list.next != LIST_POISON1);
	BUG_ON(t->rt_list.prev != LIST_POISON2);
	return 0;
}

static int adaptive_mode_change(int new_mode)
{
	unsigned long flags;
	int cpu;
	cpu_entry_t *entry;
	struct task_struct* t;
	struct list_head* pos;

	if (new_mode == MODE_RT_RUN) {
		queue_lock_irqsave(&adaptive_lock, flags);

		system_capacity = FP(0);
		for_each_online_cpu(cpu)
			system_capacity = _add(system_capacity, FP(1));

		__rerelease_all(&adaptive, edf_release_at);

		total_weight    = FP(0);
		list_for_each(pos, &adaptive.release_queue) {
			t = list_entry(pos, struct task_struct, rt_list);
			total_weight = _add(total_weight, get_est_weight(t));
		}
		TRACE("adaptive: total weight: "  _FP_ 
		      " (at mode change)\n", total_weight);


		/* get old cruft out of the way in case we reenter real-time
		 * mode for a second time
		 */
		while (!list_empty(&adaptive_cpu_queue))
			list_del(adaptive_cpu_queue.next);
		/* reinitialize */
		for_each_online_cpu(cpu) {
			entry = &per_cpu(adaptive_cpu_entries, cpu);
			atomic_set(&entry->will_schedule, 0);
			entry->linked    = NULL;
			entry->scheduled = NULL;
			list_add(&entry->list, &adaptive_cpu_queue);
		}		

		adaptive_optimize();
		
		queue_unlock_irqrestore(&adaptive_lock, flags);

	}
	return 0;
}


typedef enum {	
	ADAPTIVE_SET_MIN_OPT_SEP = 1
} adaptive_cmds_t;


static int adaptive_setup(int cmd, void __user *up)
{
	unsigned int error = -EINVAL;
	unsigned int val;

	if (copy_from_user(&val, up, sizeof(unsigned int))) {
		error = -EFAULT;
		goto out;
	}

	switch (cmd) {
	case ADAPTIVE_SET_MIN_OPT_SEP: 
		optimizer_min_invocation_sep = val;
		TRACE("adaptive: min opt sep set to %d\n", 
		      optimizer_min_invocation_sep);
		return 0;
		break;		
	}

out:
	return error;
}


/*	Plugin object	*/
static sched_plugin_t s_plugin __cacheline_aligned_in_smp = {
	.ready_to_use = 0 
};


/*
 *	Plugin initialization code.
 */
#define INIT_SCHED_PLUGIN (struct sched_plugin){		\
	.plugin_name		= "ADAPTIVE",			\
	.ready_to_use		= 1,				\
	.scheduler_tick		= adaptive_scheduler_tick,	\
	.prepare_task		= adaptive_prepare_task,	\
	.sleep_next_period	= edf_sleep_next_period,	\
	.tear_down		= adaptive_tear_down,		\
	.schedule		= adaptive_schedule,		\
	.finish_switch 		= adaptive_finish_switch,	\
	.mode_change		= adaptive_mode_change,		\
	.wake_up_task		= adaptive_wake_up_task,	\
	.task_blocks		= adaptive_task_blocks, 	\
	.scheduler_setup	= adaptive_setup		\
}


sched_plugin_t *__init init_adaptive_plugin(void)
{
	int cpu;
	cpu_entry_t *entry;

	/* magic values given in the paper */
	fc_a = _frac(  102, 1000);
	fc_b = _frac(  303, 1000);
	
	optimizer_period = 1000;
	optimizer_min_invocation_sep = 50;
	task_error_threshold = _frac(1, 2);

	if (!s_plugin.ready_to_use)
	{
		/* initialize CPU state */
		for (cpu = 0; cpu < NR_CPUS; cpu++)  {
			entry = &per_cpu(adaptive_cpu_entries, cpu);
			atomic_set(&entry->will_schedule, 0);
			entry->linked    = NULL;
			entry->scheduled = NULL;
			entry->cpu 	 = cpu;
		}		

		queue_lock_init(&adaptive_lock);
		edf_domain_init(&adaptive, NULL);
		s_plugin = INIT_SCHED_PLUGIN;
	}
	return &s_plugin;
}