#ifndef _LINUX_RT_PARAM_H_
#define _LINUX_RT_PARAM_H_
/*
 * Definition of the scheduler plugin interface.
 *
 */

#include <litmus/fpmath.h>

/* Litmus time type. */
typedef unsigned long long lt_t;

static inline int lt_after(lt_t a, lt_t b)
{
	return ((long long) b) - ((long long) a) < 0;
}
#define lt_before(a, b) lt_after(b, a)

static inline int lt_after_eq(lt_t a, lt_t b)
{
	return ((long long) a) - ((long long) b) >= 0;
}
#define lt_before_eq(a, b) lt_after_eq(b, a)

/* different types of clients */
typedef enum {
	RT_CLASS_HARD,
	RT_CLASS_SOFT,
	RT_CLASS_BEST_EFFORT
} task_class_t;

typedef enum {
	NO_ENFORCEMENT,      /* job may overrun unhindered */
	QUANTUM_ENFORCEMENT, /* budgets are only checked on quantum boundaries */
	PRECISE_ENFORCEMENT, /* budgets are enforced with hrtimers */
} budget_policy_t;

typedef enum {
	NO_SIGNALS,			/* job receives no signals when it exhausts its budget */
	QUANTUM_SIGNALS,	/* budget signals are only sent on quantum boundaries */
	PRECISE_SIGNALS,	/* budget signals are triggered with hrtimers */
} budget_signal_policy_t;

typedef enum {
	AUX_ENABLE	= 0x1,
	AUX_CURRENT = (AUX_ENABLE<<1),
	AUX_FUTURE	= (AUX_CURRENT<<2)
} aux_flags_t;

/* mirror of st_event_record_type_t
 * Assume all are UNsupported, unless otherwise stated. */
typedef enum {
	ST_INJECT_NAME = 1,				/* supported */
	ST_INJECT_PARAM,				/* supported */
	ST_INJECT_RELEASE,				/* supported */
	ST_INJECT_ASSIGNED,
	ST_INJECT_SWITCH_TO,
	ST_INJECT_SWITCH_AWAY,
	ST_INJECT_COMPLETION,			/* supported */
	ST_INJECT_BLOCK,
	ST_INJECT_RESUME,
	ST_INJECT_ACTION,
	ST_INJECT_SYS_RELEASE,			/* supported */
} sched_trace_injection_events_t;

struct st_inject_args {
	lt_t release;
	lt_t deadline;
	unsigned int job_no;
};

/* We use the common priority interpretation "lower index == higher priority",
 * which is commonly used in fixed-priority schedulability analysis papers.
 * So, a numerically lower priority value implies higher scheduling priority,
 * with priority 1 being the highest priority. Priority 0 is reserved for
 * priority boosting. LITMUS_MAX_PRIORITY denotes the maximum priority value
 * range.
 */

#define LITMUS_MAX_PRIORITY     512
#define LITMUS_HIGHEST_PRIORITY   1
#define LITMUS_LOWEST_PRIORITY    (LITMUS_MAX_PRIORITY - 1)

/* Provide generic comparison macros for userspace,
 * in case that we change this later. */
#define litmus_higher_fixed_prio(a, b)	(a < b)
#define litmus_lower_fixed_prio(a, b)	(a > b)
#define litmus_is_valid_fixed_prio(p)		\
	((p) >= LITMUS_HIGHEST_PRIORITY &&	\
	 (p) <= LITMUS_LOWEST_PRIORITY)

struct rt_task {
	lt_t 		exec_cost;
	lt_t 		period;
	lt_t		relative_deadline;
	lt_t		phase;
	unsigned int	cpu;
	unsigned int	priority;
	task_class_t	cls;
	budget_policy_t budget_policy; /* ignored by pfair */
	budget_signal_policy_t budget_signal_policy; /* currently ignored by pfair */
};

union np_flag {
	uint64_t raw;
	struct {
		/* Is the task currently in a non-preemptive section? */
		uint64_t flag:31;
		/* Should the task call into the scheduler? */
		uint64_t preempt:1;
	} np;
};

struct affinity_observer_args
{
	int lock_od;
};

struct gpu_affinity_observer_args
{
	struct affinity_observer_args obs;
	int replica_to_gpu_offset;
	int nr_simult_users;
	int relaxed_rules;
};

/* The definition of the data that is shared between the kernel and real-time
 * tasks via a shared page (see litmus/ctrldev.c).
 *
 * WARNING: User space can write to this, so don't trust
 * the correctness of the fields!
 *
 * This servees two purposes: to enable efficient signaling
 * of non-preemptive sections (user->kernel) and
 * delayed preemptions (kernel->user), and to export
 * some real-time relevant statistics such as preemption and
 * migration data to user space. We can't use a device to export
 * statistics because we want to avoid system call overhead when
 * determining preemption/migration overheads).
 */
struct control_page {
	/* This flag is used by userspace to communicate non-preempive
	 * sections. */
	volatile union np_flag sched;

	volatile uint64_t irq_count; /* Incremented by the kernel each time an IRQ is
				      * handled. */

	/* Locking overhead tracing: userspace records here the time stamp
	 * and IRQ counter prior to starting the system call. */
	uint64_t ts_syscall_start;  /* Feather-Trace cycles */
	uint64_t irq_syscall_start; /* Snapshot of irq_count when the syscall
				     * started. */

	/* to be extended */
};

/* Expected offsets within the control page. */

#define LITMUS_CP_OFFSET_SCHED		0
#define LITMUS_CP_OFFSET_IRQ_COUNT	8
#define LITMUS_CP_OFFSET_TS_SC_START	16
#define LITMUS_CP_OFFSET_IRQ_SC_START	24

/* don't export internal data structures to user space (liblitmus) */
#ifdef __KERNEL__

#include <litmus/binheap.h>
#include <linux/semaphore.h>

#ifdef CONFIG_LITMUS_SOFTIRQD
#include <linux/interrupt.h>
#endif

struct _rt_domain;
struct bheap_node;
struct release_heap;

struct rt_job {
	/* Time instant the the job was or will be released.  */
	lt_t	release;
	/* What is the current deadline? */
	lt_t   	deadline;

	/* How much service has this job received so far? */
	lt_t	exec_time;

	/* By how much did the prior job miss its deadline by?
	 * Value differs from tardiness in that lateness may
	 * be negative (when job finishes before its deadline).
	 */
	long long	lateness;

	/* Which job is this. This is used to let user space
	 * specify which job to wait for, which is important if jobs
	 * overrun. If we just call sys_sleep_next_period() then we
	 * will unintentionally miss jobs after an overrun.
	 *
	 * Increase this sequence number when a job is released.
	 */
	unsigned int    job_no;

	/* bits:
	 * 0th: Set if a budget exhaustion signal has already been sent for
	 *      the current job. */
	unsigned long	flags;
};

#define RT_JOB_SIG_BUDGET_SENT  0

struct pfair_param;

enum klmirqd_sem_status
{
	NEED_TO_REACQUIRE,
	REACQUIRING,
	NOT_HELD,
	HELD
};

typedef enum gpu_migration_dist
{
	// TODO: Make this variable against NR_NVIDIA_GPUS
	MIG_LOCAL = 0,
	MIG_NEAR = 1,
	MIG_MED = 2,
	MIG_FAR = 3,	// 8 GPUs in a binary tree hierarchy
	MIG_NONE = 4,

	MIG_LAST = MIG_NONE
} gpu_migration_dist_t;

typedef struct feedback_est{
	fp_t est;
	fp_t accum_err;
} feedback_est_t;


#define AVG_EST_WINDOW_SIZE 20

typedef int (*notify_rsrc_exit_t)(struct task_struct* tsk);

typedef struct avg_est{
	lt_t history[AVG_EST_WINDOW_SIZE];
	uint16_t count;
	uint16_t idx;
	lt_t sum;
	lt_t std;
	lt_t avg;
} avg_est_t;


#ifdef CONFIG_LITMUS_SOFTIRQD
struct klmirqd_info
{
	struct task_struct*		klmirqd;
    unsigned int			terminating:1;

	raw_spinlock_t			lock;

	u32						pending;
	atomic_t				num_hi_pending;
	atomic_t				num_low_pending;
	atomic_t				num_work_pending;

	/* in order of priority */
	struct tasklet_head     pending_tasklets_hi;
	struct tasklet_head		pending_tasklets;
	struct list_head		worklist;

	struct list_head		klmirqd_reg;
};
#endif


/*	RT task parameters for scheduling extensions
 *	These parameters are inherited during clone and therefore must
 *	be explicitly set up before the task set is launched.
 */
struct rt_param {
	/* is the task sleeping? */
	unsigned int 		flags:8;

	/* do we need to check for srp blocking? */
	unsigned int		srp_non_recurse:1;

	/* is the task present? (true if it can be scheduled) */
	unsigned int		present:1;

	/* has the task completed? */
	unsigned int		completed:1;

#ifdef CONFIG_LITMUS_SOFTIRQD
    /* proxy threads have minimum priority by default */
    unsigned int        is_interrupt_thread:1;

	/* pointer to data used by klmirqd thread.
	 *
	 * ptr only valid if is_interrupt_thread == 1
	 */
	struct klmirqd_info* klmirqd_info;
#endif

#ifdef CONFIG_LITMUS_NVIDIA
	long unsigned int			held_gpus;  // bitmap of held GPUs.
	struct binheap_node			gpu_owner_node; // just one GPU for now...
	unsigned int				hide_from_gpu:1;

#ifdef CONFIG_LITMUS_AFFINITY_LOCKING
	avg_est_t gpu_migration_est[MIG_LAST+1];

	gpu_migration_dist_t	gpu_migration;
	int				last_gpu;
	lt_t accum_gpu_time;
	lt_t gpu_time_stamp;

	unsigned int suspend_gpu_tracker_on_block:1;
#endif
#endif

#ifdef CONFIG_LITMUS_AFFINITY_LOCKING
	notify_rsrc_exit_t  rsrc_exit_cb;
	void* rsrc_exit_cb_args;
#endif

#ifdef CONFIG_LITMUS_LOCKING
	/* Is the task being priority-boosted by a locking protocol? */
	unsigned int		priority_boosted:1;
	/* If so, when did this start? */
	lt_t			boost_start_time;
#endif

	/* user controlled parameters */
	struct rt_task 		task_params;

	/* timing parameters */
	struct rt_job 		job_params;

	/* task representing the current "inherited" task
	 * priority, assigned by inherit_priority and
	 * return priority in the scheduler plugins.
	 * could point to self if PI does not result in
	 * an increased task priority.
	 */
	struct task_struct*	inh_task;

#ifdef CONFIG_LITMUS_NESTED_LOCKING
	raw_spinlock_t			hp_blocked_tasks_lock;
	struct binheap			hp_blocked_tasks;

	/* pointer to lock upon which is currently blocked */
	struct litmus_lock* blocked_lock;
#endif


#ifdef CONFIG_REALTIME_AUX_TASKS
	unsigned int		is_aux_task:1;
	unsigned int		aux_ready:1;
	unsigned int		has_aux_tasks:1;
	unsigned int		hide_from_aux_tasks:1;

	struct list_head	aux_task_node;
	struct binheap_node	aux_task_owner_node;
#endif


#ifdef CONFIG_NP_SECTION
	/* For the FMLP under PSN-EDF, it is required to make the task
	 * non-preemptive from kernel space. In order not to interfere with
	 * user space, this counter indicates the kernel space np setting.
	 * kernel_np > 0 => task is non-preemptive
	 */
	unsigned int	kernel_np;
#endif

	/* This field can be used by plugins to store where the task
	 * is currently scheduled. It is the responsibility of the
	 * plugin to avoid race conditions.
	 *
	 * This used by GSN-EDF and PFAIR.
	 */
	volatile int		scheduled_on;

	/* Is the stack of the task currently in use? This is updated by
	 * the LITMUS core.
	 *
	 * Be careful to avoid deadlocks!
	 */
	volatile int		stack_in_use;

	/* This field can be used by plugins to store where the task
	 * is currently linked. It is the responsibility of the plugin
	 * to avoid race conditions.
	 *
	 * Used by GSN-EDF.
	 */
	volatile int		linked_on;

	/* PFAIR/PD^2 state. Allocated on demand. */
	struct pfair_param*	pfair;

	/* Fields saved before BE->RT transition.
	 */
	int old_policy;
	int old_prio;

	/* ready queue for this task */
	struct _rt_domain* domain;

	/* heap element for this task
	 *
	 * Warning: Don't statically allocate this node. The heap
	 *          implementation swaps these between tasks, thus after
	 *          dequeuing from a heap you may end up with a different node
	 *          then the one you had when enqueuing the task.  For the same
	 *          reason, don't obtain and store references to this node
	 *          other than this pointer (which is updated by the heap
	 *          implementation).
	 */
	struct bheap_node*	heap_node;
	struct release_heap*	rel_heap;

	/* Used by rt_domain to queue task in release list.
	 */
	struct list_head list;

	/* Pointer to the page shared between userspace and kernel. */
	struct control_page * ctrl_page;
};

#ifdef CONFIG_REALTIME_AUX_TASKS
struct aux_data
{
	struct list_head	aux_tasks;
	struct binheap		aux_task_owners;
	unsigned int		initialized:1;
	unsigned int		aux_future:1;
};
#endif

#endif  /* __KERNEL */

#endif