/* * Definition of the scheduler plugin interface. * */ #ifndef _LINUX_RT_PARAM_H_ #define _LINUX_RT_PARAM_H_ #include /* Litmus time type. */ typedef unsigned long long lt_t; static inline int lt_after(lt_t a, lt_t b) { return ((long long) b) - ((long long) a) < 0; } #define lt_before(a, b) lt_after(b, a) static inline int lt_after_eq(lt_t a, lt_t b) { return ((long long) a) - ((long long) b) >= 0; } #define lt_before_eq(a, b) lt_after_eq(b, a) /* different types of clients */ typedef enum { RT_CLASS_HARD, RT_CLASS_SOFT, RT_CLASS_BEST_EFFORT } task_class_t; typedef enum { NO_ENFORCEMENT, /* job may overrun unhindered */ QUANTUM_ENFORCEMENT, /* budgets are only checked on quantum boundaries */ PRECISE_ENFORCEMENT, /* budgets are enforced with hrtimers */ } budget_policy_t; typedef enum { NO_SIGNALS, /* job receives no signals when it exhausts its budget */ QUANTUM_SIGNALS, /* budget signals are only sent on quantum boundaries */ PRECISE_SIGNALS, /* budget signals are triggered with hrtimers */ } budget_signal_policy_t; /* We use the common priority interpretation "lower index == higher priority", * which is commonly used in fixed-priority schedulability analysis papers. * So, a numerically lower priority value implies higher scheduling priority, * with priority 1 being the highest priority. Priority 0 is reserved for * priority boosting. LITMUS_MAX_PRIORITY denotes the maximum priority value * range. */ #define LITMUS_MAX_PRIORITY 512 #define LITMUS_HIGHEST_PRIORITY 1 #define LITMUS_LOWEST_PRIORITY (LITMUS_MAX_PRIORITY - 1) /* Provide generic comparison macros for userspace, * in case that we change this later. */ #define litmus_higher_fixed_prio(a, b) (a < b) #define litmus_lower_fixed_prio(a, b) (a > b) #define litmus_is_valid_fixed_prio(p) \ ((p) >= LITMUS_HIGHEST_PRIORITY && \ (p) <= LITMUS_LOWEST_PRIORITY) struct rt_task { lt_t exec_cost; lt_t period; lt_t relative_deadline; lt_t phase; unsigned int cpu; unsigned int priority; task_class_t cls; budget_policy_t budget_policy; /* ignored by pfair */ budget_signal_policy_t budget_signal_policy; /* currently ignored by pfair */ }; union np_flag { uint32_t raw; struct { /* Is the task currently in a non-preemptive section? */ uint32_t flag:31; /* Should the task call into the scheduler? */ uint32_t preempt:1; } np; }; struct affinity_observer_args { int lock_od; }; struct gpu_affinity_observer_args { struct affinity_observer_args obs; int replica_to_gpu_offset; int nr_simult_users; int relaxed_rules; }; /* The definition of the data that is shared between the kernel and real-time * tasks via a shared page (see litmus/ctrldev.c). * * WARNING: User space can write to this, so don't trust * the correctness of the fields! * * This servees two purposes: to enable efficient signaling * of non-preemptive sections (user->kernel) and * delayed preemptions (kernel->user), and to export * some real-time relevant statistics such as preemption and * migration data to user space. We can't use a device to export * statistics because we want to avoid system call overhead when * determining preemption/migration overheads). */ struct control_page { volatile union np_flag sched; /* to be extended */ }; /* don't export internal data structures to user space (liblitmus) */ #ifdef __KERNEL__ #include #include struct _rt_domain; struct bheap_node; struct release_heap; struct rt_job { /* Time instant the the job was or will be released. */ lt_t release; /* What is the current deadline? */ lt_t deadline; /* How much service has this job received so far? */ lt_t exec_time; /* By how much did the prior job miss its deadline by? * Value differs from tardiness in that lateness may * be negative (when job finishes before its deadline). */ long long lateness; /* Which job is this. This is used to let user space * specify which job to wait for, which is important if jobs * overrun. If we just call sys_sleep_next_period() then we * will unintentionally miss jobs after an overrun. * * Increase this sequence number when a job is released. */ unsigned int job_no; /* bits: * 0th: Set if a budget exhaustion signal has already been sent for * the current job. */ unsigned long flags; }; #define RT_JOB_SIG_BUDGET_SENT 0 struct pfair_param; enum klitirqd_sem_status { NEED_TO_REACQUIRE, REACQUIRING, NOT_HELD, HELD }; typedef enum gpu_migration_dist { // TODO: Make this variable against NR_NVIDIA_GPUS MIG_LOCAL = 0, MIG_NEAR = 1, MIG_MED = 2, MIG_FAR = 3, // 8 GPUs in a binary tree hierarchy MIG_NONE = 4, MIG_LAST = MIG_NONE } gpu_migration_dist_t; typedef struct feedback_est{ fp_t est; fp_t accum_err; } feedback_est_t; #define AVG_EST_WINDOW_SIZE 20 typedef struct avg_est{ lt_t history[AVG_EST_WINDOW_SIZE]; uint16_t count; uint16_t idx; lt_t sum; lt_t std; lt_t avg; } avg_est_t; /* RT task parameters for scheduling extensions * These parameters are inherited during clone and therefore must * be explicitly set up before the task set is launched. */ struct rt_param { /* is the task sleeping? */ unsigned int flags:8; /* do we need to check for srp blocking? */ unsigned int srp_non_recurse:1; /* is the task present? (true if it can be scheduled) */ unsigned int present:1; #ifdef CONFIG_LITMUS_SOFTIRQD /* proxy threads have minimum priority by default */ unsigned int is_proxy_thread:1; /* pointer to klitirqd currently working on this task_struct's behalf. only set by the task pointed to by klitirqd. ptr only valid if is_proxy_thread == 0 */ struct task_struct* cur_klitirqd; /* Used to implement mutual execution exclusion between * job and klitirqd execution. Job must always hold * it's klitirqd_sem to execute. klitirqd instance * must hold the semaphore before executing on behalf * of a job. */ struct mutex klitirqd_sem; /* status of held klitirqd_sem, even if the held klitirqd_sem is from another task (only proxy threads do this though). */ atomic_t klitirqd_sem_stat; #endif #ifdef CONFIG_LITMUS_NVIDIA /* number of top-half interrupts handled on behalf of current job */ atomic_t nv_int_count; long unsigned int held_gpus; // bitmap of held GPUs. #ifdef CONFIG_LITMUS_AFFINITY_LOCKING avg_est_t gpu_migration_est[MIG_LAST+1]; gpu_migration_dist_t gpu_migration; int last_gpu; lt_t accum_gpu_time; lt_t gpu_time_stamp; unsigned int suspend_gpu_tracker_on_block:1; #endif #endif #ifdef CONFIG_LITMUS_LOCKING /* Is the task being priority-boosted by a locking protocol? */ unsigned int priority_boosted:1; /* If so, when did this start? */ lt_t boost_start_time; #endif /* user controlled parameters */ struct rt_task task_params; /* timing parameters */ struct rt_job job_params; /* task representing the current "inherited" task * priority, assigned by inherit_priority and * return priority in the scheduler plugins. * could point to self if PI does not result in * an increased task priority. */ struct task_struct* inh_task; #ifdef CONFIG_LITMUS_NESTED_LOCKING raw_spinlock_t hp_blocked_tasks_lock; struct binheap hp_blocked_tasks; /* pointer to lock upon which is currently blocked */ struct litmus_lock* blocked_lock; #endif struct task_struct* hp_group; unsigned int is_slave:1; unsigned int has_slaves:1; #ifdef CONFIG_NP_SECTION /* For the FMLP under PSN-EDF, it is required to make the task * non-preemptive from kernel space. In order not to interfere with * user space, this counter indicates the kernel space np setting. * kernel_np > 0 => task is non-preemptive */ unsigned int kernel_np; #endif /* This field can be used by plugins to store where the task * is currently scheduled. It is the responsibility of the * plugin to avoid race conditions. * * This used by GSN-EDF and PFAIR. */ volatile int scheduled_on; /* Is the stack of the task currently in use? This is updated by * the LITMUS core. * * Be careful to avoid deadlocks! */ volatile int stack_in_use; /* This field can be used by plugins to store where the task * is currently linked. It is the responsibility of the plugin * to avoid race conditions. * * Used by GSN-EDF. */ volatile int linked_on; /* PFAIR/PD^2 state. Allocated on demand. */ struct pfair_param* pfair; /* Fields saved before BE->RT transition. */ int old_policy; int old_prio; /* ready queue for this task */ struct _rt_domain* domain; /* heap element for this task * * Warning: Don't statically allocate this node. The heap * implementation swaps these between tasks, thus after * dequeuing from a heap you may end up with a different node * then the one you had when enqueuing the task. For the same * reason, don't obtain and store references to this node * other than this pointer (which is updated by the heap * implementation). */ struct bheap_node* heap_node; struct release_heap* rel_heap; /* Used by rt_domain to queue task in release list. */ struct list_head list; /* Pointer to the page shared between userspace and kernel. */ struct control_page * ctrl_page; }; /* Possible RT flags */ #define RT_F_RUNNING 0x00000000 #define RT_F_SLEEP 0x00000001 #define RT_F_EXIT_SEM 0x00000008 #endif #endif